From 9f01eb5d4936f12d57da84cdfbc2a3623e23a7eb Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 10 Dec 2019 11:16:39 +0530 Subject: nfs: Fix nfs_access_get_cached_rcu() sparse error This patch fixes the following sparse error: fs/nfs/dir.c:2353:14: error: incompatible types in comparison expression (different address spaces): fs/nfs/dir.c:2353:14: struct list_head [noderef] * fs/nfs/dir.c:2353:14: struct list_head * Signed-off-by: Madhuparna Bhowmik Signed-off-by: Paul E. McKenney --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1320288ff9ec..55a29b0d52fc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2383,7 +2383,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre rcu_read_lock(); if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out; - lh = rcu_dereference(nfsi->access_cache_entry_lru.prev); + lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || cred_fscmp(cred, cache->cred) != 0) -- cgit v1.2.3 From 82dd8419e225958f01708cda8a3fc6c3c5356228 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Dec 2019 11:38:57 -0800 Subject: rcu: Warn on for_each_leaf_node_cpu_mask() from non-leaf The for_each_leaf_node_cpu_mask() and for_each_leaf_node_possible_cpu() macros must be invoked only on leaf rcu_node structures. Failing to abide by this restriction can result in infinite loops on systems with more than 64 CPUs (or for more than 32 CPUs on 32-bit systems). This commit therefore adds WARN_ON_ONCE() calls to make misuse of these two macros easier to debug. Reported-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 05f936ed167a..f6ce173e35f6 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -325,7 +325,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ (cpu) <= rnp->grphi; \ (cpu) = cpumask_next((cpu), cpu_possible_mask)) @@ -335,7 +336,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #define rcu_find_next_bit(rnp, cpu, mask) \ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ - for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ (cpu) <= rnp->grphi; \ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) -- cgit v1.2.3 From 24bb9eccf7ff335c16c2970ac7cd5c32a92821a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:55:50 -0800 Subject: rcu: Fix exp_funnel_lock()/rcu_exp_wait_wake() datarace The rcu_node structure's ->exp_seq_rq field is accessed locklessly, so updates must use WRITE_ONCE(). This commit therefore adds the needed WRITE_ONCE() invocation where it was missed. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dcbd75791f39..d7e04849c7ab 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -589,7 +589,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_lock(&rnp->exp_lock); /* Recheck, avoid hang in case someone just arrived. */ if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; + WRITE_ONCE(rnp->exp_seq_rq, s); spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ -- cgit v1.2.3 From 8a7e8f51714004112a0bbbf751f3dd0fcbbbc983 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jan 2020 16:48:05 -0800 Subject: rcu: Provide debug symbols and line numbers in KCSAN runs This commit adds "-g -fno-omit-frame-pointer" to ease interpretation of KCSAN output, but only for CONFIG_KCSAN=y kerrnels. Signed-off-by: Paul E. McKenney --- kernel/rcu/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 82d5fba48b2f..f91f2c2cf138 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,6 +3,10 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +ifeq ($(CONFIG_KCSAN),y) +KBUILD_CFLAGS += -g -fno-omit-frame-pointer +endif + obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o -- cgit v1.2.3 From 7672d647ddae37d2ea267159950bcc311e962434 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 11:38:51 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_node ->qsmask update The rcu_node structure's ->qsmask field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..bb57c24dbe9a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1881,7 +1881,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); - rnp->qsmask &= ~mask; + WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask); trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, -- cgit v1.2.3 From b0c18c87730a4de6da0303fa99aea43e814233f9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 12:12:06 -0800 Subject: rcu: Add WRITE_ONCE to rcu_node ->exp_seq_rq store The rcu_node structure's ->exp_seq_rq field is read locklessly, so this commit adds the WRITE_ONCE() to a load in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d7e04849c7ab..85b009e05637 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -314,7 +314,7 @@ static bool exp_funnel_lock(unsigned long s) sync_exp_work_done(s)); return true; } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ + WRITE_ONCE(rnp->exp_seq_rq, s); /* Followers can wait on us. */ spin_unlock(&rnp->exp_lock); trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, rnp->grplo, rnp->grphi, TPS("nxtlvl")); -- cgit v1.2.3 From 0937d045732b5dd5e5df1fb6355d5f4e01f3c4d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 14:53:31 -0800 Subject: rcu: Add READ_ONCE() to rcu_node ->gp_seq The rcu_node structure's ->gp_seq field is read locklessly, so this commit adds the READ_ONCE() to several loads in order to avoid destructive compiler optimizations. This data race was reported by KCSAN. Not appropriate for backporting because this affects only tracing and warnings. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bb57c24dbe9a..236692d7762f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1126,8 +1126,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req, - rnp->level, rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + gp_seq_req, rnp->level, + rnp->grplo, rnp->grphi, s); } /* @@ -1904,7 +1905,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); - oldmask = rnp_c->qsmask; + oldmask = READ_ONCE(rnp_c->qsmask); } /* @@ -2052,7 +2053,7 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; blkd = !!(rnp->qsmask & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; } -- cgit v1.2.3 From 2906d2154cd6ad6e4452cc25315d3cea7bb7f2d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:17:12 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_state ->gp_req_activity The rcu_state structure's ->gp_req_activity field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 236692d7762f..944390981aed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1200,7 +1200,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, } trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); if (!rcu_state.gp_kthread) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; @@ -1775,7 +1775,7 @@ static void rcu_gp_cleanup(void) rcu_segcblist_is_offloaded(&rdp->cblist); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); -- cgit v1.2.3 From 105abf82b0a62664edabcd4c5b4d9414c74ef399 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:44:23 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_node ->qsmaskinitnext The rcu_state structure's ->qsmaskinitnext field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely for systems not doing incessant CPU-hotplug operations. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 944390981aed..346321aa3c90 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3379,7 +3379,7 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->qsmaskinitnext |= mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; oldmask ^= rnp->expmaskinitnext; @@ -3432,7 +3432,7 @@ void rcu_report_dead(unsigned int cpu) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } - rnp->qsmaskinitnext &= ~mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); -- cgit v1.2.3 From 0050c7b2d27c3cc126df59bd8094fb3d25b00ade Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:59:12 -0800 Subject: locking/rtmutex: rcu: Add WRITE_ONCE() to rt_mutex ->owner The rt_mutex structure's ->owner field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 851bbb10819d..c9f090d64f00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -57,7 +57,7 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - lock->owner = (struct task_struct *)val; + WRITE_ONCE(lock->owner, (struct task_struct *)val); } static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) -- cgit v1.2.3 From bfeebe24212d374f82bbf5b005371fe13acabb93 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 16:14:08 -0800 Subject: rcu: Add READ_ONCE() to rcu_segcblist ->tails[] The rcu_segcblist structure's ->tails[] array entries are read locklessly, so this commit adds the READ_ONCE() to a load in order to avoid destructive compiler optimizations. This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5f4fd3b8777c..426a472e7308 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -182,7 +182,7 @@ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) { return rcu_segcblist_is_enabled(rsclp) && - &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; + &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]); } /* -- cgit v1.2.3 From 8ff37290d6622e130553a38ec2662a728e46cdba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 4 Jan 2020 11:33:17 -0800 Subject: rcu: Add *_ONCE() for grace-period progress indicators The various RCU structures' ->gp_seq, ->gp_seq_needed, ->gp_req_activity, and ->gp_activity fields are read locklessly, so they must be updated with WRITE_ONCE() and, when read locklessly, with READ_ONCE(). This commit makes these changes. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++------- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/tree_stall.h | 26 +++++++++++++++----------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 346321aa3c90..53946b169699 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1175,7 +1175,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, TPS("Prestarted")); goto unlock_out; } - rnp->gp_seq_needed = gp_seq_req; + WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req); if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf or internal node, and a @@ -1210,8 +1210,8 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { - rnp_start->gp_seq_needed = rnp->gp_seq_needed; - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed); + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); } if (rnp != rnp_start) raw_spin_unlock_rcu_node(rnp); @@ -1423,7 +1423,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -3276,12 +3276,12 @@ int rcutree_prepare_cpu(unsigned int cpu) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gp_seq = rnp->gp_seq; - rdp->gp_seq_needed = rnp->gp_seq; + rdp->gp_seq = READ_ONCE(rnp->gp_seq); + rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; + rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c6ea81cd4189..b5ba14864015 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -753,7 +753,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - (long)rnp->gp_seq, (long)rnp->completedqs); + (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs); for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 55f9b84790d3..43dc688c3785 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -592,21 +592,22 @@ void show_rcu_gp_kthreads(void) (long)READ_ONCE(rcu_get_root()->gp_seq_needed), READ_ONCE(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)rnp->gp_seq, - (long)rnp->gp_seq_needed); + rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), + (long)READ_ONCE(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) + ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)rdp->gp_seq_needed); + cpu, (long)READ_ONCE(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { @@ -631,7 +632,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, static atomic_t warned = ATOMIC_INIT(0); if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed))) return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || @@ -642,7 +644,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) { @@ -655,9 +658,10 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_xchg(&warned, 1)) { if (rnp_root != rnp) /* irqs remain disabled. */ -- cgit v1.2.3 From 65bb0dc437c3e57a6cde2b81170c8af4b9c90735 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:08:02 +0100 Subject: rcu: Fix typos in file-header comments Convert to plural and add a note that this is for Tree RCU. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- kernel/rcu/tree.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 657e6a7d1c03..7ddb29cc7dba 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -5,7 +5,7 @@ * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney + * Authors: Paul McKenney * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 53946b169699..a70f56bb56a7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update mechanism for mutual exclusion + * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma * Manfred Spraul - * Paul E. McKenney Hierarchical version + * Paul E. McKenney * * Based on the original work by Paul McKenney * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. -- cgit v1.2.3 From a5b8950180f8e5acb802d1672e0b4d0ceee6126e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jan 2020 15:48:39 -0800 Subject: rcu: Add READ_ONCE() to rcu_data ->gpwrap The rcu_data structure's ->gpwrap field is read locklessly, and so this commit adds the required READ_ONCE() to a pair of laods in order to avoid destructive compiler optimizations. This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_stall.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a70f56bb56a7..e851a12920e6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1322,7 +1322,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, rcu_lockdep_assert_cblist_protected(rdp); c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ (void)rcu_segcblist_accelerate(&rdp->cblist, c); return; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 43dc688c3785..bca637b274fb 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -602,7 +602,7 @@ void show_rcu_gp_kthreads(void) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || + if (READ_ONCE(rdp->gpwrap) || ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rdp->gp_seq_needed))) continue; -- cgit v1.2.3 From 2a2ae872ef7aa958f2152b8b24c6e94cf5f1d0df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2020 20:06:25 -0800 Subject: rcu: Add *_ONCE() to rcu_data ->rcu_forced_tick The rcu_data structure's ->rcu_forced_tick field is read locklessly, so this commit adds WRITE_ONCE() to all updates and READ_ONCE() to all lockless reads. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e851a12920e6..be59a5d7299d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -818,11 +818,12 @@ static __always_inline void rcu_nmi_enter_common(bool irq) incby = 1; } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { + READ_ONCE(rdp->rcu_urgent_qs) && + !READ_ONCE(rdp->rcu_forced_tick)) { raw_spin_lock_rcu_node(rdp->mynode); // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; + WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } raw_spin_unlock_rcu_node(rdp->mynode); @@ -899,7 +900,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - rdp->rcu_forced_tick = false; + WRITE_ONCE(rdp->rcu_forced_tick, false); } } -- cgit v1.2.3 From 3ca3b0e2cbe0050d1777a22b7fc13cad620eb2ba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2020 20:12:59 -0800 Subject: rcu: Add *_ONCE() to rcu_node ->boost_kthread_status The rcu_node structure's ->boost_kthread_status field is accessed locklessly, so this commit causes all updates to use WRITE_ONCE() and all reads to use READ_ONCE(). This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b5ba14864015..0f8b714f09f5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1032,18 +1032,18 @@ static int rcu_boost_kthread(void *arg) trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { - rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); - rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); if (more2boost) spincnt++; else spincnt = 0; if (spincnt > 10) { - rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); schedule_timeout_interruptible(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); @@ -1082,7 +1082,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, - rnp->boost_kthread_status); + READ_ONCE(rnp->boost_kthread_status)); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -- cgit v1.2.3 From 90c018942c2babab73814648b37808fc3bf2ed1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 11:37:38 -0800 Subject: timer: Use hlist_unhashed_lockless() in timer_pending() The timer_pending() function is mostly used in lockless contexts, so Without proper annotations, KCSAN might detect a data-race [1]. Using hlist_unhashed_lockless() instead of hand-coding it seems appropriate (as suggested by Paul E. McKenney). [1] BUG: KCSAN: data-race in del_timer / detach_if_pending write to 0xffff88808697d870 of 8 bytes by task 10 on cpu 0: __hlist_del include/linux/list.h:764 [inline] detach_timer kernel/time/timer.c:815 [inline] detach_if_pending+0xcd/0x2d0 kernel/time/timer.c:832 try_to_del_timer_sync+0x60/0xb0 kernel/time/timer.c:1226 del_timer_sync+0x6b/0xa0 kernel/time/timer.c:1365 schedule_timeout+0x2d2/0x6e0 kernel/time/timer.c:1896 rcu_gp_fqs_loop+0x37c/0x580 kernel/rcu/tree.c:1639 rcu_gp_kthread+0x143/0x230 kernel/rcu/tree.c:1799 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff88808697d870 of 8 bytes by task 12060 on cpu 1: del_timer+0x3b/0xb0 kernel/time/timer.c:1198 sk_stop_timer+0x25/0x60 net/core/sock.c:2845 inet_csk_clear_xmit_timers+0x69/0xa0 net/ipv4/inet_connection_sock.c:523 tcp_clear_xmit_timers include/net/tcp.h:606 [inline] tcp_v4_destroy_sock+0xa3/0x3f0 net/ipv4/tcp_ipv4.c:2096 inet_csk_destroy_sock+0xf4/0x250 net/ipv4/inet_connection_sock.c:836 tcp_close+0x6f3/0x970 net/ipv4/tcp.c:2497 inet_release+0x86/0x100 net/ipv4/af_inet.c:427 __sock_release+0x85/0x160 net/socket.c:590 sock_close+0x24/0x30 net/socket.c:1268 __fput+0x1e1/0x520 fs/file_table.c:280 ____fput+0x1f/0x30 fs/file_table.c:313 task_work_run+0xf6/0x130 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x2b4/0x2c0 arch/x86/entry/common.c:163 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12060 Comm: syz-executor.5 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, Signed-off-by: Eric Dumazet Cc: Thomas Gleixner [ paulmck: Pulled in Eric's later amendments. ] Signed-off-by: Paul E. McKenney --- include/linux/timer.h | 2 +- kernel/time/timer.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 1e6650ed066d..0dc19a8c39c9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -164,7 +164,7 @@ static inline void destroy_timer_on_stack(struct timer_list *timer) { } */ static inline int timer_pending(const struct timer_list * timer) { - return timer->entry.pprev != NULL; + return !hlist_unhashed_lockless(&timer->entry); } extern void add_timer_on(struct timer_list *timer, int cpu); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4820823515e9..568564ae3597 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -944,6 +944,7 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 +#define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) @@ -960,7 +961,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ - if (timer_pending(timer)) { + if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new @@ -1133,7 +1134,7 @@ EXPORT_SYMBOL(timer_reduce); void add_timer(struct timer_list *timer) { BUG_ON(timer_pending(timer)); - mod_timer(timer, timer->expires); + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); @@ -1891,7 +1892,7 @@ signed long __sched schedule_timeout(signed long timeout) timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); del_singleshot_timer_sync(&timer.timer); -- cgit v1.2.3 From 57721fd15a02f7df9dad1f3cca27f21e03ee118f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 Jan 2020 19:17:02 -0800 Subject: rcu: Remove dead code from rcu_segcblist_insert_pend_cbs() The rcu_segcblist_insert_pend_cbs() function currently (partially) initializes the rcu_cblist that it pulls callbacks from. However, all the resulting stores are dead because all callers pass in the address of an on-stack cblist that is not used afterwards. This commit therefore removes this pointless initialization. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 426a472e7308..9a0f66133b4b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -381,8 +381,6 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, return; /* Nothing to do. */ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); - rclp->head = NULL; - rclp->tail = &rclp->head; } /* -- cgit v1.2.3 From 59881bcd85a06565c53fd13ce3b0ad7fa55e560c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 20 Jan 2020 15:29:04 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_state ->gp_start The rcu_state structure's ->gp_start field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index bca637b274fb..488b71d03030 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -102,7 +102,7 @@ static void record_gp_stall_check_time(void) unsigned long j = jiffies; unsigned long j1; - rcu_state.gp_start = j; + WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); /* Record ->gp_start before ->jiffies_stall. */ smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ -- cgit v1.2.3 From aa24f93753e256c4b14fe46f7261f150cff2a50c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 20 Jan 2020 15:43:45 -0800 Subject: rcu: Fix rcu_barrier_callback() race condition The rcu_barrier_callback() function does an atomic_dec_and_test(), and if it is the last CPU to check in, does the required wakeup. Either way, it does an event trace. Unfortunately, this is susceptible to the following sequence of events: o CPU 0 invokes rcu_barrier_callback(), but atomic_dec_and_test() says that it is not last. But at this point, CPU 0 is delayed, perhaps due to an NMI, SMI, or vCPU preemption. o CPU 1 invokes rcu_barrier_callback(), and atomic_dec_and_test() says that it is last. So CPU 1 traces completion and does the needed wakeup. o The awakened rcu_barrier() function does cleanup and releases rcu_state.barrier_mutex. o Another CPU now acquires rcu_state.barrier_mutex and starts another round of rcu_barrier() processing, including updating rcu_state.barrier_sequence. o CPU 0 gets its act back together and does its tracing. Except that rcu_state.barrier_sequence has already been updated, so its tracing is incorrect and probably quite confusing. (Wait! Why did this CPU check in twice for one rcu_barrier() invocation???) This commit therefore causes rcu_barrier_callback() to take a snapshot of the value of rcu_state.barrier_sequence before invoking atomic_dec_and_test(), thus guaranteeing that the event-trace output is sensible, even if the timing of the event-trace output might still be confusing. (Wait! Why did the old rcu_barrier() complete before all of its CPUs checked in???) But being that this is RCU, only so much confusion can reasonably be eliminated. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to the mild consequences of the failure, namely a confusing event trace. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index be59a5d7299d..62383ce5161f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3077,15 +3077,22 @@ static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) /* * RCU callback function for rcu_barrier(). If we are last, wake * up the task executing rcu_barrier(). + * + * Note that the value of rcu_state.barrier_sequence must be captured + * before the atomic_dec_and_test(). Otherwise, if this CPU is not last, + * other CPUs might count the value down to zero before this CPU gets + * around to invoking rcu_barrier_trace(), which might result in bogus + * data from the next instance of rcu_barrier(). */ static void rcu_barrier_callback(struct rcu_head *rhp) { + unsigned long __maybe_unused s = rcu_state.barrier_sequence; + if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { - rcu_barrier_trace(TPS("LastCB"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); } else { - rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("CB"), -1, s); } } -- cgit v1.2.3 From 4dfd5cd83dc4458049c7f6eb9c4f361acc4239ea Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Sat, 18 Jan 2020 22:24:18 +0530 Subject: rculist: Add brackets around cond argument in __list_check_rcu macro Passing a complex lockdep condition to __list_check_rcu results in false positive lockdep splat due to incorrect expression evaluation. For example, a lockdep check condition `cond1 || cond2` is evaluated as `!cond1 || cond2 && !rcu_read_lock_any_held()` which, according to operator precedence, evaluates to `!cond1 || (cond2 && !rcu_read_lock_any_held())`. This would result in a lockdep splat when cond1 is false and cond2 is true which is logically incorrect. Signed-off-by: Amol Grover Acked-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 9f313e4999fe..8214cdc715f2 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -60,9 +60,9 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ - RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(), \ + RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ - }) + }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) -- cgit v1.2.3 From 5648d6591230c811972543ff146ce969babdd732 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Jan 2020 12:30:22 -0800 Subject: rcu: Don't flag non-starting GPs before GP kthread is running Currently rcu_check_gp_start_stall() complains if a grace period takes too long to start, where "too long" is roughly one RCU CPU stall-warning interval. This has worked well, but there are some debugging Kconfig options (such as CONFIG_EFI_PGT_DUMP=y) that can make booting take a very long time, so much so that the stall-warning interval has expired before RCU's grace-period kthread has even been spawned. This commit therefore resets the rcu_state.gp_req_activity and rcu_state.gp_activity timestamps just before the grace-period kthread is spawned, and modifies the checks and adds ordering to ensure that if rcu_check_gp_start_stall() sees that the grace-period kthread has been spawned, that it will also see the resets applied to the rcu_state.gp_req_activity and rcu_state.gp_activity timestamps. Reported-by: Qian Cai Signed-off-by: Paul E. McKenney [ paulmck: Fix whitespace issues reported by Qian Cai. ] Tested-by: Qian Cai [ paulmck: Simplify grace-period wakeup check per Steve Rostedt feedback. ] --- kernel/rcu/tree.c | 28 ++++++++++++++++------------ kernel/rcu/tree_stall.h | 7 ++++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62383ce5161f..4a4a975503e5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1202,7 +1202,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); - if (!rcu_state.gp_kthread) { + if (!READ_ONCE(rcu_state.gp_kthread)) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } @@ -1237,12 +1237,13 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken (unless in - * an interrupt or softirq handler), and don't bother awakening when there - * is nothing for the grace-period kthread to do (as in several CPUs raced - * to awaken, and we lost), and finally don't try to awaken a kthread that - * has not yet been created. If all those checks are passed, track some - * debug information and awaken. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in an + * interrupt or softirq handler, in which case we just might immediately + * sleep upon return, resulting in a grace-period hang), and don't bother + * awakening when there is nothing for the grace-period kthread to do + * (as in several CPUs raced to awaken, we lost), and finally don't try + * to awaken a kthread that has not yet been created. If all those checks + * are passed, track some debug information and awaken. * * So why do the self-wakeup when in an interrupt or softirq handler * in the grace-period kthread's context? Because the kthread might have @@ -1252,10 +1253,10 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) */ static void rcu_gp_kthread_wake(void) { - if ((current == rcu_state.gp_kthread && - !in_irq() && !in_serving_softirq()) || - !READ_ONCE(rcu_state.gp_flags) || - !rcu_state.gp_kthread) + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); + + if ((current == t && !in_irq() && !in_serving_softirq()) || + !READ_ONCE(rcu_state.gp_flags) || !t) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); @@ -3554,7 +3555,10 @@ static int __init rcu_spawn_gp_kthread(void) } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); - rcu_state.gp_kthread = t; + WRITE_ONCE(rcu_state.gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + // Reset .gp_activity and .gp_req_activity before setting .gp_kthread. + smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 488b71d03030..16ad7ad9a185 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -578,6 +578,7 @@ void show_rcu_gp_kthreads(void) unsigned long jw; struct rcu_data *rdp; struct rcu_node *rnp; + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; ja = j - READ_ONCE(rcu_state.gp_activity); @@ -585,8 +586,7 @@ void show_rcu_gp_kthreads(void) jw = j - READ_ONCE(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + rcu_state.gp_state, t ? t->state : 0x1ffffL, ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), (long)READ_ONCE(rcu_state.gp_seq), (long)READ_ONCE(rcu_get_root()->gp_seq_needed), @@ -633,7 +633,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), - READ_ONCE(rnp_root->gp_seq_needed))) + READ_ONCE(rnp_root->gp_seq_needed)) || + !smp_load_acquire(&rcu_state.gp_kthread)) // Get stable kthread. return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || -- cgit v1.2.3 From 9ced454807191e44ef093aeeee68194be9ce3a1a Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 20 Jan 2020 22:42:15 +0000 Subject: rcu: Add missing annotation for rcu_nocb_bypass_lock() Sparse reports warning at rcu_nocb_bypass_lock() |warning: context imbalance in rcu_nocb_bypass_lock() - wrong count at exit To fix this, this commit adds an __acquires(&rdp->nocb_bypass_lock). Given that rcu_nocb_bypass_lock() does actually call raw_spin_lock() when raw_spin_trylock() fails, this not only fixes the warning but also improves on the readability of the code. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0f8b714f09f5..6db2cad7dab7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1486,6 +1486,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); * flag the contention. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) { lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) -- cgit v1.2.3 From 92c0b889f2ff6898710d49458b6eae1de50895c6 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Thu, 30 Jan 2020 00:30:09 +0000 Subject: rcu/nocb: Add missing annotation for rcu_nocb_bypass_unlock() Sparse reports warning at rcu_nocb_bypass_unlock() warning: context imbalance in rcu_nocb_bypass_unlock() - unexpected unlock The root cause is a missing annotation of rcu_nocb_bypass_unlock() which causes the warning. This commit therefore adds the missing __releases(&rdp->nocb_bypass_lock) annotation. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney Acked-by: Boqun Feng --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6db2cad7dab7..384651915d74 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1530,6 +1530,7 @@ static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) * Release the specified rcu_data structure's ->nocb_bypass_lock. */ static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) { lockdep_assert_irqs_disabled(); raw_spin_unlock(&rdp->nocb_bypass_lock); -- cgit v1.2.3 From faa059c397dec8a452c79e9dba64419113ea64e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Feb 2020 14:20:00 -0800 Subject: rcu: Optimize and protect atomic_cmpxchg() loop This commit reworks the atomic_cmpxchg() loop in rcu_eqs_special_set() to do only the initial read from the current CPU's rcu_data structure's ->dynticks field explicitly. On subsequent passes, this value is instead retained from the failing atomic_cmpxchg() operation. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4a4a975503e5..6c624814ee2d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -342,14 +342,17 @@ bool rcu_eqs_special_set(int cpu) { int old; int new; + int new_old; struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + new_old = atomic_read(&rdp->dynticks); do { - old = atomic_read(&rdp->dynticks); + old = new_old; if (old & RCU_DYNTICK_CTRL_CTR) return false; new = old | RCU_DYNTICK_CTRL_MASK; - } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old); + new_old = atomic_cmpxchg(&rdp->dynticks, old, new); + } while (new_old != old); return true; } -- cgit v1.2.3 From 13817dd589f426aee9c36e3670e7cb2a7f067d23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2020 08:56:41 -0800 Subject: rcu: Tighten rcu_lockdep_assert_cblist_protected() check The ->nocb_lock lockdep assertion is currently guarded by cpu_online(), which is incorrect for no-CBs CPUs, whose callback lists must be protected by ->nocb_lock regardless of whether or not the corresponding CPU is online. This situation could result in failure to detect bugs resulting from failing to hold ->nocb_lock for offline CPUs. This commit therefore removes the cpu_online() guard. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 384651915d74..70b3c0f4ea37 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1579,8 +1579,7 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist) && - cpu_online(rdp->cpu)) + if (rcu_segcblist_is_offloaded(&rdp->cblist)) lockdep_assert_held(&rdp->nocb_lock); } -- cgit v1.2.3 From 3d05031ae6de6ad084aa41263aed1343065233d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2020 14:55:29 -0800 Subject: rcu: Make nocb_gp_wait() double-check unexpected-callback warning Currently, nocb_gp_wait() unconditionally complains if there is a callback not already associated with a grace period. This assumes that either there was no such callback initially on the one hand, or that the rcu_advance_cbs() function assigned all such callbacks to a grace period on the other. However, in theory there are some situations that would prevent rcu_advance_cbs() from assigning all of the callbacks. This commit therefore checks for unassociated callbacks immediately after rcu_advance_cbs() returns, while the corresponding rcu_node structure's ->lock is still held. If there are unassociated callbacks at that point, the subsequent WARN_ON_ONCE() is disabled. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 70b3c0f4ea37..36e71c99970a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1931,6 +1931,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) struct rcu_data *rdp; struct rcu_node *rnp; unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. + bool wasempty = false; /* * Each pass through the following loop checks for CBs and for the @@ -1970,10 +1971,13 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ needwake_gp = rcu_advance_cbs(rnp, rdp); + wasempty = rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL); raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ } // Need to wait on some grace period? - WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist, + WARN_ON_ONCE(wasempty && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)); if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { if (!needwait_gp || -- cgit v1.2.3 From 34c881745549e78f31ec65f319457c82aacc53bd Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 20 Jan 2020 15:42:25 +0100 Subject: rcu: Support kfree_bulk() interface in kfree_rcu() The kfree_rcu() logic can be improved further by using kfree_bulk() interface along with "basic batching support" introduced earlier. The are at least two advantages of using "bulk" interface: - in case of large number of kfree_rcu() requests kfree_bulk() reduces the per-object overhead caused by calling kfree() per-object. - reduces the number of cache-misses due to "pointer chasing" between objects which can be far spread between each other. This approach defines a new kfree_rcu_bulk_data structure that stores pointers in an array with a specific size. Number of entries in that array depends on PAGE_SIZE making kfree_rcu_bulk_data structure to be exactly one page. Since it deals with "block-chain" technique there is an extra need in dynamic allocation when a new block is required. Memory is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that allows to skip direct reclaim under low memory condition to prevent stalling and fails silently under high memory pressure. The "emergency path" gets maintained when a system is run out of memory. In that case objects are linked into regular list. The "rcuperf" was run to analyze this change in terms of memory consumption and kfree_bulk() throughput. 1) Testing on the Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz, 12xCPUs with following parameters: kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1 kfree_vary_obj_size=1 dev.2020.01.10a branch Default / CONFIG_SLAB 53607352517 ns, loops: 200000, batches: 1885, memory footprint: 1248MB 53529637912 ns, loops: 200000, batches: 1921, memory footprint: 1193MB 53570175705 ns, loops: 200000, batches: 1929, memory footprint: 1250MB Patch / CONFIG_SLAB 23981587315 ns, loops: 200000, batches: 810, memory footprint: 1219MB 23879375281 ns, loops: 200000, batches: 822, memory footprint: 1190MB 24086841707 ns, loops: 200000, batches: 794, memory footprint: 1380MB Default / CONFIG_SLUB 51291025022 ns, loops: 200000, batches: 1713, memory footprint: 741MB 51278911477 ns, loops: 200000, batches: 1671, memory footprint: 719MB 51256183045 ns, loops: 200000, batches: 1719, memory footprint: 647MB Patch / CONFIG_SLUB 50709919132 ns, loops: 200000, batches: 1618, memory footprint: 456MB 50736297452 ns, loops: 200000, batches: 1633, memory footprint: 507MB 50660403893 ns, loops: 200000, batches: 1628, memory footprint: 429MB in case of CONFIG_SLAB there is double increase in performance and slightly higher memory usage. As for CONFIG_SLUB, the performance figures are better together with lower memory usage. 2) Testing on the HiKey-960, arm64, 8xCPUs with below parameters: CONFIG_SLAB=y kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1 102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB 89947009882 ns, loops: 200000, batches: 6715, memory footprint: 115MB rcuperf shows approximately ~12% better throughput in case of using "bulk" interface. The "drain logic" or its RCU callback does the work faster that leads to better throughput. Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 204 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 169 insertions(+), 35 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..51a3aa884a7c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2689,22 +2689,47 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) + +/** + * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * @nr_records: Number of active pointers in the array + * @records: Array of the kfree_rcu() pointers + * @next: Next bulk object in the block chain + * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set + */ +struct kfree_rcu_bulk_data { + unsigned long nr_records; + void *records[KFREE_BULK_MAX_ENTR]; + struct kfree_rcu_bulk_data *next; + struct rcu_head *head_free_debug; +}; + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct kfree_rcu_bulk_data *bhead_free; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period + * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -2718,6 +2743,8 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; + struct kfree_rcu_bulk_data *bhead; + struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; spinlock_t lock; struct delayed_work monitor_work; @@ -2727,14 +2754,24 @@ struct kfree_rcu_cpu { static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static __always_inline void +debug_rcu_head_unqueue_bulk(struct rcu_head *head) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + for (; head; head = head->next) + debug_rcu_head_unqueue(head); +#endif +} + /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->head_free. + * It frees all the objects queued on ->bhead_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; struct rcu_head *head, *next; + struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; @@ -2744,22 +2781,41 @@ static void kfree_rcu_work(struct work_struct *work) spin_lock_irqsave(&krcp->lock, flags); head = krwp->head_free; krwp->head_free = NULL; + bhead = krwp->bhead_free; + krwp->bhead_free = NULL; spin_unlock_irqrestore(&krcp->lock, flags); - // List "head" is now private, so traverse locklessly. + /* "bhead" is now private, so traverse locklessly. */ + for (; bhead; bhead = bnext) { + bnext = bhead->next; + + debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + + rcu_lock_acquire(&rcu_callback_map); + kfree_bulk(bhead->nr_records, bhead->records); + rcu_lock_release(&rcu_callback_map); + + if (cmpxchg(&krcp->bcached, NULL, bhead)) + free_page((unsigned long) bhead); + + cond_resched_tasks_rcu_qs(); + } + + /* + * Emergency case only. It can happen under low memory + * condition when an allocation gets failed, so the "bulk" + * path can not be temporary maintained. + */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; next = head->next; - // Potentially optimize with kfree_bulk in future. debug_rcu_head_unqueue(head); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { - /* Could be optimized with kfree_bulk() in future. */ + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) kfree((void *)head - offset); - } rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2774,26 +2830,48 @@ static void kfree_rcu_work(struct work_struct *work) */ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { + struct kfree_rcu_cpu_work *krwp; + bool queued = false; int i; - struct kfree_rcu_cpu_work *krwp = NULL; lockdep_assert_held(&krcp->lock); - for (i = 0; i < KFREE_N_BATCHES; i++) - if (!krcp->krw_arr[i].head_free) { - krwp = &(krcp->krw_arr[i]); - break; - } - // If a previous RCU batch is in progress, we cannot immediately - // queue another one, so return false to tell caller to retry. - if (!krwp) - return false; + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); - krwp->head_free = krcp->head; - krcp->head = NULL; - INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); - queue_rcu_work(system_wq, &krwp->rcu_work); - return true; + /* + * Try to detach bhead or head and attach it over any + * available corresponding free channel. It can be that + * a previous RCU batch is in progress, it means that + * immediately to queue another one is not possible so + * return false to tell caller to retry. + */ + if ((krcp->bhead && !krwp->bhead_free) || + (krcp->head && !krwp->head_free)) { + /* Channel 1. */ + if (!krwp->bhead_free) { + krwp->bhead_free = krcp->bhead; + krcp->bhead = NULL; + } + + /* Channel 2. */ + if (!krwp->head_free) { + krwp->head_free = krcp->head; + krcp->head = NULL; + } + + /* + * One work is per one batch, so there are two "free channels", + * "bhead_free" and "head_free" the batch can handle. It can be + * that the work is in the pending state when two channels have + * been detached following each other, one by one. + */ + queue_rcu_work(system_wq, &krwp->rcu_work); + queued = true; + } + } + + return queued; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, @@ -2830,19 +2908,65 @@ static void kfree_rcu_monitor(struct work_struct *work) spin_unlock_irqrestore(&krcp->lock, flags); } +static inline bool +kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, + struct rcu_head *head, rcu_callback_t func) +{ + struct kfree_rcu_bulk_data *bnode; + + if (unlikely(!krcp->initialized)) + return false; + + lockdep_assert_held(&krcp->lock); + + /* Check if a new block is required. */ + if (!krcp->bhead || + krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { + bnode = xchg(&krcp->bcached, NULL); + if (!bnode) { + WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + } + + /* Switch to emergency path. */ + if (unlikely(!bnode)) + return false; + + /* Initialize the new block. */ + bnode->nr_records = 0; + bnode->next = krcp->bhead; + bnode->head_free_debug = NULL; + + /* Attach it to the head. */ + krcp->bhead = bnode; + } + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + head->func = func; + head->next = krcp->bhead->head_free_debug; + krcp->bhead->head_free_debug = head; +#endif + + /* Finally insert. */ + krcp->bhead->records[krcp->bhead->nr_records++] = + (void *) head - (unsigned long) func; + + return true; +} + /* - * Queue a request for lazy invocation of kfree() after a grace period. + * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace + * period. Please note there are two paths are maintained, one is the main one + * that uses kfree_bulk() interface and second one is emergency one, that is + * used only when the main path can not be maintained temporary, due to memory + * pressure. * * Each kfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch - * will be kfree'd in workqueue context. This allows us to: - * - * 1. Batch requests together to reduce the number of grace periods during - * heavy kfree_rcu() load. - * - * 2. It makes it possible to use kfree_bulk() on a large number of - * kfree_rcu() requests thus reducing cache misses and the per-object - * overhead of kfree(). + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will + * be free'd in workqueue context. This allows us to: batch requests together to + * reduce the number of grace periods during heavy kfree_rcu() load. */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -2861,9 +2985,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) __func__, head); goto unlock_return; } - head->func = func; - head->next = krcp->head; - krcp->head = head; + + /* + * Under high memory pressure GFP_NOWAIT can fail, + * in that case the emergency path is maintained. + */ + if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + head->func = func; + head->next = krcp->head; + krcp->head = head; + } // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && @@ -3769,8 +3900,11 @@ static void __init kfree_rcu_batch_init(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_init(&krcp->lock); - for (i = 0; i < KFREE_N_BATCHES; i++) + for (i = 0; i < KFREE_N_BATCHES; i++) { + INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit v1.2.3 From 613707929b304737e6eb841588772f1994f6702b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 20 Jan 2020 15:42:26 +0100 Subject: rcu: Add a trace event for kfree_rcu() use of kfree_bulk() The event is given three parameters, first one is the name of RCU flavour, second one is the number of elements in array for free and last one is an address of the array holding pointers to be freed by the kfree_bulk() function. To enable the trace event your kernel has to be build with CONFIG_RCU_TRACE=y, after that it is possible to track the events using ftrace subsystem. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 28 ++++++++++++++++++++++++++++ kernel/rcu/tree.c | 3 +++ 2 files changed, 31 insertions(+) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5e49b06e8104..49a49e68b916 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -623,6 +623,34 @@ TRACE_EVENT_RCU(rcu_invoke_kfree_callback, __entry->rcuname, __entry->rhp, __entry->offset) ); +/* + * Tracepoint for the invocation of a single RCU callback of the special + * kfree_bulk() form. The first argument is the RCU flavor, the second + * argument is a number of elements in array to free, the third is an + * address of the array holding nr_records entries. + */ +TRACE_EVENT_RCU(rcu_invoke_kfree_bulk_callback, + + TP_PROTO(const char *rcuname, unsigned long nr_records, void **p), + + TP_ARGS(rcuname, nr_records, p), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(unsigned long, nr_records) + __field(void **, p) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->nr_records = nr_records; + __entry->p = p; + ), + + TP_printk("%s bulk=0x%p nr_records=%lu", + __entry->rcuname, __entry->p, __entry->nr_records) +); + /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been * invoked. The first argument is the name of the RCU flavor, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51a3aa884a7c..909f97efb1ed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2792,6 +2792,9 @@ static void kfree_rcu_work(struct work_struct *work) debug_rcu_head_unqueue_bulk(bhead->head_free_debug); rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, + bhead->nr_records, bhead->records); + kfree_bulk(bhead->nr_records, bhead->records); rcu_lock_release(&rcu_callback_map); -- cgit v1.2.3 From 80c503e0e68fbe271680ab48f0fe29bc034b01b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Jan 2020 09:19:01 -0800 Subject: locktorture: Print ratio of acquisitions, not failures The __torture_print_stats() function in locktorture.c carefully initializes local variable "min" to statp[0].n_lock_acquired, but then compares it to statp[i].n_lock_fail. Given that the .n_lock_fail field should normally be zero, and given the initialization, it seems reasonable to display the maximum and minimum number acquisitions instead of miscomputing the maximum and minimum number of failures. This commit therefore switches from failures to acquisitions. And this turns out to be not only a day-zero bug, but entirely my own fault. I hate it when that happens! Fixes: 0af3fe1efa53 ("locktorture: Add a lock-torture kernel module") Reported-by: Will Deacon Signed-off-by: Paul E. McKenney Acked-by: Will Deacon Cc: Davidlohr Bueso Cc: Josh Triplett Cc: Peter Zijlstra --- kernel/locking/locktorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 99475a66c94f..687c1d83dc20 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -696,10 +696,10 @@ static void __torture_print_stats(char *page, if (statp[i].n_lock_fail) fail = true; sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_fail) - max = statp[i].n_lock_fail; - if (min > statp[i].n_lock_fail) - min = statp[i].n_lock_fail; + if (max < statp[i].n_lock_acquired) + max = statp[i].n_lock_acquired; + if (min > statp[i].n_lock_acquired) + min = statp[i].n_lock_acquired; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", -- cgit v1.2.3 From 7aabb6f839622bc96a425d93f3f7373167be1e19 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Jan 2020 12:32:31 -0800 Subject: locktorture: Allow CPU-hotplug to be disabled via --bootargs The bootparam_hotplug_cpu() bash function was checking for CPU-hotplug kernel-boot parameters from --bootargs, but that check was specific to rcutorture ("rcutorture\.onoff_"). This commit therefore makes this check also work for locktorture ("torture\.onoff_"). Note that rcuperf does not do CPU-hotplug operations, so it is not necessary to make a similar change for rcuperf. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index c3a49fb4d6f6..12810229fddc 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -12,7 +12,7 @@ # Returns 1 if the specified boot-parameter string tells rcutorture to # test CPU-hotplug operations. bootparam_hotplug_cpu () { - echo "$1" | grep -q "rcutorture\.onoff_" + echo "$1" | grep -q "torture\.onoff_" } # checkarg --argname argtype $# arg mustmatch cannotmatch -- cgit v1.2.3 From c0e1472d80784206ead1dd803dd4bc10e282b17f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Jan 2020 12:58:15 -0800 Subject: locktorture: Use private random-number generators Both lock_torture_writer() and lock_torture_reader() use the "static" keyword on their DEFINE_TORTURE_RANDOM(rand) declarations, which means that a single instance of a random-number generator are shared among all the writers and another is shared among all the readers. Unfortunately, this random-number generator was not designed for concurrent access. This commit therefore removes both "static" keywords so that each reader and each writer gets its own random-number generator. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 687c1d83dc20..5baf904e8f39 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -618,7 +618,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { static int lock_torture_writer(void *arg) { struct lock_stress_stats *lwsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); set_user_nice(current, MAX_NICE); @@ -655,7 +655,7 @@ static int lock_torture_writer(void *arg) static int lock_torture_reader(void *arg) { struct lock_stress_stats *lrsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_reader task started"); set_user_nice(current, MAX_NICE); -- cgit v1.2.3 From 28e09a2e48486ce8ff0a72e21570d59b1243b308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2020 20:37:04 -0800 Subject: locktorture: Forgive apparent unfairness if CPU hotplug If CPU hotplug testing is enabled, a lock might appear to be maximally unfair just because one of the CPUs was offline almost all the time. This commit therefore forgives unfairness if CPU hotplug testing was enabled. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5baf904e8f39..5efbfc68ce99 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -704,7 +704,8 @@ static void __torture_print_stats(char *page, page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", write ? "Writes" : "Reads ", - sum, max, min, max / 2 > min ? "???" : "", + sum, max, min, + !onoff_interval && max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) atomic_inc(&cxt.n_lock_torture_errors); -- cgit v1.2.3 From b5ea03709d12e98fa341aecfa6940cc9f49e8817 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Dec 2019 15:19:45 -0800 Subject: rcu: Clear ->core_needs_qs at GP end or self-reported QS The rcu_data structure's ->core_needs_qs field does not necessarily get cleared in a timely fashion after the corresponding CPUs' quiescent state has been reported. From a functional viewpoint, no harm done, but this can result in excessive invocation of RCU core processing, as witnessed by the kernel test robot, which saw greatly increased softirq overhead. This commit therefore restores the rcu_report_qs_rdp() function's clearing of this field, but only when running on the corresponding CPU. Cases where some other CPU reports the quiescent state (for example, on behalf of an idle CPU) are handled by setting this field appropriately within the __note_gp_changes() function's end-of-grace-period checks. This handling is carried out regardless of whether the end of a grace period actually happened, thus handling the case where a CPU goes non-idle after a quiescent state is reported on its behalf, but before the grace period ends. This fix also avoids cross-CPU updates to ->core_needs_qs, While in the area, this commit changes the __note_gp_changes() need_gp variable's name to need_qs because it is a quiescent state that is needed from the CPU in question. Fixes: ed93dfc6bc00 ("rcu: Confine ->core_needs_qs accesses to the corresponding CPU") Reported-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..31d01f80a1f6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1386,7 +1386,7 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; - bool need_gp; + bool need_qs; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -1400,10 +1400,13 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) unlikely(READ_ONCE(rdp->gpwrap))) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ + rdp->core_needs_qs = false; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { if (!offloaded) ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ + if (rdp->core_needs_qs) + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1415,9 +1418,9 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) * go looking for one. */ trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); - need_gp = !!(rnp->qsmask & rdp->grpmask); - rdp->cpu_no_qs.b.norm = need_gp; - rdp->core_needs_qs = need_gp; + need_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_qs; + rdp->core_needs_qs = need_qs; zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ @@ -1987,6 +1990,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + if (rdp->cpu == smp_processor_id()) + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { -- cgit v1.2.3 From b2b00ddf193bf83dc561d965c67b18eb54ebcd83 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 11:56:10 -0700 Subject: rcu: React to callback overload by aggressively seeking quiescent states In default configutions, RCU currently waits at least 100 milliseconds before asking cond_resched() and/or resched_rcu() for help seeking quiescent states to end a grace period. But 100 milliseconds can be one good long time during an RCU callback flood, for example, as can happen when user processes repeatedly open and close files in a tight loop. These 100-millisecond gaps in successive grace periods during a callback flood can result in excessive numbers of callbacks piling up, unnecessarily increasing memory footprint. This commit therefore asks cond_resched() and/or resched_rcu() for help as early as the first FQS scan when at least one of the CPUs has more than 20,000 callbacks queued, a number that can be changed using the new rcutree.qovld kernel boot parameter. An auxiliary qovld_calc variable is used to avoid acquisition of locks that have not yet been initialized. Early tests indicate that this reduces the RCU-callback memory footprint during rcutorture floods by from 50% to 4x, depending on configuration. Reported-by: Joel Fernandes (Google) Reported-by: Tejun Heo [ paulmck: Fix bug located by Qian Cai. ] Signed-off-by: Paul E. McKenney Tested-by: Dexuan Cui Tested-by: Qian Cai --- Documentation/admin-guide/kernel-parameters.txt | 9 +++ kernel/rcu/tree.c | 75 +++++++++++++++++++++++-- kernel/rcu/tree.h | 4 ++ kernel/rcu/tree_plugin.h | 2 + 4 files changed, 86 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dbc22d684627..dbd4b4a65209 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3980,6 +3980,15 @@ Set threshold of queued RCU callbacks below which batch limiting is re-enabled. + rcutree.qovld= [KNL] + Set threshold of queued RCU callbacks beyond which + RCU's force-quiescent-state scan will aggressively + enlist help from cond_resched() and sched IPIs to + help CPUs more quickly reach quiescent states. + Set to less than zero to make this be set based + on rcutree.qhimark at boot time and to zero to + disable more aggressive help enlistment. + rcutree.rcu_idle_gp_delay= [KNL] Set wakeup interval for idle CPUs that have RCU callbacks (RCU_FAST_NO_HZ=y). diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 31d01f80a1f6..48fba2257748 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -410,10 +411,15 @@ static long blimit = DEFAULT_RCU_BLIMIT; static long qhimark = DEFAULT_RCU_QHIMARK; #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ static long qlowmark = DEFAULT_RCU_QLOMARK; +#define DEFAULT_RCU_QOVLD_MULT 2 +#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) +static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ +static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); +module_param(qovld, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; @@ -1072,7 +1078,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || - time_after(jiffies, rcu_state.jiffies_resched))) { + time_after(jiffies, rcu_state.jiffies_resched) || + rcu_state.cbovld)) { WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); @@ -1089,8 +1096,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * So hit them over the head with the resched_cpu() hammer! */ if (tick_nohz_full_cpu(rdp->cpu) && - time_after(jiffies, - READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { + (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || + rcu_state.cbovld)) { WRITE_ONCE(*ruqp, true); resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); @@ -1704,8 +1711,9 @@ static void rcu_gp_fqs_loop(void) */ static void rcu_gp_cleanup(void) { - unsigned long gp_duration; + int cpu; bool needgp = false; + unsigned long gp_duration; unsigned long new_gp_seq; bool offloaded; struct rcu_data *rdp; @@ -1751,6 +1759,12 @@ static void rcu_gp_cleanup(void) needgp = __note_gp_changes(rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ needgp = rcu_future_gp_cleanup(rnp) || needgp; + // Reset overload indication for CPUs no longer overloaded + if (rcu_is_leaf_node(rnp)) + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + check_cb_ovld_locked(rdp, rnp); + } sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -2299,10 +2313,13 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) struct rcu_data *rdp; struct rcu_node *rnp; + rcu_state.cbovld = rcu_state.cbovldnext; + rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { @@ -2583,6 +2600,48 @@ static void rcu_leak_callback(struct rcu_head *rhp) { } +/* + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. The caller must hold the leaf rcu_node + * structure's ->lock. + */ +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp) +{ + raw_lockdep_assert_held_rcu_node(rnp); + if (qovld_calc <= 0) + return; // Early boot and wildcard value set. + if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask); + else + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask); +} + +/* + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. No locks need be held, but the + * caller must have disabled interrupts. + * + * Note that this function ignores the possibility that there are a lot + * of callbacks all of which have already seen the end of their respective + * grace periods. This omission is due to the need for no-CBs CPUs to + * be holding ->nocb_lock to do this check, which is too heavy for a + * common-case operation. + */ +static void check_cb_ovld(struct rcu_data *rdp) +{ + struct rcu_node *const rnp = rdp->mynode; + + if (qovld_calc <= 0 || + ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) == + !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask))) + return; // Early boot wildcard value or already set correctly. + raw_spin_lock_rcu_node(rnp); + check_cb_ovld_locked(rdp, rnp); + raw_spin_unlock_rcu_node(rnp); +} + /* * Helper function for call_rcu() and friends. The cpu argument will * normally be -1, indicating "currently running CPU". It may specify @@ -2626,6 +2685,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) rcu_segcblist_init(&rdp->cblist); } + check_cb_ovld(rdp); if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ @@ -3814,6 +3874,13 @@ void __init rcu_init(void) rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); srcu_init(); + + /* Fill in default value for rcutree.qovld boot parameter. */ + /* -After- the rcu_node ->lock fields are initialized! */ + if (qovld < 0) + qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; + else + qovld_calc = qovld; } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0c87e4c161c2..9dc2ec021da5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,8 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long cbovldmask; + /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ @@ -321,6 +323,8 @@ struct rcu_state { atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ + u8 cbovld; /* Callback overload now? */ + u8 cbovldnext; /* ^ ^ next time? */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c6ea81cd4189..0be8fad08daa 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -56,6 +56,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); if (qlowmark != DEFAULT_RCU_QLOMARK) pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (qovld != DEFAULT_RCU_QOVLD) + pr_info("\tBoot-time adjustment of callback overload leval to %ld.\n", qovld); if (jiffies_till_first_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) -- cgit v1.2.3 From 8c14263d351b4766a040346ee917b8d81583a460 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Nov 2019 01:10:55 -0800 Subject: rcu: React to callback overload by boosting RCU readers RCU priority boosting currently is not applied until the grace period is at least 250 milliseconds old (or the number of milliseconds specified by the CONFIG_RCU_BOOST_DELAY Kconfig option). Although this has worked well, it can result in OOM under conditions of RCU callback flooding. One can argue that the real-time systems using RCU priority boosting should carefully avoid RCU callback flooding, but one can just as well argue that an OOM is a rather obnoxious error message. This commit therefore disables the RCU priority boosting delay when there are excessive numbers of callbacks queued. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0be8fad08daa..4d4637c361b7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1079,7 +1079,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - ULONG_CMP_GE(jiffies, rnp->boost_time))) { + (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit v1.2.3 From aa96a93ba2bbad5e1b24706daffa14c4f1c50d2a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 12 Dec 2019 17:36:43 +0000 Subject: rcu: Fix spelling mistake "leval" -> "level" This commit fixes a spelling mistake in a pr_info() message. Signed-off-by: Colin Ian King Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4d4637c361b7..0765784012f8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) if (qlowmark != DEFAULT_RCU_QLOMARK) pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); if (qovld != DEFAULT_RCU_QOVLD) - pr_info("\tBoot-time adjustment of callback overload leval to %ld.\n", qovld); + pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld); if (jiffies_till_first_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) -- cgit v1.2.3 From b692dc4adfcff5430467bec8efed3c07d1772eaf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Feb 2020 07:29:02 -0800 Subject: rcu: Update __call_rcu() comments The __call_rcu() function's header comment refers to a cpu argument that no longer exists, and the comment of the return path from rcu_nocb_try_bypass() ignores the non-no-CBs CPU case. This commit therefore update both comments. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48fba2257748..0a9de9fd43ed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2642,12 +2642,7 @@ static void check_cb_ovld(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); } -/* - * Helper function for call_rcu() and friends. The cpu argument will - * normally be -1, indicating "currently running CPU". It may specify - * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier() - * is expected to specify a CPU. - */ +/* Helper function for call_rcu() and friends. */ static void __call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -2688,7 +2683,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) check_cb_ovld(rdp); if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. - /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ + // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, -- cgit v1.2.3 From fcb7381265e6cceb1d54283878d145db52b9d9d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Jan 2020 11:59:58 -0800 Subject: rcu-tasks: *_ONCE() for rcu_tasks_cbs_head The RCU tasks list of callbacks, rcu_tasks_cbs_head, is sampled locklessly by rcu_tasks_kthread() when waiting for work to do. This commit therefore applies READ_ONCE() to that lockless sampling and WRITE_ONCE() to the single potential store outside of rcu_tasks_kthread. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6c4b862f57d6..a27df761b149 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -528,7 +528,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) rhp->func = func; raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); needwake = !rcu_tasks_cbs_head; - *rcu_tasks_cbs_tail = rhp; + WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ @@ -658,7 +658,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* If there were none, wait a bit and start over. */ if (!list) { wait_event_interruptible(rcu_tasks_cbs_wq, - rcu_tasks_cbs_head); + READ_ONCE(rcu_tasks_cbs_head)); if (!rcu_tasks_cbs_head) { WARN_ON(signal_pending(current)); schedule_timeout_interruptible(HZ/10); -- cgit v1.2.3 From e1e9bdc00ade6651c11bf5628ee9d313ee6089ac Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 20 Jan 2020 22:41:26 +0000 Subject: rcu: Add missing annotation for exit_tasks_rcu_start() Sparse reports a warning at exit_tasks_rcu_start(void) |warning: context imbalance in exit_tasks_rcu_start() - wrong count at exit To fix this, this commit adds an __acquires(&tasks_rcu_exit_srcu). Given that exit_tasks_rcu_start() does actually call __srcu_read_lock(), this not only fixes the warning but also improves on the readability of the code. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a27df761b149..a04fe54bc4ab 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -801,7 +801,7 @@ static int __init rcu_spawn_tasks_kthread(void) core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) { preempt_disable(); current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); -- cgit v1.2.3 From 90ba11ba99e0a4cc75302335d10a225b27a44918 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 20 Jan 2020 22:41:54 +0000 Subject: rcu: Add missing annotation for exit_tasks_rcu_finish() Sparse reports a warning at exit_tasks_rcu_finish(void) |warning: context imbalance in exit_tasks_rcu_finish() |- wrong count at exit To fix this, this commit adds a __releases(&tasks_rcu_exit_srcu). Given that exit_tasks_rcu_finish() does actually call __srcu_read_lock(), this not only fixes the warning but also improves on the readability of the code. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a04fe54bc4ab..ede656c5e1e9 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -809,7 +809,7 @@ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) } /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { preempt_disable(); __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); -- cgit v1.2.3 From 7ff8b4502bc0f576450d4ecbda97fa30e8002ed1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:32:54 -0800 Subject: srcu: Fix __call_srcu()/process_srcu() datarace The srcu_node structure's ->srcu_gp_seq_needed_exp field is accessed locklessly, so updates must use WRITE_ONCE(). This commit therefore adds the needed WRITE_ONCE() invocations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 657e6a7d1c03..b1edac93e403 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -550,7 +550,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - snp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); @@ -660,7 +660,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp == sdp->mynode) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - snp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } -- cgit v1.2.3 From 8c9e0cb32315835825ea1ad725a858a2d2ce4a8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:36:33 -0800 Subject: srcu: Fix __call_srcu()/srcu_get_delay() datarace The srcu_struct structure's ->srcu_gp_seq_needed_exp field is accessed locklessly, so updates must use WRITE_ONCE(). This commit therefore adds the needed WRITE_ONCE() invocations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b1edac93e403..79848f7d575d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -534,7 +534,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - ssp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); spin_unlock_irq_rcu_node(ssp); mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -614,7 +614,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp } spin_lock_irqsave_rcu_node(ssp, flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); } @@ -674,7 +674,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); /* If grace period not already done and none in progress, start it. */ if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && -- cgit v1.2.3 From 39f91504a03a7a2abdb52205106942fa4a548d0d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:39:35 -0800 Subject: srcu: Fix process_srcu()/srcu_batches_completed() datarace The srcu_struct structure's ->srcu_idx field is accessed locklessly, so reads must use READ_ONCE(). This commit therefore adds the needed READ_ONCE() invocation where it was missed. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 79848f7d575d..119a37319e67 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1079,7 +1079,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return ssp->srcu_idx; + return READ_ONCE(ssp->srcu_idx); } EXPORT_SYMBOL_GPL(srcu_batches_completed); -- cgit v1.2.3 From 710426068dc60f2d2e139478d6185710802cdc0a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 11:42:05 -0800 Subject: srcu: Hold srcu_struct ->lock when updating ->srcu_gp_seq A read of the srcu_struct structure's ->srcu_gp_seq field should not need READ_ONCE() when that structure's ->lock is held. Except that this lock is not always held when updating this field. This commit therefore acquires the lock around updates and removes a now-unneeded READ_ONCE(). This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney [ paulmck: Switch from READ_ONCE() to lock per Peter Zilstra question. ] Acked-by: Peter Zijlstra (Intel) --- kernel/rcu/srcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 119a37319e67..c19c1df0d198 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -450,7 +450,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + state = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -1130,7 +1130,9 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); + spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + spin_unlock_irq_rcu_node(ssp); } if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { -- cgit v1.2.3 From 59ee0326ccf712f9a637d5df2465a16a784cbfb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Nov 2019 18:54:06 -0800 Subject: rcutorture: Suppress forward-progress complaints during early boot Some larger systems can take in excess of 50 seconds to complete their early boot initcalls prior to spawing init. This does not in any way help the forward-progress judgments of built-in rcutorture (when rcutorture is built as a module, the insmod or modprobe command normally cannot happen until some time after boot completes). This commit therefore suppresses such complaints until about the time that init is spawned. This also includes a fix to a stupid error located by kbuild test robot. [ paulmck: Apply kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney [ paulmck: Fix to nohz_full slow-expediting recovery logic, per bpetkov. ] [ paulmck: Restrict splat to CONFIG_PREEMPT_RT=y kernels and simplify. ] Tested-by: Borislav Petkov --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/rcutorture.c | 3 ++- kernel/rcu/tree_exp.h | 7 ++++++- kernel/rcu/update.c | 12 ++++++++++++ 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b2b2dc990da9..045c28b71f4f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -83,6 +83,7 @@ void rcu_scheduler_starting(void); static inline void rcu_scheduler_starting(void) { } #endif /* #else #ifndef CONFIG_SRCU */ static inline void rcu_end_inkernel_boot(void) { } +static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 2f787b9029d1..45f3f66bb04d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -54,6 +54,7 @@ void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); +bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1aeecc165b21..9ba49788cb48 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1067,7 +1067,8 @@ rcu_torture_writer(void *arg) if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && - !torture_must_stop()) + !torture_must_stop() && + rcu_inkernel_boot_has_ended()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dcbd75791f39..a72d16eb869e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -485,6 +485,7 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) static void synchronize_rcu_expedited_wait(void) { int cpu; + unsigned long j; unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; @@ -496,7 +497,7 @@ static void synchronize_rcu_expedited_wait(void) trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { @@ -508,6 +509,10 @@ static void synchronize_rcu_expedited_wait(void) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); } } + j = READ_ONCE(jiffies_till_first_fqs); + if (synchronize_rcu_expedited_wait_once(j + HZ)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)); } for (;;) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6c4b862f57d6..feaaec5747a3 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -183,6 +183,8 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); +static bool rcu_boot_ended __read_mostly; + /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -191,7 +193,17 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); + rcu_boot_ended = 1; +} + +/* + * Let rcutorture know when it is OK to turn it up to eleven. + */ +bool rcu_inkernel_boot_has_ended(void) +{ + return rcu_boot_ended; } +EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended); #endif /* #ifndef CONFIG_TINY_RCU */ -- cgit v1.2.3 From 90e23b6b81a9b374d2940cb0b33935d53664509e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Dec 2019 13:24:07 -0800 Subject: torture: Make results-directory date format completion-friendly The names of the per-test results directories are of the form 2019.11.29-20:42:19. This works, but the ":" characters make tab-based shell name completion a bit onerous because the user must remember to include a quote character somewhere before the first ":". This commit therefore changes the ":" characters to periods, as in 2019.12.01-20.48.01", which allows tab-based completion to work more naturally. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 78d18ab8e954..2315e2ec12d6 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -39,7 +39,7 @@ TORTURE_TRUST_MAKE="" resdir="" configs="" cpus=0 -ds=`date +%Y.%m.%d-%H:%M:%S` +ds=`date +%Y.%m.%d-%H.%M.%S` jitter="-1" usage () { -- cgit v1.2.3 From 435508095ab5b6870e8140948983920ce4684e9b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Dec 2019 15:58:41 -0800 Subject: rcutorture: Refrain from callback flooding during boot Additional rcutorture aggression can result in, believe it or not, boot times in excess of three minutes on large hyperthreaded systems. This is long enough for rcutorture to decide to do some callback flooding, which seems a bit excessive given that userspace cannot have started until long after boot, and it is userspace that does the real-world callback flooding. Worse yet, because Tiny RCU lacks forward-progress functionality, the looping-in-the-kernel tests can also be problematic during early boot. This commit therefore causes rcutorture to hold off on callback flooding until about the time that init is spawned, and the same for looping-in-the-kernel tests for Tiny RCU. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9ba49788cb48..08fa4ef23914 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1994,8 +1994,11 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - rcu_torture_fwd_prog_cr(rfp); + if (!IS_ENABLED(CONFIG_TINY_RCU) || + rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + if (rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ -- cgit v1.2.3 From a59ee765a6890e7f4281070008a2654337458311 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2019 10:49:11 -0800 Subject: torture: Forgive -EBUSY from boottime CPU-hotplug operations During boot, CPU hotplug is often disabled, for example by PCI probing. On large systems that take substantial time to boot, this can result in spurious RCU_HOTPLUG errors. This commit therefore forgives any boottime -EBUSY CPU-hotplug failures by adjusting counters to pretend that the corresponding attempt never happened. A non-splat record of the failed attempt is emitted to the console with the added string "(-EBUSY forgiven during boot)". Signed-off-by: Paul E. McKenney --- kernel/torture.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 7c13f5558b71..e377b5b17de8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -84,6 +84,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -99,10 +100,16 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, (*n_offl_attempts)++; ret = cpu_down(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_offl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: offline %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -137,6 +144,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -150,10 +158,16 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, (*n_onl_attempts)++; ret = cpu_up(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_onl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: online %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG -- cgit v1.2.3 From 58c53360b36d2077cbb843e7ad2bf75f0498271c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2019 11:29:01 -0800 Subject: rcutorture: Allow boottime stall warnings to be suppressed In normal production, an RCU CPU stall warning at boottime is often just as bad as at any other time. In fact, given the desire for fast boot, any sort of long-term stall at boot is a bad idea. However, heavy rcutorture testing on large hyperthreaded systems can generate boottime RCU CPU stalls as a matter of course. This commit therefore provides a kernel boot parameter that suppresses reporting of boottime RCU CPU stall warnings and similarly of rcutorture writer stalls. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ kernel/rcu/rcu.h | 17 +++++++++++++++++ kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_stall.h | 6 +++--- kernel/rcu/update.c | 8 +++++++- 6 files changed, 35 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dbc22d684627..ee007b5c874f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4195,6 +4195,12 @@ rcupdate.rcu_cpu_stall_suppress= [KNL] Suppress RCU CPU stall warning messages. + rcupdate.rcu_cpu_stall_suppress_at_boot= [KNL] + Suppress RCU CPU stall warning messages and + rcutorture writer stall warnings that occur + during early boot, that is, during the time + before the init task is spawned. + rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 05f936ed167a..1779cbf33cd1 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,6 +198,13 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +extern int rcu_cpu_stall_suppress_at_boot; + +static inline bool rcu_stall_is_suppressed_at_boot(void) +{ + return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); +} + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -205,6 +212,11 @@ extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress; +} + #define rcu_ftrace_dump_stall_suppress() \ do { \ if (!rcu_cpu_stall_suppress) \ @@ -218,6 +230,11 @@ do { \ } while (0) #else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ + +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot(); +} #define rcu_ftrace_dump_stall_suppress() #define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08fa4ef23914..16c84ec182bd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1479,7 +1479,7 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL) { + rcu_torture_current != NULL && !rcu_stall_is_suppressed()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a72d16eb869e..c28d9f0034c3 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -518,7 +518,7 @@ static void synchronize_rcu_expedited_wait(void) for (;;) { if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) continue; panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 55f9b84790d3..7ee8a1cc0d8b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -383,7 +383,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -452,7 +452,7 @@ static void print_cpu_stall(void) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -504,7 +504,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index feaaec5747a3..085f08a898fe 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -476,13 +476,19 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_ftrace_dump __read_mostly; module_param(rcu_cpu_stall_ftrace_dump, int, 0644); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall +// warnings. Also used by rcutorture even if stall warnings are excluded. +int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); +module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); + #ifdef CONFIG_TASKS_RCU /* -- cgit v1.2.3 From 4ab00bdd99a906c089b5c20ee7b5cb91e7c61123 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2019 15:53:28 -0800 Subject: rcutorture: Suppress boottime bad-sequence warnings In normal production, an excessively long wait on a grace period (synchronize_rcu(), for example) at boottime is often just as bad as at any other time. In fact, given the desire for fast boot, any sort of long wait at boot is a bad idea. However, heavy rcutorture testing on large hyperthreaded systems can generate such long waits during boot as a matter of course. This commit therefore causes the rcupdate.rcu_cpu_stall_suppress_at_boot kernel boot parameter to suppress reporting of bootime bad-sequence warning due to excessively long grace-period waits. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 16c84ec182bd..5efd9503df56 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1423,7 +1423,8 @@ rcu_torture_stats_print(void) pr_alert("%s%s ", torture_type, TORTURE_FLAG); pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, - rcu_torture_current ? "ver" : "VER", + rcu_torture_current && !rcu_stall_is_suppressed_at_boot() + ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), -- cgit v1.2.3 From 8171d3e0dafd37a9c833904e5a936f4154a1e95b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Dec 2019 15:02:59 -0800 Subject: torture: Allow disabling of boottime CPU-hotplug torture operations In theory, RCU-hotplug operations are supposed to work as soon as there is more than one CPU online. However, in practice, in normal production there is no way to make them happen until userspace is up and running. Besides which, on smaller systems, rcutorture doesn't start doing hotplug operations until 30 seconds after the start of boot, which on most systems also means the better part of 30 seconds after the end of boot. This commit therefore provides a new torture.disable_onoff_at_boot kernel boot parameter that suppresses CPU-hotplug torture operations until about the time that init is spawned. Of course, if you know of a need for boottime CPU-hotplug operations, then you should avoid passing this argument to any of the torture tests. You might also want to look at the splats linked to below. Link: https://lore.kernel.org/lkml/20191206185208.GA25636@paulmck-ThinkPad-P72/ Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/torture.c | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ee007b5c874f..868f59a48580 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4873,6 +4873,10 @@ topology updates sent by the hypervisor to this LPAR. + torture.disable_onoff_at_boot= [KNL] + Prevent the CPU-hotplug component of torturing + until after init has spawned. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/kernel/torture.c b/kernel/torture.c index e377b5b17de8..8683375dc0c7 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -42,6 +42,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); +static bool disable_onoff_at_boot; +module_param(disable_onoff_at_boot, bool, 0444); + static char *torture_type; static int verbose; @@ -229,6 +232,10 @@ torture_onoff(void *arg) VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } while (!torture_must_stop()) { + if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { + schedule_timeout_interruptible(HZ / 10); + continue; + } cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); if (!torture_offline(cpu, &n_offline_attempts, &n_offline_successes, -- cgit v1.2.3 From e0714247373b9c0253b002573f63f3e9698b7b30 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Dec 2019 12:11:56 -0800 Subject: rcutorture: Add 100-CPU configuration The small-system rcutorture configurations have served us well for a great many years, but it is now time to add a larger one. This commit does just that, but does not add it to the defaults in CFLIST. This allows the kvm.sh argument '--configs "4*CFLIST TREE10" to run four instances of each of the default configurations concurrently with one instance of the large configuration. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE10 | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE10 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 new file mode 100644 index 000000000000..2debe7891aeb --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=100 +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TREE_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n +CONFIG_DEBUG_OBJECTS=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=n -- cgit v1.2.3 From c0b94ffb66845d13f9ec537a28bd1466b4de2e14 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2019 12:04:33 -0800 Subject: rcutorture: Summarize summary of build and run results When running the default list of tests, the run summary of a successful (that is, failed to find any errors) run fits easily on a 24-line screen. But a run with something like "--configs '5*CFLIST'" will be 80 lines long, and it is all too easy to miss a failure message when scrolling back. This commit therefore prints out the number of runs with failing builds or runtime failures, but only if there are any such failures. For example, a run with a single build error and a single runtime error would print two lines like this: 1 runs with build errors. 1 runs with runtime errors. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index e5edd5198725..0326f4a5ff9c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -13,6 +13,9 @@ # # Authors: Paul E. McKenney +T=/tmp/kvm-recheck.sh.$$ +trap 'rm -f $T' 0 2 + PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH . functions.sh for rd in "$@" @@ -68,4 +71,16 @@ do fi done done -EDITOR=echo kvm-find-errors.sh "${@: -1}" > /dev/null 2>&1 +EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1 +ret=$? +builderrors="`tr ' ' '\012' < $T | grep -c '/Make.out.diags'`" +if test "$builderrors" -gt 0 +then + echo $builderrors runs with build errors. +fi +runerrors="`tr ' ' '\012' < $T | grep -c '/console.log.diags'`" +if test "$runerrors" -gt 0 +then + echo $runerrors runs with runtime errors. +fi +exit $ret -- cgit v1.2.3 From beabc806f5aaa158fc90a939215e8b44ee9d7acc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2019 12:08:31 -0800 Subject: rcutorture: Make kvm-find-errors.sh abort on bad directory Currently, kvm-find-errors.sh gives a usage prompt when given a bad directory, but then soldiers on, giving a series of confusing error messages. This commit therefore prints an error message and exits when given a bad directory, hopefully reducing confusion. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 1871d00bccd7..6f50722f251f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -20,7 +20,9 @@ rundir="${1}" if test -z "$rundir" -o ! -d "$rundir" then + echo Directory "$rundir" not found. echo Usage: $0 directory + exit 1 fi editor=${EDITOR-vi} -- cgit v1.2.3 From 202489101f2e6cee3f6dffc087a4abd5fdfcebda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Dec 2019 10:41:48 -0800 Subject: rcutorture: Fix rcu_torture_one_read()/rcu_torture_writer() data race The ->rtort_pipe_count field in the rcu_torture structure checks for too-short grace periods, and is therefore read by rcutorture's readers while being updated by rcutorture's writers. This commit therefore adds the needed READ_ONCE() and WRITE_ONCE() invocations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5efd9503df56..edd97465a0f7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -375,11 +375,12 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; - i = rp->rtort_pipe_count; + i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + WRITE_ONCE(rp->rtort_pipe_count, i + 1); + if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } @@ -1015,7 +1016,8 @@ rcu_torture_writer(void *arg) if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; + WRITE_ONCE(old_rp->rtort_pipe_count, + old_rp->rtort_pipe_count + 1); switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1291,7 +1293,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) atomic_inc(&n_rcu_torture_mberror); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); - pipe_count = p->rtort_pipe_count; + pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; -- cgit v1.2.3 From 102c14d2f87976e8390d2cb892ccd14e3532e020 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Dec 2019 11:23:50 -0800 Subject: rcutorture: Fix stray access to rcu_fwd_cb_nodelay The rcu_fwd_cb_nodelay variable suppresses excessively long read-side delays while carrying out an rcutorture forward-progress test. As such, it is accessed both by readers and updaters, and most of the accesses therefore use *_ONCE(). Except for one in rcu_read_delay(), which this commit fixes. This data race was reported by KCSAN. Not appropriate for backporting due to this being rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index edd97465a0f7..124160a610fa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -339,7 +339,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!rcu_fwd_cb_nodelay && + if (!READ_ONCE(rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); -- cgit v1.2.3 From f042a436c8dc9f9cfe8ed1ee5de372697269657d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 16:27:00 -0800 Subject: rcutorture: Add READ_ONCE() to rcu_torture_count and rcu_torture_batch The rcutorture rcu_torture_count and rcu_torture_batch per-CPU variables are read locklessly, so this commit adds the READ_ONCE() to a load in order to avoid various types of compiler vandalism^Woptimization. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 124160a610fa..0b9ce9a00623 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1413,8 +1413,8 @@ rcu_torture_stats_print(void) for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); + batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { -- cgit v1.2.3 From 5396d31d3a396039502f75a128bd8064819cba61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2020 19:58:13 -0800 Subject: rcutorture: Annotation lockless accesses to rcu_torture_current The rcutorture global variable rcu_torture_current is accessed locklessly, so it must use the RCU pointer load/store primitives. This commit therefore adds several that were missed. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being used only by rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0b9ce9a00623..7e01e9a87352 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1407,6 +1407,7 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; struct task_struct *wtp; @@ -1423,10 +1424,10 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); + rtcp = rcu_access_pointer(rcu_torture_current); pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current && !rcu_stall_is_suppressed_at_boot() - ? "ver" : "VER", + rtcp, + rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), @@ -1482,7 +1483,8 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL && !rcu_stall_is_suppressed()) { + rcu_access_pointer(rcu_torture_current) && + !rcu_stall_is_suppressed()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; -- cgit v1.2.3 From 12af660321263d7744d5d06b765c7b03fade3858 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 19 Dec 2019 11:22:42 -0500 Subject: rcuperf: Measure memory footprint during kfree_rcu() test During changes to kfree_rcu() code, we often check the amount of free memory. As an alternative to checking this manually, this commit adds a measurement in the test itself. It measures four times during the test for available memory, digitally filters these measurements to produce a running average with a weight of 0.5, and compares this digitally filtered value with the amount of available memory at the beginning of the test. Something like the following is printed at the end of the run: Total time taken by all kfree'ers: 6369738407 ns, loops: 10000, batches: 764, memory footprint: 216MB Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index da94b89cd531..a4a8d097d84d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -611,6 +612,7 @@ kfree_perf_thread(void *arg) long me = (long)arg; struct kfree_obj *alloc_ptr; u64 start_time, end_time; + long long mem_begin, mem_during = 0; VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); @@ -626,6 +628,12 @@ kfree_perf_thread(void *arg) } do { + if (!mem_during) { + mem_during = mem_begin = si_mem_available(); + } else if (loop % (kfree_loops / 4) == 0) { + mem_during = (mem_during + si_mem_available()) / 2; + } + for (i = 0; i < kfree_alloc_num; i++) { alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) @@ -645,9 +653,11 @@ kfree_perf_thread(void *arg) else b_rcu_gp_test_finished = cur_ops->get_gp_seq(); - pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + if (shutdown) { smp_mb(); /* Assign before wake. */ wake_up(&shutdown_wq); -- cgit v1.2.3 From 50d4b62970e21e9573daf0e3c1138b4d1ebcca47 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2020 15:00:56 -0800 Subject: rcutorture: Make rcu_torture_barrier_cbs() post from corresponding CPU Currently, rcu_torture_barrier_cbs() posts callbacks from whatever CPU it is running on, which means that all these kthreads might well be posting from the same CPU, which would drastically reduce the effectiveness of this test. This commit therefore uses IPIs to make the callbacks be posted from the corresponding CPU (given by local variable myid). If the IPI fails (which can happen if the target CPU is offline or does not exist at all), the callback is posted on whatever CPU is currently running. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e01e9a87352..f82515cded34 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2053,6 +2053,14 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) atomic_inc(&barrier_cbs_invoked); } +/* IPI handler to get callback posted on desired CPU, if online. */ +static void rcu_torture_barrier1cb(void *rcu_void) +{ + struct rcu_head *rhp = rcu_void; + + cur_ops->call(rhp, rcu_torture_barrier_cbf); +} + /* kthread function to register callbacks used to test RCU barriers. */ static int rcu_torture_barrier_cbs(void *arg) { @@ -2076,9 +2084,11 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - local_irq_disable(); /* Just to test no-irq call_rcu(). */ - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - local_irq_enable(); + if (smp_call_function_single(myid, rcu_torture_barrier1cb, + &rcu, 1)) { + // IPI failed, so use direct call from current CPU. + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + } if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); -- cgit v1.2.3 From 9470a18fabd056e67ee12059dab04faf6e1f253c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Feb 2020 12:54:34 -0800 Subject: rcutorture: Manually clean up after rcu_barrier() failure Currently, if rcu_barrier() returns too soon, the test waits 100ms and then does another instance of the test. However, if rcu_barrier() were to have waited for more than 100ms too short a time, this could cause the test's rcu_head structures to be reused while they were still on RCU's callback lists. This can result in knock-on errors that obscure the original rcu_barrier() test failure. This commit therefore adds code that attempts to wait until all of the test's callbacks have been invoked. Of course, if RCU completely lost track of the corresponding rcu_head structures, this wait could be forever. This commit therefore also complains if this attempted recovery takes more than one second, and it also gives up when the test ends. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f82515cded34..5453bd557f43 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2124,7 +2124,21 @@ static int rcu_torture_barrier(void *arg) pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", atomic_read(&barrier_cbs_invoked), n_barrier_cbs); - WARN_ON_ONCE(1); + WARN_ON(1); + // Wait manually for the remaining callbacks + i = 0; + do { + if (WARN_ON(i++ > HZ)) + i = INT_MIN; + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + } while (atomic_read(&barrier_cbs_invoked) != + n_barrier_cbs && + !torture_must_stop()); + smp_mb(); // Can't trust ordering if broken. + if (!torture_must_stop()) + pr_err("Recovered: barrier_cbs_invoked = %d\n", + atomic_read(&barrier_cbs_invoked)); } else { n_barrier_successes++; } -- cgit v1.2.3 From a144935ceaed277c3e640b85f4cff89d7cce4b8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Feb 2020 05:20:18 -0800 Subject: rcutorture: Set KCSAN Kconfig options to detect more data races This commit enables the KCSAN Kconfig options that (1) detect data races between reads and writes even when the writes do not change the variable's value and (2) detect data races involving plain C-language writes. These changes only affect scripted rcutorture runs and can be overridden using the kvm.sh --kconfig argument. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/CFcommon | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index e19a444a0684..0e92d85313aa 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -3,3 +3,5 @@ CONFIG_PRINTK_TIME=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n +CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n -- cgit v1.2.3 From dc8cb9df2b863c71027c6fee8ce370775d739fc3 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 13 Feb 2020 16:38:21 -0500 Subject: doc: Add some more RCU list patterns in the kernel - Add more information about RCU list patterns taking examples from audit subsystem in the linux kernel. - Keep the current audit examples, even though the kernel has changed. - Modify inline text for better passage quality. - Fix typo in code-blocks and improve code comments. - Add text formatting (italics, bold and code) for better emphasis. Patch originally submitted at https://lore.kernel.org/patchwork/patch/1082804/ Co-developed-by: Amol Grover Signed-off-by: Amol Grover Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/listRCU.rst | 275 ++++++++++++++++++++++++++++++++---------- 1 file changed, 211 insertions(+), 64 deletions(-) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index 7956ff33042b..55d2b30db481 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -4,12 +4,61 @@ Using RCU to Protect Read-Mostly Linked Lists ============================================= One of the best applications of RCU is to protect read-mostly linked lists -("struct list_head" in list.h). One big advantage of this approach +(``struct list_head`` in list.h). One big advantage of this approach is that all of the required memory barriers are included for you in the list macros. This document describes several applications of RCU, with the best fits first. -Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates + +Example 1: Read-mostly list: Deferred Destruction +------------------------------------------------- + +A widely used usecase for RCU lists in the kernel is lockless iteration over +all processes in the system. ``task_struct::tasks`` represents the list node that +links all the processes. The list can be traversed in parallel to any list +additions or removals. + +The traversal of the list is done using ``for_each_process()`` which is defined +by the 2 macros:: + + #define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) + + #define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +The code traversing the list of all processes typically looks like:: + + rcu_read_lock(); + for_each_process(p) { + /* Do something with p */ + } + rcu_read_unlock(); + +The simplified code for removing a process from a task list is:: + + void release_task(struct task_struct *p) + { + write_lock(&tasklist_lock); + list_del_rcu(&p->tasks); + write_unlock(&tasklist_lock); + call_rcu(&p->rcu, delayed_put_task_struct); + } + +When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)`` under +``tasklist_lock`` writer lock protection, to remove the task from the list of +all tasks. The ``tasklist_lock`` prevents concurrent list additions/removals +from corrupting the list. Readers using ``for_each_process()`` are not protected +with the ``tasklist_lock``. To prevent readers from noticing changes in the list +pointers, the ``task_struct`` object is freed only after one or more grace +periods elapse (with the help of call_rcu()). This deferring of destruction +ensures that any readers traversing the list will see valid ``p->tasks.next`` +pointers and deletion/freeing can happen in parallel with traversal of the list. +This pattern is also called an **existence lock**, since RCU pins the object in +memory until all existing readers finish. + + +Example 2: Read-Side Action Taken Outside of Lock: No In-Place Updates ---------------------------------------------------------------------- The best applications are cases where, if reader-writer locking were @@ -26,7 +75,7 @@ added or deleted, rather than being modified in place. A straightforward example of this use of RCU may be found in the system-call auditing support. For example, a reader-writer locked -implementation of audit_filter_task() might be as follows:: +implementation of ``audit_filter_task()`` might be as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -34,7 +83,7 @@ implementation of audit_filter_task() might be as follows:: enum audit_state state; read_lock(&auditsc_lock); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry(e, &audit_tsklist, list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { read_unlock(&auditsc_lock); @@ -58,7 +107,7 @@ This means that RCU can be easily applied to the read side, as follows:: enum audit_state state; rcu_read_lock(); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry_rcu(e, &audit_tsklist, list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { rcu_read_unlock(); @@ -69,18 +118,18 @@ This means that RCU can be easily applied to the read side, as follows:: return AUDIT_BUILD_CONTEXT; } -The read_lock() and read_unlock() calls have become rcu_read_lock() +The ``read_lock()`` and ``read_unlock()`` calls have become rcu_read_lock() and rcu_read_unlock(), respectively, and the list_for_each_entry() has -become list_for_each_entry_rcu(). The _rcu() list-traversal primitives +become list_for_each_entry_rcu(). The **_rcu()** list-traversal primitives insert the read-side memory barriers that are required on DEC Alpha CPUs. -The changes to the update side are also straightforward. A reader-writer -lock might be used as follows for deletion and insertion:: +The changes to the update side are also straightforward. A reader-writer lock +might be used as follows for deletion and insertion:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; write_lock(&auditsc_lock); list_for_each_entry(e, list, list) { @@ -113,9 +162,9 @@ Following are the RCU equivalents for these two functions:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; - /* Do not use the _rcu iterator here, since this is the only + /* No need to use the _rcu iterator here, since this is the only * deletion routine. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -139,41 +188,41 @@ Following are the RCU equivalents for these two functions:: return 0; } -Normally, the write_lock() and write_unlock() would be replaced by -a spin_lock() and a spin_unlock(), but in this case, all callers hold -audit_netlink_sem, so no additional locking is required. The auditsc_lock -can therefore be eliminated, since use of RCU eliminates the need for -writers to exclude readers. Normally, the write_lock() calls would -be converted into spin_lock() calls. +Normally, the ``write_lock()`` and ``write_unlock()`` would be replaced by a +spin_lock() and a spin_unlock(). But in this case, all callers hold +``audit_filter_mutex``, so no additional locking is required. The +``auditsc_lock`` can therefore be eliminated, since use of RCU eliminates the +need for writers to exclude readers. The list_del(), list_add(), and list_add_tail() primitives have been replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu(). -The _rcu() list-manipulation primitives add memory barriers that are -needed on weakly ordered CPUs (most of them!). The list_del_rcu() -primitive omits the pointer poisoning debug-assist code that would -otherwise cause concurrent readers to fail spectacularly. +The **_rcu()** list-manipulation primitives add memory barriers that are needed on +weakly ordered CPUs (most of them!). The list_del_rcu() primitive omits the +pointer poisoning debug-assist code that would otherwise cause concurrent +readers to fail spectacularly. -So, when readers can tolerate stale data and when entries are either added -or deleted, without in-place modification, it is very easy to use RCU! +So, when readers can tolerate stale data and when entries are either added or +deleted, without in-place modification, it is very easy to use RCU! -Example 2: Handling In-Place Updates + +Example 3: Handling In-Place Updates ------------------------------------ -The system-call auditing code does not update auditing rules in place. -However, if it did, reader-writer-locked code to do so might look as -follows (presumably, the field_count is only permitted to decrease, -otherwise, the added fields would need to be filled in):: +The system-call auditing code does not update auditing rules in place. However, +if it did, the reader-writer-locked code to do so might look as follows +(assuming only ``field_count`` is updated, otherwise, the added fields would +need to be filled in):: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, __u32 newaction, __u32 newfield_count) { - struct audit_entry *e; - struct audit_newentry *ne; + struct audit_entry *e; + struct audit_entry *ne; write_lock(&auditsc_lock); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { e->rule.action = newaction; @@ -188,16 +237,16 @@ otherwise, the added fields would need to be filled in):: The RCU version creates a copy, updates the copy, then replaces the old entry with the newly updated entry. This sequence of actions, allowing -concurrent reads while doing a copy to perform an update, is what gives -RCU ("read-copy update") its name. The RCU code is as follows:: +concurrent reads while making a copy to perform an update, is what gives +RCU (*read-copy update*) its name. The RCU code is as follows:: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, __u32 newaction, __u32 newfield_count) { - struct audit_entry *e; - struct audit_newentry *ne; + struct audit_entry *e; + struct audit_entry *ne; list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -215,34 +264,45 @@ RCU ("read-copy update") its name. The RCU code is as follows:: return -EFAULT; /* No matching rule */ } -Again, this assumes that the caller holds audit_netlink_sem. Normally, -the reader-writer lock would become a spinlock in this sort of code. +Again, this assumes that the caller holds ``audit_filter_mutex``. Normally, the +writer lock would become a spinlock in this sort of code. -Example 3: Eliminating Stale Data +Another use of this pattern can be found in the openswitch driver's *connection +tracking table* code in ``ct_limit_set()``. The table holds connection tracking +entries and has a limit on the maximum entries. There is one such table +per-zone and hence one *limit* per zone. The zones are mapped to their limits +through a hashtable using an RCU-managed hlist for the hash chains. When a new +limit is set, a new limit object is allocated and ``ct_limit_set()`` is called +to replace the old limit object with the new one using list_replace_rcu(). +The old limit object is then freed after a grace period using kfree_rcu(). + + +Example 4: Eliminating Stale Data --------------------------------- -The auditing examples above tolerate stale data, as do most algorithms +The auditing example above tolerates stale data, as do most algorithms that are tracking external state. Because there is a delay from the time the external state changes before Linux becomes aware of the change, -additional RCU-induced staleness is normally not a problem. +additional RCU-induced staleness is generally not a problem. However, there are many examples where stale data cannot be tolerated. One example in the Linux kernel is the System V IPC (see the ipc_lock() -function in ipc/util.c). This code checks a "deleted" flag under a -per-entry spinlock, and, if the "deleted" flag is set, pretends that the +function in ipc/util.c). This code checks a *deleted* flag under a +per-entry spinlock, and, if the *deleted* flag is set, pretends that the entry does not exist. For this to be helpful, the search function must -return holding the per-entry spinlock, as ipc_lock() does in fact do. +return holding the per-entry lock, as ipc_lock() does in fact do. + +.. _quick_quiz: Quick Quiz: - Why does the search function need to return holding the per-entry lock for - this deleted-flag technique to be helpful? + For the deleted-flag technique to be helpful, why is it necessary + to hold the per-entry lock while returning from the search function? -:ref:`Answer to Quick Quiz ` +:ref:`Answer to Quick Quiz ` -If the system-call audit module were to ever need to reject stale data, -one way to accomplish this would be to add a "deleted" flag and a "lock" -spinlock to the audit_entry structure, and modify audit_filter_task() -as follows:: +If the system-call audit module were to ever need to reject stale data, one way +to accomplish this would be to add a ``deleted`` flag and a ``lock`` spinlock to the +audit_entry structure, and modify ``audit_filter_task()`` as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -267,20 +327,20 @@ as follows:: } Note that this example assumes that entries are only added and deleted. -Additional mechanism is required to deal correctly with the -update-in-place performed by audit_upd_rule(). For one thing, -audit_upd_rule() would need additional memory barriers to ensure -that the list_add_rcu() was really executed before the list_del_rcu(). +Additional mechanism is required to deal correctly with the update-in-place +performed by ``audit_upd_rule()``. For one thing, ``audit_upd_rule()`` would +need additional memory barriers to ensure that the list_add_rcu() was really +executed before the list_del_rcu(). -The audit_del_rule() function would need to set the "deleted" -flag under the spinlock as follows:: +The ``audit_del_rule()`` function would need to set the ``deleted`` flag under the +spinlock as follows:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; - /* Do not need to use the _rcu iterator here, since this + /* No need to use the _rcu iterator here, since this * is the only deletion routine. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -295,6 +355,91 @@ flag under the spinlock as follows:: return -EFAULT; /* No matching rule */ } +This too assumes that the caller holds ``audit_filter_mutex``. + + +Example 5: Skipping Stale Objects +--------------------------------- + +For some usecases, reader performance can be improved by skipping stale objects +during read-side list traversal if the object in concern is pending destruction +after one or more grace periods. One such example can be found in the timerfd +subsystem. When a ``CLOCK_REALTIME`` clock is reprogrammed - for example due to +setting of the system time, then all programmed timerfds that depend on this +clock get triggered and processes waiting on them to expire are woken up in +advance of their scheduled expiry. To facilitate this, all such timers are added +to an RCU-managed ``cancel_list`` when they are setup in +``timerfd_setup_cancel()``:: + + static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) + { + spin_lock(&ctx->cancel_lock); + if ((ctx->clockid == CLOCK_REALTIME && + (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { + if (!ctx->might_cancel) { + ctx->might_cancel = true; + spin_lock(&cancel_lock); + list_add_rcu(&ctx->clist, &cancel_list); + spin_unlock(&cancel_lock); + } + } + spin_unlock(&ctx->cancel_lock); + } + +When a timerfd is freed (fd is closed), then the ``might_cancel`` flag of the +timerfd object is cleared, the object removed from the ``cancel_list`` and +destroyed:: + + int timerfd_release(struct inode *inode, struct file *file) + { + struct timerfd_ctx *ctx = file->private_data; + + spin_lock(&ctx->cancel_lock); + if (ctx->might_cancel) { + ctx->might_cancel = false; + spin_lock(&cancel_lock); + list_del_rcu(&ctx->clist); + spin_unlock(&cancel_lock); + } + spin_unlock(&ctx->cancel_lock); + + hrtimer_cancel(&ctx->t.tmr); + kfree_rcu(ctx, rcu); + return 0; + } + +If the ``CLOCK_REALTIME`` clock is set, for example by a time server, the +hrtimer framework calls ``timerfd_clock_was_set()`` which walks the +``cancel_list`` and wakes up processes waiting on the timerfd. While iterating +the ``cancel_list``, the ``might_cancel`` flag is consulted to skip stale +objects:: + + void timerfd_clock_was_set(void) + { + struct timerfd_ctx *ctx; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &cancel_list, clist) { + if (!ctx->might_cancel) + continue; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->moffs != ktime_mono_to_real(0)) { + ctx->moffs = KTIME_MAX; + ctx->ticks++; + wake_up_locked_poll(&ctx->wqh, EPOLLIN); + } + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + } + rcu_read_unlock(); + } + +The key point here is, because RCU-traversal of the ``cancel_list`` happens +while objects are being added and removed to the list, sometimes the traversal +can step on an object that has been removed from the list. In this example, it +is seen that it is better to skip such objects using a flag. + + Summary ------- @@ -303,19 +448,21 @@ the most amenable to use of RCU. The simplest case is where entries are either added or deleted from the data structure (or atomically modified in place), but non-atomic in-place modifications can be handled by making a copy, updating the copy, then replacing the original with the copy. -If stale data cannot be tolerated, then a "deleted" flag may be used +If stale data cannot be tolerated, then a *deleted* flag may be used in conjunction with a per-entry spinlock in order to allow the search function to reject newly deleted data. -.. _answer_quick_quiz_list: +.. _quick_quiz_answer: Answer to Quick Quiz: - Why does the search function need to return holding the per-entry - lock for this deleted-flag technique to be helpful? + For the deleted-flag technique to be helpful, why is it necessary + to hold the per-entry lock while returning from the search function? If the search function drops the per-entry lock before returning, then the caller will be processing stale data in any case. If it is really OK to be processing stale data, then you don't need a - "deleted" flag. If processing stale data really is a problem, + *deleted* flag. If processing stale data really is a problem, then you need to hold the per-entry lock across all of the code that uses the value that was returned. + +:ref:`Back to Quick Quiz ` -- cgit v1.2.3 From d18c265fbf19de7716e6e3054b752594133b0739 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:56 +0100 Subject: doc/RCU/Design: Remove remaining HTML tags in ReST files Commit ccc9971e2147 ("docs: rcu: convert some articles from html to ReST") has converted a few of html RCU docs into ReST files, but a few of html tags which not supported on rst is remaining. This commit converts those to ReST appropriate alternatives. Reviewed-by: Madhuparna Bhowmik Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 1a8b129cfc04..83ae3b79a643 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -4,7 +4,7 @@ A Tour Through TREE_RCU's Grace-Period Memory Ordering August 8, 2017 -This article was contributed by Paul E. McKenney +This article was contributed by Paul E. McKenney Introduction ============ @@ -48,7 +48,7 @@ Tree RCU Grace Period Memory Ordering Building Blocks The workhorse for RCU's grace-period memory ordering is the critical section for the ``rcu_node`` structure's -``->lock``. These critical sections use helper functions for lock +``->lock``. These critical sections use helper functions for lock acquisition, including ``raw_spin_lock_rcu_node()``, ``raw_spin_lock_irq_rcu_node()``, and ``raw_spin_lock_irqsave_rcu_node()``. Their lock-release counterparts are ``raw_spin_unlock_rcu_node()``, @@ -102,9 +102,9 @@ lock-acquisition and lock-release functions:: 23 r3 = READ_ONCE(x); 24 } 25 - 26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0); + 26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0); -The ``WARN_ON()`` is evaluated at “the end of time”, +The ``WARN_ON()`` is evaluated at "the end of time", after all changes have propagated throughout the system. Without the ``smp_mb__after_unlock_lock()`` provided by the acquisition functions, this ``WARN_ON()`` could trigger, for example -- cgit v1.2.3 From c50a871409dc664f2f89d11926d9a6b8ab7f5b07 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:57 +0100 Subject: doc/RCU/listRCU: Fix typos in a example code snippets Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/listRCU.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index 55d2b30db481..e768f56e8fa3 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -226,7 +226,7 @@ need to be filled in):: list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { e->rule.action = newaction; - e->rule.file_count = newfield_count; + e->rule.field_count = newfield_count; write_unlock(&auditsc_lock); return 0; } @@ -255,7 +255,7 @@ RCU (*read-copy update*) its name. The RCU code is as follows:: return -ENOMEM; audit_copy_rule(&ne->rule, &e->rule); ne->rule.action = newaction; - ne->rule.file_count = newfield_count; + ne->rule.field_count = newfield_count; list_replace_rcu(&e->list, &ne->list); call_rcu(&e->rcu, audit_free_rule); return 0; -- cgit v1.2.3 From 3282b0469248ab25b3f40b95e9a3d357c9d946d5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:58 +0100 Subject: doc/RCU/listRCU: Update example function name listRCU.rst document gives an example with 'ipc_lock()', but the function has dropped off by commit 82061c57ce93 ("ipc: drop ipc_lock()"). Because the main logic of 'ipc_lock()' has melded in 'shm_lock()' by the commit, this commit updates the document to use 'shm_lock()' instead. Reviewed-by: Madhuparna Bhowmik Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/listRCU.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index e768f56e8fa3..2a643e293fb4 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -286,11 +286,11 @@ time the external state changes before Linux becomes aware of the change, additional RCU-induced staleness is generally not a problem. However, there are many examples where stale data cannot be tolerated. -One example in the Linux kernel is the System V IPC (see the ipc_lock() -function in ipc/util.c). This code checks a *deleted* flag under a +One example in the Linux kernel is the System V IPC (see the shm_lock() +function in ipc/shm.c). This code checks a *deleted* flag under a per-entry spinlock, and, if the *deleted* flag is set, pretends that the entry does not exist. For this to be helpful, the search function must -return holding the per-entry lock, as ipc_lock() does in fact do. +return holding the per-entry spinlock, as shm_lock() does in fact do. .. _quick_quiz: -- cgit v1.2.3 From be2895681d6d8b1eb5a052d4655dea0d7cc2d84d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:59 +0100 Subject: doc/RCU/rcu: Use ':ref:' for links to other docs Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index 8dfb437dacc3..a1dd71d01862 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -11,8 +11,8 @@ must be long enough that any readers accessing the item being deleted have since dropped their references. For example, an RCU-protected deletion from a linked list would first remove the item from the list, wait for a grace period to elapse, then free the element. See the -Documentation/RCU/listRCU.rst file for more information on using RCU with -linked lists. +:ref:`Documentation/RCU/listRCU.rst ` for more information on +using RCU with linked lists. Frequently Asked Questions -------------------------- @@ -50,7 +50,7 @@ Frequently Asked Questions - If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? - See the Documentation/RCU/UP.rst file for more information. + See :ref:`Documentation/RCU/UP.rst ` for more information. - How can I see where RCU is currently used in the Linux kernel? @@ -68,9 +68,9 @@ Frequently Asked Questions - Why the name "RCU"? - "RCU" stands for "read-copy update". The file Documentation/RCU/listRCU.rst - has more information on where this name came from, search for - "read-copy update" to find it. + "RCU" stands for "read-copy update". + :ref:`Documentation/RCU/listRCU.rst ` has more information on where + this name came from, search for "read-copy update" to find it. - I hear that RCU is patented? What is with that? -- cgit v1.2.3 From 6a534b299ab20bdb345d3a6df0fc557c963fc50d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:08:00 +0100 Subject: doc/RCU/rcu: Use absolute paths for non-rst files Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index a1dd71d01862..2a830c51477e 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -75,7 +75,7 @@ Frequently Asked Questions - I hear that RCU is patented? What is with that? Yes, it is. There are several known patents related to RCU, - search for the string "Patent" in RTFP.txt to find them. + search for the string "Patent" in Documentation/RCU/RTFP.txt to find them. Of these, one was allowed to lapse by the assignee, and the others have been contributed to the Linux kernel under GPL. There are now also LGPL implementations of user-level RCU @@ -88,5 +88,5 @@ Frequently Asked Questions - Where can I find more information on RCU? - See the RTFP.txt file in this directory. + See the Documentation/RCU/RTFP.txt file. Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/). -- cgit v1.2.3 From 06a649b314b3e579cfe1c50112c30e3127a36dba Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:08:01 +0100 Subject: doc/RCU/rcu: Use https instead of http if possible Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index 2a830c51477e..0e03c6ef3147 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -79,7 +79,7 @@ Frequently Asked Questions Of these, one was allowed to lapse by the assignee, and the others have been contributed to the Linux kernel under GPL. There are now also LGPL implementations of user-level RCU - available (http://liburcu.org/). + available (https://liburcu.org/). - I hear that RCU needs work in order to support realtime kernels? -- cgit v1.2.3 From 9671f30ee25151f680d2f4345b4e4c67bed6559c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Jan 2020 11:50:05 -0800 Subject: doc: Add rcutorture scripting to torture.txt For testing mainline, the kvm.sh rcutorture script is the preferred approach to testing. This commit therefore adds it to the torture.txt documentation. Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 147 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 140 insertions(+), 7 deletions(-) diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index a41a0384d20c..af712a3c5b6a 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -124,9 +124,14 @@ using a dynamically allocated srcu_struct (hence "srcud-" rather than debugging. The final "T" entry contains the totals of the counters. -USAGE +USAGE ON SPECIFIC KERNEL BUILDS -The following script may be used to torture RCU: +It is sometimes desirable to torture RCU on a specific kernel build, +for example, when preparing to put that kernel build into production. +In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m +so that the test can be started using modprobe and terminated using rmmod. + +For example, the following script may be used to torture RCU: #!/bin/sh @@ -142,8 +147,136 @@ checked for such errors. The "rmmod" command forces a "SUCCESS", two are self-explanatory, while the last indicates that while there were no RCU failures, CPU-hotplug problems were detected. -However, the tools/testing/selftests/rcutorture/bin/kvm.sh script -provides better automation, including automatic failure analysis. -It assumes a qemu/kvm-enabled platform, and runs guest OSes out of initrd. -See tools/testing/selftests/rcutorture/doc/initrd.txt for instructions -on setting up such an initrd. + +USAGE ON MAINLINE KERNELS + +When using rcutorture to test changes to RCU itself, it is often +necessary to build a number of kernels in order to test that change +across a broad range of combinations of the relevant Kconfig options +and of the relevant kernel boot parameters. In this situation, use +of modprobe and rmmod can be quite time-consuming and error-prone. + +Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh +script is available for mainline testing for x86, arm64, and +powerpc. By default, it will run the series of tests specified by +tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test +running for 30 minutes within a guest OS using a minimal userspace +supplied by an automatically generated initrd. After the tests are +complete, the resulting build products and console output are analyzed +for errors and the results of the runs are summarized. + +On larger systems, rcutorture testing can be accelerated by passing the +--cpus argument to kvm.sh. For example, on a 64-CPU system, "--cpus 43" +would use up to 43 CPUs to run tests concurrently, which as of v5.4 would +complete all the scenarios in two batches, reducing the time to complete +from about eight hours to about one hour (not counting the time to build +the sixteen kernels). The "--dryrun sched" argument will not run tests, +but rather tell you how the tests would be scheduled into batches. This +can be useful when working out how many CPUs to specify in the --cpus +argument. + +Not all changes require that all scenarios be run. For example, a change +to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the +--configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'". +Large systems can run multiple copies of of the full set of scenarios, +for example, a system with 448 hardware threads can run five instances +of the full set concurrently. To make this happen: + + kvm.sh --cpus 448 --configs '5*CFLIST' + +Alternatively, such a system can run 56 concurrent instances of a single +eight-CPU scenario: + + kvm.sh --cpus 448 --configs '56*TREE04' + +Or 28 concurrent instances of each of two eight-CPU scenarios: + + kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04' + +Of course, each concurrent instance will use memory, which can be +limited using the --memory argument, which defaults to 512M. Small +values for memory may require disabling the callback-flooding tests +using the --bootargs parameter discussed below. + +Sometimes additional debugging is useful, and in such cases the --kconfig +parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'". + +Kernel boot arguments can also be supplied, for example, to control +rcutorture's module parameters. For example, to test a change to RCU's +CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'". +This will of course result in the scripting reporting a failure, namely +the resuling RCU CPU stall warning. As noted above, reducing memory may +require disabling rcutorture's callback-flooding tests: + + kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \ + --bootargs 'rcutorture.fwd_progress=0' + +Sometimes all that is needed is a full set of kernel builds. This is +what the --buildonly argument does. + +Finally, the --trust-make argument allows each kernel build to reuse what +it can from the previous kernel build. + +There are additional more arcane arguments that are documented in the +source code of the kvm.sh script. + +If a run contains failures, the number of buildtime and runtime failures +is listed at the end of the kvm.sh output, which you really should redirect +to a file. The build products and console output of each run is kept in +tools/testing/selftests/rcutorture/res in timestamped directories. A +given directory can be supplied to kvm-find-errors.sh in order to have +it cycle you through summaries of errors and full error logs. For example: + + tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \ + tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23 + +However, it is often more convenient to access the files directly. +Files pertaining to all scenarios in a run reside in the top-level +directory (2020.01.20-15.54.23 in the example above), while per-scenario +files reside in a subdirectory named after the scenario (for example, +"TREE04"). If a given scenario ran more than once (as in "--configs +'56*TREE04'" above), the directories corresponding to the second and +subsequent runs of that scenario include a sequence number, for example, +"TREE04.2", "TREE04.3", and so on. + +The most frequently used file in the top-level directory is testid.txt. +If the test ran in a git repository, then this file contains the commit +that was tested and any uncommitted changes in diff format. + +The most frequently used files in each per-scenario-run directory are: + +.config: This file contains the Kconfig options. + +Make.out: This contains build output for a specific scenario. + +console.log: This contains the console output for a specific scenario. + This file may be examined once the kernel has booted, but + it might not exist if the build failed. + +vmlinux: This contains the kernel, which can be useful with tools like + objdump and gdb. + +A number of additional files are available, but are less frequently used. +Many are intended for debugging of rcutorture itself or of its scripting. + +As of v5.4, a successful run with the default set of scenarios produces +the following summary at the end of the run on a 12-CPU system: + +SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ] +SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ] +SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ] +SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ] +TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ] +TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ] +TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ] +TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198 +TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631 +TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ] +TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844 +TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497 +CPU count limited from 16 to 12 +TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961 +TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997 +TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732 +CPU count limited from 16 to 12 +TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011 -- cgit v1.2.3 From 8149b5cbfa1540cd7542fd4e790a2874afbc5001 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 31 Jan 2020 21:52:37 +0100 Subject: Documentation/memory-barriers: Fix typos Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 7146da061693..e1c355e84edd 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -185,7 +185,7 @@ As a further example, consider this sequence of events: =============== =============== { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; Q = P; - P = &B D = *Q; + P = &B; D = *Q; There is an obvious data dependency here, as the value loaded into D depends on the address retrieved from P by CPU 2. At the end of the sequence, any of the @@ -569,7 +569,7 @@ following sequence of events: { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; - WRITE_ONCE(P, &B) + WRITE_ONCE(P, &B); Q = READ_ONCE(P); D = *Q; @@ -1721,7 +1721,7 @@ of optimizations: and WRITE_ONCE() are more selective: With READ_ONCE() and WRITE_ONCE(), the compiler need only forget the contents of the indicated memory locations, while with barrier() the compiler must - discard the value of all memory locations that it has currented + discard the value of all memory locations that it has currently cached in any machine registers. Of course, the compiler must also respect the order in which the READ_ONCE()s and WRITE_ONCE()s occur, though the CPU of course need not do so. @@ -1833,7 +1833,7 @@ Aside: In the case of data dependencies, the compiler would be expected to issue the loads in the correct order (eg. `a[b]` would have to load the value of b before loading a[b]), however there is no guarantee in the C specification that the compiler may not speculate the value of b -(eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1) +(eg. is equal to 1) and load a[b] before b (eg. tmp = a[1]; if (b != 1) tmp = a[b]; ). There is also the problem of a compiler reloading b after having loaded a[b], thus having a newer copy of b than a[b]. A consensus has not yet been reached about these problems, however the READ_ONCE() -- cgit v1.2.3 From 0f11ad323dd3d316830152de63b5737a6261ad14 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Feb 2020 09:58:37 -0800 Subject: rcu: Mark rcu_state.gp_seq to detect concurrent writes The rcu_state structure's gp_seq field is only to be modified by the RCU grace-period kthread, which is single-threaded. This commit therefore enlists KCSAN's help in enforcing this restriction. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c624814ee2d..739788ff0f6e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1209,7 +1209,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1657,8 +1657,7 @@ static void rcu_gp_fqs_loop(void) WRITE_ONCE(rcu_state.jiffies_kick_kthreads, jiffies + (j ? 3 * j : 2)); } - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( @@ -1672,13 +1671,11 @@ static void rcu_gp_fqs_loop(void) /* If time for quiescent-state forcing, do it. */ if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); first_gp_fqs = false; - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); @@ -1689,8 +1686,7 @@ static void rcu_gp_fqs_loop(void) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -1782,7 +1778,7 @@ static void rcu_gp_cleanup(void) WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + rcu_state.gp_seq, TPS("newreq")); } else { WRITE_ONCE(rcu_state.gp_flags, @@ -1801,8 +1797,7 @@ static int __noreturn rcu_gp_kthread(void *unused) /* Handle grace-period start. */ for (;;) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); rcu_state.gp_state = RCU_GP_WAIT_GPS; swait_event_idle_exclusive(rcu_state.gp_wq, @@ -1815,8 +1810,7 @@ static int __noreturn rcu_gp_kthread(void *unused) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwaitsig")); } -- cgit v1.2.3 From 127e29815b4b2206c0a97ac1d83f92ffc0e25c34 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Feb 2020 06:17:33 -0800 Subject: rcu: Make rcu_barrier() account for offline no-CBs CPUs Currently, rcu_barrier() ignores offline CPUs, However, it is possible for an offline no-CBs CPU to have callbacks queued, and rcu_barrier() must wait for those callbacks. This commit therefore makes rcu_barrier() directly invoke the rcu_barrier_func() with interrupts disabled for such CPUs. This requires passing the CPU number into this function so that it can entrain the rcu_barrier() callback onto the correct CPU's callback list, given that the code must instead execute on the current CPU. While in the area, this commit fixes a bug where the first CPU's callback might have been invoked before rcu_segcblist_entrain() returned, which would also result in an early wakeup. Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs") Signed-off-by: Paul E. McKenney [ paulmck: Apply optimization feedback from Boqun Feng. ] Cc: # 5.5.x --- include/trace/events/rcu.h | 1 + kernel/rcu/tree.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5e49b06e8104..d56d54c17497 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -712,6 +712,7 @@ TRACE_EVENT_RCU(rcu_torture_read, * "Begin": rcu_barrier() started. * "EarlyExit": rcu_barrier() piggybacked, thus early exit. * "Inc1": rcu_barrier() piggyback check counter incremented. + * "OfflineNoCBQ": rcu_barrier() found offline no-CBs CPU with callbacks. * "OnlineQ": rcu_barrier() found online CPU with callbacks. * "OnlineNQ": rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 739788ff0f6e..35292c8ffd4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3097,9 +3097,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *unused) +static void rcu_barrier_func(void *cpu_in) { - struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; @@ -3126,7 +3127,7 @@ static void rcu_barrier_func(void *unused) */ void rcu_barrier(void) { - int cpu; + uintptr_t cpu; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -3149,13 +3150,14 @@ void rcu_barrier(void) rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* - * Initialize the count to one rather than to zero in order to - * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Exclude CPU-hotplug operations - * to ensure that no offline CPU has callbacks queued. + * Initialize the count to two rather than to zero in order + * to avoid a too-soon return to zero in case of an immediate + * invocation of the just-enqueued callback (or preemption of + * this task). Exclude CPU-hotplug operations to ensure that no + * offline non-offloaded CPU has callbacks queued. */ init_completion(&rcu_state.barrier_completion); - atomic_set(&rcu_state.barrier_cpu_count, 1); + atomic_set(&rcu_state.barrier_cpu_count, 2); get_online_cpus(); /* @@ -3165,13 +3167,23 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (!cpu_online(cpu) && + if (cpu_is_offline(cpu) && !rcu_segcblist_is_offloaded(&rdp->cblist)) continue; - if (rcu_segcblist_n_cbs(&rdp->cblist)) { + if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); + smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); + } else if (rcu_segcblist_n_cbs(&rdp->cblist) && + cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, + rcu_state.barrier_sequence); + local_irq_disable(); + rcu_barrier_func((void *)cpu); + local_irq_enable(); + } else if (cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, + rcu_state.barrier_sequence); } else { rcu_barrier_trace(TPS("OnlineNQ"), cpu, rcu_state.barrier_sequence); @@ -3183,7 +3195,7 @@ void rcu_barrier(void) * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) + if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count)) complete(&rcu_state.barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ -- cgit v1.2.3