summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2017-02-09 14:13:13 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2017-02-09 14:13:16 +1100
commite15246babb490d1285a70591b5544d57c427c4d5 (patch)
tree974cb736e8c5049ed96d6fce94619d63fced84a7 /kernel
parent37b9a1de57381e12e565bc5dda6be7528abbbb35 (diff)
parent40b1f8f7b57f89ce2709a3d901a8f0bf3740fbcb (diff)
Merge remote-tracking branch 'rcu/rcu/next'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/locking/lockdep.c80
-rw-r--r--kernel/locking/locktorture.c6
-rw-r--r--kernel/locking/rtmutex-debug.c9
-rw-r--r--kernel/membarrier.c4
-rw-r--r--kernel/rcu/rcu.h4
-rw-r--r--kernel/rcu/rcutorture.c19
-rw-r--r--kernel/rcu/srcu.c169
-rw-r--r--kernel/rcu/tiny.c22
-rw-r--r--kernel/rcu/tiny_plugin.h4
-rw-r--r--kernel/rcu/tree.c429
-rw-r--r--kernel/rcu/tree.h26
-rw-r--r--kernel/rcu/tree_exp.h40
-rw-r--r--kernel/rcu/tree_plugin.h23
-rw-r--r--kernel/rcu/tree_trace.c9
-rw-r--r--kernel/rcu/update.c6
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/signal.c2
18 files changed, 501 insertions, 356 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 2de63d1d654c..84a907592d34 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1299,7 +1299,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
if (atomic_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
/*
- * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
* without an RCU grace period, see __lock_task_sighand().
*/
kmem_cache_free(sighand_cachep, sighand);
@@ -2079,7 +2079,7 @@ void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7c38f8f3d97b..45acb86d3f56 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1142,10 +1142,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
return 0;
printk("\n");
- printk("======================================================\n");
- printk("[ INFO: possible circular locking dependency detected ]\n");
+ pr_warn("======================================================\n");
+ pr_warn("WARNING: possible circular locking dependency detected\n");
print_kernel_ident();
- printk("-------------------------------------------------------\n");
+ pr_warn("------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
@@ -1480,11 +1480,11 @@ print_bad_irq_dependency(struct task_struct *curr,
return 0;
printk("\n");
- printk("======================================================\n");
- printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ pr_warn("=====================================================\n");
+ pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
irqclass, irqclass);
print_kernel_ident();
- printk("------------------------------------------------------\n");
+ pr_warn("-----------------------------------------------------\n");
printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1709,10 +1709,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
return 0;
printk("\n");
- printk("=============================================\n");
- printk("[ INFO: possible recursive locking detected ]\n");
+ pr_warn("============================================\n");
+ pr_warn("WARNING: possible recursive locking detected\n");
print_kernel_ident();
- printk("---------------------------------------------\n");
+ pr_warn("--------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(next);
@@ -2059,10 +2059,10 @@ static void print_collision(struct task_struct *curr,
struct lock_chain *chain)
{
printk("\n");
- printk("======================\n");
- printk("[chain_key collision ]\n");
+ pr_warn("============================\n");
+ pr_warn("WARNING: chain_key collision\n");
print_kernel_ident();
- printk("----------------------\n");
+ pr_warn("----------------------------\n");
printk("%s/%d: ", current->comm, task_pid_nr(current));
printk("Hash chain already cached but the contents don't match!\n");
@@ -2358,10 +2358,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
return 0;
printk("\n");
- printk("=================================\n");
- printk("[ INFO: inconsistent lock state ]\n");
+ pr_warn("================================\n");
+ pr_warn("WARNING: inconsistent lock state\n");
print_kernel_ident();
- printk("---------------------------------\n");
+ pr_warn("--------------------------------\n");
printk("inconsistent {%s} -> {%s} usage.\n",
usage_str[prev_bit], usage_str[new_bit]);
@@ -2423,10 +2423,10 @@ print_irq_inversion_bug(struct task_struct *curr,
return 0;
printk("\n");
- printk("=========================================================\n");
- printk("[ INFO: possible irq lock inversion dependency detected ]\n");
+ pr_warn("========================================================\n");
+ pr_warn("WARNING: possible irq lock inversion dependency detected\n");
print_kernel_ident();
- printk("---------------------------------------------------------\n");
+ pr_warn("--------------------------------------------------------\n");
printk("%s/%d just changed the state of lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(this);
@@ -3168,10 +3168,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
return 0;
printk("\n");
- printk("==================================\n");
- printk("[ BUG: Nested lock was not taken ]\n");
+ pr_warn("==================================\n");
+ pr_warn("WARNING: Nested lock was not taken\n");
print_kernel_ident();
- printk("----------------------------------\n");
+ pr_warn("----------------------------------\n");
printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
print_lock(hlock);
@@ -3374,10 +3374,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
printk("\n");
- printk("=====================================\n");
- printk("[ BUG: bad unlock balance detected! ]\n");
+ pr_warn("=====================================\n");
+ pr_warn("WARNING: bad unlock balance detected!\n");
print_kernel_ident();
- printk("-------------------------------------\n");
+ pr_warn("-------------------------------------\n");
printk("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
@@ -3871,10 +3871,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
printk("\n");
- printk("=================================\n");
- printk("[ BUG: bad contention detected! ]\n");
+ pr_warn("=================================\n");
+ pr_warn("WARNING: bad contention detected!\n");
print_kernel_ident();
- printk("---------------------------------\n");
+ pr_warn("---------------------------------\n");
printk("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
@@ -4235,10 +4235,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
return;
printk("\n");
- printk("=========================\n");
- printk("[ BUG: held lock freed! ]\n");
+ pr_warn("=========================\n");
+ pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
- printk("-------------------------\n");
+ pr_warn("-------------------------\n");
printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
@@ -4293,11 +4293,11 @@ static void print_held_locks_bug(void)
return;
printk("\n");
- printk("=====================================\n");
- printk("[ BUG: %s/%d still has locks held! ]\n",
+ pr_warn("====================================\n");
+ pr_warn("WARNING: %s/%d still has locks held!\n",
current->comm, task_pid_nr(current));
print_kernel_ident();
- printk("-------------------------------------\n");
+ pr_warn("------------------------------------\n");
lockdep_print_held_locks(current);
printk("\nstack backtrace:\n");
dump_stack();
@@ -4362,7 +4362,7 @@ retry:
} while_each_thread(g, p);
printk("\n");
- printk("=============================================\n\n");
+ pr_warn("=============================================\n\n");
if (unlock)
read_unlock(&tasklist_lock);
@@ -4392,10 +4392,10 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (!debug_locks_off())
return;
printk("\n");
- printk("================================================\n");
- printk("[ BUG: lock held when returning to user space! ]\n");
+ pr_warn("================================================\n");
+ pr_warn("WARNING: lock held when returning to user space!\n");
print_kernel_ident();
- printk("------------------------------------------------\n");
+ pr_warn("------------------------------------------------\n");
printk("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
@@ -4412,10 +4412,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
printk("\n");
- printk("===============================\n");
- printk("[ INFO: suspicious RCU usage. ]\n");
+ pr_warn("=============================\n");
+ pr_warn("WARNING: suspicious RCU usage\n");
print_kernel_ident();
- printk("-------------------------------\n");
+ pr_warn("-----------------------------\n");
printk("%s:%d %s!\n", file, line, s);
printk("\nother info that might help us debug this:\n\n");
printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 9bffedd82884..28350dc8ecbb 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -852,6 +852,10 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+
+ kfree(cxt.lwsa);
+ kfree(cxt.lrsa);
+
end:
torture_cleanup_end();
}
@@ -997,6 +1001,8 @@ static int __init lock_torture_init(void)
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ kfree(writer_tasks);
+ writer_tasks = NULL;
firsterr = -ENOMEM;
goto unwind;
}
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 62b6cee8ea7f..7f8a9e2ced6e 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -101,10 +101,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
return;
}
- printk("\n============================================\n");
- printk( "[ BUG: circular locking deadlock detected! ]\n");
- printk("%s\n", print_tainted());
- printk( "--------------------------------------------\n");
+ pr_warn("\n");
+ pr_warn("============================================\n");
+ pr_warn("WARNING: circular locking deadlock detected!\n");
+ pr_warn("%s\n", print_tainted());
+ pr_warn("--------------------------------------------\n");
printk("%s/%d is deadlocking current task %s/%d\n\n",
task->comm, task_pid_nr(task),
current->comm, task_pid_nr(current));
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727a56e9..9f9284f37f8d 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/membarrier.h>
+#include <linux/tick.h>
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
@@ -51,6 +52,9 @@
*/
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
{
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -ENOSYS;
if (unlikely(flags))
return -EINVAL;
switch (cmd) {
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d6ff3e471be..8700a81daf56 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -109,12 +109,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
rcu_lock_acquire(&rcu_callback_map);
if (__is_kfree_rcu_offset(offset)) {
- RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+ RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
} else {
- RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+ RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
head->func(head);
rcu_lock_release(&rcu_callback_map);
return false;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 87c51225ceec..d81345be730e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -564,10 +564,25 @@ static void srcu_torture_stats(void)
pr_alert("%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
long c0, c1;
+ struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
- c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
- c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
+ u0 = counts->unlock_count[!idx];
+ u1 = counts->unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->lock_count[!idx];
+ l1 = counts->lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd549caa..52ec4ddeacdc 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -65,6 +65,17 @@ static inline bool rcu_batch_empty(struct rcu_batch *b)
}
/*
+ * Are all batches empty for the specified srcu_struct?
+ */
+static inline bool rcu_all_batches_empty(struct srcu_struct *sp)
+{
+ return rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue);
+}
+
+/*
* Remove the callback at the head of the specified rcu_batch structure
* and return a pointer to it, or return NULL if the structure is empty.
*/
@@ -106,7 +117,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
rcu_batch_init(&sp->batch_check1);
rcu_batch_init(&sp->batch_done);
INIT_DELAYED_WORK(&sp->work, process_srcu);
- sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+ sp->per_cpu_ref = alloc_percpu(struct srcu_array);
return sp->per_cpu_ref ? 0 : -ENOMEM;
}
@@ -141,114 +152,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
- * Returns approximate total of the readers' ->seq[] values for the
+ * Returns approximate total of the readers' ->lock_count[] values for the
* rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
{
int cpu;
unsigned long sum = 0;
- unsigned long t;
for_each_possible_cpu(cpu) {
- t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
- sum += t;
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->lock_count[idx]);
}
return sum;
}
/*
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
{
int cpu;
unsigned long sum = 0;
- unsigned long t;
for_each_possible_cpu(cpu) {
- t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
- sum += t;
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->unlock_count[idx]);
}
return sum;
}
/*
* Return true if the number of pre-existing readers is determined to
- * be stably zero. An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement. This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
+ * be zero.
*/
static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
{
- unsigned long seq;
+ unsigned long unlocks;
- seq = srcu_readers_seq_idx(sp, idx);
+ unlocks = srcu_readers_unlock_idx(sp, idx);
/*
- * The following smp_mb() A pairs with the smp_mb() B located in
- * __srcu_read_lock(). This pairing ensures that if an
- * __srcu_read_lock() increments its counter after the summation
- * in srcu_readers_active_idx(), then the corresponding SRCU read-side
- * critical section will see any changes made prior to the start
- * of the current SRCU grace period.
+ * Make sure that a lock is always counted if the corresponding unlock
+ * is counted. Needs to be a smp_mb() as the read side may contain a
+ * read from a variable that is written to before the synchronize_srcu()
+ * in the write side. In this case smp_mb()s A and B act like the store
+ * buffering pattern.
*
- * Also, if the above call to srcu_readers_seq_idx() saw the
- * increment of ->seq[], then the call to srcu_readers_active_idx()
- * must see the increment of ->c[].
+ * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
+ * synchronize_srcu() from being executed before the grace period ends.
*/
smp_mb(); /* A */
/*
- * Note that srcu_readers_active_idx() can incorrectly return
- * zero even though there is a pre-existing reader throughout.
- * To see this, suppose that task A is in a very long SRCU
- * read-side critical section that started on CPU 0, and that
- * no other reader exists, so that the sum of the counters
- * is equal to one. Then suppose that task B starts executing
- * srcu_readers_active_idx(), summing up to CPU 1, and then that
- * task C starts reading on CPU 0, so that its increment is not
- * summed, but finishes reading on CPU 2, so that its decrement
- * -is- summed. Then when task B completes its sum, it will
- * incorrectly get zero, despite the fact that task A has been
- * in its SRCU read-side critical section the whole time.
- *
- * We therefore do a validation step should srcu_readers_active_idx()
- * return zero.
- */
- if (srcu_readers_active_idx(sp, idx) != 0)
- return false;
-
- /*
- * The remainder of this function is the validation step.
- * The following smp_mb() D pairs with the smp_mb() C in
- * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
- * by srcu_readers_active_idx() above, then any destructive
- * operation performed after the grace period will happen after
- * the corresponding SRCU read-side critical section.
+ * If the locks are the same as the unlocks, then there must have
+ * been no readers on this index at some time in between. This does not
+ * mean that there are no more readers, as one could have read the
+ * current index but not have incremented the lock counter yet.
*
- * Note that there can be at most NR_CPUS worth of readers using
- * the old index, which is not enough to overflow even a 32-bit
- * integer. (Yes, this does mean that systems having more than
- * a billion or so CPUs need to be 64-bit systems.) Therefore,
- * the sum of the ->seq[] counters cannot possibly overflow.
- * Therefore, the only way that the return values of the two
- * calls to srcu_readers_seq_idx() can be equal is if there were
- * no increments of the corresponding rank of ->seq[] counts
- * in the interim. But the missed-increment scenario laid out
- * above includes an increment of the ->seq[] counter by
- * the corresponding __srcu_read_lock(). Therefore, if this
- * scenario occurs, the return values from the two calls to
- * srcu_readers_seq_idx() will differ, and thus the validation
- * step below suffices.
+ * Possible bug: There is no guarantee that there haven't been ULONG_MAX
+ * increments of ->lock_count[] since the unlocks were counted, meaning
+ * that this could return true even if there are still active readers.
+ * Since there are no memory barriers around srcu_flip(), the CPU is not
+ * required to increment ->completed before running
+ * srcu_readers_unlock_idx(), which means that there could be an
+ * arbitrarily large number of critical sections that execute after
+ * srcu_readers_unlock_idx() but use the old value of ->completed.
*/
- smp_mb(); /* D */
-
- return srcu_readers_seq_idx(sp, idx) == seq;
+ return srcu_readers_lock_idx(sp, idx) == unlocks;
}
/**
@@ -266,8 +240,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
- sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->lock_count[0]);
+ sum += READ_ONCE(cpuc->lock_count[1]);
+ sum -= READ_ONCE(cpuc->unlock_count[0]);
+ sum -= READ_ONCE(cpuc->unlock_count[1]);
}
return sum;
}
@@ -283,6 +261,11 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
{
if (WARN_ON(srcu_readers_active(sp)))
return; /* Leakage unless caller handles error. */
+ if (WARN_ON(!rcu_all_batches_empty(sp)))
+ return; /* Leakage unless caller handles error. */
+ flush_delayed_work(&sp->work);
+ if (WARN_ON(sp->running))
+ return; /* Caller forgot to stop doing call_srcu()? */
free_percpu(sp->per_cpu_ref);
sp->per_cpu_ref = NULL;
}
@@ -298,9 +281,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx;
idx = READ_ONCE(sp->completed) & 0x1;
- __this_cpu_inc(sp->per_cpu_ref->c[idx]);
+ __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
return idx;
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -314,7 +296,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_dec(sp->per_cpu_ref->c[idx]);
+ this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -349,12 +331,21 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
/*
* Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->c[] and ->seq[] arrays. This allows
+ * use the other rank of the ->(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
static void srcu_flip(struct srcu_struct *sp)
{
- sp->completed++;
+ WRITE_ONCE(sp->completed, sp->completed + 1);
+
+ /*
+ * Ensure that if the updater misses an __srcu_read_unlock()
+ * increment, that task's next __srcu_read_lock() will see the
+ * above counter update. Note that both this memory barrier
+ * and the one in srcu_readers_active_idx_check() provide the
+ * guarantee for __srcu_read_lock().
+ */
+ smp_mb(); /* D */ /* Pairs with C. */
}
/*
@@ -392,6 +383,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
head->next = NULL;
head->func = func;
spin_lock_irqsave(&sp->queue_lock, flags);
+ smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
rcu_batch_queue(&sp->batch_queue, head);
if (!sp->running) {
sp->running = true;
@@ -425,6 +417,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
head->next = NULL;
head->func = wakeme_after_rcu;
spin_lock_irq(&sp->queue_lock);
+ smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
if (!sp->running) {
/* steal the processing owner */
sp->running = true;
@@ -444,8 +437,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
spin_unlock_irq(&sp->queue_lock);
}
- if (!done)
+ if (!done) {
wait_for_completion(&rcu.completion);
+ smp_mb(); /* Caller's later accesses after GP. */
+ }
+
}
/**
@@ -613,7 +609,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
/*
* Invoke a limited number of SRCU callbacks that have passed through
* their grace period. If there are more to do, SRCU will reschedule
- * the workqueue.
+ * the workqueue. Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
*/
static void srcu_invoke_callbacks(struct srcu_struct *sp)
{
@@ -638,15 +635,9 @@ static void srcu_reschedule(struct srcu_struct *sp)
{
bool pending = true;
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
+ if (rcu_all_batches_empty(sp)) {
spin_lock_irq(&sp->queue_lock);
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
+ if (rcu_all_batches_empty(sp)) {
sp->running = false;
pending = false;
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index b23a4d076f3d..2b6025ca43f0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -41,8 +41,6 @@
/* Forward declarations for tiny_plugin.h. */
struct rcu_ctrlblk;
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static void rcu_process_callbacks(struct softirq_action *unused);
static void __call_rcu(struct rcu_head *head,
rcu_callback_t func,
struct rcu_ctrlblk *rcp);
@@ -69,7 +67,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
- RCU_TRACE(reset_cpu_stall_ticks(rcp));
+ RCU_TRACE(reset_cpu_stall_ticks(rcp);)
if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
@@ -115,7 +113,7 @@ void rcu_bh_qs(void)
*/
void rcu_check_callbacks(int user)
{
- RCU_TRACE(check_cpu_stalls());
+ RCU_TRACE(check_cpu_stalls();)
if (user)
rcu_sched_qs();
else if (!in_softirq())
@@ -133,7 +131,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
- RCU_TRACE(int cb_count = 0);
+ RCU_TRACE(int cb_count = 0;)
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
@@ -142,7 +140,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
return;
}
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
+ RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
*rcp->donetail = NULL;
@@ -152,7 +150,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
/* Invoke the callbacks on the local list. */
- RCU_TRACE(rn = rcp->name);
+ RCU_TRACE(rn = rcp->name;)
while (list) {
next = list->next;
prefetch(next);
@@ -161,9 +159,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
__rcu_reclaim(rn, list);
local_bh_enable();
list = next;
- RCU_TRACE(cb_count++);
+ RCU_TRACE(cb_count++;)
}
- RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
RCU_TRACE(trace_rcu_batch_end(rcp->name,
cb_count, 0, need_resched(),
is_idle_task(current),
@@ -211,7 +209,7 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
- RCU_TRACE(rcp->qlen++);
+ RCU_TRACE(rcp->qlen++;)
local_irq_restore(flags);
if (unlikely(is_idle_task(current))) {
@@ -244,8 +242,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c64b827ecbca..df3a60e19f07 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -162,8 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
static void check_cpu_stalls(void)
{
- RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
- RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+ RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
+ RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
}
#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb4e2056ccf3..508a6a4bc523 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -270,19 +270,169 @@ void rcu_bh_qs(void)
}
}
-static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+/*
+ * Steal a bit from the bottom of ->dynticks for idle entry/exit
+ * control. Initially this is for TLB flushing.
+ */
+#define RCU_DYNTICK_CTRL_MASK 0x1
+#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
+#ifndef rcu_eqs_special_exit
+#define rcu_eqs_special_exit() do { } while (0)
+#endif
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
+ .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
.dynticks_idle = ATOMIC_INIT(1),
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
-DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
-EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_enter(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int seq;
+
+ /*
+ * CPUs seeing atomic_inc_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+ /* Better be in an extended quiescent state! */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (seq & RCU_DYNTICK_CTRL_CTR));
+ /* Better not have special action (TLB flush) pending! */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (seq & RCU_DYNTICK_CTRL_MASK));
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_exit(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int seq;
+
+ /*
+ * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !(seq & RCU_DYNTICK_CTRL_CTR));
+ if (seq & RCU_DYNTICK_CTRL_MASK) {
+ rcu_eqs_special_exit();
+ /* Prefer duplicate flushes to losing a flush. */
+ smp_mb__before_atomic(); /* NMI safety. */
+ atomic_and(~RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
+ }
+}
+
+/*
+ * Reset the current CPU's ->dynticks counter to indicate that the
+ * newly onlined CPU is no longer in an extended quiescent state.
+ * This will either leave the counter unchanged, or increment it
+ * to the next non-quiescent value.
+ *
+ * The non-atomic test/increment sequence works because the upper bits
+ * of the ->dynticks counter are manipulated only by the corresponding CPU,
+ * or when the corresponding CPU is offline.
+ */
+static void rcu_dynticks_eqs_online(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
+ return;
+ atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+}
+
+/*
+ * Is the current CPU in an extended quiescent state?
+ *
+ * No ordering, as we are sampling CPU-local information.
+ */
+bool rcu_dynticks_curr_cpu_in_eqs(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+}
+
+/*
+ * Snapshot the ->dynticks counter with full ordering so as to allow
+ * stable comparison of this counter with past and future snapshots.
+ */
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+{
+ int snap = atomic_add_return(0, &rdtp->dynticks);
+
+ return snap & ~RCU_DYNTICK_CTRL_MASK;
+}
+
+/*
+ * Return true if the snapshot returned from rcu_dynticks_snap()
+ * indicates that RCU is in an extended quiescent state.
+ */
+static bool rcu_dynticks_in_eqs(int snap)
+{
+ return !(snap & RCU_DYNTICK_CTRL_CTR);
+}
+
+/*
+ * Return true if the CPU corresponding to the specified rcu_dynticks
+ * structure has spent some time in an extended quiescent state since
+ * rcu_dynticks_snap() returned the specified snapshot.
+ */
+static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
+{
+ return snap != rcu_dynticks_snap(rdtp);
+}
+
+/*
+ * Do a double-increment of the ->dynticks counter to emulate a
+ * momentary idle-CPU quiescent state.
+ */
+static void rcu_dynticks_momentary_idle(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
+ &rdtp->dynticks);
+
+ /* It is illegal to call this from idle state. */
+ WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+}
+
+/*
+ * Set the special (bottom) bit of the specified CPU so that it
+ * will take special action (such as flushing its TLB) on the
+ * next exit from an extended quiescent state. Returns true if
+ * the bit was successfully set, or false if the CPU was not in
+ * an extended quiescent state.
+ */
+bool rcu_eqs_special_set(int cpu)
+{
+ int old;
+ int new;
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ do {
+ old = atomic_read(&rdtp->dynticks);
+ if (old & RCU_DYNTICK_CTRL_CTR)
+ return false;
+ new = old | RCU_DYNTICK_CTRL_MASK;
+ } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
+ return true;
+}
/*
* Let the RCU core know that this CPU has gone through the scheduler,
@@ -291,48 +441,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
* memory barriers to let the RCU core know about it, regardless of what
* this CPU might (or might not) do in the near future.
*
- * We inform the RCU core by emulating a zero-duration dyntick-idle
- * period, which we in turn do by incrementing the ->dynticks counter
- * by two.
+ * We inform the RCU core by emulating a zero-duration dyntick-idle period.
*
* The caller must have disabled interrupts.
*/
static void rcu_momentary_dyntick_idle(void)
{
- struct rcu_data *rdp;
- struct rcu_dynticks *rdtp;
- int resched_mask;
- struct rcu_state *rsp;
-
- /*
- * Yes, we can lose flag-setting operations. This is OK, because
- * the flag will be set again after some delay.
- */
- resched_mask = raw_cpu_read(rcu_sched_qs_mask);
- raw_cpu_write(rcu_sched_qs_mask, 0);
-
- /* Find the flavor that needs a quiescent state. */
- for_each_rcu_flavor(rsp) {
- rdp = raw_cpu_ptr(rsp->rda);
- if (!(resched_mask & rsp->flavor_mask))
- continue;
- smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
- if (READ_ONCE(rdp->mynode->completed) !=
- READ_ONCE(rdp->cond_resched_completed))
- continue;
-
- /*
- * Pretend to be momentarily idle for the quiescent state.
- * This allows the grace-period kthread to record the
- * quiescent state, with no need for this CPU to do anything
- * further.
- */
- rdtp = this_cpu_ptr(&rcu_dynticks);
- smp_mb__before_atomic(); /* Earlier stuff before QS. */
- atomic_add(2, &rdtp->dynticks); /* QS. */
- smp_mb__after_atomic(); /* Later stuff after QS. */
- break;
- }
+ raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
+ rcu_dynticks_momentary_idle();
}
/*
@@ -342,12 +458,22 @@ static void rcu_momentary_dyntick_idle(void)
*/
void rcu_note_context_switch(void)
{
+ struct rcu_state *rsp;
+
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
rcu_preempt_note_context_switch();
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
+ goto out;
+ this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
+ if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
+ for_each_rcu_flavor(rsp)
+ do_nocb_deferred_wakeup(this_cpu_ptr(rsp->rda));
+ this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
+out:
trace_rcu_utilization(TPS("End context switch"));
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -369,30 +495,30 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
void rcu_all_qs(void)
{
unsigned long flags;
+ struct rcu_state *rsp;
+ if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
+ return;
+ preempt_disable();
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
+ preempt_enable();
+ return;
+ }
+ this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
barrier(); /* Avoid RCU read-side critical sections leaking down. */
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+ if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
- if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
- /*
- * Yes, we just checked a per-CPU variable with preemption
- * enabled, so we might be migrated to some other CPU at
- * this point. That is OK because in that case, the
- * migration will supply the needed quiescent state.
- * We might end up needlessly disabling preemption and
- * invoking rcu_sched_qs() on the destination CPU, but
- * the probability and cost are both quite low, so this
- * should not be a problem in practice.
- */
- preempt_disable();
+ if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
rcu_sched_qs();
- preempt_enable();
- }
- this_cpu_inc(rcu_qs_ctr);
+ for_each_rcu_flavor(rsp)
+ do_nocb_deferred_wakeup(this_cpu_ptr(rsp->rda));
+ this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -611,7 +737,7 @@ static int
cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
{
return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
- rdp->nxttail[RCU_DONE_TAIL] != NULL;
+ rdp->nxttail[RCU_NEXT_TAIL] != NULL;
}
/*
@@ -673,7 +799,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -692,12 +818,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
do_nocb_deferred_wakeup(rdp);
}
rcu_prepare_for_idle();
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_eqs_enter();
rcu_dynticks_task_enter();
/*
@@ -826,15 +947,10 @@ void rcu_irq_exit_irqson(void)
*/
static void rcu_eqs_exit_common(long long oldval, int user)
{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
rcu_dynticks_task_exit();
- smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(atomic_read(&rdtp->dynticks) & 0x1));
+ rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -980,12 +1096,8 @@ void rcu_nmi_enter(void)
* to be in the outermost NMI handler that interrupted an RCU-idle
* period (observation due to Andy Lutomirski).
*/
- if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
- smp_mb__before_atomic(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* atomic_inc() before later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_dynticks_eqs_exit();
incby = 1;
}
rdtp->dynticks_nmi_nesting += incby;
@@ -1010,7 +1122,7 @@ void rcu_nmi_exit(void)
* to us!)
*/
WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
/*
* If the nesting level is not 1, the CPU wasn't RCU-idle, so
@@ -1023,11 +1135,7 @@ void rcu_nmi_exit(void)
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
rdtp->dynticks_nmi_nesting = 0;
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force delay to next write. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_eqs_enter();
}
/**
@@ -1040,7 +1148,7 @@ void rcu_nmi_exit(void)
*/
bool notrace __rcu_is_watching(void)
{
- return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+ return !rcu_dynticks_curr_cpu_in_eqs();
}
/**
@@ -1123,9 +1231,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
static int dyntick_save_progress_counter(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
- rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
rcu_sysidle_check_cpu(rdp, isidle, maxj);
- if ((rdp->dynticks_snap & 0x1) == 0) {
+ if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
@@ -1144,12 +1252,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
- unsigned int curr;
- int *rcrmp;
- unsigned int snap;
-
- curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
- snap = (unsigned int)rdp->dynticks_snap;
+ unsigned long jtsq;
+ bool *rnhqp;
+ bool *ruqp;
+ unsigned long rjtsc;
+ struct rcu_node *rnp;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -1159,27 +1266,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
return 1;
}
+ /* Compute and saturate jiffies_till_sched_qs. */
+ jtsq = jiffies_till_sched_qs;
+ rjtsc = rcu_jiffies_till_stall_check();
+ if (jtsq > rjtsc / 2) {
+ WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
+ jtsq = rjtsc / 2;
+ } else if (jtsq < 1) {
+ WRITE_ONCE(jiffies_till_sched_qs, 1);
+ jtsq = 1;
+ }
+
/*
- * Check for the CPU being offline, but only if the grace period
- * is old enough. We don't need to worry about the CPU changing
- * state: If we see it offline even once, it has been through a
- * quiescent state.
- *
- * The reason for insisting that the grace period be at least
- * one jiffy old is that CPUs that are not quite online and that
- * have just gone offline can still execute RCU read-side critical
- * sections.
+ * Has this CPU encountered a cond_resched_rcu_qs() since the
+ * beginning of the grace period? For this to be the case,
+ * the CPU has to have noticed the current grace period. This
+ * might not be the case for nohz_full CPUs looping in the kernel.
*/
- if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
- return 0; /* Grace period is not old enough. */
- barrier();
- if (cpu_is_offline(rdp->cpu)) {
+ rnp = rdp->mynode;
+ ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
+ if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
+ READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
+ READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+ return 1;
+ } else {
+ /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
+ smp_store_release(ruqp, true);
+ }
+
+ /* Check for the CPU being offline. */
+ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
return 1;
@@ -1192,7 +1315,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* in-kernel CPU-bound tasks cannot advance grace periods.
* So if the grace period is old enough, make the CPU pay attention.
* Note that the unsynchronized assignments to the per-CPU
- * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * rcu_need_heavy_qs variable are safe. Yes, setting of
* bits can be lost, but they will be set again on the next
* force-quiescent-state pass. So lost bit sets do not result
* in incorrect behavior, merely in a grace period lasting
@@ -1206,25 +1329,22 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* is set too high, we override with half of the RCU CPU stall
* warning delay.
*/
- rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
- if (ULONG_CMP_GE(jiffies,
- rdp->rsp->gp_start + jiffies_till_sched_qs) ||
- ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
- WRITE_ONCE(rdp->cond_resched_completed,
- READ_ONCE(rdp->mynode->completed));
- smp_mb(); /* ->cond_resched_completed before *rcrmp. */
- WRITE_ONCE(*rcrmp,
- READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
- }
+ rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
+ if (!READ_ONCE(*rnhqp) &&
+ (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
+ time_after(jiffies, rdp->rsp->jiffies_resched))) {
+ WRITE_ONCE(*rnhqp, true);
+ /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
+ smp_store_release(ruqp, true);
rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
- /* And if it has been a really long time, kick the CPU as well. */
- if (ULONG_CMP_GE(jiffies,
- rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
- ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
- resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ /*
+ * If more than halfway to RCU CPU stall-warning time, do
+ * a resched_cpu() to try to loosen things up a bit.
+ */
+ if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+ resched_cpu(rdp->cpu);
return 0;
}
@@ -1277,7 +1397,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
}
/*
- * Dump stacks of all tasks running on stalled CPUs.
+ * Dump stacks of all tasks running on stalled CPUs. First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps. The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -1287,11 +1410,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (rnp->qsmask != 0) {
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ if (!trigger_single_cpu_backtrace(cpu))
dump_cpu_task(cpu);
- }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1379,6 +1501,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
(long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected) {
rcu_dump_cpu_stacks(rsp);
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_print_detail_task_stall(rsp);
} else {
if (READ_ONCE(rsp->gpnum) != gpnum ||
READ_ONCE(rsp->completed) == gpnum) {
@@ -1395,9 +1520,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
}
}
- /* Complain about tasks blocking the grace period. */
- rcu_print_detail_task_stall(rsp);
-
rcu_check_gp_kthread_starvation(rsp);
panic_on_rcu_stall();
@@ -1879,7 +2001,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
need_gp = !!(rnp->qsmask & rdp->grpmask);
rdp->cpu_no_qs.b.norm = need_gp;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
@@ -2467,10 +2589,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if ((rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
- rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
- rdp->gpwrap) {
+ if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum ||
+ rnp->completed == rnp->gpnum || rdp->gpwrap) {
/*
* The grace period in which this quiescent state was
@@ -2479,7 +2599,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* within the current grace period.
*/
rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -2525,8 +2645,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
+ if (rdp->cpu_no_qs.b.norm)
return;
/*
@@ -2649,14 +2768,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
- RCU_TRACE(unsigned long mask);
- RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+ RCU_TRACE(unsigned long mask;)
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);)
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return;
- RCU_TRACE(mask = rdp->grpmask);
+ RCU_TRACE(mask = rdp->grpmask;)
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
TPS("cpuofl"));
@@ -3478,11 +3597,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
rdp->n_rp_core_needs_qs++;
- } else if (rdp->core_needs_qs &&
- (!rdp->cpu_no_qs.b.norm ||
- rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
+ } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
rdp->n_rp_report_qs++;
return 1;
}
@@ -3748,7 +3865,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
- WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
rdp->cpu = cpu;
rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
@@ -3765,7 +3882,6 @@ static void
rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -3778,8 +3894,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
- atomic_set(&rdp->dynticks->dynticks,
- (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+ rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -3788,7 +3903,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
* of the next grace period.
*/
rnp = rdp->mynode;
- mask = rdp->grpmask;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
if (!rdp->beenonline)
WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
@@ -3796,7 +3910,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
rdp->cpu_no_qs.b.norm = true;
- rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
+ rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -3872,7 +3986,7 @@ void rcu_cpu_starting(unsigned int cpu)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp) {
- rdp = this_cpu_ptr(rsp->rda);
+ rdp = per_cpu_ptr(rsp->rda, cpu);
rnp = rdp->mynode;
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -4035,7 +4149,6 @@ static void __init rcu_init_one(struct rcu_state *rsp)
static const char * const fqs[] = RCU_FQS_NAME_INIT;
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
- static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
@@ -4057,8 +4170,6 @@ static void __init rcu_init_one(struct rcu_state *rsp)
for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
rcu_init_levelspread(levelspread, levelcnt);
- rsp->flavor_mask = fl_mask;
- fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe98dd24adf8..ee48fe56909e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -51,11 +51,7 @@
#ifdef CONFIG_RCU_FANOUT_LEAF
#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT_LEAF 64
-# else
-# define RCU_FANOUT_LEAF 32
-# endif
+#define RCU_FANOUT_LEAF 16
#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
@@ -112,6 +108,9 @@ struct rcu_dynticks {
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
+ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
+ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
+ bool rcu_urgent_qs; /* GP old need light quiescent state. */
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
long long dynticks_idle_nesting;
/* irq/process nesting level from idle. */
@@ -481,7 +480,6 @@ struct rcu_state {
struct rcu_node *level[RCU_NUM_LVLS + 1];
/* Hierarchy levels (+1 to */
/* shut bogus gcc warning) */
- u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
call_rcu_func_t call; /* call_rcu() flavor. */
int ncpus; /* # CPUs seen so far. */
@@ -521,7 +519,6 @@ struct rcu_state {
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
@@ -595,6 +592,9 @@ extern struct rcu_state rcu_bh_state;
extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+bool rcu_eqs_special_set(int cpu);
+
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
@@ -688,18 +688,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#endif /* #ifdef CONFIG_RCU_TRACE */
/*
- * Place this after a lock-acquisition primitive to guarantee that
- * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies
- * if the UNLOCK and LOCK are executed by the same CPU or if the
- * UNLOCK and LOCK operate on the same lock variable.
- */
-#ifdef CONFIG_PPC
-#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
-#else /* #ifdef CONFIG_PPC */
-#define smp_mb__after_unlock_lock() do { } while (0)
-#endif /* #else #ifdef CONFIG_PPC */
-
-/*
* Wrappers for the rcu_node::lock acquire and release.
*
* Because the rcu_nodes form a tree, the tree traversal locking will observe
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e59e1849b89a..a1f52bbe9db6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,16 +20,26 @@
* Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
-/* Wrapper functions for expedited grace periods. */
+/*
+ * Record the start of an expedited grace period.
+ */
static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
{
rcu_seq_start(&rsp->expedited_sequence);
}
+
+/*
+ * Record the end of an expedited grace period.
+ */
static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
{
rcu_seq_end(&rsp->expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
+
+/*
+ * Take a snapshot of the expedited-grace-period counter.
+ */
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
unsigned long s;
@@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
return s;
}
+
+/*
+ * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true
+ * if a full expedited grace period has elapsed since that snapshot
+ * was taken.
+ */
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
{
return rcu_seq_done(&rsp->expedited_sequence, s);
@@ -315,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
return;
}
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
resched_cpu(smp_processor_id());
}
@@ -356,12 +374,11 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
mask_ofl_test = 0;
for_each_leaf_node_possible_cpu(rnp, cpu) {
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
rdp->exp_dynticks_snap =
- atomic_add_return(0, &rdtp->dynticks);
+ rcu_dynticks_snap(rdp->dynticks);
if (raw_smp_processor_id() == cpu ||
- !(rdp->exp_dynticks_snap & 0x1) ||
+ rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
!(rnp->qsmaskinitnext & rdp->grpmask))
mask_ofl_test |= rdp->grpmask;
}
@@ -380,13 +397,12 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
if (!(mask_ofl_ipi & mask))
continue;
retry_ipi:
- if (atomic_add_return(0, &rdtp->dynticks) !=
- rdp->exp_dynticks_snap) {
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+ rdp->exp_dynticks_snap)) {
mask_ofl_test |= mask;
continue;
}
@@ -623,6 +639,11 @@ void synchronize_sched_expedited(void)
{
struct rcu_state *rsp = &rcu_sched_state;
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched_expedited() in RCU read-side critical section");
+
/* If only one CPU, this is automatically a grace period. */
if (rcu_blocking_is_gp())
return;
@@ -692,6 +713,11 @@ void synchronize_rcu_expedited(void)
{
struct rcu_state *rsp = rcu_state_p;
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
+
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 56583e764ebf..92450d620322 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
ticks_value, ticks_title,
- atomic_read(&rdtp->dynticks) & 0xfff,
+ rcu_dynticks_snap(rdtp) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
@@ -1766,6 +1766,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
return;
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
swake_up(&rdp_leader->nocb_wq);
}
@@ -1851,14 +1852,18 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
return;
}
len = atomic_long_read(&rdp->nocb_q_count);
- if (old_rhpp == &rdp->nocb_head) {
+ if (old_rhpp == &rdp->nocb_head ||
+ (rcu_nocb_need_deferred_wakeup(rdp) &&
+ !irqs_disabled_flags(flags))) {
if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
+ /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmptyIsDeferred"));
}
@@ -1870,7 +1875,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
+ /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvfIsDeferred"));
}
@@ -2196,7 +2203,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
int ndw;
- if (!rcu_nocb_need_deferred_wakeup(rdp))
+ if (!rcu_nocb_need_deferred_wakeup(rdp) ||
+ unlikely(rcu_scheduler_active != RCU_SCHEDULER_RUNNING))
return;
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
@@ -2366,8 +2374,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
}
/*
- * Each pass through this loop sets up one rcu_data structure and
- * spawns one rcu_nocb_kthread().
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
*/
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index b1f28972872c..65b43be38e68 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -45,8 +45,6 @@
#define RCU_TREE_NONCORE
#include "tree.h"
-DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
-
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{
@@ -121,10 +119,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
rdp->cpu_no_qs.b.norm,
- rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+ rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
rdp->core_needs_qs);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
- atomic_read(&rdp->dynticks->dynticks),
+ rcu_dynticks_snap(rdp->dynticks),
rdp->dynticks->dynticks_nesting,
rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
@@ -194,9 +192,8 @@ static int show_rcuexp(struct seq_file *m, void *v)
s2 += atomic_long_read(&rdp->exp_workdone2);
s3 += atomic_long_read(&rdp->exp_workdone3);
}
- seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
rsp->expedited_sequence, s0, s1, s2, s3,
- atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
return 0;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4f6db7e6a117..9e03db9ea9c0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void)
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting =
- ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
@@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
*/
void rcu_end_inkernel_boot(void)
{
- if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d01f9d047397..dd4cefd4604f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4977,6 +4977,7 @@ int __sched _cond_resched(void)
preempt_schedule_common();
return 1;
}
+ rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
diff --git a/kernel/signal.c b/kernel/signal.c
index 3603d93a1968..1c57784885a5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1232,7 +1232,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
}
/*
* This sighand can be already freed and even reused, but
- * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+ * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
* initializes ->siglock: this slab can't go away, it has
* the same object type, ->siglock can't be reinitialized.
*