diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-02-09 14:13:13 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-02-09 14:13:16 +1100 |
commit | e15246babb490d1285a70591b5544d57c427c4d5 (patch) | |
tree | 974cb736e8c5049ed96d6fce94619d63fced84a7 /kernel | |
parent | 37b9a1de57381e12e565bc5dda6be7528abbbb35 (diff) | |
parent | 40b1f8f7b57f89ce2709a3d901a8f0bf3740fbcb (diff) |
Merge remote-tracking branch 'rcu/rcu/next'
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 80 | ||||
-rw-r--r-- | kernel/locking/locktorture.c | 6 | ||||
-rw-r--r-- | kernel/locking/rtmutex-debug.c | 9 | ||||
-rw-r--r-- | kernel/membarrier.c | 4 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 4 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 19 | ||||
-rw-r--r-- | kernel/rcu/srcu.c | 169 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 22 | ||||
-rw-r--r-- | kernel/rcu/tiny_plugin.h | 4 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 429 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 26 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 40 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 23 | ||||
-rw-r--r-- | kernel/rcu/tree_trace.c | 9 | ||||
-rw-r--r-- | kernel/rcu/update.c | 6 | ||||
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/signal.c | 2 |
18 files changed, 501 insertions, 356 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 2de63d1d654c..84a907592d34 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1299,7 +1299,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* - * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); @@ -2079,7 +2079,7 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7c38f8f3d97b..45acb86d3f56 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1142,10 +1142,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: possible circular locking dependency detected ]\n"); + pr_warn("======================================================\n"); + pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); - printk("-------------------------------------------------------\n"); + pr_warn("------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); @@ -1480,11 +1480,11 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + pr_warn("=====================================================\n"); + pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); - printk("------------------------------------------------------\n"); + pr_warn("-----------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, @@ -1709,10 +1709,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, return 0; printk("\n"); - printk("=============================================\n"); - printk("[ INFO: possible recursive locking detected ]\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); - printk("---------------------------------------------\n"); + pr_warn("--------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); @@ -2059,10 +2059,10 @@ static void print_collision(struct task_struct *curr, struct lock_chain *chain) { printk("\n"); - printk("======================\n"); - printk("[chain_key collision ]\n"); + pr_warn("============================\n"); + pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); - printk("----------------------\n"); + pr_warn("----------------------------\n"); printk("%s/%d: ", current->comm, task_pid_nr(current)); printk("Hash chain already cached but the contents don't match!\n"); @@ -2358,10 +2358,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, return 0; printk("\n"); - printk("=================================\n"); - printk("[ INFO: inconsistent lock state ]\n"); + pr_warn("================================\n"); + pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("--------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); @@ -2423,10 +2423,10 @@ print_irq_inversion_bug(struct task_struct *curr, return 0; printk("\n"); - printk("=========================================================\n"); - printk("[ INFO: possible irq lock inversion dependency detected ]\n"); + pr_warn("========================================================\n"); + pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); - printk("---------------------------------------------------------\n"); + pr_warn("--------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); @@ -3168,10 +3168,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; printk("\n"); - printk("==================================\n"); - printk("[ BUG: Nested lock was not taken ]\n"); + pr_warn("==================================\n"); + pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); - printk("----------------------------------\n"); + pr_warn("----------------------------------\n"); printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); @@ -3374,10 +3374,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: bad unlock balance detected! ]\n"); + pr_warn("=====================================\n"); + pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3871,10 +3871,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=================================\n"); - printk("[ BUG: bad contention detected! ]\n"); + pr_warn("=================================\n"); + pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -4235,10 +4235,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, return; printk("\n"); - printk("=========================\n"); - printk("[ BUG: held lock freed! ]\n"); + pr_warn("=========================\n"); + pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); - printk("-------------------------\n"); + pr_warn("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); @@ -4293,11 +4293,11 @@ static void print_held_locks_bug(void) return; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", + pr_warn("====================================\n"); + pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); @@ -4362,7 +4362,7 @@ retry: } while_each_thread(g, p); printk("\n"); - printk("=============================================\n\n"); + pr_warn("=============================================\n\n"); if (unlock) read_unlock(&tasklist_lock); @@ -4392,10 +4392,10 @@ asmlinkage __visible void lockdep_sys_exit(void) if (!debug_locks_off()) return; printk("\n"); - printk("================================================\n"); - printk("[ BUG: lock held when returning to user space! ]\n"); + pr_warn("================================================\n"); + pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); - printk("------------------------------------------------\n"); + pr_warn("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); @@ -4412,10 +4412,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ printk("\n"); - printk("===============================\n"); - printk("[ INFO: suspicious RCU usage. ]\n"); + pr_warn("=============================\n"); + pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); - printk("-------------------------------\n"); + pr_warn("-----------------------------\n"); printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9bffedd82884..28350dc8ecbb 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -852,6 +852,10 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); + + kfree(cxt.lwsa); + kfree(cxt.lrsa); + end: torture_cleanup_end(); } @@ -997,6 +1001,8 @@ static int __init lock_torture_init(void) GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + kfree(writer_tasks); + writer_tasks = NULL; firsterr = -ENOMEM; goto unwind; } diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 62b6cee8ea7f..7f8a9e2ced6e 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -101,10 +101,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); - printk( "--------------------------------------------\n"); + pr_warn("\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: circular locking deadlock detected!\n"); + pr_warn("%s\n", print_tainted()); + pr_warn("--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), current->comm, task_pid_nr(current)); diff --git a/kernel/membarrier.c b/kernel/membarrier.c index 536c727a56e9..9f9284f37f8d 100644 --- a/kernel/membarrier.c +++ b/kernel/membarrier.c @@ -16,6 +16,7 @@ #include <linux/syscalls.h> #include <linux/membarrier.h> +#include <linux/tick.h> /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, @@ -51,6 +52,9 @@ */ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) { + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -ENOSYS; if (unlikely(flags)) return -EINVAL; switch (cmd) { diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be..8700a81daf56 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -109,12 +109,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + RCU_TRACE(trace_rcu_invoke_callback(rn, head);) head->func(head); rcu_lock_release(&rcu_callback_map); return false; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 87c51225ceec..d81345be730e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -564,10 +564,25 @@ static void srcu_torture_stats(void) pr_alert("%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; long c0, c1; + struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; + u0 = counts->unlock_count[!idx]; + u1 = counts->unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->lock_count[!idx]; + l1 = counts->lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 9b9cdd549caa..52ec4ddeacdc 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -65,6 +65,17 @@ static inline bool rcu_batch_empty(struct rcu_batch *b) } /* + * Are all batches empty for the specified srcu_struct? + */ +static inline bool rcu_all_batches_empty(struct srcu_struct *sp) +{ + return rcu_batch_empty(&sp->batch_done) && + rcu_batch_empty(&sp->batch_check1) && + rcu_batch_empty(&sp->batch_check0) && + rcu_batch_empty(&sp->batch_queue); +} + +/* * Remove the callback at the head of the specified rcu_batch structure * and return a pointer to it, or return NULL if the structure is empty. */ @@ -106,7 +117,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) rcu_batch_init(&sp->batch_check1); rcu_batch_init(&sp->batch_done); INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + sp->per_cpu_ref = alloc_percpu(struct srcu_array); return sp->per_cpu_ref ? 0 : -ENOMEM; } @@ -141,114 +152,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * Returns approximate total of the readers' ->seq[] values for the + * Returns approximate total of the readers' ->lock_count[] values for the * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[idx]); } return sum; } /* - * Returns approximate number of readers active on the specified rank - * of the per-CPU ->c[] counters. + * Returns approximate total of the readers' ->unlock_count[] values for the + * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->unlock_count[idx]); } return sum; } /* * Return true if the number of pre-existing readers is determined to - * be stably zero. An example unstable zero can occur if the call - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, - * but due to task migration, sees the corresponding __srcu_read_unlock() - * decrement. This can happen because srcu_readers_active_idx() takes - * time to sum the array, and might in fact be interrupted or preempted - * partway through the summation. + * be zero. */ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { - unsigned long seq; + unsigned long unlocks; - seq = srcu_readers_seq_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(sp, idx); /* - * The following smp_mb() A pairs with the smp_mb() B located in - * __srcu_read_lock(). This pairing ensures that if an - * __srcu_read_lock() increments its counter after the summation - * in srcu_readers_active_idx(), then the corresponding SRCU read-side - * critical section will see any changes made prior to the start - * of the current SRCU grace period. + * Make sure that a lock is always counted if the corresponding unlock + * is counted. Needs to be a smp_mb() as the read side may contain a + * read from a variable that is written to before the synchronize_srcu() + * in the write side. In this case smp_mb()s A and B act like the store + * buffering pattern. * - * Also, if the above call to srcu_readers_seq_idx() saw the - * increment of ->seq[], then the call to srcu_readers_active_idx() - * must see the increment of ->c[]. + * This smp_mb() also pairs with smp_mb() C to prevent accesses after the + * synchronize_srcu() from being executed before the grace period ends. */ smp_mb(); /* A */ /* - * Note that srcu_readers_active_idx() can incorrectly return - * zero even though there is a pre-existing reader throughout. - * To see this, suppose that task A is in a very long SRCU - * read-side critical section that started on CPU 0, and that - * no other reader exists, so that the sum of the counters - * is equal to one. Then suppose that task B starts executing - * srcu_readers_active_idx(), summing up to CPU 1, and then that - * task C starts reading on CPU 0, so that its increment is not - * summed, but finishes reading on CPU 2, so that its decrement - * -is- summed. Then when task B completes its sum, it will - * incorrectly get zero, despite the fact that task A has been - * in its SRCU read-side critical section the whole time. - * - * We therefore do a validation step should srcu_readers_active_idx() - * return zero. - */ - if (srcu_readers_active_idx(sp, idx) != 0) - return false; - - /* - * The remainder of this function is the validation step. - * The following smp_mb() D pairs with the smp_mb() C in - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen - * by srcu_readers_active_idx() above, then any destructive - * operation performed after the grace period will happen after - * the corresponding SRCU read-side critical section. + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does not + * mean that there are no more readers, as one could have read the + * current index but not have incremented the lock counter yet. * - * Note that there can be at most NR_CPUS worth of readers using - * the old index, which is not enough to overflow even a 32-bit - * integer. (Yes, this does mean that systems having more than - * a billion or so CPUs need to be 64-bit systems.) Therefore, - * the sum of the ->seq[] counters cannot possibly overflow. - * Therefore, the only way that the return values of the two - * calls to srcu_readers_seq_idx() can be equal is if there were - * no increments of the corresponding rank of ->seq[] counts - * in the interim. But the missed-increment scenario laid out - * above includes an increment of the ->seq[] counter by - * the corresponding __srcu_read_lock(). Therefore, if this - * scenario occurs, the return values from the two calls to - * srcu_readers_seq_idx() will differ, and thus the validation - * step below suffices. + * Possible bug: There is no guarantee that there haven't been ULONG_MAX + * increments of ->lock_count[] since the unlocks were counted, meaning + * that this could return true even if there are still active readers. + * Since there are no memory barriers around srcu_flip(), the CPU is not + * required to increment ->completed before running + * srcu_readers_unlock_idx(), which means that there could be an + * arbitrarily large number of critical sections that execute after + * srcu_readers_unlock_idx() but use the old value of ->completed. */ - smp_mb(); /* D */ - - return srcu_readers_seq_idx(sp, idx) == seq; + return srcu_readers_lock_idx(sp, idx) == unlocks; } /** @@ -266,8 +240,12 @@ static bool srcu_readers_active(struct srcu_struct *sp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[0]); + sum += READ_ONCE(cpuc->lock_count[1]); + sum -= READ_ONCE(cpuc->unlock_count[0]); + sum -= READ_ONCE(cpuc->unlock_count[1]); } return sum; } @@ -283,6 +261,11 @@ void cleanup_srcu_struct(struct srcu_struct *sp) { if (WARN_ON(srcu_readers_active(sp))) return; /* Leakage unless caller handles error. */ + if (WARN_ON(!rcu_all_batches_empty(sp))) + return; /* Leakage unless caller handles error. */ + flush_delayed_work(&sp->work); + if (WARN_ON(sp->running)) + return; /* Caller forgot to stop doing call_srcu()? */ free_percpu(sp->per_cpu_ref); sp->per_cpu_ref = NULL; } @@ -298,9 +281,8 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->c[idx]); + __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - __this_cpu_inc(sp->per_cpu_ref->seq[idx]); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -314,7 +296,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_dec(sp->per_cpu_ref->c[idx]); + this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -349,12 +331,21 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) /* * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->c[] and ->seq[] arrays. This allows + * use the other rank of the ->(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *sp) { - sp->completed++; + WRITE_ONCE(sp->completed, sp->completed + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ } /* @@ -392,6 +383,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, head->next = NULL; head->func = func; spin_lock_irqsave(&sp->queue_lock, flags); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; @@ -425,6 +417,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) head->next = NULL; head->func = wakeme_after_rcu; spin_lock_irq(&sp->queue_lock); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ if (!sp->running) { /* steal the processing owner */ sp->running = true; @@ -444,8 +437,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) spin_unlock_irq(&sp->queue_lock); } - if (!done) + if (!done) { wait_for_completion(&rcu.completion); + smp_mb(); /* Caller's later accesses after GP. */ + } + } /** @@ -613,7 +609,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount) /* * Invoke a limited number of SRCU callbacks that have passed through * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). */ static void srcu_invoke_callbacks(struct srcu_struct *sp) { @@ -638,15 +635,9 @@ static void srcu_reschedule(struct srcu_struct *sp) { bool pending = true; - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { + if (rcu_all_batches_empty(sp)) { spin_lock_irq(&sp->queue_lock); - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { + if (rcu_all_batches_empty(sp)) { sp->running = false; pending = false; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b23a4d076f3d..2b6025ca43f0 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -41,8 +41,6 @@ /* Forward declarations for tiny_plugin.h. */ struct rcu_ctrlblk; -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_ctrlblk *rcp); @@ -69,7 +67,7 @@ EXPORT_SYMBOL(__rcu_is_watching); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp)); + RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -115,7 +113,7 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls()); + RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -133,7 +131,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0); + RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -142,7 +140,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -152,7 +150,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name); + RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); @@ -161,9 +159,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) __rcu_reclaim(rn, list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++); + RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), @@ -211,7 +209,7 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++); + RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -244,8 +242,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca..df3a60e19f07 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -162,8 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) static void check_cpu_stalls(void) { - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) } #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb4e2056ccf3..508a6a4bc523 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -270,19 +270,169 @@ void rcu_bh_qs(void) } } -static DEFINE_PER_CPU(int, rcu_sched_qs_mask); +/* + * Steal a bit from the bottom of ->dynticks for idle entry/exit + * control. Initially this is for TLB flushing. + */ +#define RCU_DYNTICK_CTRL_MASK 0x1 +#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) +#ifndef rcu_eqs_special_exit +#define rcu_eqs_special_exit() do { } while (0) +#endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), + .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, .dynticks_idle = ATOMIC_INIT(1), #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; -DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); -EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); +/* + * Record entry into an extended quiescent state. This is only to be + * called when not already in an extended quiescent state. + */ +static void rcu_dynticks_eqs_enter(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int seq; + + /* + * CPUs seeing atomic_inc_return() must see prior RCU read-side + * critical sections, and we also must force ordering with the + * next idle sojourn. + */ + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* Better be in an extended quiescent state! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_CTR)); + /* Better not have special action (TLB flush) pending! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_MASK)); +} + +/* + * Record exit from an extended quiescent state. This is only to be + * called from an extended quiescent state. + */ +static void rcu_dynticks_eqs_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int seq; + + /* + * CPUs seeing atomic_inc_return() must see prior idle sojourns, + * and we also must force ordering with the next RCU read-side + * critical section. + */ + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !(seq & RCU_DYNTICK_CTRL_CTR)); + if (seq & RCU_DYNTICK_CTRL_MASK) { + rcu_eqs_special_exit(); + /* Prefer duplicate flushes to losing a flush. */ + smp_mb__before_atomic(); /* NMI safety. */ + atomic_and(~RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); + } +} + +/* + * Reset the current CPU's ->dynticks counter to indicate that the + * newly onlined CPU is no longer in an extended quiescent state. + * This will either leave the counter unchanged, or increment it + * to the next non-quiescent value. + * + * The non-atomic test/increment sequence works because the upper bits + * of the ->dynticks counter are manipulated only by the corresponding CPU, + * or when the corresponding CPU is offline. + */ +static void rcu_dynticks_eqs_online(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) + return; + atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); +} + +/* + * Is the current CPU in an extended quiescent state? + * + * No ordering, as we are sampling CPU-local information. + */ +bool rcu_dynticks_curr_cpu_in_eqs(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); +} + +/* + * Snapshot the ->dynticks counter with full ordering so as to allow + * stable comparison of this counter with past and future snapshots. + */ +int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +{ + int snap = atomic_add_return(0, &rdtp->dynticks); + + return snap & ~RCU_DYNTICK_CTRL_MASK; +} + +/* + * Return true if the snapshot returned from rcu_dynticks_snap() + * indicates that RCU is in an extended quiescent state. + */ +static bool rcu_dynticks_in_eqs(int snap) +{ + return !(snap & RCU_DYNTICK_CTRL_CTR); +} + +/* + * Return true if the CPU corresponding to the specified rcu_dynticks + * structure has spent some time in an extended quiescent state since + * rcu_dynticks_snap() returned the specified snapshot. + */ +static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) +{ + return snap != rcu_dynticks_snap(rdtp); +} + +/* + * Do a double-increment of the ->dynticks counter to emulate a + * momentary idle-CPU quiescent state. + */ +static void rcu_dynticks_momentary_idle(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, + &rdtp->dynticks); + + /* It is illegal to call this from idle state. */ + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); +} + +/* + * Set the special (bottom) bit of the specified CPU so that it + * will take special action (such as flushing its TLB) on the + * next exit from an extended quiescent state. Returns true if + * the bit was successfully set, or false if the CPU was not in + * an extended quiescent state. + */ +bool rcu_eqs_special_set(int cpu) +{ + int old; + int new; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + do { + old = atomic_read(&rdtp->dynticks); + if (old & RCU_DYNTICK_CTRL_CTR) + return false; + new = old | RCU_DYNTICK_CTRL_MASK; + } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); + return true; +} /* * Let the RCU core know that this CPU has gone through the scheduler, @@ -291,48 +441,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * memory barriers to let the RCU core know about it, regardless of what * this CPU might (or might not) do in the near future. * - * We inform the RCU core by emulating a zero-duration dyntick-idle - * period, which we in turn do by incrementing the ->dynticks counter - * by two. + * We inform the RCU core by emulating a zero-duration dyntick-idle period. * * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - struct rcu_data *rdp; - struct rcu_dynticks *rdtp; - int resched_mask; - struct rcu_state *rsp; - - /* - * Yes, we can lose flag-setting operations. This is OK, because - * the flag will be set again after some delay. - */ - resched_mask = raw_cpu_read(rcu_sched_qs_mask); - raw_cpu_write(rcu_sched_qs_mask, 0); - - /* Find the flavor that needs a quiescent state. */ - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (!(resched_mask & rsp->flavor_mask)) - continue; - smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (READ_ONCE(rdp->mynode->completed) != - READ_ONCE(rdp->cond_resched_completed)) - continue; - - /* - * Pretend to be momentarily idle for the quiescent state. - * This allows the grace-period kthread to record the - * quiescent state, with no need for this CPU to do anything - * further. - */ - rdtp = this_cpu_ptr(&rcu_dynticks); - smp_mb__before_atomic(); /* Earlier stuff before QS. */ - atomic_add(2, &rdtp->dynticks); /* QS. */ - smp_mb__after_atomic(); /* Later stuff after QS. */ - break; - } + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); + rcu_dynticks_momentary_idle(); } /* @@ -342,12 +458,22 @@ static void rcu_momentary_dyntick_idle(void) */ void rcu_note_context_switch(void) { + struct rcu_state *rsp; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); + for_each_rcu_flavor(rsp) + do_nocb_deferred_wakeup(this_cpu_ptr(rsp->rda)); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); +out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -369,30 +495,30 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); void rcu_all_qs(void) { unsigned long flags; + struct rcu_state *rsp; + if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { - /* - * Yes, we just checked a per-CPU variable with preemption - * enabled, so we might be migrated to some other CPU at - * this point. That is OK because in that case, the - * migration will supply the needed quiescent state. - * We might end up needlessly disabling preemption and - * invoking rcu_sched_qs() on the destination CPU, but - * the probability and cost are both quite low, so this - * should not be a problem in practice. - */ - preempt_disable(); + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) rcu_sched_qs(); - preempt_enable(); - } - this_cpu_inc(rcu_qs_ctr); + for_each_rcu_flavor(rsp) + do_nocb_deferred_wakeup(this_cpu_ptr(rsp->rda)); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -611,7 +737,7 @@ static int cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) { return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_DONE_TAIL] != NULL; + rdp->nxttail[RCU_NEXT_TAIL] != NULL; } /* @@ -673,7 +799,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -692,12 +818,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); /* @@ -826,15 +947,10 @@ void rcu_irq_exit_irqson(void) */ static void rcu_eqs_exit_common(long long oldval, int user) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) rcu_dynticks_task_exit(); - smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -980,12 +1096,8 @@ void rcu_nmi_enter(void) * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ - if (!(atomic_read(&rdtp->dynticks) & 0x1)) { - smp_mb__before_atomic(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* atomic_inc() before later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + if (rcu_dynticks_curr_cpu_in_eqs()) { + rcu_dynticks_eqs_exit(); incby = 1; } rdtp->dynticks_nmi_nesting += incby; @@ -1010,7 +1122,7 @@ void rcu_nmi_exit(void) * to us!) */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so @@ -1023,11 +1135,7 @@ void rcu_nmi_exit(void) /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ rdtp->dynticks_nmi_nesting = 0; - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic(); /* Force delay to next write. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_eqs_enter(); } /** @@ -1040,7 +1148,7 @@ void rcu_nmi_exit(void) */ bool notrace __rcu_is_watching(void) { - return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; + return !rcu_dynticks_curr_cpu_in_eqs(); } /** @@ -1123,9 +1231,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { - rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - if ((rdp->dynticks_snap & 0x1) == 0) { + if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) @@ -1144,12 +1252,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { - unsigned int curr; - int *rcrmp; - unsigned int snap; - - curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned int)rdp->dynticks_snap; + unsigned long jtsq; + bool *rnhqp; + bool *ruqp; + unsigned long rjtsc; + struct rcu_node *rnp; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1159,27 +1266,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; return 1; } + /* Compute and saturate jiffies_till_sched_qs. */ + jtsq = jiffies_till_sched_qs; + rjtsc = rcu_jiffies_till_stall_check(); + if (jtsq > rjtsc / 2) { + WRITE_ONCE(jiffies_till_sched_qs, rjtsc); + jtsq = rjtsc / 2; + } else if (jtsq < 1) { + WRITE_ONCE(jiffies_till_sched_qs, 1); + jtsq = 1; + } + /* - * Check for the CPU being offline, but only if the grace period - * is old enough. We don't need to worry about the CPU changing - * state: If we see it offline even once, it has been through a - * quiescent state. - * - * The reason for insisting that the grace period be at least - * one jiffy old is that CPUs that are not quite online and that - * have just gone offline can still execute RCU read-side critical - * sections. + * Has this CPU encountered a cond_resched_rcu_qs() since the + * beginning of the grace period? For this to be the case, + * the CPU has to have noticed the current grace period. This + * might not be the case for nohz_full CPUs looping in the kernel. */ - if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) - return 0; /* Grace period is not old enough. */ - barrier(); - if (cpu_is_offline(rdp->cpu)) { + rnp = rdp->mynode; + ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); + if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + return 1; + } else { + /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ + smp_store_release(ruqp, true); + } + + /* Check for the CPU being offline. */ + if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; return 1; @@ -1192,7 +1315,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * in-kernel CPU-bound tasks cannot advance grace periods. * So if the grace period is old enough, make the CPU pay attention. * Note that the unsynchronized assignments to the per-CPU - * rcu_sched_qs_mask variable are safe. Yes, setting of + * rcu_need_heavy_qs variable are safe. Yes, setting of * bits can be lost, but they will be set again on the next * force-quiescent-state pass. So lost bit sets do not result * in incorrect behavior, merely in a grace period lasting @@ -1206,25 +1329,22 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * is set too high, we override with half of the RCU CPU stall * warning delay. */ - rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - WRITE_ONCE(rdp->cond_resched_completed, - READ_ONCE(rdp->mynode->completed)); - smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - WRITE_ONCE(*rcrmp, - READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - } + rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + if (!READ_ONCE(*rnhqp) && + (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched))) { + WRITE_ONCE(*rnhqp, true); + /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ + smp_store_release(ruqp, true); rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } - /* And if it has been a really long time, kick the CPU as well. */ - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + /* + * If more than halfway to RCU CPU stall-warning time, do + * a resched_cpu() to try to loosen things up a bit. + */ + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + resched_cpu(rdp->cpu); return 0; } @@ -1277,7 +1397,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) } /* - * Dump stacks of all tasks running on stalled CPUs. + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1287,11 +1410,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1379,6 +1501,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); + + /* Complain about tasks blocking the grace period. */ + rcu_print_detail_task_stall(rsp); } else { if (READ_ONCE(rsp->gpnum) != gpnum || READ_ONCE(rsp->completed) == gpnum) { @@ -1395,9 +1520,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } } - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); - rcu_check_gp_kthread_starvation(rsp); panic_on_rcu_stall(); @@ -1879,7 +2001,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); @@ -2467,10 +2589,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || - rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || - rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum || rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2479,7 +2599,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -2525,8 +2645,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) + if (rdp->cpu_no_qs.b.norm) return; /* @@ -2649,14 +2768,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask); - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + RCU_TRACE(unsigned long mask;) + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) + RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask); + RCU_TRACE(mask = rdp->grpmask;) trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), TPS("cpuofl")); @@ -3478,11 +3597,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && - (!rdp->cpu_no_qs.b.norm || - rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { + } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; return 1; } @@ -3748,7 +3865,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); - WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -3765,7 +3882,6 @@ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3778,8 +3894,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); - atomic_set(&rdp->dynticks->dynticks, - (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); + rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* @@ -3788,7 +3903,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) * of the next grace period. */ rnp = rdp->mynode; - mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); @@ -3796,7 +3910,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); + rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -3872,7 +3986,7 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); + rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4035,7 +4149,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) static const char * const fqs[] = RCU_FQS_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ @@ -4057,8 +4170,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; rcu_init_levelspread(levelspread, levelcnt); - rsp->flavor_mask = fl_mask; - fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fe98dd24adf8..ee48fe56909e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -51,11 +51,7 @@ #ifdef CONFIG_RCU_FANOUT_LEAF #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF #else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT_LEAF 64 -# else -# define RCU_FANOUT_LEAF 32 -# endif +#define RCU_FANOUT_LEAF 16 #endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) @@ -112,6 +108,9 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ @@ -481,7 +480,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ @@ -521,7 +519,6 @@ struct rcu_state { struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ @@ -595,6 +592,9 @@ extern struct rcu_state rcu_bh_state; extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ +int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +bool rcu_eqs_special_set(int cpu); + #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); @@ -688,18 +688,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #ifdef CONFIG_RCU_TRACE */ /* - * Place this after a lock-acquisition primitive to guarantee that - * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies - * if the UNLOCK and LOCK are executed by the same CPU or if the - * UNLOCK and LOCK operate on the same lock variable. - */ -#ifdef CONFIG_PPC -#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ -#define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ - -/* * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e59e1849b89a..a1f52bbe9db6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,16 +20,26 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ -/* Wrapper functions for expedited grace periods. */ +/* + * Record the start of an expedited grace period. + */ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) { rcu_seq_start(&rsp->expedited_sequence); } + +/* + * Record the end of an expedited grace period. + */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) { rcu_seq_end(&rsp->expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } + +/* + * Take a snapshot of the expedited-grace-period counter. + */ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { unsigned long s; @@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); return s; } + +/* + * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true + * if a full expedited grace period has elapsed since that snapshot + * was taken. + */ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { return rcu_seq_done(&rsp->expedited_sequence, s); @@ -315,6 +331,8 @@ static void sync_sched_exp_handler(void *data) return; } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); resched_cpu(smp_processor_id()); } @@ -356,12 +374,11 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, mask_ofl_test = 0; for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); rdp->exp_dynticks_snap = - atomic_add_return(0, &rdtp->dynticks); + rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - !(rdp->exp_dynticks_snap & 0x1) || + rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -380,13 +397,12 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: - if (atomic_add_return(0, &rdtp->dynticks) != - rdp->exp_dynticks_snap) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; continue; } @@ -623,6 +639,11 @@ void synchronize_sched_expedited(void) { struct rcu_state *rsp = &rcu_sched_state; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched_expedited() in RCU read-side critical section"); + /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; @@ -692,6 +713,11 @@ void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 56583e764ebf..92450d620322 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], ticks_value, ticks_title, - atomic_read(&rdtp->dynticks) & 0xfff, + rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, @@ -1766,6 +1766,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) return; if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); swake_up(&rdp_leader->nocb_wq); } @@ -1851,14 +1852,18 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, return; } len = atomic_long_read(&rdp->nocb_q_count); - if (old_rhpp == &rdp->nocb_head) { + if (old_rhpp == &rdp->nocb_head || + (rcu_nocb_need_deferred_wakeup(rdp) && + !irqs_disabled_flags(flags))) { if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } @@ -1870,7 +1875,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvfIsDeferred")); } @@ -2196,7 +2203,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { int ndw; - if (!rcu_nocb_need_deferred_wakeup(rdp)) + if (!rcu_nocb_need_deferred_wakeup(rdp) || + unlikely(rcu_scheduler_active != RCU_SCHEDULER_RUNNING)) return; ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); @@ -2366,8 +2374,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) } /* - * Each pass through this loop sets up one rcu_data structure and - * spawns one rcu_nocb_kthread(). + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index b1f28972872c..65b43be38e68 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -45,8 +45,6 @@ #define RCU_TREE_NONCORE #include "tree.h" -DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); - static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -121,10 +119,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", - atomic_read(&rdp->dynticks->dynticks), + rcu_dynticks_snap(rdp->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); @@ -194,9 +192,8 @@ static int show_rcuexp(struct seq_file *m, void *v) s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, s0, s1, s2, s3, - atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); return 0; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4f6db7e6a117..9e03db9ea9c0 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void) } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = - ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); /* * Should normal grace-period primitives be expedited? Intended for @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); */ void rcu_end_inkernel_boot(void) { - if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d01f9d047397..dd4cefd4604f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4977,6 +4977,7 @@ int __sched _cond_resched(void) preempt_schedule_common(); return 1; } + rcu_all_qs(); return 0; } EXPORT_SYMBOL(_cond_resched); diff --git a/kernel/signal.c b/kernel/signal.c index 3603d93a1968..1c57784885a5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1232,7 +1232,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, } /* * This sighand can be already freed and even reused, but - * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * |