From f27bc4873fa8b75cc1eba7b641eda7375dc72ccf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 May 2014 15:38:38 -0700 Subject: rcu: Document deadlock-avoidance information for rcu_read_unlock() Reported-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- include/linux/rcupdate.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6a94cc8b1ca0..c56ad15204ec 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -858,6 +858,34 @@ static inline void rcu_read_lock(void) /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * + * In most situations, rcu_read_unlock() is immune from deadlock. + * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock() + * is responsible for deboosting, which it does via rt_mutex_unlock(). + * Unfortunately, this function acquires the scheduler's runqueue and + * priority-inheritance spinlocks. This means that deadlock could result + * if the caller of rcu_read_unlock() already holds one of these locks or + * any lock that is ever acquired while holding them. + * + * That said, RCU readers are never priority boosted unless they were + * preempted. Therefore, one way to avoid deadlock is to make sure + * that preemption never happens within any RCU read-side critical + * section whose outermost rcu_read_unlock() is called with one of + * rt_mutex_unlock()'s locks held. Such preemption can be avoided in + * a number of ways, for example, by invoking preempt_disable() before + * critical section's outermost rcu_read_lock(). + * + * Given that the set of locks acquired by rt_mutex_unlock() might change + * at any time, a somewhat more future-proofed approach is to make sure + * that that preemption never happens within any RCU read-side critical + * section whose outermost rcu_read_unlock() is called with irqs disabled. + * This approach relies on the fact that rt_mutex_unlock() currently only + * acquires irq-disabled locks. + * + * The second of these two approaches is best in most situations, + * however, the first approach can also be useful, at least to those + * developers willing to keep abreast of the set of locks acquired by + * rt_mutex_unlock(). + * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) -- cgit v1.2.3 From ab74fdfd4e11ec040f21cf87edc14fc9f62cc934 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 May 2014 15:41:21 -0700 Subject: rcu: Handle obsolete references to TINY_PREEMPT_RCU Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- include/linux/rcupdate.h | 17 ++++++++--------- init/Kconfig | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c56ad15204ec..d231aa17b1d7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -826,15 +826,14 @@ static inline void rcu_preempt_sleep_check(void) * read-side critical section that would block in a !PREEMPT kernel. * But if you want the full story, read on! * - * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it - * is illegal to block while in an RCU read-side critical section. In - * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU) - * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may - * be preempted, but explicit blocking is illegal. Finally, in preemptible - * RCU implementations in real-time (with -rt patchset) kernel builds, - * RCU read-side critical sections may be preempted and they may also - * block, but only when acquiring spinlocks that are subject to priority - * inheritance. + * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), + * it is illegal to block while in an RCU read-side critical section. + * In preemptible RCU implementations (TREE_PREEMPT_RCU) in CONFIG_PREEMPT + * kernel builds, RCU read-side critical sections may be preempted, + * but explicit blocking is illegal. Finally, in preemptible RCU + * implementations in real-time (with -rt patchset) kernel builds, RCU + * read-side critical sections may be preempted and they may also block, but + * only when acquiring spinlocks that are subject to priority inheritance. */ static inline void rcu_read_lock(void) { diff --git a/init/Kconfig b/init/Kconfig index 9d76b99af1b9..977b37806e95 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -505,7 +505,7 @@ config PREEMPT_RCU def_bool TREE_PREEMPT_RCU help This option enables preemptible-RCU code that is common between - the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU. config RCU_STALL_COMMON def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) -- cgit v1.2.3 From abaa93d9e1de2c29297e69ddba8ddd38f15064cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jun 2014 13:30:25 -0700 Subject: rcu: Simplify priority boosting by putting rt_mutex in rcu_node RCU priority boosting currently checks for boosting via a pointer in task_struct. However, this is not needed: As Oleg noted, if the rt_mutex is placed in the rcu_node instead of on the booster's stack, the boostee can simply check it see if it owns the lock. This commit makes this change, shrinking task_struct by one pointer and the kernel by thirteen lines. Suggested-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 9 +-------- include/linux/sched.h | 6 ------ kernel/rcu/tree.h | 3 +++ kernel/rcu/tree_plugin.h | 25 +++++++++++-------------- 4 files changed, 15 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 6df7f9fe0d01..2bb4c4f3531a 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -102,12 +102,6 @@ extern struct group_info init_groups; #define INIT_IDS #endif -#ifdef CONFIG_RCU_BOOST -#define INIT_TASK_RCU_BOOST() \ - .rcu_boost_mutex = NULL, -#else -#define INIT_TASK_RCU_BOOST() -#endif #ifdef CONFIG_TREE_PREEMPT_RCU #define INIT_TASK_RCU_TREE_PREEMPT() \ .rcu_blocked_node = NULL, @@ -119,8 +113,7 @@ extern struct group_info init_groups; .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ - INIT_TASK_RCU_TREE_PREEMPT() \ - INIT_TASK_RCU_BOOST() + INIT_TASK_RCU_TREE_PREEMPT() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 306f4f0c987a..3cfbc05e66e6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1270,9 +1270,6 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -2009,9 +2006,6 @@ static inline void rcu_copy_process(struct task_struct *p) #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3eeb919e26a2..60fb0eaa2d16 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -177,6 +177,9 @@ struct rcu_node { /* to carry out the boosting is fully */ /* released with no future boostee accesses */ /* before that rt_mutex is re-initialized. */ + struct rt_mutex boost_mtx; + /* Used only for the priority-boosting */ + /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ struct task_struct *boost_kthread_task; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9c811879d31e..719587af7b10 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -33,6 +33,7 @@ #define RCU_KTHREAD_PRIO 1 #ifdef CONFIG_RCU_BOOST +#include "../locking/rtmutex_common.h" #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO #else #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO @@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t) unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; + bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; int special; @@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ - if (t->rcu_boost_mutex) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - } + /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -427,8 +425,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (rbmp) { - rt_mutex_unlock(rbmp); + if (drop_boost_mutex) { + rt_mutex_unlock(&rnp->boost_mtx); complete(&rnp->boost_completion); } #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -1151,7 +1149,6 @@ static void rcu_wake_cond(struct task_struct *t, int status) static int rcu_boost(struct rcu_node *rnp) { unsigned long flags; - struct rt_mutex mtx; struct task_struct *t; struct list_head *tb; @@ -1202,14 +1199,14 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; + rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); - rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + /* Lock only for side effect: boosts task t's priority. */ + rt_mutex_lock(&rnp->boost_mtx); + rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait until boostee is done accessing mtx before reinitializing. */ + /* Wait for boostee to be done w/boost_mtx before reinitializing. */ wait_for_completion(&rnp->boost_completion); return ACCESS_ONCE(rnp->exp_tasks) != NULL || -- cgit v1.2.3 From c0f489d2c6fec8994c642c2ec925eb858727dc7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jun 2014 13:46:03 -0700 Subject: rcu: Bind grace-period kthreads to non-NO_HZ_FULL CPUs Binding the grace-period kthreads to the timekeeping CPU resulted in significant performance decreases for some workloads. For more detail, see: https://lkml.org/lkml/2014/6/3/395 for benchmark numbers https://lkml.org/lkml/2014/6/4/218 for CPU statistics It turns out that it is necessary to bind the grace-period kthreads to the timekeeping CPU only when all but CPU 0 is a nohz_full CPU on the one hand or if CONFIG_NO_HZ_FULL_SYSIDLE=y on the other. In other cases, it suffices to bind the grace-period kthreads to the set of non-nohz_full CPUs. This commit therefore creates a tick_nohz_not_full_mask that is the complement of tick_nohz_full_mask, and then binds the grace-period kthread to the set of CPUs indicated by this new mask, which covers the CONFIG_NO_HZ_FULL_SYSIDLE=n case. The CONFIG_NO_HZ_FULL_SYSIDLE=y case still binds the grace-period kthreads to the timekeeping CPU. This commit also includes the tick_nohz_full_enabled() check suggested by Frederic Weisbecker. Reported-by: Jet Chen Signed-off-by: Paul E. McKenney [ paulmck: Created housekeeping_affine() and housekeeping_mask per fweisbec feedback. ] --- include/linux/tick.h | 20 ++++++++++++++++++++ kernel/rcu/tree_plugin.h | 14 +++++++++----- kernel/time/tick-sched.c | 10 ++++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index b84773cb9f4c..06cc093ab7ad 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -162,6 +163,7 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; extern cpumask_var_t tick_nohz_full_mask; +extern cpumask_var_t housekeeping_mask; static inline bool tick_nohz_full_enabled(void) { @@ -194,6 +196,24 @@ static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } #endif +static inline bool is_housekeeping_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return cpumask_test_cpu(cpu, housekeeping_mask); +#endif + return true; +} + +static inline void housekeeping_affine(struct task_struct *t) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + set_cpus_allowed_ptr(t, housekeeping_mask); + +#endif +} + static inline void tick_nohz_full_check(void) { if (tick_nohz_full_enabled()) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 719587af7b10..b39ba7239bd6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2846,12 +2846,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { -#ifdef CONFIG_NO_HZ_FULL - int cpu = tick_do_timer_cpu; + int __maybe_unused cpu; - if (cpu < 0 || cpu >= nr_cpu_ids) + if (!tick_nohz_full_enabled()) return; - if (raw_smp_processor_id() != cpu) +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + cpu = tick_do_timer_cpu; + if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + if (!is_housekeeping_cpu(raw_smp_processor_id())) + housekeeping_affine(current); +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7ac112d..f784d83e29f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; +cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static bool can_stop_full_tick(void) @@ -281,6 +282,7 @@ static int __init tick_nohz_full_setup(char *str) int cpu; alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + alloc_bootmem_cpumask_var(&housekeeping_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); return 1; @@ -291,6 +293,8 @@ static int __init tick_nohz_full_setup(char *str) pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); tick_nohz_full_running = true; return 1; @@ -332,9 +336,15 @@ static int tick_nohz_init_all(void) pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); + return err; + } err = 0; cpumask_setall(tick_nohz_full_mask); cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + cpumask_clear(housekeeping_mask); + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); tick_nohz_full_running = true; #endif return err; -- cgit v1.2.3