From 43295d73adc8d3780e9f34206663e336678aaff8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:40 +0200 Subject: sched/wakeup: Split out the wakeup ->__state check RT kernels have a slightly more complicated handling of wakeups due to 'sleeping' spin/rwlocks. If a task is blocked on such a lock then the original state of the task is preserved over the blocking period, and any regular (non lock related) wakeup has to be targeted at the saved state to ensure that these wakeups are not lost. Once the task acquires the lock it restores the task state from the saved state. To avoid cluttering try_to_wake_up() with that logic, split the wakeup state check out into an inline helper and use it at both places where task::__state is checked against the state argument of try_to_wake_up(). No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.088945085@linutronix.de --- kernel/sched/core.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..961991e06337 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3561,6 +3561,22 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) rq_unlock(rq, &rf); } +/* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ + if (READ_ONCE(p->__state) & state) { + *success = 1; + return true; + } + return false; +} + /* * Notes on Program-Order guarantees on SMP systems. * @@ -3700,10 +3716,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto out; - success = 1; trace_sched_waking(p); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); @@ -3718,14 +3733,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto unlock; trace_sched_waking(p); - /* We're going to change ->state: */ - success = 1; - /* * Ensure we load p->on_rq _after_ p->state, otherwise it would * be possible to, falsely, observe p->on_rq == 0 and get stuck -- cgit v1.2.3 From 5f220be21418541422335288b6e2360a5ce0613c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:44 +0200 Subject: sched/wakeup: Prepare for RT sleeping spin/rwlocks Waiting for spinlocks and rwlocks on non RT enabled kernels is task::state preserving. Any wakeup which matches the state is valid. RT enabled kernels substitutes them with 'sleeping' spinlocks. This creates an issue vs. task::__state. In order to block on the lock, the task has to overwrite task::__state and a consecutive wakeup issued by the unlocker sets the state back to TASK_RUNNING. As a consequence the task loses the state which was set before the lock acquire and also any regular wakeup targeted at the task while it is blocked on the lock. To handle this gracefully, add a 'saved_state' member to task_struct which is used in the following way: 1) When a task blocks on a 'sleeping' spinlock, the current state is saved in task::saved_state before it is set to TASK_RTLOCK_WAIT. 2) When the task unblocks and after acquiring the lock, it restores the saved state. 3) When a regular wakeup happens for a task while it is blocked then the state change of that wakeup is redirected to operate on task::saved_state. This is also required when the task state is running because the task might have been woken up from the lock wait and has not yet restored the saved state. To make it complete, provide the necessary helpers to save and restore the saved state along with the necessary documentation how the RT lock blocking is supposed to work. For non-RT kernels there is no functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.258751046@linutronix.de --- include/linux/sched.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 33 ++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c72cf6aaabf..02714b9a3ff9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -143,9 +143,22 @@ struct task_group; current->task_state_change = _THIS_IP_; \ } while (0) +# define debug_rtlock_wait_set_state() \ + do { \ + current->saved_state_change = current->task_state_change;\ + current->task_state_change = _THIS_IP_; \ + } while (0) + +# define debug_rtlock_wait_restore_state() \ + do { \ + current->task_state_change = current->saved_state_change;\ + } while (0) + #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) +# define debug_rtlock_wait_set_state() do { } while (0) +# define debug_rtlock_wait_restore_state() do { } while (0) #endif /* @@ -213,6 +226,51 @@ struct task_group; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) +/* + * PREEMPT_RT specific variants for "sleeping" spin/rwlocks + * + * RT's spin/rwlock substitutions are state preserving. The state of the + * task when blocking on the lock is saved in task_struct::saved_state and + * restored after the lock has been acquired. These operations are + * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT + * lock related wakeups while the task is blocked on the lock are + * redirected to operate on task_struct::saved_state to ensure that these + * are not dropped. On restore task_struct::saved_state is set to + * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. + * + * The lock operation looks like this: + * + * current_save_and_set_rtlock_wait_state(); + * for (;;) { + * if (try_lock()) + * break; + * raw_spin_unlock_irq(&lock->wait_lock); + * schedule_rtlock(); + * raw_spin_lock_irq(&lock->wait_lock); + * set_current_state(TASK_RTLOCK_WAIT); + * } + * current_restore_rtlock_saved_state(); + */ +#define current_save_and_set_rtlock_wait_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + current->saved_state = current->__state; \ + debug_rtlock_wait_set_state(); \ + WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); + +#define current_restore_rtlock_saved_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + debug_rtlock_wait_restore_state(); \ + WRITE_ONCE(current->__state, current->saved_state); \ + current->saved_state = TASK_RUNNING; \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); + #define get_current_state() READ_ONCE(current->__state) /* Task command name length: */ @@ -668,6 +726,11 @@ struct task_struct { #endif unsigned int __state; +#ifdef CONFIG_PREEMPT_RT + /* saved state for "spinlock sleepers" */ + unsigned int saved_state; +#endif + /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. @@ -1357,6 +1420,9 @@ struct task_struct { struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; +# ifdef CONFIG_PREEMPT_RT + unsigned long saved_state_change; +# endif #endif int pagefault_disabled; #ifdef CONFIG_MMU diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 961991e06337..e407c6ac4a26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3566,14 +3566,47 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * * The caller holds p::pi_lock if p != current or has preemption * disabled when p == current. + * + * The rules of PREEMPT_RT saved_state: + * + * The related locking code always holds p::pi_lock when updating + * p::saved_state, which means the code is fully serialized in both cases. + * + * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other + * bits set. This allows to distinguish all wakeup scenarios. */ static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { + WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && + state != TASK_RTLOCK_WAIT); + } + if (READ_ONCE(p->__state) & state) { *success = 1; return true; } + +#ifdef CONFIG_PREEMPT_RT + /* + * Saved state preserves the task state across blocking on + * an RT lock. If the state matches, set p::saved_state to + * TASK_RUNNING, but do not wake the task because it waits + * for a lock wakeup. Also indicate success because from + * the regular waker's point of view this has succeeded. + * + * After acquiring the lock the task will restore p::__state + * from p::saved_state which ensures that the regular + * wakeup is not lost. The restore will also set + * p::saved_state to TASK_RUNNING so any further tests will + * not result in false positives vs. @success + */ + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + *success = 1; + } +#endif return false; } -- cgit v1.2.3 From b4bfa3fcfe3b827ddb8b16edd45896caac5a1194 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:46 +0200 Subject: sched/core: Rework the __schedule() preempt argument PREEMPT_RT needs to hand a special state into __schedule() when a task blocks on a 'sleeping' spin/rwlock. This is required to handle rcu_note_context_switch() correctly without having special casing in the RCU code. From an RCU point of view the blocking on the sleeping spinlock is equivalent to preemption, because the task might be in a read side critical section. schedule_debug() also has a check which would trigger with the !preempt case, but that could be handled differently. To avoid adding another argument and extra checks which cannot be optimized out by the compiler, the following solution has been chosen: - Replace the boolean 'preempt' argument with an unsigned integer 'sched_mode' argument and define constants to hand in: (0 == no preemption, 1 = preemption). - Add two masks to apply on that mode: one for the debug/rcu invocations, and one for the actual scheduling decision. For a non RT kernel these masks are UINT_MAX, i.e. all bits are set, which allows the compiler to optimize the AND operation out, because it is not masking out anything. IOW, it's not different from the boolean. RT enabled kernels will define these masks separately. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.315473019@linutronix.de --- kernel/sched/core.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e407c6ac4a26..ebc24e136222 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5819,6 +5819,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SCHED_CORE */ +/* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. Note that + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to + * optimize the AND operation out and just check for zero. + */ +#define SM_NONE 0x0 +#define SM_PREEMPT 0x1 +#define SM_MASK_PREEMPT (~0U) + /* * __schedule() is the main scheduler function. * @@ -5858,7 +5870,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(unsigned int sched_mode) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -5871,13 +5883,13 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, preempt); + schedule_debug(prev, !!sched_mode); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(preempt); + rcu_note_context_switch(!!sched_mode); /* * Make sure that signal_pending_state()->signal_pending() below @@ -5911,7 +5923,7 @@ static void __sched notrace __schedule(bool preempt) * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ prev_state = READ_ONCE(prev->__state); - if (!preempt && prev_state) { + if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { @@ -5977,7 +5989,7 @@ static void __sched notrace __schedule(bool preempt) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(preempt, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -5998,7 +6010,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(SM_NONE); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -6059,7 +6071,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(SM_NONE); sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -6087,7 +6099,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(false); + __schedule(SM_NONE); } while (need_resched()); } @@ -6140,7 +6152,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -6219,7 +6231,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(SM_PREEMPT); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -6368,7 +6380,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(SM_PREEMPT); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); -- cgit v1.2.3 From 6991436c2b5d91d5358d9914ae2df22b9a1d1dc9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:48 +0200 Subject: sched/core: Provide a scheduling point for RT locks RT enabled kernels substitute spin/rwlocks with 'sleeping' variants based on rtmutexes. Blocking on such a lock is similar to preemption versus: - I/O scheduling and worker handling, because these functions might block on another substituted lock, or come from a lock contention within these functions. - RCU considers this like a preemption, because the task might be in a read side critical section. Add a separate scheduling point for this, and hand a new scheduling mode argument to __schedule() which allows, along with separate mode masks, to handle this gracefully from within the scheduler, without proliferating that to other subsystems like RCU. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.372319055@linutronix.de --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 02714b9a3ff9..746dfc06a35c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -288,6 +288,9 @@ extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); +#ifdef CONFIG_PREEMPT_RT + extern void schedule_rtlock(void); +#endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ebc24e136222..c89c1d45dd0b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5829,7 +5829,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) */ #define SM_NONE 0x0 #define SM_PREEMPT 0x1 -#define SM_MASK_PREEMPT (~0U) +#define SM_RTLOCK_WAIT 0x2 + +#ifndef CONFIG_PREEMPT_RT +# define SM_MASK_PREEMPT (~0U) +#else +# define SM_MASK_PREEMPT SM_PREEMPT +#endif /* * __schedule() is the main scheduler function. @@ -6134,6 +6140,18 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ + do { + preempt_disable(); + __schedule(SM_RTLOCK_WAIT); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif + static void __sched notrace preempt_schedule_common(void) { do { -- cgit v1.2.3