diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 13:33:57 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 13:33:57 -0700 |
commit | 16b3d0cf5bad844daaf436ad2e9061de0fe36e5c (patch) | |
tree | d553a51e6d95fb166df7fa62264e9a27e4c438a4 /kernel/sched/core.c | |
parent | 42dec9a936e7696bea1f27d3c5a0068cd9aa95fd (diff) | |
parent | 2ea46c6fc9452ac100ad907b051d797225847e33 (diff) |
Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and
debugfs interfaces to a unified debugfs interface.
- Signals: Allow caching one sigqueue object per task, to improve
performance & latencies.
- Improve newidle_balance() irq-off latencies on systems with a large
number of CPU cgroups.
- Improve energy-aware scheduling
- Improve the PELT metrics for certain workloads
- Reintroduce select_idle_smt() to improve load-balancing locality -
but without the previous regressions
- Add 'scheduler latency debugging': warn after long periods of pending
need_resched. This is an opt-in feature that requires the enabling of
the LATENCY_WARN scheduler feature, or the use of the
resched_latency_warn_ms=xx boot parameter.
- CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix
remaining balance_push() vs. hotplug holes/races
- PSI fixes, plus allow /proc/pressure/ files to be written by
CAP_SYS_RESOURCE tasks as well
- Fix/improve various load-balancing corner cases vs. capacity margins
- Fix sched topology on systems with NUMA diameter of 3 or above
- Fix PF_KTHREAD vs to_kthread() race
- Minor rseq optimizations
- Misc cleanups, optimizations, fixes and smaller updates
* tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits)
cpumask/hotplug: Fix cpu_dying() state tracking
kthread: Fix PF_KTHREAD vs to_kthread() race
sched/debug: Fix cgroup_path[] serialization
sched,psi: Handle potential task count underflow bugs more gracefully
sched: Warn on long periods of pending need_resched
sched/fair: Move update_nohz_stats() to the CONFIG_NO_HZ_COMMON block to simplify the code & fix an unused function warning
sched/debug: Rename the sched_debug parameter to sched_verbose
sched,fair: Alternative sched_slice()
sched: Move /proc/sched_debug to debugfs
sched,debug: Convert sysctl sched_domains to debugfs
debugfs: Implement debugfs_create_str()
sched,preempt: Move preempt_dynamic to debug.c
sched: Move SCHED_DEBUG sysctl to debugfs
sched: Don't make LATENCYTOP select SCHED_DEBUG
sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG
sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG
sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
cpumask: Introduce DYING mask
cpumask: Make cpu_{online,possible,present,active}() inline
rseq: Optimise rseq_get_rseq_cs() and clear_rseq_cs()
...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 217 |
1 files changed, 113 insertions, 104 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 347127e73422..9143163fa678 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT -#endif + +/* + * Print a warning if need_resched is set for the given duration (if + * LATENCY_WARN is enabled). + * + * If sysctl_resched_latency_warn_once is set, only one warning will be shown + * per boot. + */ +__read_mostly int sysctl_resched_latency_warn_ms = 100; +__read_mostly int sysctl_resched_latency_warn_once = 1; +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. @@ -737,7 +747,7 @@ static void nohz_csd_func(void *info) /* * Release the rq::nohz_csd. */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu)); WARN_ON(!(flags & NOHZ_KICK_MASK)); rq->idle_balance = idle_cpu(cpu); @@ -1811,7 +1821,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) return cpu_online(cpu); /* Regular kernel threads don't get to stay during offline. */ - if (cpu_rq(cpu)->balance_push) + if (cpu_dying(cpu)) return false; /* But are allowed during online. */ @@ -1927,6 +1937,12 @@ static int migration_cpu_stop(void *data) rq_lock(rq, &rf); /* + * If we were passed a pending, then ->stop_pending was set, thus + * p->migration_pending must have remained stable. + */ + WARN_ON_ONCE(pending && pending != p->migration_pending); + + /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. @@ -1936,8 +1952,7 @@ static int migration_cpu_stop(void *data) goto out; if (pending) { - if (p->migration_pending == pending) - p->migration_pending = NULL; + p->migration_pending = NULL; complete = true; } @@ -1976,8 +1991,7 @@ static int migration_cpu_stop(void *data) * somewhere allowed, we're done. */ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { - if (p->migration_pending == pending) - p->migration_pending = NULL; + p->migration_pending = NULL; complete = true; goto out; } @@ -2165,16 +2179,21 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * * (1) In the cases covered above. There is one more where the completion is * signaled within affine_move_task() itself: when a subsequent affinity request - * cancels the need for an active migration. Consider: + * occurs after the stopper bailed out due to the targeted task still being + * Migrate-Disable. Consider: * * Initial conditions: P0->cpus_mask = [0, 1] * - * P0@CPU0 P1 P2 - * - * migrate_disable(); - * <preempted> + * CPU0 P1 P2 + * <P0> + * migrate_disable(); + * <preempted> * set_cpus_allowed_ptr(P0, [1]); * <blocks> + * <migration/0> + * migration_cpu_stop() + * is_migration_disabled() + * <bails> * set_cpus_allowed_ptr(P0, [0, 1]); * <signal completion> * <awakes> @@ -4244,8 +4263,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq; - /* * New tasks start with FORK_PREEMPT_COUNT, see there and * finish_task_switch() for details. @@ -4255,7 +4272,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) * PREEMPT_COUNT kernels). */ - rq = finish_task_switch(prev); + finish_task_switch(prev); preempt_enable(); if (current->set_child_tid) @@ -4520,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_SCHED_DEBUG +static u64 cpu_resched_latency(struct rq *rq) +{ + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); + u64 resched_latency, now = rq_clock(rq); + static bool warned_once; + + if (sysctl_resched_latency_warn_once && warned_once) + return 0; + + if (!need_resched() || !latency_warn_ms) + return 0; + + if (system_state == SYSTEM_BOOTING) + return 0; + + if (!rq->last_seen_need_resched_ns) { + rq->last_seen_need_resched_ns = now; + rq->ticks_without_resched = 0; + return 0; + } + + rq->ticks_without_resched++; + resched_latency = now - rq->last_seen_need_resched_ns; + if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) + return 0; + + warned_once = true; + + return resched_latency; +} + +static int __init setup_resched_latency_warn_ms(char *str) +{ + long val; + + if ((kstrtol(str, 0, &val))) { + pr_warn("Unable to set resched_latency_warn_ms\n"); + return 1; + } + + sysctl_resched_latency_warn_ms = val; + return 1; +} +__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); +#else +static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } +#endif /* CONFIG_SCHED_DEBUG */ + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -4531,6 +4597,7 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; struct rq_flags rf; unsigned long thermal_pressure; + u64 resched_latency; arch_scale_freq_tick(); sched_clock_tick(); @@ -4541,11 +4608,15 @@ void scheduler_tick(void) thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); + if (sched_feat(LATENCY_WARN)) + resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); - psi_task_tick(rq); rq_unlock(rq, &rf); + if (sched_feat(LATENCY_WARN) && resched_latency) + resched_latency_warn(cpu, resched_latency); + perf_event_task_tick(); #ifdef CONFIG_SMP @@ -5040,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); +#ifdef CONFIG_SCHED_DEBUG + rq->last_seen_need_resched_ns = 0; +#endif if (likely(prev != next)) { rq->nr_switches++; @@ -5365,23 +5439,23 @@ enum { preempt_dynamic_full, }; -static int preempt_dynamic_mode = preempt_dynamic_full; +int preempt_dynamic_mode = preempt_dynamic_full; -static int sched_dynamic_mode(const char *str) +int sched_dynamic_mode(const char *str) { if (!strcmp(str, "none")) - return 0; + return preempt_dynamic_none; if (!strcmp(str, "voluntary")) - return 1; + return preempt_dynamic_voluntary; if (!strcmp(str, "full")) - return 2; + return preempt_dynamic_full; - return -1; + return -EINVAL; } -static void sched_dynamic_update(int mode) +void sched_dynamic_update(int mode) { /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in @@ -5438,77 +5512,8 @@ static int __init setup_preempt_mode(char *str) } __setup("preempt=", setup_preempt_mode); -#ifdef CONFIG_SCHED_DEBUG - -static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[16]; - int mode; - - if (cnt > 15) - cnt = 15; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - mode = sched_dynamic_mode(strstrip(buf)); - if (mode < 0) - return mode; - - sched_dynamic_update(mode); - - *ppos += cnt; - - return cnt; -} - -static int sched_dynamic_show(struct seq_file *m, void *v) -{ - static const char * preempt_modes[] = { - "none", "voluntary", "full" - }; - int i; - - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { - if (preempt_dynamic_mode == i) - seq_puts(m, "("); - seq_puts(m, preempt_modes[i]); - if (preempt_dynamic_mode == i) - seq_puts(m, ")"); - - seq_puts(m, " "); - } - - seq_puts(m, "\n"); - return 0; -} - -static int sched_dynamic_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_dynamic_show, NULL); -} - -static const struct file_operations sched_dynamic_fops = { - .open = sched_dynamic_open, - .write = sched_dynamic_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug_dynamic(void) -{ - debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops); - return 0; -} -late_initcall(sched_init_debug_dynamic); - -#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_PREEMPT_DYNAMIC */ - /* * This is the entry point to schedule() from kernel preemption * off of irq context. @@ -7633,6 +7638,9 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work); /* * Ensure we only run per-cpu kthreads once the CPU goes !active. + * + * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only + * effective when the hotplug motion is down. */ static void balance_push(struct rq *rq) { @@ -7640,12 +7648,19 @@ static void balance_push(struct rq *rq) lockdep_assert_held(&rq->lock); SCHED_WARN_ON(rq->cpu != smp_processor_id()); + /* * Ensure the thing is persistent until balance_push_set(.on = false); */ rq->balance_callback = &balance_push_callback; /* + * Only active while going offline. + */ + if (!cpu_dying(rq->cpu)) + return; + + /* * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. * @@ -7653,7 +7668,7 @@ static void balance_push(struct rq *rq) * histerical raisins. */ if (rq->idle == push_task || - ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) || + kthread_is_per_cpu(push_task) || is_migration_disabled(push_task)) { /* @@ -7698,7 +7713,6 @@ static void balance_push_set(int cpu, bool on) struct rq_flags rf; rq_lock_irqsave(rq, &rf); - rq->balance_push = on; if (on) { WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = &balance_push_callback; @@ -7823,8 +7837,8 @@ int sched_cpu_activate(unsigned int cpu) struct rq_flags rf; /* - * Make sure that when the hotplug state machine does a roll-back - * we clear balance_push. Ideally that would happen earlier... + * Clear the balance_push callback and prepare to schedule + * regular tasks. */ balance_push_set(cpu, false); @@ -8009,12 +8023,6 @@ int sched_cpu_dying(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - /* - * Now that the CPU is offline, make sure we're welcome - * to new tasks once we come back up. - */ - balance_push_set(cpu, false); - calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); @@ -8199,7 +8207,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->balance_callback = NULL; + rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -8246,6 +8254,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); @@ -8970,7 +8979,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* - * Likewise, bound things on the otherside by preventing insane quota + * Likewise, bound things on the other side by preventing insane quota * periods. This also allows us to normalize in computing quota * feasibility. */ |