diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 11 | ||||
-rw-r--r-- | kernel/cpu_pm.c | 233 | ||||
-rw-r--r-- | kernel/events/core.c | 4 | ||||
-rw-r--r-- | kernel/hrtimer.c | 6 | ||||
-rw-r--r-- | kernel/irq/manage.c | 5 | ||||
-rw-r--r-- | kernel/irq/pm.c | 48 | ||||
-rw-r--r-- | kernel/irq/spurious.c | 6 | ||||
-rw-r--r-- | kernel/jump_label.c | 3 | ||||
-rw-r--r-- | kernel/kmod.c | 4 | ||||
-rw-r--r-- | kernel/power/Kconfig | 4 | ||||
-rw-r--r-- | kernel/power/suspend.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 27 | ||||
-rw-r--r-- | kernel/sched_fair.c | 57 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 | ||||
-rw-r--r-- | kernel/time.c | 2 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 1 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 62 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 11 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 58 |
25 files changed, 477 insertions, 84 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eca595e2fd5..988cb3da703 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o +obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e691818d7e4..a3f638ac3de 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss, kfree(cgroup_freezer(cgroup)); } +/* task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} + /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (frozen(task)) + if (is_task_frozen_enough(task)) nfrozen++; } @@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) continue; - if (frozen(task)) + if (is_task_frozen_enough(task)) continue; if (!freezing(task) && !freezer_should_skip(task)) num_cant_freeze_now++; diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c new file mode 100644 index 00000000000..249152e1530 --- /dev/null +++ b/kernel/cpu_pm.c @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2011 Google, Inc. + * + * Author: + * Colin Cross <ccross@android.com> + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/cpu_pm.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/spinlock.h> +#include <linux/syscore_ops.h> + +static DEFINE_RWLOCK(cpu_pm_notifier_lock); +static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); + +static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +{ + int ret; + + ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, + nr_to_call, nr_calls); + + return notifier_to_errno(ret); +} + +/** + * cpu_pm_register_notifier - register a driver with cpu_pm + * @nb: notifier block to register + * + * Add a driver to a list of drivers that are notified about + * CPU and CPU cluster low power entry and exit. + * + * This function may sleep, and has the same return conditions as + * raw_notifier_chain_register. + */ +int cpu_pm_register_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + write_lock_irqsave(&cpu_pm_notifier_lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb); + write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); + +/** + * cpu_pm_unregister_notifier - unregister a driver with cpu_pm + * @nb: notifier block to be unregistered + * + * Remove a driver from the CPU PM notifier list. + * + * This function may sleep, and has the same return conditions as + * raw_notifier_chain_unregister. + */ +int cpu_pm_unregister_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + write_lock_irqsave(&cpu_pm_notifier_lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); + +/** + * cpm_pm_enter - CPU low power entry notifier + * + * Notifies listeners that a single CPU is entering a low power state that may + * cause some blocks in the same power domain as the cpu to reset. + * + * Must be called on the affected CPU with interrupts disabled. Platform is + * responsible for ensuring that cpu_pm_enter is not called twice on the same + * CPU before cpu_pm_exit is called. Notified drivers can include VFP + * co-processor, interrupt controller and it's PM extensions, local CPU + * timers context save/restore which shouldn't be interrupted. Hence it + * must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_pm_enter(void) +{ + int nr_calls; + int ret = 0; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); + if (ret) + /* + * Inform listeners (nr_calls - 1) about failure of CPU PM + * PM entry who are notified earlier to prepare for it. + */ + cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_enter); + +/** + * cpm_pm_exit - CPU low power exit notifier + * + * Notifies listeners that a single CPU is exiting a low power state that may + * have caused some blocks in the same power domain as the cpu to reset. + * + * Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_pm_exit(void) +{ + int ret; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_exit); + +/** + * cpm_cluster_pm_enter - CPU cluster low power entry notifier + * + * Notifies listeners that all cpus in a power domain are entering a low power + * state that may cause some blocks in the same power domain to reset. + * + * Must be called after cpu_pm_enter has been called on all cpus in the power + * domain, and before cpu_pm_exit has been called on any cpu in the power + * domain. Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_cluster_pm_enter(void) +{ + int nr_calls; + int ret = 0; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); + if (ret) + /* + * Inform listeners (nr_calls - 1) about failure of CPU cluster + * PM entry who are notified earlier to prepare for it. + */ + cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); + +/** + * cpm_cluster_pm_exit - CPU cluster low power exit notifier + * + * Notifies listeners that all cpus in a power domain are exiting form a + * low power state that may have caused some blocks in the same power domain + * to reset. + * + * Must be called after cpu_pm_exit has been called on all cpus in the power + * domain, and before cpu_pm_exit has been called on any cpu in the power + * domain. Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_cluster_pm_exit(void) +{ + int ret; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); + +#ifdef CONFIG_PM +static int cpu_pm_suspend(void) +{ + int ret; + + ret = cpu_pm_enter(); + if (ret) + return ret; + + ret = cpu_cluster_pm_enter(); + return ret; +} + +static void cpu_pm_resume(void) +{ + cpu_cluster_pm_exit(); + cpu_pm_exit(); +} + +static struct syscore_ops cpu_pm_syscore_ops = { + .suspend = cpu_pm_suspend, + .resume = cpu_pm_resume, +}; + +static int cpu_pm_init(void) +{ + register_syscore_ops(&cpu_pm_syscore_ops); + return 0; +} +core_initcall(cpu_pm_init); +#endif diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f857782d06..fbe38f2e8ed 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5758,6 +5758,7 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + event->pmu = pmu; ret = pmu->event_init(event); if (ret) pmu = ERR_PTR(ret); @@ -5765,6 +5766,7 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + event->pmu = pmu; ret = pmu->event_init(event); if (!ret) goto unlock; @@ -5891,8 +5893,6 @@ done: return ERR_PTR(err); } - event->pmu = pmu; - if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) jump_label_inc(&perf_sched_events); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index a9205e32a05..2043c08d36c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { + struct timerqueue_node *next_timer; if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; - if (&timer->node == timerqueue_getnext(&base->active)) { + next_timer = timerqueue_getnext(&base->active); + timerqueue_del(&base->active, &timer->node); + if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ if (reprogram && hrtimer_hres_active()) { @@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - timerqueue_del(&base->active, &timer->node); if (!timerqueue_getnext(&base->active)) base->cpu_base->active_bases &= ~(1 << base->index); out: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9b956fa2030..d6c4adc2804 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -620,8 +620,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) static int irq_wait_for_interrupt(struct irqaction *action) { + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { @@ -629,7 +630,9 @@ static int irq_wait_for_interrupt(struct irqaction *action) return 0; } schedule(); + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return -1; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f76fc00c987..15e53b1766a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -9,6 +9,7 @@ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/syscore_ops.h> #include "internals.h" @@ -39,25 +40,58 @@ void suspend_device_irqs(void) } EXPORT_SYMBOL_GPL(suspend_device_irqs); -/** - * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() - * - * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQS_SUSPENDED flag set. - */ -void resume_device_irqs(void) +static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; + bool is_early = desc->action && + desc->action->flags & IRQF_EARLY_RESUME; + + if (is_early != want_early) + continue; raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); } } + +/** + * irq_pm_syscore_ops - enable interrupt lines early + * + * Enable all interrupt lines with %IRQF_EARLY_RESUME set. + */ +static void irq_pm_syscore_resume(void) +{ + resume_irqs(true); +} + +static struct syscore_ops irq_pm_syscore_ops = { + .resume = irq_pm_syscore_resume, +}; + +static int __init irq_pm_init_ops(void) +{ + register_syscore_ops(&irq_pm_syscore_ops); + return 0; +} + +device_initcall(irq_pm_init_ops); + +/** + * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() + * + * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously + * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag + * set as well as those with %IRQF_FORCE_RESUME. + */ +void resume_device_irqs(void) +{ + resume_irqs(false); +} EXPORT_SYMBOL_GPL(resume_device_irqs); /** diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index aa57d5da18c..dc813a948be 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) */ action = desc->action; if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER) || !action->next) + (action->flags & __IRQF_TIMER) || + (action->handler(irq, action->dev_id) == IRQ_HANDLED) || + !action->next) goto out; /* Already running on another processor */ @@ -115,7 +117,7 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; - if (atomic_inc_return(&irq_poll_active) == 1) + if (atomic_inc_return(&irq_poll_active) != 1) goto out; irq_poll_cpu = smp_processor_id(); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a8ce45097f3..e6f1f24ad57 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -66,8 +66,9 @@ void jump_label_inc(struct jump_label_key *key) return; jump_label_lock(); - if (atomic_add_return(1, &key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) jump_label_update(key, JUMP_LABEL_ENABLE); + atomic_inc(&key->enabled); jump_label_unlock(); } diff --git a/kernel/kmod.c b/kernel/kmod.c index ddc7644c130..a4bea97c75b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...) atomic_inc(&kmod_concurrent); if (atomic_read(&kmod_concurrent) > max_modprobes) { /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg++ < 5) + if (kmod_loop_msg < 5) { printk(KERN_ERR "request_module: runaway loop modprobe %s\n", module_name); + kmod_loop_msg++; + } atomic_dec(&kmod_concurrent); return -ENOMEM; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3744c594b19..80a85971cf6 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -235,3 +235,7 @@ config PM_GENERIC_DOMAINS config PM_GENERIC_DOMAINS_RUNTIME def_bool y depends on PM_RUNTIME && PM_GENERIC_DOMAINS + +config CPU_PM + bool + depends on SUSPEND || CPU_IDLE diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b6b71ad2208..d3caa763498 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -315,7 +315,7 @@ int enter_state(suspend_state_t state) */ int pm_suspend(suspend_state_t state) { - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) + if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) return enter_state(state); return -EINVAL; } diff --git a/kernel/sched.c b/kernel/sched.c index b50b0f0c9aa..036311afeed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -510,7 +510,7 @@ struct rq { unsigned long cpu_power; - unsigned char idle_at_tick; + unsigned char idle_balance; /* For active balancing */ int post_schedule; int active_balance; @@ -1272,6 +1272,18 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static inline bool got_nohz_idle_kick(void) +{ + return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -2591,7 +2603,7 @@ void scheduler_ipi(void) struct rq *rq = this_rq(); struct task_struct *list = xchg(&rq->wake_list, NULL); - if (!list) + if ((!list) && !got_nohz_idle_kick()) return; /* @@ -2609,6 +2621,14 @@ void scheduler_ipi(void) */ irq_enter(); sched_ttwu_do_pending(list); + + /* + * Check if someone kicked us for doing the nohz idle load balance. + */ + if (unlikely(got_nohz_idle_kick() && !need_resched())) { + this_rq()->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } irq_exit(); } @@ -4116,7 +4136,7 @@ void scheduler_tick(void) perf_event_task_tick(); #ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); + rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } @@ -8133,7 +8153,6 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_balance_kick = 0; - init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); #endif #endif init_rq_hrtick(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bc8ee999381..2107c3c3049 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -91,6 +91,8 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; static const struct sched_class fair_sched_class; +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -2667,6 +2669,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgp->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); @@ -2774,12 +2781,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); + if (local_group) { + if (idle != CPU_NEWLY_IDLE) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); + } else if (time_after_eq(jiffies, group->sgp->next_update)) + update_group_power(sd, this_cpu); } /* Adjust by relative CPU power of the group */ @@ -3612,22 +3622,6 @@ out_unlock: } #ifdef CONFIG_NO_HZ - -static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); - -static void trigger_sched_softirq(void *data) -{ - raise_softirq_irqoff(SCHED_SOFTIRQ); -} - -static inline void init_sched_softirq_csd(struct call_single_data *csd) -{ - csd->func = trigger_sched_softirq; - csd->info = NULL; - csd->flags = 0; - csd->priv = 0; -} - /* * idle load balancing details * - One of the idle CPUs nominates itself as idle load_balancer, while @@ -3793,11 +3787,16 @@ static void nohz_balancer_kick(int cpu) } if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { - struct call_single_data *cp; - cpu_rq(ilb_cpu)->nohz_balance_kick = 1; - cp = &per_cpu(remote_sched_softirq_cb, cpu); - __smp_call_function_single(ilb_cpu, cp, 0); + + smp_mb(); + /* + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target cpu which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_send_reschedule(ilb_cpu); } return; } @@ -3879,8 +3878,6 @@ void select_nohz_load_balancer(int stop_tick) static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. @@ -4030,7 +4027,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (time_before(now, nohz.next_balance)) return 0; - if (rq->idle_at_tick) + if (idle_cpu(cpu)) return 0; first_pick_cpu = atomic_read(&nohz.first_pick_cpu); @@ -4066,7 +4063,7 @@ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_at_tick ? + enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 2e74677cb04..85f8bd92a64 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -47,7 +47,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, 0) +SCHED_FEAT(ARCH_POWER, 1) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) diff --git a/kernel/time.c b/kernel/time.c index 8e8dc6d705c..d7760621452 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval); /* * Convert jiffies/jiffies_64 to clock_t and back. */ -clock_t jiffies_to_clock_t(long x) +clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index e4c699dfa4e..13dfaaba406 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -286,6 +286,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0980f0d9a0..8f77da18fef 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -494,6 +494,22 @@ void clocksource_touch_watchdog(void) } /** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs: Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more then 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + +/** * clocksource_max_deferment - Returns max time the clocksource can be deferred * @cs: Pointer to clocksource * @@ -505,25 +521,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/cs->mult which - * is equivalent to the below. - * max_cycles < (2^63)/cs->mult - * max_cycles < 2^(log2((2^63)/cs->mult)) - * max_cycles < 2^(log2(2^63) - log2(cs->mult)) - * max_cycles < 2^(63 - log2(cs->mult)) - * max_cycles < 1 << (63 - log2(cs->mult)) + * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) + * which is equivalent to the below. + * max_cycles < (2^63)/(cs->mult + cs->maxadj) + * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) + * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) + * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) + * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is * determined by the minimum of max_cycles and cs->mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. */ max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, + cs->shift); /* * To ensure that the clocksource does not wrap whilst we are idle, @@ -531,7 +550,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) * note a margin of 12.5% is used because this can be computed with * a shift, versus say 10% which would require division. */ - return max_nsecs - (max_nsecs >> 5); + return max_nsecs - (max_nsecs >> 3); } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -642,7 +661,6 @@ static void clocksource_enqueue(struct clocksource *cs) void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; - /* * Calc the maximum number of seconds which we can run before * wrapping around. For clocksources which have a mask > 32bit @@ -653,7 +671,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) * ~ 0.06ppm granularity for NTP. We apply the same 12.5% * margin as we do in clocksource_max_deferment() */ - sec = (cs->mask - (cs->mask >> 5)); + sec = (cs->mask - (cs->mask >> 3)); do_div(sec, freq); do_div(sec, scale); if (!sec) @@ -663,6 +681,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); + + /* + * for clocksources that have large mults, to avoid overflow. + * Since mult may be adjusted by ntp, add an safety extra margin + * + */ + cs->maxadj = clocksource_max_adjustment(cs); + while ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult)) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + cs->max_idle_ns = clocksource_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -703,6 +735,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); */ int clocksource_register(struct clocksource *cs) { + /* calculate max adjustment for given mult/shift */ + cs->maxadj = clocksource_max_adjustment(cs); + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + /* calculate max idle time permitted for this clocksource */ cs->max_idle_ns = clocksource_max_deferment(cs); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c7218d13273..7a90d021b79 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; - clockevents_exchange_device(NULL, dev); + clockevents_exchange_device(tick_broadcast_device.evtdev, dev); tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(dev); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e850..6f9798bf240 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -249,6 +249,8 @@ ktime_t ktime_get(void) secs = xtime.tv_sec + wall_to_monotonic.tv_sec; nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); } while (read_seqretry(&xtime_lock, seq)); /* @@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) *ts = xtime; tomono = wall_to_monotonic; nsecs = timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); } while (read_seqretry(&xtime_lock, seq)); @@ -820,6 +824,13 @@ static void timekeeping_adjust(s64 offset) } else return; + WARN_ONCE(timekeeper.clock->maxadj && + (timekeeper.mult + adj > timekeeper.clock->mult + + timekeeper.clock->maxadj), + "Adjusting %s more then 11%% (%ld vs %ld)\n", + timekeeper.clock->name, (long)timekeeper.mult + adj, + (long)timekeeper.clock->mult + + timekeeper.clock->maxadj); timekeeper.mult += adj; timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e782..48d3762b828 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,7 +151,6 @@ void clear_ftrace_function(void) ftrace_pid_function = ftrace_stub; } -#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e7aa00caa38..3c9b9686be2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3817,8 +3817,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (info->read < PAGE_SIZE) goto read; - info->read = 0; - trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, @@ -3828,6 +3826,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) return 0; + info->read = 0; + read: size = PAGE_SIZE - info->read; if (size > count) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b066c6dd41a..773cb84adc8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1079,7 +1079,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { - __get_system(system); system->nr_events++; return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 256764ecccd..bd3c6369f80 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1766,7 +1766,7 @@ static int replace_system_preds(struct event_subsystem *system, * replace the filter for the call. */ filter = call->filter; - call->filter = filter_item->filter; + rcu_assign_pointer(call->filter, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1821,7 +1821,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) filter = call->filter; if (!filter) goto out_unlock; - call->filter = NULL; + RCU_INIT_POINTER(call->filter, NULL); /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); @@ -1862,7 +1862,7 @@ out: * string */ tmp = call->filter; - call->filter = filter; + rcu_assign_pointer(call->filter, filter); if (tmp) { /* Make sure the call is done with the filter */ synchronize_sched(); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5fb3697bf0e..00d527c945a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp) } /* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static void unregister_trace_probe(struct trace_probe *tp) +static int unregister_trace_probe(struct trace_probe *tp) { + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(tp)) + return -EBUSY; + __unregister_trace_probe(tp); list_del(&tp->list); unregister_probe_event(tp); + + return 0; } /* Register a trace_probe and probe_event */ @@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp) /* Delete old (same name) event if exist */ old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { - unregister_trace_probe(old_tp); + ret = unregister_trace_probe(old_tp); + if (ret < 0) + goto end; free_trace_probe(old_tp); } @@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb, mutex_lock(&probe_lock); list_for_each_entry(tp, &probe_list, list) { if (trace_probe_within_module(tp, mod)) { + /* Don't need to check busy - this should have gone. */ __unregister_trace_probe(tp); ret = __register_trace_probe(tp); if (ret) @@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv) return -ENOENT; } /* delete an event */ - unregister_trace_probe(tp); - free_trace_probe(tp); + ret = unregister_trace_probe(tp); + if (ret == 0) + free_trace_probe(tp); mutex_unlock(&probe_lock); - return 0; + return ret; } if (argc < 2) { @@ -1317,18 +1327,29 @@ error: return ret; } -static void release_all_trace_probes(void) +static int release_all_trace_probes(void) { struct trace_probe *tp; + int ret = 0; mutex_lock(&probe_lock); + /* Ensure no probe is in use. */ + list_for_each_entry(tp, &probe_list, list) + if (trace_probe_is_enabled(tp)) { + ret = -EBUSY; + goto end; + } /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { tp = list_entry(probe_list.next, struct trace_probe, list); unregister_trace_probe(tp); free_trace_probe(tp); } + +end: mutex_unlock(&probe_lock); + + return ret; } /* Probes listing interfaces */ @@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = { static int probes_open(struct inode *inode, struct file *file) { - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - release_all_trace_probes(); + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = release_all_trace_probes(); + if (ret < 0) + return ret; + } return seq_open(file, &probes_seq_op); } @@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); + /* Disable trace points before removing it */ + tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting test probe.\n"); + warn++; + } else + disable_trace_probe(tp, TP_FLAG_TRACE); + + tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting 2nd test probe.\n"); + warn++; + } else + disable_trace_probe(tp, TP_FLAG_TRACE); + ret = command_trace_probe("-:testprobe"); if (WARN_ON_ONCE(ret)) { pr_warning("error on deleting a probe.\n"); |