diff options
author | Nicolas Pitre <nicolas.pitre@linaro.org> | 2011-11-14 08:44:38 -0500 |
---|---|---|
committer | Nicolas Pitre <nicolas.pitre@linaro.org> | 2011-11-14 08:44:38 -0500 |
commit | 73f1b47518caba450b9bae0940c129e27a6dd163 (patch) | |
tree | fa54617038fa300ed10f69b1b318c19baa50760e | |
parent | c2fdcfc81097255816194a584e6faabd3914cfc5 (diff) | |
parent | 71b2adaed013af61626bcab902ba521ff6cbe928 (diff) |
Merge branch 'sched_mc_for_11.11' of git://git.linaro.org/people/vingu/kernel into linaro-3.1
-rw-r--r-- | arch/arm/include/asm/topology.h | 35 | ||||
-rw-r--r-- | arch/arm/kernel/topology.c | 485 | ||||
-rw-r--r-- | drivers/cpufreq/db8500-cpufreq.c | 7 | ||||
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/sched.c | 27 | ||||
-rw-r--r-- | kernel/sched_fair.c | 57 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 |
7 files changed, 553 insertions, 61 deletions
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index a7e457ed27c..f7f02e392ef 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -25,7 +25,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS]; void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); -const struct cpumask *cpu_coregroup_mask(unsigned int cpu); +const struct cpumask *cpu_coregroup_mask(int cpu); #else @@ -34,6 +34,39 @@ static inline void store_cpu_topology(unsigned int cpuid) { } #endif +/* Common values for CPUs */ +#ifndef SD_CPU_INIT +#define SD_CPU_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .idle_idx = 1, \ + .newidle_idx = 0, \ + .wake_idx = 0, \ + .forkexec_idx = 0, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_BALANCE_WAKE \ + | 1*SD_WAKE_AFFINE \ + | 0*SD_PREFER_LOCAL \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | arch_sd_sibling_asym_packing() \ + | sd_balance_for_package_power() \ + | sd_power_saving_flags() \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ +} +#endif + #include <asm-generic/topology.h> #endif /* _ASM_ARM_TOPOLOGY_H */ diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 1040c00405d..053ce9cbc0b 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -18,6 +18,17 @@ #include <linux/node.h> #include <linux/nodemask.h> #include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> + +#ifdef CONFIG_CPU_FREQ +#include <linux/cpufreq.h> +#endif + +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#include <linux/uaccess.h> /* for copy_from_user */ +#endif #include <asm/cputype.h> #include <asm/topology.h> @@ -43,12 +54,327 @@ struct cputopo_arm cpu_topology[NR_CPUS]; -const struct cpumask *cpu_coregroup_mask(unsigned int cpu) +/* + * cpu power scale management + */ + +/* + * a per cpu data structure should be better because each cpu is mainly + * using its own cpu_power even it's not always true because of + * no_hz_idle_balance + */ + +static DEFINE_PER_CPU(unsigned int, cpu_scale); + +/* + * cpu topology mask management + */ + +unsigned int advanced_topology = 1; + +static void normal_cpu_topology_mask(void); +static void (*set_cpu_topology_mask)(void) = normal_cpu_topology_mask; + +#ifdef CONFIG_CPU_FREQ +/* + * This struct describes parameters to compute cpu_power + */ +struct cputopo_power { + int id; + int max; /* max idx in the table */ + unsigned int step; /* frequency step for the table */ + unsigned int *table; /* table of cpu_power */ +}; + +/* default table with one default cpu_power value */ +unsigned int table_default_power[1] = { + 1024 +}; + +static struct cputopo_power default_cpu_power = { + .max = 1, + .step = 1, + .table = table_default_power, +}; + +/* CA-9 table with cpufreq modifying cpu_power */ +#define CPU_MAX_FREQ 10 +/* we use a 200Mhz step for scaling cpu power */ +#define CPU_TOPO_FREQ_STEP 200000 +/* This table sets the cpu_power scale of a cpu according to 2 inputs which are + * the frequency and the sched_mc mode. The content of this table could be SoC + * specific so we should add a method to overwrite this default table. + * TODO: Study how to use DT for setting this table + */ +unsigned int table_ca9_power[CPU_MAX_FREQ] = { +/* freq< 200 400 600 800 1000 1200 1400 1600 1800 other*/ + 4096, 4096, 4096, 1024, 1024, 1024, 1024, 1024, 1024, 1024, /* Power save mode CA9 MP */ +}; + +static struct cputopo_power CA9_cpu_power = { + .max = CPU_MAX_FREQ, + .step = CPU_TOPO_FREQ_STEP, + .table = table_ca9_power, +}; + +#define ARM_CORTEX_A9_DEFAULT_SCALE 0 +#define ARM_CORTEX_A9_POWER_SCALE 1 +/* This table list all possible cpu power configuration */ +struct cputopo_power *table_config[2] = { + &default_cpu_power, + &CA9_cpu_power, +}; + +struct cputopo_scale { + int id; + int freq; + struct cputopo_power *power; +}; + +/* + * The table will be mostly used by one cpu which will update the + * configuration for all cpu on a cpufreq notification + * or a sched_mc level change + */ +static struct cputopo_scale cpu_power[NR_CPUS]; + +static void set_cpufreq_scale(unsigned int cpuid, unsigned int freq) +{ + unsigned int idx; + + cpu_power[cpuid].freq = freq; + + idx = freq / cpu_power[cpuid].power->step; + if (idx >= cpu_power[cpuid].power->max) + idx = cpu_power[cpuid].power->max - 1; + + per_cpu(cpu_scale, cpuid) = cpu_power[cpuid].power->table[idx]; + smp_wmb(); +} + +static void set_power_scale(unsigned int cpu, unsigned int idx) +{ + cpu_power[cpu].id = idx; + cpu_power[cpu].power = table_config[idx]; + + set_cpufreq_scale(cpu, cpu_power[cpu].freq); +} + +static int topo_cpufreq_transition(struct notifier_block *nb, + unsigned long state, void *data) +{ + struct cpufreq_freqs *freqs = data; + + if (state == CPUFREQ_POSTCHANGE || state == CPUFREQ_RESUMECHANGE) + set_cpufreq_scale(freqs->cpu, freqs->new); + + return NOTIFY_OK; +} + +static struct notifier_block topo_cpufreq_nb = { + .notifier_call = topo_cpufreq_transition, +}; + +static int topo_cpufreq_init(void) +{ + unsigned int cpu; + + /* TODO set initial value according to current freq */ + + /* init core mask */ + for_each_possible_cpu(cpu) { + cpu_power[cpu].freq = 0; + cpu_power[cpu].power = &default_cpu_power; + } + + return cpufreq_register_notifier(&topo_cpufreq_nb, + CPUFREQ_TRANSITION_NOTIFIER); +} +#else +#define ARM_CORTEX_A9_DEFAULT_SCALE 0 +#define ARM_CORTEX_A9_POWER_SCALE 0 +/* This table list all possible cpu power configuration */ +unsigned int table_config[1] = { + 1024, +}; + +static void set_power_scale(unsigned int cpu, unsigned int idx) +{ + per_cpu(cpu_scale, cpu) = table_config[idx]; +} + +static inline int topo_cpufreq_init(void) {return 0; } +#endif + +static int init_cpu_power_scale(void) +{ + /* register cpufreq notifer */ + topo_cpufreq_init(); + + /* Do we need to change default config */ + advanced_topology = 1; + + /* Force a cpu topology update */ + rebuild_sched_domains(); + + return 0; +} + +core_initcall(init_cpu_power_scale); + +/* + * Update the cpu power + */ + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +/* + * sched_domain flag configuration + */ +/* TODO add a config flag for this function */ +int arch_sd_sibling_asym_packing(void) +{ + if (sched_smt_power_savings || sched_mc_power_savings) + return SD_ASYM_PACKING; + return 0; +} + +/* + * default topology function + */ + +const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; } /* + * clear cpu topology masks + */ +static void clear_cpu_topology_mask(void) +{ + unsigned int cpuid; + for_each_possible_cpu(cpuid) { + struct cputopo_arm *cpuid_topo = &(cpu_topology[cpuid]); + cpumask_clear(&cpuid_topo->core_sibling); + cpumask_clear(&cpuid_topo->thread_sibling); + } + smp_wmb(); +} + +/* + * default_cpu_topology_mask set the core and thread mask as described in the + * ARM ARM + */ +static inline void default_cpu_topology_mask(unsigned int cpuid) +{ + struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid]; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + struct cputopo_arm *cpu_topo = &cpu_topology[cpu]; + + if (cpuid_topo->socket_id == cpu_topo->socket_id) { + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); + if (cpu != cpuid) + cpumask_set_cpu(cpu, + &cpuid_topo->core_sibling); + + if (cpuid_topo->core_id == cpu_topo->core_id) { + cpumask_set_cpu(cpuid, + &cpu_topo->thread_sibling); + if (cpu != cpuid) + cpumask_set_cpu(cpu, + &cpuid_topo->thread_sibling); + } + } + } + smp_wmb(); +} + +static void normal_cpu_topology_mask(void) +{ + unsigned int cpuid; + + for_each_possible_cpu(cpuid) { + default_cpu_topology_mask(cpuid); + set_power_scale(cpuid, ARM_CORTEX_A9_DEFAULT_SCALE); + } + smp_wmb(); +} + +/* + * For Cortex-A9 MPcore, we emulate a multi-package topology in power mode. + * The goal is to gathers tasks on 1 virtual package + */ +static void power_cpu_topology_mask_CA9(void) +{ + unsigned int cpuid, cpu; + + for_each_possible_cpu(cpuid) { + struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid]; + + for_each_possible_cpu(cpu) { + struct cputopo_arm *cpu_topo = &cpu_topology[cpu]; + + if ((cpuid_topo->socket_id == cpu_topo->socket_id) + && ((cpuid & 0x1) == (cpu & 0x1))) { + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); + if (cpu != cpuid) + cpumask_set_cpu(cpu, + &cpuid_topo->core_sibling); + + if (cpuid_topo->core_id == cpu_topo->core_id) { + cpumask_set_cpu(cpuid, + &cpu_topo->thread_sibling); + if (cpu != cpuid) + cpumask_set_cpu(cpu, + &cpuid_topo->thread_sibling); + } + } + } + set_power_scale(cpuid, ARM_CORTEX_A9_POWER_SCALE); + } + smp_wmb(); +} + +#define ARM_FAMILY_MASK 0xFF0FFFF0 +#define ARM_CORTEX_A9_FAMILY 0x410FC090 + +/* update_cpu_topology_policy select a cpu topology policy according to the + * available cores. + * TODO: The current version assumes that all cores are exactly the same which + * might not be true. We need to update it to take into account various + * configuration among which system with different kind of core. + */ +static int update_cpu_topology_policy(void) +{ + unsigned long cpuid; + + if (sched_mc_power_savings == POWERSAVINGS_BALANCE_NONE) { + set_cpu_topology_mask = normal_cpu_topology_mask; + return 0; + } + + cpuid = read_cpuid_id(); + cpuid &= ARM_FAMILY_MASK; + + switch (cpuid) { + case ARM_CORTEX_A9_FAMILY: + set_cpu_topology_mask = power_cpu_topology_mask_CA9; + break; + default: + set_cpu_topology_mask = normal_cpu_topology_mask; + break; + } + + return 0; +} + +/* * store_cpu_topology is called at boot when only one cpu is running * and with the mutex cpu_hotplug.lock locked, when several cpus have booted, * which prevents simultaneous write access to cpu_topology array @@ -57,7 +383,6 @@ void store_cpu_topology(unsigned int cpuid) { struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid]; unsigned int mpidr; - unsigned int cpu; /* If the cpu topology has been already set, just return */ if (cpuid_topo->core_id != -1) @@ -99,26 +424,11 @@ void store_cpu_topology(unsigned int cpuid) cpuid_topo->socket_id = -1; } - /* update core and thread sibling masks */ - for_each_possible_cpu(cpu) { - struct cputopo_arm *cpu_topo = &cpu_topology[cpu]; - - if (cpuid_topo->socket_id == cpu_topo->socket_id) { - cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, - &cpuid_topo->core_sibling); - - if (cpuid_topo->core_id == cpu_topo->core_id) { - cpumask_set_cpu(cpuid, - &cpu_topo->thread_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, - &cpuid_topo->thread_sibling); - } - } - } - smp_wmb(); + /* + * The core and thread sibling masks can also be updated during the + * call of arch_update_cpu_topology + */ + default_cpu_topology_mask(cpuid); printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n", cpuid, cpu_topology[cpuid].thread_id, @@ -127,6 +437,27 @@ void store_cpu_topology(unsigned int cpuid) } /* + * arch_update_cpu_topology is called by the scheduler before building + * a new sched_domain hierarchy. + */ +int arch_update_cpu_topology(void) +{ + if (!advanced_topology) + return 0; + + /* clear core threads mask */ + clear_cpu_topology_mask(); + + /* set topology policy */ + update_cpu_topology_policy(); + + /* set topology mask and power */ + (*set_cpu_topology_mask)(); + + return 1; +} + +/* * init_cpu_topology is called at boot when only one cpu is running * which prevent simultaneous write access to cpu_topology array */ @@ -143,6 +474,116 @@ void init_cpu_topology(void) cpu_topo->socket_id = -1; cpumask_clear(&cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); + + per_cpu(cpu_scale, cpu) = SCHED_POWER_SCALE; } smp_wmb(); } + +/* + * debugfs interface for scaling cpu power + */ + +#ifdef CONFIG_DEBUG_FS +static struct dentry *topo_debugfs_root; + +static ssize_t dbg_write(struct file *file, const char __user *buf, + size_t size, loff_t *off) +{ + unsigned int *value = file->f_dentry->d_inode->i_private; + char cdata[128]; + unsigned long tmp; + unsigned int cpu; + + if (size < (sizeof(cdata)-1)) { + if (copy_from_user(cdata, buf, size)) + return -EFAULT; + cdata[size] = 0; + if (!strict_strtoul(cdata, 10, &tmp)) { + *value = tmp; + +#ifdef CONFIG_CPU_FREQ + for_each_online_cpu(cpu) + set_power_scale(cpu, cpu_power[cpu].id); +#endif + } + return size; + } + return -EINVAL; +} + +static ssize_t dbg_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + unsigned int *value = file->f_dentry->d_inode->i_private; + char cdata[128]; + unsigned int len; + + len = sprintf(cdata, "%u\n", *value); + return simple_read_from_buffer(buf, size, off, cdata, len); +} + +static const struct file_operations debugfs_fops = { + .read = dbg_read, + .write = dbg_write, +}; + +static struct dentry *topo_debugfs_register(unsigned int cpu, + struct dentry *parent) +{ + struct dentry *cpu_d, *d; + char cpu_name[16]; + + sprintf(cpu_name, "cpu%u", cpu); + + cpu_d = debugfs_create_dir(cpu_name, parent); + if (!cpu_d) + return NULL; + + d = debugfs_create_file("cpu_power", S_IRUGO | S_IWUGO, + cpu_d, &per_cpu(cpu_scale, cpu), &debugfs_fops); + if (!d) + goto err_out; + +#ifdef CONFIG_CPU_FREQ + d = debugfs_create_file("scale", S_IRUGO | S_IWUGO, + cpu_d, &cpu_power[cpu].id, &debugfs_fops); + if (!d) + goto err_out; + + d = debugfs_create_file("freq", S_IRUGO, + cpu_d, &cpu_power[cpu].freq, &debugfs_fops); + if (!d) + goto err_out; +#endif + return cpu_d; + +err_out: + debugfs_remove_recursive(cpu_d); + return NULL; +} + +static int __init topo_debugfs_init(void) +{ + struct dentry *d; + unsigned int cpu; + + d = debugfs_create_dir("cpu_topo", NULL); + if (!d) + return -ENOMEM; + topo_debugfs_root = d; + + for_each_possible_cpu(cpu) { + d = topo_debugfs_register(cpu, topo_debugfs_root); + if (d == NULL) + goto err_out; + } + return 0; + +err_out: + debugfs_remove_recursive(topo_debugfs_root); + return -ENOMEM; +} + +late_initcall(topo_debugfs_init); +#endif diff --git a/drivers/cpufreq/db8500-cpufreq.c b/drivers/cpufreq/db8500-cpufreq.c index d90456a809f..e0acaceca57 100644 --- a/drivers/cpufreq/db8500-cpufreq.c +++ b/drivers/cpufreq/db8500-cpufreq.c @@ -72,13 +72,13 @@ static int db8500_cpufreq_target(struct cpufreq_policy *policy, freqs.old = policy->cur; freqs.new = freq_table[idx].frequency; - freqs.cpu = policy->cpu; if (freqs.old == freqs.new) return 0; /* pre-change notification */ - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + for_each_cpu(freqs.cpu, policy->cpus) + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); /* request the PRCM unit for opp change */ if (prcmu_set_arm_opp(idx2opp[idx])) { @@ -87,7 +87,8 @@ static int db8500_cpufreq_target(struct cpufreq_policy *policy, } /* post change notification */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + for_each_cpu(freqs.cpu, policy->cpus) + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); return 0; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 41d0237fd44..8610921984b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -901,6 +901,7 @@ struct sched_group_power { * single CPU. */ unsigned int power, power_orig; + unsigned long next_update; }; struct sched_group { diff --git a/kernel/sched.c b/kernel/sched.c index b50b0f0c9aa..036311afeed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -510,7 +510,7 @@ struct rq { unsigned long cpu_power; - unsigned char idle_at_tick; + unsigned char idle_balance; /* For active balancing */ int post_schedule; int active_balance; @@ -1272,6 +1272,18 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static inline bool got_nohz_idle_kick(void) +{ + return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -2591,7 +2603,7 @@ void scheduler_ipi(void) struct rq *rq = this_rq(); struct task_struct *list = xchg(&rq->wake_list, NULL); - if (!list) + if ((!list) && !got_nohz_idle_kick()) return; /* @@ -2609,6 +2621,14 @@ void scheduler_ipi(void) */ irq_enter(); sched_ttwu_do_pending(list); + + /* + * Check if someone kicked us for doing the nohz idle load balance. + */ + if (unlikely(got_nohz_idle_kick() && !need_resched())) { + this_rq()->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } irq_exit(); } @@ -4116,7 +4136,7 @@ void scheduler_tick(void) perf_event_task_tick(); #ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); + rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } @@ -8133,7 +8153,6 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_balance_kick = 0; - init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); #endif #endif init_rq_hrtick(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bc8ee999381..2107c3c3049 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -91,6 +91,8 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; static const struct sched_class fair_sched_class; +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -2667,6 +2669,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgp->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); @@ -2774,12 +2781,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); + if (local_group) { + if (idle != CPU_NEWLY_IDLE) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); + } else if (time_after_eq(jiffies, group->sgp->next_update)) + update_group_power(sd, this_cpu); } /* Adjust by relative CPU power of the group */ @@ -3612,22 +3622,6 @@ out_unlock: } #ifdef CONFIG_NO_HZ - -static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); - -static void trigger_sched_softirq(void *data) -{ - raise_softirq_irqoff(SCHED_SOFTIRQ); -} - -static inline void init_sched_softirq_csd(struct call_single_data *csd) -{ - csd->func = trigger_sched_softirq; - csd->info = NULL; - csd->flags = 0; - csd->priv = 0; -} - /* * idle load balancing details * - One of the idle CPUs nominates itself as idle load_balancer, while @@ -3793,11 +3787,16 @@ static void nohz_balancer_kick(int cpu) } if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { - struct call_single_data *cp; - cpu_rq(ilb_cpu)->nohz_balance_kick = 1; - cp = &per_cpu(remote_sched_softirq_cb, cpu); - __smp_call_function_single(ilb_cpu, cp, 0); + + smp_mb(); + /* + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target cpu which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_send_reschedule(ilb_cpu); } return; } @@ -3879,8 +3878,6 @@ void select_nohz_load_balancer(int stop_tick) static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. @@ -4030,7 +4027,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (time_before(now, nohz.next_balance)) return 0; - if (rq->idle_at_tick) + if (idle_cpu(cpu)) return 0; first_pick_cpu = atomic_read(&nohz.first_pick_cpu); @@ -4066,7 +4063,7 @@ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_at_tick ? + enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 2e74677cb04..85f8bd92a64 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -47,7 +47,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, 0) +SCHED_FEAT(ARCH_POWER, 1) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) |