Merge branch 'sched_mc_for_11.11' of git://git.linaro.org/people/vingu/kernel into linaro-3.1

author: Nicolas Pitre <nicolas.pitre@linaro.org> 2011-11-14 08:44:38 -0500
committer: Nicolas Pitre <nicolas.pitre@linaro.org> 2011-11-14 08:44:38 -0500
commit: 73f1b47518caba450b9bae0940c129e27a6dd163 (patch)
tree: fa54617038fa300ed10f69b1b318c19baa50760e
parent: c2fdcfc81097255816194a584e6faabd3914cfc5 (diff)
parent: 71b2adaed013af61626bcab902ba521ff6cbe928 (diff)
7 files changed, 553 insertions, 61 deletions
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index a7e457ed27c..f7f02e392ef 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -25,7 +25,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
+const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #else
 
@@ -34,6 +34,39 @@ static inline void store_cpu_topology(unsigned int cpuid) { }
 
 #endif
 
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) {				\
+	.min_interval		= 1,					\
+	.max_interval		= 4,					\
+	.busy_factor		= 64,					\
+	.imbalance_pct		= 125,					\
+	.cache_nice_tries	= 1,					\
+	.busy_idx		= 2,					\
+	.idle_idx		= 1,					\
+	.newidle_idx		= 0,					\
+	.wake_idx		= 0,					\
+	.forkexec_idx		= 0,					\
+									\
+	.flags			= 1*SD_LOAD_BALANCE			\
+				| 1*SD_BALANCE_NEWIDLE			\
+				| 1*SD_BALANCE_EXEC			\
+				| 1*SD_BALANCE_FORK			\
+				| 0*SD_BALANCE_WAKE			\
+				| 1*SD_WAKE_AFFINE			\
+				| 0*SD_PREFER_LOCAL			\
+				| 0*SD_SHARE_CPUPOWER			\
+				| 0*SD_SHARE_PKG_RESOURCES		\
+				| 0*SD_SERIALIZE			\
+				| arch_sd_sibling_asym_packing()	\
+				| sd_balance_for_package_power()	\
+				| sd_power_saving_flags()		\
+				,					\
+	.last_balance		= jiffies,				\
+	.balance_interval	= 1,					\
+}
+#endif
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 1040c00405d..053ce9cbc0b 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -18,6 +18,17 @@
 #include <linux/node.h>
 #include <linux/nodemask.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+
+#ifdef CONFIG_CPU_FREQ
+#include <linux/cpufreq.h>
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>	/* for copy_from_user */
+#endif
 
 #include <asm/cputype.h>
 #include <asm/topology.h>
@@ -43,12 +54,327 @@
 
 struct cputopo_arm cpu_topology[NR_CPUS];
 
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+/*
+ * cpu power scale management
+ */
+
+/*
+ * a per cpu data structure should be better because each cpu is mainly
+ * using its own cpu_power even it's not always true because of
+ * no_hz_idle_balance
+ */
+
+static DEFINE_PER_CPU(unsigned int, cpu_scale);
+
+/*
+ * cpu topology mask management
+ */
+
+unsigned int advanced_topology = 1;
+
+static void normal_cpu_topology_mask(void);
+static void (*set_cpu_topology_mask)(void) = normal_cpu_topology_mask;
+
+#ifdef CONFIG_CPU_FREQ
+/*
+ * This struct describes parameters to compute cpu_power
+ */
+struct cputopo_power {
+	int id;
+	int max; /* max idx in the table */
+	unsigned int step; /* frequency step for the table */
+	unsigned int *table; /* table of cpu_power */
+};
+
+/* default table with one default cpu_power value */
+unsigned int table_default_power[1] = {
+	1024
+};
+
+static struct cputopo_power default_cpu_power = {
+	.max  = 1,
+	.step = 1,
+	.table = table_default_power,
+};
+
+/* CA-9 table with cpufreq modifying cpu_power */
+#define CPU_MAX_FREQ 10
+/* we use a 200Mhz step for scaling cpu power */
+#define CPU_TOPO_FREQ_STEP 200000
+/* This table sets the cpu_power scale of a cpu according to 2 inputs which are
+ * the frequency and the sched_mc mode. The content of this table could be SoC
+ * specific so we should add a method to overwrite this default table.
+ * TODO: Study how to use DT for setting this table
+ */
+unsigned int table_ca9_power[CPU_MAX_FREQ] = {
+/* freq< 200   400   600   800  1000  1200  1400  1600  1800  other*/
+	4096, 4096, 4096, 1024, 1024, 1024, 1024, 1024, 1024, 1024, /* Power save mode CA9 MP */
+};
+
+static struct cputopo_power CA9_cpu_power = {
+	.max  = CPU_MAX_FREQ,
+	.step = CPU_TOPO_FREQ_STEP,
+	.table = table_ca9_power,
+};
+
+#define ARM_CORTEX_A9_DEFAULT_SCALE 0
+#define ARM_CORTEX_A9_POWER_SCALE 1
+/* This table list all possible cpu power configuration */
+struct cputopo_power *table_config[2] = {
+	&default_cpu_power,
+	&CA9_cpu_power,
+};
+
+struct cputopo_scale {
+	int id;
+	int freq;
+	struct cputopo_power *power;
+};
+
+/*
+ * The table will be mostly used by one cpu which will update the
+ * configuration for all cpu on a cpufreq notification
+ * or a sched_mc level change
+ */
+static struct cputopo_scale cpu_power[NR_CPUS];
+
+static void set_cpufreq_scale(unsigned int cpuid, unsigned int freq)
+{
+	unsigned int idx;
+
+	cpu_power[cpuid].freq = freq;
+
+	idx = freq / cpu_power[cpuid].power->step;
+	if (idx >= cpu_power[cpuid].power->max)
+		idx = cpu_power[cpuid].power->max - 1;
+
+	per_cpu(cpu_scale, cpuid) = cpu_power[cpuid].power->table[idx];
+	smp_wmb();
+}
+
+static void set_power_scale(unsigned int cpu, unsigned int idx)
+{
+	cpu_power[cpu].id = idx;
+	cpu_power[cpu].power = table_config[idx];
+
+	set_cpufreq_scale(cpu, cpu_power[cpu].freq);
+}
+
+static int topo_cpufreq_transition(struct notifier_block *nb,
+	unsigned long state, void *data)
+{
+	struct cpufreq_freqs *freqs = data;
+
+	if (state == CPUFREQ_POSTCHANGE || state == CPUFREQ_RESUMECHANGE)
+		set_cpufreq_scale(freqs->cpu, freqs->new);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block topo_cpufreq_nb = {
+	.notifier_call = topo_cpufreq_transition,
+};
+
+static int topo_cpufreq_init(void)
+{
+	unsigned int cpu;
+
+	/* TODO set initial value according to current freq */
+
+	/* init core mask */
+	for_each_possible_cpu(cpu) {
+		cpu_power[cpu].freq = 0;
+		cpu_power[cpu].power = &default_cpu_power;
+	}
+
+	return cpufreq_register_notifier(&topo_cpufreq_nb,
+			CPUFREQ_TRANSITION_NOTIFIER);
+}
+#else
+#define ARM_CORTEX_A9_DEFAULT_SCALE 0
+#define ARM_CORTEX_A9_POWER_SCALE 0
+/* This table list all possible cpu power configuration */
+unsigned int table_config[1] = {
+	1024,
+};
+
+static void set_power_scale(unsigned int cpu, unsigned int idx)
+{
+	per_cpu(cpu_scale, cpu) = table_config[idx];
+}
+
+static inline int topo_cpufreq_init(void) {return 0; }
+#endif
+
+static int init_cpu_power_scale(void)
+{
+	/* register cpufreq notifer */
+	topo_cpufreq_init();
+
+	/* Do we need to change default config */
+	advanced_topology = 1;
+
+	/* Force a cpu topology update */
+	rebuild_sched_domains();
+
+	return 0;
+}
+
+core_initcall(init_cpu_power_scale);
+
+/*
+ * Update the cpu power
+ */
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_scale, cpu);
+}
+
+/*
+ * sched_domain flag configuration
+ */
+/* TODO add a config flag for this function */
+int arch_sd_sibling_asym_packing(void)
+{
+	if (sched_smt_power_savings || sched_mc_power_savings)
+		return SD_ASYM_PACKING;
+	return 0;
+}
+
+/*
+ * default topology function
+ */
+
+const struct cpumask *cpu_coregroup_mask(int cpu)
 {
 	return &cpu_topology[cpu].core_sibling;
 }
 
 /*
+ * clear cpu topology masks
+ */
+static void clear_cpu_topology_mask(void)
+{
+	unsigned int cpuid;
+	for_each_possible_cpu(cpuid) {
+		struct cputopo_arm *cpuid_topo = &(cpu_topology[cpuid]);
+		cpumask_clear(&cpuid_topo->core_sibling);
+		cpumask_clear(&cpuid_topo->thread_sibling);
+	}
+	smp_wmb();
+}
+
+/*
+ * default_cpu_topology_mask set the core and thread mask as described in the
+ * ARM ARM
+ */
+static inline void default_cpu_topology_mask(unsigned int cpuid)
+{
+	struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid];
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cputopo_arm *cpu_topo = &cpu_topology[cpu];
+
+		if (cpuid_topo->socket_id == cpu_topo->socket_id) {
+			cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
+			if (cpu != cpuid)
+				cpumask_set_cpu(cpu,
+					&cpuid_topo->core_sibling);
+
+			if (cpuid_topo->core_id == cpu_topo->core_id) {
+				cpumask_set_cpu(cpuid,
+					&cpu_topo->thread_sibling);
+				if (cpu != cpuid)
+					cpumask_set_cpu(cpu,
+						&cpuid_topo->thread_sibling);
+			}
+		}
+	}
+	smp_wmb();
+}
+
+static void normal_cpu_topology_mask(void)
+{
+	unsigned int cpuid;
+
+	for_each_possible_cpu(cpuid) {
+		default_cpu_topology_mask(cpuid);
+		set_power_scale(cpuid, ARM_CORTEX_A9_DEFAULT_SCALE);
+	}
+	smp_wmb();
+}
+
+/*
+ * For Cortex-A9 MPcore, we emulate a multi-package topology in power mode.
+ * The goal is to gathers tasks on 1 virtual package
+ */
+static void power_cpu_topology_mask_CA9(void)
+{
+	unsigned int cpuid, cpu;
+
+	for_each_possible_cpu(cpuid) {
+		struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid];
+
+		for_each_possible_cpu(cpu) {
+			struct cputopo_arm *cpu_topo = &cpu_topology[cpu];
+
+			if ((cpuid_topo->socket_id == cpu_topo->socket_id)
+			&& ((cpuid & 0x1) == (cpu & 0x1))) {
+				cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
+				if (cpu != cpuid)
+					cpumask_set_cpu(cpu,
+						&cpuid_topo->core_sibling);
+
+				if (cpuid_topo->core_id == cpu_topo->core_id) {
+					cpumask_set_cpu(cpuid,
+						&cpu_topo->thread_sibling);
+					if (cpu != cpuid)
+						cpumask_set_cpu(cpu,
+							&cpuid_topo->thread_sibling);
+				}
+			}
+		}
+		set_power_scale(cpuid, ARM_CORTEX_A9_POWER_SCALE);
+	}
+	smp_wmb();
+}
+
+#define ARM_FAMILY_MASK 0xFF0FFFF0
+#define ARM_CORTEX_A9_FAMILY 0x410FC090
+
+/* update_cpu_topology_policy select a cpu topology policy according to the
+ * available cores.
+ * TODO: The current version assumes that all cores are exactly the same which
+ * might not be true. We need to update it to take into account various
+ * configuration among which system with different kind of core.
+ */
+static int update_cpu_topology_policy(void)
+{
+	unsigned long cpuid;
+
+	if (sched_mc_power_savings == POWERSAVINGS_BALANCE_NONE) {
+		set_cpu_topology_mask = normal_cpu_topology_mask;
+		return 0;
+	}
+
+	cpuid = read_cpuid_id();
+	cpuid &= ARM_FAMILY_MASK;
+
+	switch (cpuid) {
+	case ARM_CORTEX_A9_FAMILY:
+		set_cpu_topology_mask = power_cpu_topology_mask_CA9;
+	break;
+	default:
+		set_cpu_topology_mask = normal_cpu_topology_mask;
+	break;
+	}
+
+	return 0;
+}
+
+/*
  * store_cpu_topology is called at boot when only one cpu is running
  * and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
  * which prevents simultaneous write access to cpu_topology array
@@ -57,7 +383,6 @@ void store_cpu_topology(unsigned int cpuid)
 {
 	struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid];
 	unsigned int mpidr;
-	unsigned int cpu;
 
 	/* If the cpu topology has been already set, just return */
 	if (cpuid_topo->core_id != -1)
@@ -99,26 +424,11 @@ void store_cpu_topology(unsigned int cpuid)
 		cpuid_topo->socket_id = -1;
 	}
 
-	/* update core and thread sibling masks */
-	for_each_possible_cpu(cpu) {
-		struct cputopo_arm *cpu_topo = &cpu_topology[cpu];
-
-		if (cpuid_topo->socket_id == cpu_topo->socket_id) {
-			cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
-			if (cpu != cpuid)
-				cpumask_set_cpu(cpu,
-					&cpuid_topo->core_sibling);
-
-			if (cpuid_topo->core_id == cpu_topo->core_id) {
-				cpumask_set_cpu(cpuid,
-					&cpu_topo->thread_sibling);
-				if (cpu != cpuid)
-					cpumask_set_cpu(cpu,
-						&cpuid_topo->thread_sibling);
-			}
-		}
-	}
-	smp_wmb();
+	/*
+	 * The core and thread sibling masks can also be updated during the
+	 * call of arch_update_cpu_topology
+	 */
+	default_cpu_topology_mask(cpuid);
 
 	printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
 		cpuid, cpu_topology[cpuid].thread_id,
@@ -127,6 +437,27 @@ void store_cpu_topology(unsigned int cpuid)
 }
 
 /*
+ * arch_update_cpu_topology is called by the scheduler before building
+ * a new sched_domain hierarchy.
+ */
+int arch_update_cpu_topology(void)
+{
+	if (!advanced_topology)
+		return 0;
+
+	/* clear core threads mask */
+	clear_cpu_topology_mask();
+
+	/* set topology policy */
+	update_cpu_topology_policy();
+
+	/* set topology mask and power */
+	(*set_cpu_topology_mask)();
+
+	return 1;
+}
+
+/*
  * init_cpu_topology is called at boot when only one cpu is running
  * which prevent simultaneous write access to cpu_topology array
  */
@@ -143,6 +474,116 @@ void init_cpu_topology(void)
 		cpu_topo->socket_id = -1;
 		cpumask_clear(&cpu_topo->core_sibling);
 		cpumask_clear(&cpu_topo->thread_sibling);
+
+		per_cpu(cpu_scale, cpu) = SCHED_POWER_SCALE;
 	}
 	smp_wmb();
 }
+
+/*
+ * debugfs interface for scaling cpu power
+ */
+
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *topo_debugfs_root;
+
+static ssize_t dbg_write(struct file *file, const char __user *buf,
+						size_t size, loff_t *off)
+{
+	unsigned int *value = file->f_dentry->d_inode->i_private;
+	char cdata[128];
+	unsigned long tmp;
+	unsigned int cpu;
+
+	if (size < (sizeof(cdata)-1)) {
+		if (copy_from_user(cdata, buf, size))
+			return -EFAULT;
+		cdata[size] = 0;
+		if (!strict_strtoul(cdata, 10, &tmp)) {
+			*value = tmp;
+
+#ifdef CONFIG_CPU_FREQ
+			for_each_online_cpu(cpu)
+				set_power_scale(cpu, cpu_power[cpu].id);
+#endif
+		}
+		return size;
+	}
+	return -EINVAL;
+}
+
+static ssize_t dbg_read(struct file *file, char __user *buf,
+						size_t size, loff_t *off)
+{
+	unsigned int *value = file->f_dentry->d_inode->i_private;
+	char cdata[128];
+	unsigned int len;
+
+	len = sprintf(cdata, "%u\n", *value);
+	return simple_read_from_buffer(buf, size, off, cdata, len);
+}
+
+static const struct file_operations debugfs_fops = {
+	.read = dbg_read,
+	.write = dbg_write,
+};
+
+static struct dentry *topo_debugfs_register(unsigned int cpu,
+						struct dentry *parent)
+{
+	struct dentry *cpu_d, *d;
+	char cpu_name[16];
+
+	sprintf(cpu_name, "cpu%u", cpu);
+
+	cpu_d = debugfs_create_dir(cpu_name, parent);
+	if (!cpu_d)
+		return NULL;
+
+	d = debugfs_create_file("cpu_power", S_IRUGO  | S_IWUGO,
+				cpu_d, &per_cpu(cpu_scale, cpu), &debugfs_fops);
+	if (!d)
+		goto err_out;
+
+#ifdef CONFIG_CPU_FREQ
+	d = debugfs_create_file("scale", S_IRUGO | S_IWUGO,
+				cpu_d, &cpu_power[cpu].id, &debugfs_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file("freq", S_IRUGO,
+				cpu_d, &cpu_power[cpu].freq, &debugfs_fops);
+	if (!d)
+		goto err_out;
+#endif
+	return cpu_d;
+
+err_out:
+	debugfs_remove_recursive(cpu_d);
+	return NULL;
+}
+
+static int __init topo_debugfs_init(void)
+{
+	struct dentry *d;
+	unsigned int cpu;
+
+	d = debugfs_create_dir("cpu_topo", NULL);
+	if (!d)
+		return -ENOMEM;
+	topo_debugfs_root = d;
+
+	for_each_possible_cpu(cpu) {
+		d = topo_debugfs_register(cpu, topo_debugfs_root);
+		if (d == NULL)
+			goto err_out;
+	}
+	return 0;
+
+err_out:
+	debugfs_remove_recursive(topo_debugfs_root);
+	return -ENOMEM;
+}
+
+late_initcall(topo_debugfs_init);
+#endif
diff --git a/drivers/cpufreq/db8500-cpufreq.c b/drivers/cpufreq/db8500-cpufreq.c
index d90456a809f..e0acaceca57 100644
--- a/drivers/cpufreq/db8500-cpufreq.c
+++ b/drivers/cpufreq/db8500-cpufreq.c
@@ -72,13 +72,13 @@ static int db8500_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = policy->cur;
 	freqs.new = freq_table[idx].frequency;
-	freqs.cpu = policy->cpu;
 
 	if (freqs.old == freqs.new)
 		return 0;
 
 	/* pre-change notification */
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	for_each_cpu(freqs.cpu, policy->cpus)
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
 	/* request the PRCM unit for opp change */
 	if (prcmu_set_arm_opp(idx2opp[idx])) {
@@ -87,7 +87,8 @@ static int db8500_cpufreq_target(struct cpufreq_policy *policy,
 	}
 
 	/* post change notification */
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	for_each_cpu(freqs.cpu, policy->cpus)
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41d0237fd44..8610921984b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -901,6 +901,7 @@ struct sched_group_power {
 	 * single CPU.
 	 */
 	unsigned int power, power_orig;
+	unsigned long next_update;
 };
 
 struct sched_group {
diff --git a/kernel/sched.c b/kernel/sched.c
index b50b0f0c9aa..036311afeed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -510,7 +510,7 @@ struct rq {
 
 	unsigned long cpu_power;
 
-	unsigned char idle_at_tick;
+	unsigned char idle_balance;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
@@ -1272,6 +1272,18 @@ void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
+static inline bool got_nohz_idle_kick(void)
+{
+	return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+}
+
+#else /* CONFIG_NO_HZ */
+
+static inline bool got_nohz_idle_kick(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -2591,7 +2603,7 @@ void scheduler_ipi(void)
 	struct rq *rq = this_rq();
 	struct task_struct *list = xchg(&rq->wake_list, NULL);
 
-	if (!list)
+	if ((!list) && !got_nohz_idle_kick())
 		return;
 
 	/*
@@ -2609,6 +2621,14 @@ void scheduler_ipi(void)
 	 */
 	irq_enter();
 	sched_ttwu_do_pending(list);
+
+	/*
+	 * Check if someone kicked us for doing the nohz idle load balance.
+	 */
+	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+		this_rq()->idle_balance = 1;
+		raise_softirq_irqoff(SCHED_SOFTIRQ);
+	}
 	irq_exit();
 }
 
@@ -4116,7 +4136,7 @@ void scheduler_tick(void)
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_at_tick = idle_cpu(cpu);
+	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
@@ -8133,7 +8153,6 @@ void __init sched_init(void)
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_balance_kick = 0;
-		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
 #endif
 #endif
 		init_rq_hrtick(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee999381..2107c3c3049 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -91,6 +91,8 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 
 static const struct sched_class fair_sched_class;
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -2667,6 +2669,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power;
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(sd->balance_interval);
+	interval = clamp(interval, 1UL, max_load_balance_interval);
+	sdg->sgp->next_update = jiffies + interval;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
@@ -2774,12 +2781,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	 * domains. In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
-	if (idle != CPU_NEWLY_IDLE && local_group) {
-		if (balance_cpu != this_cpu) {
-			*balance = 0;
-			return;
-		}
-		update_group_power(sd, this_cpu);
+	if (local_group) {
+		if (idle != CPU_NEWLY_IDLE) {
+			if (balance_cpu != this_cpu) {
+				*balance = 0;
+				return;
+			}
+			update_group_power(sd, this_cpu);
+		} else if (time_after_eq(jiffies, group->sgp->next_update))
+			update_group_power(sd, this_cpu);
 	}
 
 	/* Adjust by relative CPU power of the group */
@@ -3612,22 +3622,6 @@ out_unlock:
 }
 
 #ifdef CONFIG_NO_HZ
-
-static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
-
-static void trigger_sched_softirq(void *data)
-{
-	raise_softirq_irqoff(SCHED_SOFTIRQ);
-}
-
-static inline void init_sched_softirq_csd(struct call_single_data *csd)
-{
-	csd->func = trigger_sched_softirq;
-	csd->info = NULL;
-	csd->flags = 0;
-	csd->priv = 0;
-}
-
 /*
  * idle load balancing details
  * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3793,11 +3787,16 @@ static void nohz_balancer_kick(int cpu)
 	}
 
 	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
-		struct call_single_data *cp;
-
 		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-		cp = &per_cpu(remote_sched_softirq_cb, cpu);
-		__smp_call_function_single(ilb_cpu, cp, 0);
+
+		smp_mb();
+		/*
+		 * Use smp_send_reschedule() instead of resched_cpu().
+		 * This way we generate a sched IPI on the target cpu which
+		 * is idle. And the softirq performing nohz idle load balance
+		 * will be run before returning from the IPI.
+		 */
+		smp_send_reschedule(ilb_cpu);
 	}
 	return;
 }
@@ -3879,8 +3878,6 @@ void select_nohz_load_balancer(int stop_tick)
 
 static DEFINE_SPINLOCK(balancing);
 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
@@ -4030,7 +4027,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 	if (time_before(now, nohz.next_balance))
 		return 0;
 
-	if (rq->idle_at_tick)
+	if (idle_cpu(cpu))
 		return 0;
 
 	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4063,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
-	enum cpu_idle_type idle = this_rq->idle_at_tick ?
+	enum cpu_idle_type idle = this_rq->idle_balance ?
 						CPU_IDLE : CPU_NOT_IDLE;
 
 	rebalance_domains(this_cpu, idle);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb04..85f8bd92a64 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -47,7 +47,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 /*
  * Use arch dependent cpu power functions
  */
-SCHED_FEAT(ARCH_POWER, 0)
+SCHED_FEAT(ARCH_POWER, 1)
 
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
author	Nicolas Pitre <nicolas.pitre@linaro.org>	2011-11-14 08:44:38 -0500
committer	Nicolas Pitre <nicolas.pitre@linaro.org>	2011-11-14 08:44:38 -0500
commit	73f1b47518caba450b9bae0940c129e27a6dd163 (patch)
tree	fa54617038fa300ed10f69b1b318c19baa50760e
parent	c2fdcfc81097255816194a584e6faabd3914cfc5 (diff)
parent	71b2adaed013af61626bcab902ba521ff6cbe928 (diff)