From b820cdb85eb3ba38404f09c4a89b0607aabf5e71 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 14 Nov 2011 10:16:23 +0100
Subject: sched: Use resched IPI to kick off the nohz idle balance

Current use of smp call function to kick the nohz idle balance can deadlock
in this scenario.

1. cpu-A did a generic_exec_single() to cpu-B and after queuing its call single
data (csd) to the call single queue, cpu-A took a timer interrupt.  Actual IPI
to cpu-B to process the call single queue is not yet sent.

2. As part of the timer interrupt handler, cpu-A decided to kick cpu-B
for the idle load balancing (sets cpu-B's rq->nohz_balance_kick to 1)
and __smp_call_function_single() with nowait will queue the csd to the
cpu-B's queue. But the generic_exec_single() won't send an IPI to cpu-B
as the call single queue was not empty.

3. cpu-A is busy with lot of interrupts

4. Meanwhile cpu-B is entering and exiting idle and noticed that it has
it's rq->nohz_balance_kick set to '1'. So it will go ahead and do the
idle load balancer and clear its rq->nohz_balance_kick.

5. At this point, csd queued as part of the step-2 above is still locked
and waiting to be serviced on cpu-B.

6. cpu-A is still busy with interrupt load and now it got another timer
interrupt and as part of it decided to kick cpu-B for another idle load
balancing (as it finds cpu-B's rq->nohz_balance_kick cleared in step-4
above) and does __smp_call_function_single() with the same csd that is
still locked.

7. And we get a deadlock waiting for the csd_lock() in the
__smp_call_function_single().

Main issue here is that cpu-B can service the idle load balancer kick
request from cpu-A even with out receiving the IPI and this lead to
doing multiple __smp_call_function_single() on the same csd leading to
deadlock.

To kick a cpu, scheduler already has the reschedule vector reserved. Use
that mechanism (kick_process()) instead of using the generic smp call function
mechanism to kick off the nohz idle load balancing and avoid the deadlock.

   [ This issue is present from 2.6.35+ kernels, but marking it -stable
     only from v3.0+ as the proposed fix depends on the scheduler_ipi()
     that is introduced recently. ]

Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: stable@kernel.org # v3.0+
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111003220934.834943260@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 21 +++++++++++++++++++--
 kernel/sched_fair.c | 29 +++++++++--------------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index b50b0f0c9aa..b57cf9e09b5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1272,6 +1272,18 @@ void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
+static inline bool got_nohz_idle_kick(void)
+{
+	return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+}
+
+#else /* CONFIG_NO_HZ */
+
+static inline bool got_nohz_idle_kick(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -2591,7 +2603,7 @@ void scheduler_ipi(void)
 	struct rq *rq = this_rq();
 	struct task_struct *list = xchg(&rq->wake_list, NULL);
 
-	if (!list)
+	if ((!list) && !got_nohz_idle_kick())
 		return;
 
 	/*
@@ -2609,6 +2621,12 @@ void scheduler_ipi(void)
 	 */
 	irq_enter();
 	sched_ttwu_do_pending(list);
+
+	/*
+	 * Check if someone kicked us for doing the nohz idle load balance.
+	 */
+	if (unlikely(got_nohz_idle_kick() && !need_resched()))
+		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	irq_exit();
 }
 
@@ -8133,7 +8151,6 @@ void __init sched_init(void)
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_balance_kick = 0;
-		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
 #endif
 #endif
 		init_rq_hrtick(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee999381..22cbad091ba 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3612,22 +3612,6 @@ out_unlock:
 }
 
 #ifdef CONFIG_NO_HZ
-
-static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
-
-static void trigger_sched_softirq(void *data)
-{
-	raise_softirq_irqoff(SCHED_SOFTIRQ);
-}
-
-static inline void init_sched_softirq_csd(struct call_single_data *csd)
-{
-	csd->func = trigger_sched_softirq;
-	csd->info = NULL;
-	csd->flags = 0;
-	csd->priv = 0;
-}
-
 /*
  * idle load balancing details
  * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3793,11 +3777,16 @@ static void nohz_balancer_kick(int cpu)
 	}
 
 	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
-		struct call_single_data *cp;
-
 		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-		cp = &per_cpu(remote_sched_softirq_cb, cpu);
-		__smp_call_function_single(ilb_cpu, cp, 0);
+
+		smp_mb();
+		/*
+		 * Use smp_send_reschedule() instead of resched_cpu().
+		 * This way we generate a sched IPI on the target cpu which
+		 * is idle. And the softirq performing nohz idle load balance
+		 * will be run before returning from the IPI.
+		 */
+		smp_send_reschedule(ilb_cpu);
 	}
 	return;
 }
-- 
cgit v1.2.3


From e10d3fe8389c3fd1037d4ac4c34b01186477c1af Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 3 Oct 2011 15:09:01 -0700
Subject: sched: Request for idle balance during nohz idle load balance

rq's idle_at_tick is set to idle/busy during the timer tick
depending on the cpu was idle or not. This will be used later in the load
balance that will be done in the softirq context (which is a process
context in -RT kernels).

For nohz kernels, for the cpu doing nohz idle load balance on behalf of
all the idle cpu's, its rq->idle_at_tick might have a stale value (which is
recorded when it got the timer tick presumably when it is busy).

As the nohz idle load balancing is also being done at the same place
as the regular load balancing, nohz idle load balancing was bailing out
when it sees rq's idle_at_tick not set.

Thus leading to poor system utilization.

Rename rq's idle_at_tick to idle_balance and set it when someone requests
for nohz idle balance on an idle cpu.

Reported-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111003220934.892350549@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 8 +++++---
 kernel/sched_fair.c | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index b57cf9e09b5..036311afeed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -510,7 +510,7 @@ struct rq {
 
 	unsigned long cpu_power;
 
-	unsigned char idle_at_tick;
+	unsigned char idle_balance;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
@@ -2625,8 +2625,10 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick() && !need_resched()))
+	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
+	}
 	irq_exit();
 }
 
@@ -4134,7 +4136,7 @@ void scheduler_tick(void)
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_at_tick = idle_cpu(cpu);
+	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 22cbad091ba..e476a5abca0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4019,7 +4019,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 	if (time_before(now, nohz.next_balance))
 		return 0;
 
-	if (rq->idle_at_tick)
+	if (idle_cpu(cpu))
 		return 0;
 
 	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4055,7 +4055,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
-	enum cpu_idle_type idle = this_rq->idle_at_tick ?
+	enum cpu_idle_type idle = this_rq->idle_balance ?
 						CPU_IDLE : CPU_NOT_IDLE;
 
 	rebalance_domains(this_cpu, idle);
-- 
cgit v1.2.3


From 4fc861754b3a20632fd883cf0d8a12cbfa9138ee Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 17 Aug 2011 10:09:12 +0200
Subject: ARM: ux500: send cpufreq notification on all cores

The same clock is used for all cpus so we must notify the frequency change
for each one in order to update the configuration of all twd clockevent.

change since V1:
* use policy->cpus instead of online_cpu

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 drivers/cpufreq/db8500-cpufreq.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/db8500-cpufreq.c b/drivers/cpufreq/db8500-cpufreq.c
index d90456a809f..e0acaceca57 100644
--- a/drivers/cpufreq/db8500-cpufreq.c
+++ b/drivers/cpufreq/db8500-cpufreq.c
@@ -72,13 +72,13 @@ static int db8500_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = policy->cur;
 	freqs.new = freq_table[idx].frequency;
-	freqs.cpu = policy->cpu;
 
 	if (freqs.old == freqs.new)
 		return 0;
 
 	/* pre-change notification */
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	for_each_cpu(freqs.cpu, policy->cpus)
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
 	/* request the PRCM unit for opp change */
 	if (prcmu_set_arm_opp(idx2opp[idx])) {
@@ -87,7 +87,8 @@ static int db8500_cpufreq_target(struct cpufreq_policy *policy,
 	}
 
 	/* post change notification */
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	for_each_cpu(freqs.cpu, policy->cpus)
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
-- 
cgit v1.2.3


From d78f58161a7f5bbf304955bc01b455110578b9bc Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 21 Oct 2011 09:02:47 +0200
Subject: sched: Ensure cpu_power periodic update

With a lot of small task, the softirq sched is nearly never called
when no_hz is enable. Te load_balance is mainly called with
the newly_idle mode which doesn't update the cpu_power.
Add a next_update field which ensure a maximum update period when
there is short activity

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 include/linux/sched.h |  1 +
 kernel/sched_fair.c   | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41d0237fd44..8610921984b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -901,6 +901,7 @@ struct sched_group_power {
 	 * single CPU.
 	 */
 	unsigned int power, power_orig;
+	unsigned long next_update;
 };
 
 struct sched_group {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee999381..320b7a05ef9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -91,6 +91,8 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 
 static const struct sched_class fair_sched_class;
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -2667,6 +2669,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power;
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(sd->balance_interval);
+	interval = clamp(interval, 1UL, max_load_balance_interval);
+	sdg->sgp->next_update = jiffies + interval;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
@@ -2774,12 +2781,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	 * domains. In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
-	if (idle != CPU_NEWLY_IDLE && local_group) {
-		if (balance_cpu != this_cpu) {
-			*balance = 0;
-			return;
-		}
-		update_group_power(sd, this_cpu);
+	if (local_group) {
+		if (idle != CPU_NEWLY_IDLE) {
+			if (balance_cpu != this_cpu) {
+				*balance = 0;
+				return;
+			}
+			update_group_power(sd, this_cpu);
+		} else if (time_after_eq(jiffies, group->sgp->next_update))
+			update_group_power(sd, this_cpu);
 	}
 
 	/* Adjust by relative CPU power of the group */
@@ -3879,8 +3889,6 @@ void select_nohz_load_balancer(int stop_tick)
 
 static DEFINE_SPINLOCK(balancing);
 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
-- 
cgit v1.2.3


From 3e264fa39f87c799108accff1064db21402066bb Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 12 Sep 2011 17:21:54 +0200
Subject: ARM: cpu topology: Enable ARCH_POWER

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched_features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb04..85f8bd92a64 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -47,7 +47,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 /*
  * Use arch dependent cpu power functions
  */
-SCHED_FEAT(ARCH_POWER, 0)
+SCHED_FEAT(ARCH_POWER, 1)
 
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
-- 
cgit v1.2.3


From be6697e1d8f118fc2afc4baaf7e9798fa77da656 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 14 Oct 2011 14:04:05 +0200
Subject: ARM: cpu topology: fix warning

kernel/sched.c:7354:2: warning: initialization from incompatible pointer type

Align cpu_coregroup_mask prototype interface with sched_domain_mask_f typedef
use int cpu instead of unsigned int cpu

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/include/asm/topology.h | 2 +-
 arch/arm/kernel/topology.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index a7e457ed27c..58b8b84adcd 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -25,7 +25,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
+const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #else
 
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 1040c00405d..8200deaa14f 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -43,7 +43,7 @@
 
 struct cputopo_arm cpu_topology[NR_CPUS];
 
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+const struct cpumask *cpu_coregroup_mask(int cpu)
 {
 	return &cpu_topology[cpu].core_sibling;
 }
-- 
cgit v1.2.3


From bb0e9400c60ef4ed4859324d75c49fcc85f3accd Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 10 Nov 2011 11:08:35 +0100
Subject: ARM: cpu topology: Add update_cpu_topology function

arch_update_cpu_topology function is called by the scheduler
before building its sched_domain hierarchy. Prepare the update
of the cpu topology masks in this function in addition to set
it in the the store_cpu_topology which is executed only once per cpu.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/kernel/topology.c | 108 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 21 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 8200deaa14f..90352cbb6d2 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -43,11 +43,75 @@
 
 struct cputopo_arm cpu_topology[NR_CPUS];
 
+/*
+ * cpu topology mask management
+ */
+
+unsigned int advanced_topology = 0;
+
+/*
+ * default topology function
+ */
+
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
 	return &cpu_topology[cpu].core_sibling;
 }
 
+/*
+ * clear cpu topology masks
+ */
+static void clear_cpu_topology_mask(void)
+{
+	unsigned int cpuid;
+	for_each_possible_cpu(cpuid) {
+		struct cputopo_arm *cpuid_topo = &(cpu_topology[cpuid]);
+		cpumask_clear(&cpuid_topo->core_sibling);
+		cpumask_clear(&cpuid_topo->thread_sibling);
+	}
+	smp_wmb();
+}
+
+/*
+ * default_cpu_topology_mask set the core and thread mask as described in the
+ * ARM ARM
+ */
+static inline void default_cpu_topology_mask(unsigned int cpuid)
+{
+	struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid];
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cputopo_arm *cpu_topo = &cpu_topology[cpu];
+
+		if (cpuid_topo->socket_id == cpu_topo->socket_id) {
+			cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
+			if (cpu != cpuid)
+				cpumask_set_cpu(cpu,
+					&cpuid_topo->core_sibling);
+
+			if (cpuid_topo->core_id == cpu_topo->core_id) {
+				cpumask_set_cpu(cpuid,
+					&cpu_topo->thread_sibling);
+				if (cpu != cpuid)
+					cpumask_set_cpu(cpu,
+						&cpuid_topo->thread_sibling);
+			}
+		}
+	}
+	smp_wmb();
+}
+
+static void normal_cpu_topology_mask(void)
+{
+	unsigned int cpuid;
+
+	for_each_possible_cpu(cpuid) {
+		default_cpu_topology_mask(cpuid);
+	}
+	smp_wmb();
+}
+
 /*
  * store_cpu_topology is called at boot when only one cpu is running
  * and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@@ -57,7 +121,6 @@ void store_cpu_topology(unsigned int cpuid)
 {
 	struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid];
 	unsigned int mpidr;
-	unsigned int cpu;
 
 	/* If the cpu topology has been already set, just return */
 	if (cpuid_topo->core_id != -1)
@@ -99,26 +162,11 @@ void store_cpu_topology(unsigned int cpuid)
 		cpuid_topo->socket_id = -1;
 	}
 
-	/* update core and thread sibling masks */
-	for_each_possible_cpu(cpu) {
-		struct cputopo_arm *cpu_topo = &cpu_topology[cpu];
-
-		if (cpuid_topo->socket_id == cpu_topo->socket_id) {
-			cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
-			if (cpu != cpuid)
-				cpumask_set_cpu(cpu,
-					&cpuid_topo->core_sibling);
-
-			if (cpuid_topo->core_id == cpu_topo->core_id) {
-				cpumask_set_cpu(cpuid,
-					&cpu_topo->thread_sibling);
-				if (cpu != cpuid)
-					cpumask_set_cpu(cpu,
-						&cpuid_topo->thread_sibling);
-			}
-		}
-	}
-	smp_wmb();
+	/*
+	 * The core and thread sibling masks can also be updated during the
+	 * call of arch_update_cpu_topology
+	 */
+	default_cpu_topology_mask(cpuid);
 
 	printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
 		cpuid, cpu_topology[cpuid].thread_id,
@@ -126,6 +174,24 @@ void store_cpu_topology(unsigned int cpuid)
 		cpu_topology[cpuid].socket_id, mpidr);
 }
 
+/*
+ * arch_update_cpu_topology is called by the scheduler before building
+ * a new sched_domain hierarchy.
+ */
+int arch_update_cpu_topology(void)
+{
+	if (!advanced_topology)
+		return 0;
+
+	/* clear core mask */
+	clear_cpu_topology_mask();
+
+	/* update core and thread sibling masks */
+	normal_cpu_topology_mask();
+
+	return 1;
+}
+
 /*
  * init_cpu_topology is called at boot when only one cpu is running
  * which prevent simultaneous write access to cpu_topology array
-- 
cgit v1.2.3


From 91a253b3eb52425b0a445b4dae7bd737b551cd48 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 14 Nov 2011 09:25:32 +0100
Subject: ARM: cpu topology: modify cpu topology

Modify the CPU topology policy according to the sched_mc level and the cortex family

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/kernel/topology.c | 80 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 90352cbb6d2..af1c3e68da6 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -18,6 +18,7 @@
 #include <linux/node.h>
 #include <linux/nodemask.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
 
 #include <asm/cputype.h>
 #include <asm/topology.h>
@@ -47,7 +48,7 @@ struct cputopo_arm cpu_topology[NR_CPUS];
  * cpu topology mask management
  */
 
-unsigned int advanced_topology = 0;
+unsigned int advanced_topology = 1;
 
 /*
  * default topology function
@@ -112,6 +113,74 @@ static void normal_cpu_topology_mask(void)
 	smp_wmb();
 }
 
+/*
+ * For Cortex-A9 MPcore, we emulate a multi-package topology in power mode.
+ * The goal is to gathers tasks on 1 virtual package
+ */
+static void power_cpu_topology_mask_CA9(void)
+{
+
+	unsigned int cpuid, cpu;
+
+	for_each_possible_cpu(cpuid) {
+		struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid];
+
+		for_each_possible_cpu(cpu) {
+			struct cputopo_arm *cpu_topo = &cpu_topology[cpu];
+
+			if ((cpuid_topo->socket_id == cpu_topo->socket_id)
+			&& ((cpuid & 0x1) == (cpu & 0x1))) {
+				cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
+				if (cpu != cpuid)
+					cpumask_set_cpu(cpu,
+						&cpuid_topo->core_sibling);
+
+				if (cpuid_topo->core_id == cpu_topo->core_id) {
+					cpumask_set_cpu(cpuid,
+						&cpu_topo->thread_sibling);
+					if (cpu != cpuid)
+						cpumask_set_cpu(cpu,
+							&cpuid_topo->thread_sibling);
+				}
+			}
+		}
+	}
+	smp_wmb();
+}
+
+#define ARM_FAMILY_MASK 0xFF0FFFF0
+#define ARM_CORTEX_A9_FAMILY 0x410FC090
+
+/* update_cpu_topology_policy select a cpu topology policy according to the
+ * available cores.
+ * TODO: The current version assumes that all cores are exactly the same which
+ * might not be true. We need to update it to take into account various
+ * configuration among which system with different kind of core.
+ */
+static int update_cpu_topology_policy(void)
+{
+	unsigned long cpuid;
+
+	if (sched_mc_power_savings == POWERSAVINGS_BALANCE_NONE) {
+		set_cpu_topology_mask = normal_cpu_topology_mask;
+		return 0;
+	}
+
+	cpuid = read_cpuid_id();
+	cpuid &= ARM_FAMILY_MASK;
+
+	switch (cpuid) {
+	case ARM_CORTEX_A9_FAMILY:
+		set_cpu_topology_mask = power_cpu_topology_mask_CA9;
+	break;
+	default:
+		set_cpu_topology_mask = normal_cpu_topology_mask;
+	break;
+	}
+
+	return 0;
+}
+
 /*
  * store_cpu_topology is called at boot when only one cpu is running
  * and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@@ -183,11 +252,14 @@ int arch_update_cpu_topology(void)
 	if (!advanced_topology)
 		return 0;
 
-	/* clear core mask */
+	/* clear core threads mask */
 	clear_cpu_topology_mask();
 
-	/* update core and thread sibling masks */
-	normal_cpu_topology_mask();
+	/* set topology policy */
+	update_cpu_topology_policy();
+
+	/* set topology mask*/
+	(*set_cpu_topology_mask)();
 
 	return 1;
 }
-- 
cgit v1.2.3


From 96881acee427b188f713e8cdfa1e7702ba99477d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 10 Nov 2011 15:10:39 +0100
Subject: ARM: scheduler: add a cpu_power function

Add an architecture specific function for setting cpu_power

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/kernel/topology.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index af1c3e68da6..9d80e2254ec 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -44,12 +44,32 @@
 
 struct cputopo_arm cpu_topology[NR_CPUS];
 
+/*
+ * cpu power scale management
+ */
+
+/*
+ * a per cpu data structure should be better because each cpu is mainly
+ * using its own cpu_power even it's not always true because of
+ * no_hz_idle_balance
+ */
+static DEFINE_PER_CPU(unsigned int, cpu_scale);
+
 /*
  * cpu topology mask management
  */
 
 unsigned int advanced_topology = 1;
 
+/*
+ * Update the cpu power
+ */
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_scale, cpu);
+}
+
 /*
  * default topology function
  */
@@ -281,6 +301,8 @@ void init_cpu_topology(void)
 		cpu_topo->socket_id = -1;
 		cpumask_clear(&cpu_topo->core_sibling);
 		cpumask_clear(&cpu_topo->thread_sibling);
+
+		per_cpu(cpu_scale, cpu) = SCHED_POWER_SCALE;
 	}
 	smp_wmb();
 }
-- 
cgit v1.2.3


From 80fce0df1841a6261d20ca30c9b572c372a64045 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 14 Nov 2011 09:07:00 +0100
Subject: ARM: cpu topology: Modify cpu_power according to sched_mc level

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/kernel/topology.c | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 9d80e2254ec..2774c5d6cca 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -19,6 +19,7 @@
 #include <linux/nodemask.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
+#include <linux/cpuset.h>
 
 #include <asm/cputype.h>
 #include <asm/topology.h>
@@ -61,6 +62,43 @@ static DEFINE_PER_CPU(unsigned int, cpu_scale);
 
 unsigned int advanced_topology = 1;
 
+static void normal_cpu_topology_mask(void);
+static void (*set_cpu_topology_mask)(void) = normal_cpu_topology_mask;
+
+/* This table sets the cpu_power scale of a cpu according to the sched_mc mode.
+ * The content of this table could be SoC specific so we should add a method to
+ * overwrite this default table.
+ * TODO: Study how to use DT for setting this table
+ */
+#define ARM_CORTEX_A9_DEFAULT_SCALE 0
+#define ARM_CORTEX_A9_POWER_SCALE 1
+/* This table list all possible cpu power configuration */
+unsigned int table_config[2] = {
+	1024,
+	4096
+};
+
+static void set_power_scale(unsigned int cpu, unsigned int idx)
+{
+	per_cpu(cpu_scale, cpu) = table_config[idx];
+}
+
+static int init_cpu_power_scale(void)
+{
+	/* Do we need to change default config */
+	advanced_topology = 1;
+
+	/* force topology update */
+	arch_update_cpu_topology();
+
+	/* Force a cpu topology update */
+	rebuild_sched_domains();
+
+	return 0;
+}
+
+core_initcall(init_cpu_power_scale);
+
 /*
  * Update the cpu power
  */
@@ -129,6 +167,7 @@ static void normal_cpu_topology_mask(void)
 
 	for_each_possible_cpu(cpuid) {
 		default_cpu_topology_mask(cpuid);
+		set_power_scale(cpuid, ARM_CORTEX_A9_DEFAULT_SCALE);
 	}
 	smp_wmb();
 }
@@ -139,7 +178,6 @@ static void normal_cpu_topology_mask(void)
  */
 static void power_cpu_topology_mask_CA9(void)
 {
-
 	unsigned int cpuid, cpu;
 
 	for_each_possible_cpu(cpuid) {
@@ -164,6 +202,7 @@ static void power_cpu_topology_mask_CA9(void)
 				}
 			}
 		}
+		set_power_scale(cpuid, ARM_CORTEX_A9_POWER_SCALE);
 	}
 	smp_wmb();
 }
@@ -278,7 +317,7 @@ int arch_update_cpu_topology(void)
 	/* set topology policy */
 	update_cpu_topology_policy();
 
-	/* set topology mask*/
+	/* set topology mask and power */
 	(*set_cpu_topology_mask)();
 
 	return 1;
-- 
cgit v1.2.3


From 93a4dc80b8b9140076a016a56fc93a27abba8845 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 10 Nov 2011 15:14:41 +0100
Subject: ARM: cpu topology: Modify cpu_power according to cpufreq

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/kernel/topology.c | 134 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 126 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 2774c5d6cca..a1b1f7f95ec 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -21,6 +21,10 @@
 #include <linux/cpumask.h>
 #include <linux/cpuset.h>
 
+#ifdef CONFIG_CPU_FREQ
+#include <linux/cpufreq.h>
+#endif
+
 #include <asm/cputype.h>
 #include <asm/topology.h>
 
@@ -54,6 +58,7 @@ struct cputopo_arm cpu_topology[NR_CPUS];
  * using its own cpu_power even it's not always true because of
  * no_hz_idle_balance
  */
+
 static DEFINE_PER_CPU(unsigned int, cpu_scale);
 
 /*
@@ -65,17 +70,127 @@ unsigned int advanced_topology = 1;
 static void normal_cpu_topology_mask(void);
 static void (*set_cpu_topology_mask)(void) = normal_cpu_topology_mask;
 
-/* This table sets the cpu_power scale of a cpu according to the sched_mc mode.
- * The content of this table could be SoC specific so we should add a method to
- * overwrite this default table.
+#ifdef CONFIG_CPU_FREQ
+/*
+ * This struct describes parameters to compute cpu_power
+ */
+struct cputopo_power {
+	int id;
+	int max; /* max idx in the table */
+	unsigned int step; /* frequency step for the table */
+	unsigned int *table; /* table of cpu_power */
+};
+
+/* default table with one default cpu_power value */
+unsigned int table_default_power[1] = {
+	1024
+};
+
+static struct cputopo_power default_cpu_power = {
+	.max  = 1,
+	.step = 1,
+	.table = table_default_power,
+};
+
+/* CA-9 table with cpufreq modifying cpu_power */
+#define CPU_MAX_FREQ 10
+/* we use a 200Mhz step for scaling cpu power */
+#define CPU_TOPO_FREQ_STEP 200000
+/* This table sets the cpu_power scale of a cpu according to 2 inputs which are
+ * the frequency and the sched_mc mode. The content of this table could be SoC
+ * specific so we should add a method to overwrite this default table.
  * TODO: Study how to use DT for setting this table
  */
+unsigned int table_ca9_power[CPU_MAX_FREQ] = {
+/* freq< 200   400   600   800  1000  1200  1400  1600  1800  other*/
+	4096, 4096, 4096, 1024, 1024, 1024, 1024, 1024, 1024, 1024, /* Power save mode CA9 MP */
+};
+
+static struct cputopo_power CA9_cpu_power = {
+	.max  = CPU_MAX_FREQ,
+	.step = CPU_TOPO_FREQ_STEP,
+	.table = table_ca9_power,
+};
+
 #define ARM_CORTEX_A9_DEFAULT_SCALE 0
 #define ARM_CORTEX_A9_POWER_SCALE 1
 /* This table list all possible cpu power configuration */
-unsigned int table_config[2] = {
+struct cputopo_power *table_config[2] = {
+	&default_cpu_power,
+	&CA9_cpu_power,
+};
+
+struct cputopo_scale {
+	int id;
+	int freq;
+	struct cputopo_power *power;
+};
+
+/*
+ * The table will be mostly used by one cpu which will update the
+ * configuration for all cpu on a cpufreq notification
+ * or a sched_mc level change
+ */
+static struct cputopo_scale cpu_power[NR_CPUS];
+
+static void set_cpufreq_scale(unsigned int cpuid, unsigned int freq)
+{
+	unsigned int idx;
+
+	cpu_power[cpuid].freq = freq;
+
+	idx = freq / cpu_power[cpuid].power->step;
+	if (idx >= cpu_power[cpuid].power->max)
+		idx = cpu_power[cpuid].power->max - 1;
+
+	per_cpu(cpu_scale, cpuid) = cpu_power[cpuid].power->table[idx];
+	smp_wmb();
+}
+
+static void set_power_scale(unsigned int cpu, unsigned int idx)
+{
+	cpu_power[cpu].id = idx;
+	cpu_power[cpu].power = table_config[idx];
+
+	set_cpufreq_scale(cpu, cpu_power[cpu].freq);
+}
+
+static int topo_cpufreq_transition(struct notifier_block *nb,
+	unsigned long state, void *data)
+{
+	struct cpufreq_freqs *freqs = data;
+
+	if (state == CPUFREQ_POSTCHANGE || state == CPUFREQ_RESUMECHANGE)
+		set_cpufreq_scale(freqs->cpu, freqs->new);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block topo_cpufreq_nb = {
+	.notifier_call = topo_cpufreq_transition,
+};
+
+static int topo_cpufreq_init(void)
+{
+	unsigned int cpu;
+
+	/* TODO set initial value according to current freq */
+
+	/* init core mask */
+	for_each_possible_cpu(cpu) {
+		cpu_power[cpu].freq = 0;
+		cpu_power[cpu].power = &default_cpu_power;
+	}
+
+	return cpufreq_register_notifier(&topo_cpufreq_nb,
+			CPUFREQ_TRANSITION_NOTIFIER);
+}
+#else
+#define ARM_CORTEX_A9_DEFAULT_SCALE 0
+#define ARM_CORTEX_A9_POWER_SCALE 0
+/* This table list all possible cpu power configuration */
+unsigned int table_config[1] = {
 	1024,
-	4096
 };
 
 static void set_power_scale(unsigned int cpu, unsigned int idx)
@@ -83,14 +198,17 @@ static void set_power_scale(unsigned int cpu, unsigned int idx)
 	per_cpu(cpu_scale, cpu) = table_config[idx];
 }
 
+static inline int topo_cpufreq_init(void) {return 0; }
+#endif
+
 static int init_cpu_power_scale(void)
 {
+	/* register cpufreq notifer */
+	topo_cpufreq_init();
+
 	/* Do we need to change default config */
 	advanced_topology = 1;
 
-	/* force topology update */
-	arch_update_cpu_topology();
-
 	/* Force a cpu topology update */
 	rebuild_sched_domains();
 
-- 
cgit v1.2.3


From 64ed398797f236ed708828e672dff130854e9d3b Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 20 Sep 2011 11:09:32 +0200
Subject: ARM: cpu topology: Add asym topology flag for using cpu0 1st

Modify the CPU sched_domain flags in powersave mode for using the cpu0 in ched_mc powersave mode

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/include/asm/topology.h | 33 +++++++++++++++++++++++++++++++++
 arch/arm/kernel/topology.c      | 11 +++++++++++
 2 files changed, 44 insertions(+)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 58b8b84adcd..f7f02e392ef 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -34,6 +34,39 @@ static inline void store_cpu_topology(unsigned int cpuid) { }
 
 #endif
 
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) {				\
+	.min_interval		= 1,					\
+	.max_interval		= 4,					\
+	.busy_factor		= 64,					\
+	.imbalance_pct		= 125,					\
+	.cache_nice_tries	= 1,					\
+	.busy_idx		= 2,					\
+	.idle_idx		= 1,					\
+	.newidle_idx		= 0,					\
+	.wake_idx		= 0,					\
+	.forkexec_idx		= 0,					\
+									\
+	.flags			= 1*SD_LOAD_BALANCE			\
+				| 1*SD_BALANCE_NEWIDLE			\
+				| 1*SD_BALANCE_EXEC			\
+				| 1*SD_BALANCE_FORK			\
+				| 0*SD_BALANCE_WAKE			\
+				| 1*SD_WAKE_AFFINE			\
+				| 0*SD_PREFER_LOCAL			\
+				| 0*SD_SHARE_CPUPOWER			\
+				| 0*SD_SHARE_PKG_RESOURCES		\
+				| 0*SD_SERIALIZE			\
+				| arch_sd_sibling_asym_packing()	\
+				| sd_balance_for_package_power()	\
+				| sd_power_saving_flags()		\
+				,					\
+	.last_balance		= jiffies,				\
+	.balance_interval	= 1,					\
+}
+#endif
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index a1b1f7f95ec..945b9802ad6 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -226,6 +226,17 @@ unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
 	return per_cpu(cpu_scale, cpu);
 }
 
+/*
+ * sched_domain flag configuration
+ */
+/* TODO add a config flag for this function */
+int arch_sd_sibling_asym_packing(void)
+{
+	if (sched_smt_power_savings || sched_mc_power_savings)
+		return SD_ASYM_PACKING;
+	return 0;
+}
+
 /*
  * default topology function
  */
-- 
cgit v1.2.3


From 0a54c5632af98dfb98a092042524472ac7277047 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 12 Sep 2011 17:40:50 +0200
Subject: ARM: cpu topology: Add debugfs interface for cpu_power

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 arch/arm/kernel/topology.c | 113 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 945b9802ad6..053ce9cbc0b 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -25,6 +25,11 @@
 #include <linux/cpufreq.h>
 #endif
 
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>	/* for copy_from_user */
+#endif
+
 #include <asm/cputype.h>
 #include <asm/topology.h>
 
@@ -474,3 +479,111 @@ void init_cpu_topology(void)
 	}
 	smp_wmb();
 }
+
+/*
+ * debugfs interface for scaling cpu power
+ */
+
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *topo_debugfs_root;
+
+static ssize_t dbg_write(struct file *file, const char __user *buf,
+						size_t size, loff_t *off)
+{
+	unsigned int *value = file->f_dentry->d_inode->i_private;
+	char cdata[128];
+	unsigned long tmp;
+	unsigned int cpu;
+
+	if (size < (sizeof(cdata)-1)) {
+		if (copy_from_user(cdata, buf, size))
+			return -EFAULT;
+		cdata[size] = 0;
+		if (!strict_strtoul(cdata, 10, &tmp)) {
+			*value = tmp;
+
+#ifdef CONFIG_CPU_FREQ
+			for_each_online_cpu(cpu)
+				set_power_scale(cpu, cpu_power[cpu].id);
+#endif
+		}
+		return size;
+	}
+	return -EINVAL;
+}
+
+static ssize_t dbg_read(struct file *file, char __user *buf,
+						size_t size, loff_t *off)
+{
+	unsigned int *value = file->f_dentry->d_inode->i_private;
+	char cdata[128];
+	unsigned int len;
+
+	len = sprintf(cdata, "%u\n", *value);
+	return simple_read_from_buffer(buf, size, off, cdata, len);
+}
+
+static const struct file_operations debugfs_fops = {
+	.read = dbg_read,
+	.write = dbg_write,
+};
+
+static struct dentry *topo_debugfs_register(unsigned int cpu,
+						struct dentry *parent)
+{
+	struct dentry *cpu_d, *d;
+	char cpu_name[16];
+
+	sprintf(cpu_name, "cpu%u", cpu);
+
+	cpu_d = debugfs_create_dir(cpu_name, parent);
+	if (!cpu_d)
+		return NULL;
+
+	d = debugfs_create_file("cpu_power", S_IRUGO  | S_IWUGO,
+				cpu_d, &per_cpu(cpu_scale, cpu), &debugfs_fops);
+	if (!d)
+		goto err_out;
+
+#ifdef CONFIG_CPU_FREQ
+	d = debugfs_create_file("scale", S_IRUGO | S_IWUGO,
+				cpu_d, &cpu_power[cpu].id, &debugfs_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file("freq", S_IRUGO,
+				cpu_d, &cpu_power[cpu].freq, &debugfs_fops);
+	if (!d)
+		goto err_out;
+#endif
+	return cpu_d;
+
+err_out:
+	debugfs_remove_recursive(cpu_d);
+	return NULL;
+}
+
+static int __init topo_debugfs_init(void)
+{
+	struct dentry *d;
+	unsigned int cpu;
+
+	d = debugfs_create_dir("cpu_topo", NULL);
+	if (!d)
+		return -ENOMEM;
+	topo_debugfs_root = d;
+
+	for_each_possible_cpu(cpu) {
+		d = topo_debugfs_register(cpu, topo_debugfs_root);
+		if (d == NULL)
+			goto err_out;
+	}
+	return 0;
+
+err_out:
+	debugfs_remove_recursive(topo_debugfs_root);
+	return -ENOMEM;
+}
+
+late_initcall(topo_debugfs_init);
+#endif
-- 
cgit v1.2.3