summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c828
1 files changed, 543 insertions, 285 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e7dd0ec169be..039de34f1521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw)
* OR
* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
- * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
* we're guaranteed shift stays positive because inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
@@ -682,18 +682,100 @@ void init_entity_runnable_average(struct sched_entity *se)
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /*
+ * At this point, util_avg won't be used in select_task_rq_fair anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
-#else
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = now;
+ return;
+ }
+ }
+
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
+}
+
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -1254,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1321,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
- bool assigned = false;
rcu_read_lock();
-
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task or idle task.
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ cur = task_rcu_dereference(&dst_rq->curr);
+ if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
- }
-
- raw_spin_unlock_irq(&dst_rq->lock);
/*
* Because we have preemption enabled we can get migrated around and
@@ -1428,7 +1492,6 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
- put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1454,16 +1517,9 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
- assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
- /*
- * The dst_rq->curr isn't assigned. The protection for task_struct is
- * finished.
- */
- if (cur && !assigned)
- put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2437,37 +2493,33 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
+#endif
cfs_rq->nr_running--;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
-
- return tg_weight;
-}
+ load = scale_load_down(cfs_rq->load.weight);
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
@@ -2486,6 +2538,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -2550,6 +2603,16 @@ static const u32 runnable_avg_yN_sum[] = {
};
/*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers. See Documentation/scheduler/sched-avg.txt how these
+ * were generated:
+ */
+static const u32 __accumulated_sum_N32[] = {
+ 0, 23371, 35056, 40899, 43820, 45281,
+ 46011, 46376, 46559, 46650, 46696, 46719,
+};
+
+/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
@@ -2597,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n)
else if (unlikely(n >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
- /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
- do {
- contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
- contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
- n -= LOAD_AVG_PERIOD;
- } while (n > LOAD_AVG_PERIOD);
-
+ /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+ contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+ n %= LOAD_AVG_PERIOD;
contrib = decay_load(contrib, n);
return contrib + runnable_avg_yN_sum[n];
}
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/*
@@ -2819,25 +2873,87 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ unsigned long max = rq->cpu_capacity_orig;
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_clock(rq),
+ min(cfs_rq->avg.util_avg, max), max);
+ }
+}
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed = 0;
+ int decayed, removed_load = 0, removed_util = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- sa->load_avg = max_t(long, sa->load_avg - r, 0);
- sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
- removed = 1;
+ sub_positive(&sa->load_avg, r);
+ sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
+ removed_load = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sa->util_avg = max_t(long, sa->util_avg - r, 0);
- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(&sa->util_avg, r);
+ sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+ removed_util = 1;
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2848,7 +2964,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- return decayed || removed;
+ if (update_freq && (decayed || removed_util))
+ cfs_rq_util_change(cfs_rq);
+
+ return decayed || removed_load;
}
/* Update task and its cfs_rq load average */
@@ -2867,33 +2986,18 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
update_tg_load_avg(cfs_rq, 0);
-
- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
- unsigned long max = rq->cpu_capacity_orig;
-
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
- }
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2902,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* If we got migrated (either between CPUs or between cgroups) we'll
* have aged the average right before clearing @last_update_time.
+ *
+ * Or we're fresh through post_init_entity_util_avg().
*/
if (se->avg.last_update_time) {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -2919,18 +3025,30 @@ skip_aging:
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+
+ cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
&se->avg, se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
- cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+
+ cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -2948,7 +3066,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->curr == se, NULL);
}
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
@@ -3003,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
last_update_time = cfs_rq_last_update_time(cfs_rq);
@@ -3030,6 +3151,12 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
static inline void update_load_avg(struct sched_entity *se, int not_used)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -3177,7 +3304,7 @@ static inline void check_schedstat_required(void)
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled()) {
- pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
"kernel parameter schedstats=enabled or "
"kernel.sched_schedstats=1\n");
@@ -3185,20 +3312,61 @@ static inline void check_schedstat_required(void)
#endif
}
+
+/*
+ * MIGRATION
+ *
+ * dequeue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
+ bool curr = cfs_rq->curr == se;
+
/*
- * Update the normalized vruntime before updating min_vruntime
- * through calling update_curr().
+ * If we're the current task, we must renormalise before calling
+ * update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+ if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;
+ update_curr(cfs_rq);
+
/*
- * Update run-time statistics of the 'current'.
+ * Otherwise, renormalise after, such that we're placed at the current
+ * moment in time, instead of some random moment in the past. Being
+ * placed in the past could significantly boost this task to the
+ * fairness detriment of existing tasks.
*/
- update_curr(cfs_rq);
+ if (renorm && !curr)
+ se->vruntime += cfs_rq->min_vruntime;
+
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3214,7 +3382,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
- if (se != cfs_rq->curr)
+ if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3578,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task;
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
@@ -3716,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
-#endif
return 0;
}
@@ -4089,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *pcfs_rq, *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tg->parent)
+ return;
+
+ cfs_rq = tg->cfs_rq[cpu];
+ pcfs_rq = tg->parent->cfs_rq[cpu];
+
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4228,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4336,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
@@ -4390,15 +4574,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && parent_entity(se))
- set_next_buddy(parent_entity(se));
-
- /* avoid re-evaluating load for this entity */
- se = parent_entity(se);
+ if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
@@ -4422,7 +4605,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
*/
@@ -4488,13 +4671,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
}
return load;
}
+#endif /* CONFIG_NO_HZ_COMMON */
/**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
* @this_rq: The rq to update statistics for
* @this_load: The current load
* @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
*
* Update rq->cpu_load[] statistics. This function is usually called every
* scheduler tick (TICK_NSEC).
@@ -4523,12 +4706,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
* load[i]_n = (1 - 1/2^i)^n * load[i]_0
*
* see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
*/
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
{
- unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+ unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
this_rq->nr_load_updates++;
@@ -4541,6 +4724,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
@@ -4551,6 +4735,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
*/
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -4573,10 +4758,23 @@ static unsigned long weighted_cpuload(const int cpu)
}
#ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load,
- int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+static void cpu_load_update_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load)
{
unsigned long pending_updates;
@@ -4588,28 +4786,15 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
* In the NOHZ_FULL case, we were non-idle, we should consider
* its weighted load.
*/
- __update_cpu_load(this_rq, load, pending_updates, active);
+ cpu_load_update(this_rq, load, pending_updates);
}
}
/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
-static void update_cpu_load_idle(struct rq *this_rq)
+static void cpu_load_update_idle(struct rq *this_rq)
{
/*
* bail if there's load or we're actually up-to-date.
@@ -4617,38 +4802,71 @@ static void update_cpu_load_idle(struct rq *this_rq)
if (weighted_cpuload(cpu_of(this_rq)))
return;
- __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
+ cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
}
/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ * Record CPU load on nohz entry so we know the tickless load to account
+ * on nohz exit. cpu_load[0] happens then to be updated more frequently
+ * than other cpu_load[idx] but it should be fine as cpu_load readers
+ * shouldn't rely into synchronized cpu_load[*] updates.
*/
-void update_cpu_load_nohz(int active)
+void cpu_load_update_nohz_start(void)
{
struct rq *this_rq = this_rq();
+
+ /*
+ * This is all lockless but should be fine. If weighted_cpuload changes
+ * concurrently we'll exit nohz. And cpu_load write can race with
+ * cpu_load_update_idle() but both updater would be writing the same.
+ */
+ this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+}
+
+/*
+ * Account the tickless load in the end of a nohz frame.
+ */
+void cpu_load_update_nohz_stop(void)
+{
unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
+ struct rq *this_rq = this_rq();
+ unsigned long load;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
+ load = weighted_cpuload(cpu_of(this_rq));
raw_spin_lock(&this_rq->lock);
- __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
+ update_rq_clock(this_rq);
+ cpu_load_update_nohz(this_rq, curr_jiffies, load);
raw_spin_unlock(&this_rq->lock);
}
-#endif /* CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void cpu_load_update_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ /* See the mess around cpu_load_update_nohz(). */
+ this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
+ cpu_load_update(this_rq, load, 1);
+}
/*
* Called from scheduler_tick()
*/
-void update_cpu_load_active(struct rq *this_rq)
+void cpu_load_update_active(struct rq *this_rq)
{
unsigned long load = weighted_cpuload(cpu_of(this_rq));
- /*
- * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1, 1);
+
+ if (tick_nohz_tick_stopped())
+ cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
+ else
+ cpu_load_update_periodic(this_rq, load);
}
/*
@@ -4706,46 +4924,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
}
-static void record_wakee(struct task_struct *p)
-{
- /*
- * Rough decay (wiping) for cost saving, don't worry
- * about the boundary, really active task won't care
- * about the loss.
- */
- if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
- current->wakee_flips >>= 1;
- current->wakee_flip_decay_ts = jiffies;
- }
-
- if (current->last_wakee != p) {
- current->last_wakee = p;
- current->wakee_flips++;
- }
-}
-
-static void task_waking_fair(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
-
- se->vruntime -= min_vruntime;
- record_wakee(p);
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
@@ -4805,19 +4983,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
@@ -4861,17 +5044,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Only decay a single time; tasks that have less then 1 wakeup per
+ * jiffy will not have built up many flips.
+ */
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
+
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
* A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees. In order
- * to determine whether we should let the load spread vs consolodating to
- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other. With
- * both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size. Waker/wakee
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
*/
static int wake_wide(struct task_struct *p)
{
@@ -5176,8 +5381,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (sd_flag & SD_BALANCE_WAKE)
+ if (sd_flag & SD_BALANCE_WAKE) {
+ record_wakee(p);
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ }
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -5257,6 +5464,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
static void migrate_task_rq_fair(struct task_struct *p)
{
/*
+ * As blocked tasks retain absolute vruntime the migration needs to
+ * deal with this by subtracting the old and adding the new
+ * min_vruntime -- the latter is done by enqueue_entity() when placing
+ * the task on the new runqueue.
+ */
+ if (p->state == TASK_WAKING) {
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+
+ do {
+ min_vruntime_copy = cfs_rq->min_vruntime_copy;
+ smp_rmb();
+ min_vruntime = cfs_rq->min_vruntime;
+ } while (min_vruntime != min_vruntime_copy);
+#else
+ min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+ se->vruntime -= min_vruntime;
+ }
+
+ /*
* We are supposed to update the task to "current" time, then its up to date
* and ready to go to new CPU/cfs_rq. But we have difficulty in getting
* what current time is, so simply throw away the out-of-date time. This
@@ -5439,7 +5672,7 @@ preempt:
}
static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
@@ -5552,9 +5785,9 @@ idle:
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
new_tasks = idle_balance(rq);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -5653,7 +5886,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* W_i,0 = \Sum_j w_i,j (2)
*
* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
- * is derived from the nice value as per prio_to_weight[].
+ * is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
* weight:
@@ -6155,7 +6388,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6216,7 +6449,7 @@ static inline void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6625,6 +6858,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
+ /* No ASYM_PACKING if target cpu is already busy */
+ if (env->idle == CPU_NOT_IDLE)
+ return true;
/*
* ASYM_PACKING needs to move all the work to the lowest
* numbered CPUs in the group, therefore mark all groups
@@ -6634,7 +6870,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (!sds->busiest)
return true;
- if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ /* Prefer to move from highest possible cpu's work */
+ if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
return true;
}
@@ -6780,6 +7017,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (!(env->sd->flags & SD_ASYM_PACKING))
return 0;
+ if (env->idle == CPU_NOT_IDLE)
+ return 0;
+
if (!sds->busiest)
return 0;
@@ -6888,9 +7128,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
- * In the presence of smp nice balancing, certain scenarios can have
- * max load less than avg load(as we skip the groups at or below
- * its cpu_capacity, while calculating max_load..)
+ * Avg load of busiest sg can be less and avg load of local sg can
+ * be greater than avg load across all sgs of sd because avg load
+ * factors in sg capacity and sgs with smaller group_type are
+ * skipped when updating the busiest sg:
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
@@ -6903,11 +7144,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity = busiest->sum_nr_running *
- SCHED_LOAD_SCALE;
- if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
+ if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
- else
+ load_above_capacity *= NICE_0_LOAD;
+ load_above_capacity /= busiest->group_capacity;
+ } else
load_above_capacity = ~0UL;
}
@@ -6915,9 +7157,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load. At the same time,
- * we also don't want to reduce the group load below the group capacity
- * (so that we can implement power-savings policies etc). Thus we look
- * for the minimum possible imbalance.
+ * we also don't want to reduce the group load below the group
+ * capacity. Thus we look for the minimum possible imbalance.
*/
max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
@@ -6941,10 +7182,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
+ * if there is an imbalance.
*
* Also calculates the amount of weighted load which should be moved
* to restore balance.
@@ -6952,9 +7190,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* @env: The load balancing environment.
*
* Return: - The busiest group if imbalance exists.
- * - If no imbalance and user has opted for power-savings balance,
- * return the least loaded group whose CPUs can be
- * put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
{
@@ -6972,8 +7207,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest = &sds.busiest_stat;
/* ASYM feature bypasses nice load balance check */
- if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
- check_asym_packing(env, &sds))
+ if (check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
@@ -7398,10 +7632,7 @@ more_balance:
&busiest->active_balance_work);
}
- /*
- * We've kicked active balancing, reset the failure
- * counter.
- */
+ /* We've kicked active balancing, force task migration. */
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
@@ -7636,10 +7867,13 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
- else
+ /* Active balancing done, reset the failure counter. */
+ sd->nr_balance_failed = 0;
+ } else {
schedstat_inc(sd, alb_failed);
+ }
}
rcu_read_unlock();
out_unlock:
@@ -7710,7 +7944,7 @@ static void nohz_balancer_kick(void)
return;
}
-static inline void nohz_balance_exit_idle(int cpu)
+void nohz_balance_exit_idle(unsigned int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
/*
@@ -7783,18 +8017,6 @@ void nohz_balance_enter_idle(int cpu)
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DYING:
- nohz_balance_exit_idle(smp_processor_id());
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
#endif
static DEFINE_SPINLOCK(balancing);
@@ -7956,7 +8178,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- update_cpu_load_idle(rq);
+ cpu_load_update_idle(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
}
@@ -8139,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8176,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
@@ -8233,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
if (!vruntime_normalized(p)) {
/*
@@ -8244,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
}
/* Catch up with the cfs_rq and remove our load when we leave */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8261,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif
/* Synchronize task with its cfs_rq */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8321,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -8333,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -8352,8 +8590,9 @@ void free_fair_sched_group(struct task_group *tg)
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8368,6 +8607,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -8391,6 +8632,23 @@ err:
return 0;
}
+void online_fair_sched_group(struct task_group *tg)
+{
+ struct sched_entity *se;
+ struct rq *rq;
+ int i;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ se = tg->se[i];
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ sync_throttle(tg, i);
+ raw_spin_unlock_irq(&rq->lock);
+ }
+}
+
void unregister_fair_sched_group(struct task_group *tg)
{
unsigned long flags;
@@ -8495,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
+void online_fair_sched_group(struct task_group *tg) { }
+
void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8537,7 +8797,6 @@ const struct sched_class fair_sched_class = {
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
- .task_waking = task_waking_fair,
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
@@ -8555,7 +8814,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
};
@@ -8599,7 +8858,6 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */