summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c232
1 files changed, 161 insertions, 71 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 095b0aa378df..d4bd299d67ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,39 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
+#include <linux/task_work.h>
+
+#include <asm/switch_to.h>
+
+#include <linux/sched/cond_resched.h>
+
#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -1259,10 +1291,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
- int maxdist, bool task)
+ int lim_dist, bool task)
{
unsigned long score = 0;
- int node;
+ int node, max_dist;
/*
* All nodes are directly connected, and the same distance
@@ -1271,6 +1303,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
+ /* sched_max_numa_distance may be changed in parallel. */
+ max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
@@ -1283,7 +1317,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
- if (dist == sched_max_numa_distance || node == nid)
+ if (dist >= max_dist || node == nid)
continue;
/*
@@ -1293,8 +1327,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist >= maxdist)
+ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1312,8 +1345,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
- faults *= (sched_max_numa_distance - dist);
- faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ faults *= (max_dist - dist);
+ faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
@@ -1489,6 +1522,7 @@ struct task_numa_env {
int src_cpu, src_nid;
int dst_cpu, dst_nid;
+ int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
@@ -1503,7 +1537,7 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight);
+ int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1884,7 +1918,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
- env->dst_stats.weight);
+ env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -1949,8 +1983,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();
/*
@@ -1985,7 +2021,7 @@ static int task_numa_migrate(struct task_struct *p)
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -2083,13 +2119,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
@@ -2243,7 +2279,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
dist = sched_max_numa_distance;
- for_each_online_node(node) {
+ for_each_node_state(node, N_CPU) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
@@ -2262,7 +2298,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
- nodes = node_online_map;
+ nodes = node_states[N_CPU];
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
@@ -2401,6 +2437,21 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ /* Cannot migrate task to CPU-less node */
+ if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
+ int near_nid = max_nid;
+ int distance, near_distance = INT_MAX;
+
+ for_each_node_state(nid, N_CPU) {
+ distance = node_distance(max_nid, nid);
+ if (distance < near_distance) {
+ near_nid = nid;
+ near_distance = distance;
+ }
+ }
+ max_nid = near_nid;
+ }
+
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
@@ -2825,6 +2876,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
+ p->numa_pages_migrated = 0;
+ p->total_numa_faults = 0;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
@@ -3028,9 +3081,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3381,7 +3436,6 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3449,15 +3503,14 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3466,23 +3519,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3493,19 +3553,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * divider;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
+ s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3532,7 +3598,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
- load_sum = div_s64(gcfs_rq->avg.load_sum,
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
@@ -3549,19 +3615,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
- load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, divider);
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
- se->avg.load_sum = runnable_sum;
-
- delta = load_avg - se->avg.load_avg;
- if (!delta)
+ delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
return;
- se->avg.load_avg = load_avg;
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3652,7 +3721,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed load.
+ * Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
@@ -3677,15 +3746,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
- sa->load_sum = sa->load_avg * divider;
+ sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
- sa->util_sum = sa->util_avg * divider;
+ sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
- sa->runnable_sum = sa->runnable_avg * divider;
+ sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3772,17 +3858,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = get_pelt_divider(&cfs_rq->avg);
-
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -8539,6 +8626,8 @@ group_type group_classify(unsigned int imbalance_pct,
*
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
* of @dst_cpu are idle and @sg has lower priority.
+ *
+ * Return: true if @dst_cpu can pull tasks, false otherwise.
*/
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
struct sg_lb_stats *sgs,
@@ -8614,6 +8703,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
@@ -9003,9 +9093,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
* This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity.
*/
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
{
- return (dst_running < (dst_weight >> 2));
+ return running <= imb_numa_nr;
}
/*
@@ -9139,12 +9229,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
#endif
/*
- * Otherwise, keep the task on this node to stay close
- * its wakeup source and improve locality. If there is
- * a real need of migration, periodic load balance will
- * take care of it.
+ * Otherwise, keep the task close to the wakeup source
+ * and improve locality if the number of running tasks
+ * would remain below threshold where an imbalance is
+ * allowed. If there is a real need of migration,
+ * periodic load balance will take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+ if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
return NULL;
}
@@ -9236,9 +9327,9 @@ next_group:
#define NUMA_IMBALANCE_MIN 2
static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight)
+ int dst_running, int imb_numa_nr)
{
- if (!allow_numa_imbalance(dst_running, dst_weight))
+ if (!allow_numa_imbalance(dst_running, imb_numa_nr))
return imbalance;
/*
@@ -9350,7 +9441,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- busiest->sum_nr_running, busiest->group_weight);
+ local->sum_nr_running + 1, env->sd->imb_numa_nr);
}
return;
@@ -9421,12 +9512,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
+ * @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
- * @env: The load balancing environment.
- *
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
@@ -10315,7 +10405,7 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
* anywhere yet.
*/
@@ -10324,7 +10414,7 @@ static inline int find_new_ilb(void)
int ilb;
const struct cpumask *hk_mask;
- hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
@@ -10340,7 +10430,7 @@ static inline int find_new_ilb(void)
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -10553,7 +10643,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
return;
/*
@@ -10769,7 +10859,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/