summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt15
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/auditsc.c87
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/cgroup/cpuset.c16
-rw-r--r--kernel/cgroup/freezer.c2
-rw-r--r--kernel/cgroup/rstat.c5
-rw-r--r--kernel/configs/debug.config1
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/dma/contiguous.c4
-rw-r--r--kernel/dma/mapping.c2
-rw-r--r--kernel/dma/swiotlb.c23
-rw-r--r--kernel/entry/common.c37
-rw-r--r--kernel/events/callchain.c4
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/fork.c278
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/irq/chip.c29
-rw-r--r--kernel/irq/cpuhotplug.c4
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/irqdesc.c27
-rw-r--r--kernel/irq/irqdomain.c9
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kthread.c9
-rw-r--r--kernel/locking/lockdep.c43
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/lockdep_proc.c51
-rw-r--r--kernel/locking/percpu-rwsem.c5
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/suspend_test.c8
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/printk/printk.c85
-rw-r--r--kernel/printk/printk_ringbuffer.c52
-rw-r--r--kernel/printk/printk_ringbuffer.h2
-rw-r--r--kernel/rcu/rcu_segcblist.h4
-rw-r--r--kernel/rcu/rcutorture.c41
-rw-r--r--kernel/rcu/tasks.h8
-rw-r--r--kernel/rcu/tree.c331
-rw-r--r--kernel/rcu/tree.h18
-rw-r--r--kernel/rcu/tree_exp.h5
-rw-r--r--kernel/rcu/tree_nocb.h18
-rw-r--r--kernel/rcu/tree_plugin.h37
-rw-r--r--kernel/rcu/tree_stall.h35
-rw-r--r--kernel/rcu/update.c7
-rw-r--r--kernel/rseq.c8
-rw-r--r--kernel/sched/Makefile28
-rw-r--r--kernel/sched/autogroup.c26
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c52
-rw-r--r--kernel/sched/build_utility.c109
-rw-r--r--kernel/sched/clock.c44
-rw-r--r--kernel/sched/completion.c2
-rw-r--r--kernel/sched/core.c494
-rw-r--r--kernel/sched/core_sched.c3
-rw-r--r--kernel/sched/cpuacct.c12
-rw-r--r--kernel/sched/cpudeadline.c3
-rw-r--r--kernel/sched/cpufreq.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c20
-rw-r--r--kernel/sched/cpupri.c1
-rw-r--r--kernel/sched/cputime.c1
-rw-r--r--kernel/sched/deadline.c155
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c113
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/isolation.c163
-rw-r--r--kernel/sched/loadavg.c1
-rw-r--r--kernel/sched/membarrier.c1
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/psi.c61
-rw-r--r--kernel/sched/rt.c51
-rw-r--r--kernel/sched/sched.h354
-rw-r--r--kernel/sched/stats.c1
-rw-r--r--kernel/sched/stats.h4
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/topology.c268
-rw-r--r--kernel/sched/wait.c1
-rw-r--r--kernel/sched/wait_bit.c2
-rw-r--r--kernel/signal.c40
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/stacktrace.c3
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/tick-sched.c77
-rw-r--r--kernel/time/tick-sched.h4
-rw-r--r--kernel/torture.c6
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/fgraph.c4
-rw-r--r--kernel/trace/ftrace.c10
-rw-r--r--kernel/trace/trace.c73
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c104
-rw-r--r--kernel/trace/trace_events_hist.c33
-rw-r--r--kernel/trace/trace_events_synth.c14
-rw-r--r--kernel/trace/trace_events_user.c1690
-rw-r--r--kernel/trace/trace_osnoise.c35
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/watch_queue.c25
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c62
109 files changed, 4282 insertions, 1387 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index ce77f0265660..8c6de5a9ecc4 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -96,8 +96,9 @@ config PREEMPTION
config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
select PREEMPT_BUILD
- default y
+ default y if HAVE_PREEMPT_DYNAMIC_CALL
help
This option allows to define the preemption model on the kernel
command line parameter and thus override the default preemption
@@ -132,4 +133,14 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should
be no measurable impact on performance.
-
+config ARCH_WANTS_RT_DELAYED_SIGNALS
+ bool
+ help
+ This option is selected by architectures where raising signals
+ can happen in atomic contexts on PREEMPT_RT enabled kernels. This
+ option delays raising the signal until the return to user space
+ loop where it is also delivered. X86 requires this to deliver
+ signals from trap handlers which run on IST stacks.
+
+config RT_DELAYED_SIGNALS
+ def_bool PREEMPT_RT && ARCH_WANTS_RT_DELAYED_SIGNALS
diff --git a/kernel/audit.h b/kernel/audit.h
index c4498090a5bd..58b66543b4d5 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -201,6 +201,10 @@ struct audit_context {
struct {
char *name;
} module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a83928cbdcb7..ea2ee1181921 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1340,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
from_kuid(&init_user_ns, name->fcap.rootid));
}
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1454,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "(null)");
break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -2849,31 +2901,26 @@ void __audit_fanotify(unsigned int response)
void __audit_tk_injoffset(struct timespec64 offset)
{
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
- "sec=%lli nsec=%li",
- (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
- const char *op, enum audit_ntp_type type)
-{
- const struct audit_ntp_val *val = &ad->vals[type];
-
- if (val->newval == val->oldval)
- return;
+ struct audit_context *context = audit_context();
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
- "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
- audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
- audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
- audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
- audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
- audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
- audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a557eea7166f..795c2818e2a3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6243,6 +6243,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6305,6 +6306,7 @@ void cgroup_cancel_fork(struct task_struct *child,
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5de18448016c..9390bfd9f1cd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/*
* There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
* of the situation.
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@ -833,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
goto done;
}
@@ -863,7 +863,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
if (root_load_balance &&
@@ -952,7 +952,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -1181,7 +1181,7 @@ enum subparts_cmd {
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
- * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
* into parent's effective_cpus. 0 should always be returned.
@@ -2027,7 +2027,7 @@ out:
}
/*
- * update_prstate - update partititon_root_state
+ * update_prstate - update partition_root_state
* cs: the cpuset to update
* new_prs: new partition root state
*
@@ -2879,7 +2879,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -3076,7 +3076,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9d331ba44870..29ea74f0eab3 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -315,7 +315,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
@@ -325,11 +325,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
- delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 07df6d93c4df..e8db8d938661 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -16,6 +16,7 @@ CONFIG_SYMBOLIC_ERRNAME=y
#
# Compile-time checks and compiler options
#
+CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_FRAME_WARN=2048
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 407a2568f35e..5797c2a7a93f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,6 +34,7 @@
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
+#include <linux/random.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -1488,8 +1489,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
- primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+ primary = housekeeping_any_cpu(HK_TYPE_TIMER);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
@@ -1659,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
@@ -1782,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..6ea80ae42622 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = {
static int __init rmem_cma_setup(struct reserved_mem *rmem)
{
- phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
- phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
@@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- if ((rmem->base & mask) || (rmem->size & mask)) {
+ if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9478eccd1c8e..559461a826ba 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -745,7 +745,6 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
/*
@@ -761,7 +760,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
size_t dma_max_mapping_size(struct device *dev)
{
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index bfc56cb21705..6db1c475ec82 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -627,10 +627,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
tlb_addr = slot_addr(mem->start, index) + offset;
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE ||
- dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ /*
+ * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+ * to the tlb buffer, if we knew for sure the device will
+ * overwirte the entire current content. But we don't. Thus
+ * unconditional bounce may prevent leaking swiotlb content (i.e.
+ * kernel memory) to user-space.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
@@ -697,10 +701,13 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
{
- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
- else
- BUG_ON(dir != DMA_FROM_DEVICE);
+ /*
+ * Unconditional bounce is necessary to avoid corruption on
+ * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite
+ * the whole lengt of the bounce buffer.
+ */
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ BUG_ON(!valid_dma_direction(dir));
}
void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..ed10a95a6b1d 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -3,6 +3,7 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/highmem.h>
+#include <linux/jump_label.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -148,6 +149,18 @@ static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
}
+#ifdef CONFIG_RT_DELAYED_SIGNALS
+static inline void raise_delayed_signal(void)
+{
+ if (unlikely(current->forced_info.si_signo)) {
+ force_sig_info(&current->forced_info);
+ current->forced_info.si_signo = 0;
+ }
+}
+#else
+static inline void raise_delayed_signal(void) { }
+#endif
+
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -162,6 +175,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & _TIF_NEED_RESCHED)
schedule();
+ raise_delayed_signal();
+
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
@@ -380,7 +395,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
return ret;
}
-void irqentry_exit_cond_resched(void)
+void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
@@ -392,7 +407,17 @@ void irqentry_exit_cond_resched(void)
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
@@ -420,13 +445,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMPT_DYNAMIC
- static_call(irqentry_exit_cond_resched)();
-#else
+ if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
-#endif
- }
+
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 58cbe357fb2b..1273be84392c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
}
if (regs) {
- mm_segment_t fs;
-
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6859229497b1..cfde994ce61c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6746,7 +6746,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
- mm_segment_t fs;
/*
* We dump:
@@ -6764,9 +6763,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -10531,8 +10528,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
*/
if (state == IF_STATE_END) {
ret = -EINVAL;
- if (kernel && event->attr.exclude_kernel)
- goto fail;
/*
* ACTION "filter" must have a non-zero length region
@@ -10574,8 +10569,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6357c3580d07..6418083901d4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = compound_head(old_page),
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
int err;
struct mmu_notifier_range range;
@@ -173,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
return err;
}
- /* For try_to_free_swap() and munlock_vma_page() below */
+ /* For try_to_free_swap() below */
lock_page(old_page);
mmu_notifier_invalidate_range_start(&range);
@@ -201,13 +197,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, false);
+ page_remove_rmap(old_page, vma, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
-
- if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
- munlock_vma_page(old_page);
put_page(old_page);
err = 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..f5459e28b861 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -735,21 +735,7 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- WARN_ON(blk_needs_flush_plug(tsk));
-
- /*
- * If do_dead is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- *
- * On uptodate architectures force_uaccess_begin is a noop. On
- * architectures that still have set_fs/get_fs in addition to handling
- * oopses handles kernel threads that run as set_fs(KERNEL_DS) by
- * default.
- */
- force_uaccess_begin();
+ WARN_ON(tsk->plug);
kcov_task_exit(tsk);
@@ -845,6 +831,7 @@ void __noreturn do_exit(long code)
put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index f1e89007f228..d74c30b6733b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -185,7 +186,7 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-#ifdef CONFIG_VMAP_STACK
+# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
@@ -193,6 +194,41 @@ static inline void free_task_struct(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -210,11 +246,35 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}
-#endif
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
-#ifdef CONFIG_VMAP_STACK
+ int i;
+ int ret;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ }
+ return 0;
+err:
+ /*
+ * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+ * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+ * ignore this page.
+ */
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ return ret;
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm;
void *stack;
int i;
@@ -232,9 +292,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
+ if (memcg_charge_kernel_stack(s)) {
+ vfree(s->addr);
+ return -ENOMEM;
+ }
+
tsk->stack_vm_area = s;
tsk->stack = s->addr;
- return s->addr;
+ return 0;
}
/*
@@ -247,71 +312,95 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
+ if (!stack)
+ return -ENOMEM;
+ vm = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
-#else
+ tsk->stack_vm_area = vm;
+ tsk->stack = stack;
+ return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}
+
+# else /* !CONFIG_VMAP_STACK */
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
+ return 0;
}
- return NULL;
-#endif
+ return -ENOMEM;
}
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
+}
- if (vm) {
- int i;
+# endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
-
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
+static struct kmem_cache *thread_stack_cache;
- return;
- }
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
- vfree_atomic(tsk->stack);
- return;
- }
-#endif
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ call_rcu(rh, thread_stack_free_rcu);
}
-# else
-static struct kmem_cache *thread_stack_cache;
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
void thread_stack_cache_init(void)
@@ -321,8 +410,26 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ unsigned long *stack;
+
+ stack = arch_alloc_thread_stack_node(tsk, node);
+ tsk->stack = stack;
+ return stack ? 0 : -ENOMEM;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ arch_free_thread_stack(tsk);
+ tsk->stack = NULL;
+}
+
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -379,50 +486,34 @@ void vm_area_free(struct vm_area_struct *vma)
static void account_kernel_stack(struct task_struct *tsk, int account)
{
- void *stack = task_stack_page(tsk);
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
+ void *stack = task_stack_page(tsk);
+
/* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
}
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+void exit_task_stack_account(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ account_kernel_stack(tsk, -1);
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm;
int i;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page's
- * memory cgroup pointer is NULL, and
- * memcg_kmem_uncharge_page() in free_thread_stack()
- * will ignore this page.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
- }
+ vm = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
}
-#endif
- return 0;
}
static void release_task_stack(struct task_struct *tsk)
@@ -430,12 +521,7 @@ static void release_task_stack(struct task_struct *tsk)
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -874,8 +960,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
@@ -884,32 +968,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!tsk)
return NULL;
- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
goto free_tsk;
- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
- stack_vm_area = task_stack_vm_area(tsk);
-
- err = arch_dup_task_struct(tsk, orig);
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
+ goto free_tsk;
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
-
- if (err)
- goto free_stack;
+ account_kernel_stack(tsk, 1);
err = scs_prepare(tsk, node);
if (err)
@@ -953,8 +1023,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;
- account_kernel_stack(tsk, 1);
-
kcov_task_init(tsk);
kmap_local_fork(tsk);
@@ -967,12 +1035,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_IOMMU_SVA
+ tsk->pasid_activated = 0;
+#endif
+
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
return tsk;
free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -1019,13 +1092,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static void mm_init_pasid(struct mm_struct *mm)
-{
-#ifdef CONFIG_IOMMU_SUPPORT
- mm->pasid = INIT_PASID;
-#endif
-}
-
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1054,7 +1120,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
- mm_init_pasid(mm);
+ mm_pasid_init(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1121,6 +1187,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ mm_pasid_drop(mm);
mmdrop(mm);
}
@@ -2451,6 +2518,7 @@ bad_fork_cleanup_count:
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 51dd822a8060..b22ef1efe751 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -302,7 +302,7 @@ again:
* found it, but truncated or holepunched or subjected to
* invalidate_complete_page2 before we got the page lock (also
* cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
+ * so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c09324663088..54af0deb239b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -38,7 +38,7 @@ struct irqaction chained_action = {
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
@@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
if (!desc)
return -EINVAL;
- if (!chip)
- chip = &no_irq_chip;
-
- desc->irq_data.chip = chip;
+ desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
@@ -1073,7 +1070,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
@@ -1558,6 +1555,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+static struct device *irq_get_parent_device(struct irq_data *data)
+{
+ if (data->domain)
+ return data->domain->dev;
+
+ return NULL;
+}
+
/**
* irq_chip_pm_get - Enable power for an IRQ chip
* @data: Pointer to interrupt specific data
@@ -1567,12 +1572,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
int irq_chip_pm_get(struct irq_data *data)
{
+ struct device *dev = irq_get_parent_device(data);
int retval;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
- retval = pm_runtime_get_sync(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev) {
+ retval = pm_runtime_get_sync(dev);
if (retval < 0) {
- pm_runtime_put_noidle(data->chip->parent_device);
+ pm_runtime_put_noidle(dev);
return retval;
}
}
@@ -1590,10 +1596,11 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
+ struct device *dev = irq_get_parent_device(data);
int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
- retval = pm_runtime_put(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_put(dev);
return (retval < 0) ? retval : 0;
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 39a41c56ad4f..1ed2b1739363 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
{
const struct cpumask *hk_mask;
- if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
return false;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
return false;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index e4cff358b437..2b43f5f5033d 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -69,8 +69,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
seq_printf(m, "chip: None\n");
return;
}
- seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
- seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
+ seq_printf(m, "%*schip: ", ind, "");
+ if (chip->irq_print_chip)
+ chip->irq_print_chip(data, m);
+ else
+ seq_printf(m, "%s", chip->name);
+ seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
ARRAY_SIZE(irqchip_flags));
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2267e6527db3..939d21cd55c3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -640,7 +640,7 @@ int handle_irq_desc(struct irq_desc *desc)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data)))
return -EPERM;
generic_handle_irq_desc(desc);
@@ -662,6 +662,29 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+/**
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
+ * @irq: The irq number to handle
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
+ */
+int generic_handle_irq_safe(unsigned int irq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_to_desc(irq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
+
#ifdef CONFIG_IRQ_DOMAIN
/**
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
@@ -676,7 +699,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
- WARN_ON_ONCE(!in_irq());
+ WARN_ON_ONCE(!in_hardirq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index bf38c546aa25..d5ce96510549 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @chip_data: The associated chip data
*/
int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq,
+ const struct irq_chip *chip,
void *chip_data)
{
struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return -ENOENT;
irq_data->hwirq = hwirq;
- irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
return 0;
@@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f23ffd30385b..c03f71d5ec10 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -247,13 +247,13 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
* online.
*/
if (irqd_affinity_is_managed(data) &&
- housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
+ housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
const struct cpumask *hk_mask, *prog_mask;
static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
static struct cpumask tmp_mask;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
raw_spin_lock(&tmp_mask_lock);
cpumask_and(&tmp_mask, mask, hk_mask);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 38c6dd822da8..50265f69a135 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -55,7 +55,6 @@ struct kthread {
int result;
int (*threadfn)(void *);
void *data;
- mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
@@ -356,7 +355,7 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -722,7 +721,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
@@ -1441,8 +1440,6 @@ void kthread_use_mm(struct mm_struct *mm)
mmdrop(active_mm);
else
smp_mb();
-
- to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1457,8 +1454,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- force_uaccess_end(to_kthread(tsk)->oldfs);
-
task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f8a0212189ca..c06cab6546ed 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -183,11 +183,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
unsigned long nr_zapped_classes;
-#ifndef CONFIG_DEBUG_LOCKDEP
-static
-#endif
+unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
-static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
@@ -338,7 +336,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
* elements. These elements are linked together by the lock_entry member in
* struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
static LIST_HEAD(free_lock_classes);
/**
@@ -1252,6 +1250,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1317,6 +1316,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* of classes.
*/
list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
@@ -6000,6 +6002,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
WRITE_ONCE(class->name, NULL);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -6011,13 +6015,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
static void reinit_class(struct lock_class *class)
{
- void *const p = class;
- const unsigned int offset = offsetof(struct lock_class, key);
-
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
- memset(p + offset, 0, sizeof(*class) - offset);
+ memset_startat(class, 0, key);
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
@@ -6290,7 +6291,13 @@ void lockdep_reset_lock(struct lockdep_map *lock)
lockdep_reset_lock_reg(lock);
}
-/* Unregister a dynamically allocated key. */
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
void lockdep_unregister_key(struct lock_class_key *key)
{
struct hlist_head *hash_head = keyhashentry(key);
@@ -6305,10 +6312,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ lockdep_lock();
- pf = get_pending_free();
hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
if (k == key) {
hlist_del_rcu(&k->hash_entry);
@@ -6316,11 +6321,13 @@ void lockdep_unregister_key(struct lock_class_key *key)
break;
}
}
- WARN_ON_ONCE(!found);
- __lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
- graph_unlock();
-out_irq:
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ }
+ lockdep_unlock();
raw_local_irq_restore(flags);
/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index ecb8662e7a4e..bbe9000260d0 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
@@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks;
extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
@@ -205,7 +208,6 @@ struct lockdep_stats {
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
-extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
#define __debug_atomic_inc(ptr) \
this_cpu_inc(lockdep_stats.ptr);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b8d9a050c337..15fdc7fa5c68 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -24,14 +24,33 @@
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
@@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
sum_forward_deps += lockdep_count_forward_deps(class);
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
@@ -345,6 +370,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
@@ -622,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
iter->stats = lock_stats(class);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -645,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -654,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 70a32a576f3f..c9fdae94e098 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,7 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
@@ -162,7 +163,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -211,7 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 69aba4abe104..acde5d6f1254 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1048,7 +1048,7 @@ out_nolock:
/*
* Wait until we successfully acquire the write lock
*/
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
long count;
diff --git a/kernel/module.c b/kernel/module.c
index 46a5c2ed1928..6cea788fd965 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -335,7 +335,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
/*
* A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit. nfsd and lockd use this.
+ * is running can call this to safely exit.
*/
void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
{
diff --git a/kernel/padata.c b/kernel/padata.c
index 18d3a5c699d8..e5819bb8bd1d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -181,7 +181,7 @@ int padata_do_parallel(struct padata_shell *ps,
goto out;
if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
- if (!cpumask_weight(pd->cpumask.cbcpu))
+ if (cpumask_empty(pd->cpumask.cbcpu))
goto out;
/* Select an alternate fallback CPU and notify the caller. */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e6af502c2fd7..938d5c78b421 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,7 +28,6 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
#include <linux/secretmem.h>
@@ -689,8 +688,10 @@ static int load_image_and_restore(void)
lock_device_hotplug();
error = create_basic_memory_bitmaps();
- if (error)
+ if (error) {
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Unlock;
+ }
error = swsusp_read(&flags);
swsusp_close(FMODE_READ | FMODE_EXCL);
@@ -1328,7 +1329,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
- return rc;
+ pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index d20526c5be15..b663a97f5867 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value)
value++;
suspend_type = strsep(&value, ",");
if (!suspend_type)
- return 0;
+ return 1;
repeat = strsep(&value, ",");
if (repeat) {
if (kstrtou32(repeat, 0, &test_repeat_count_max))
- return 0;
+ return 1;
}
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
- return 0;
+ return 1;
}
printk(warn_bad_state, suspend_type);
- return 0;
+ return 1;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ad10359030a4..91fffdd2c7fb 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -16,7 +16,6 @@
#include <linux/file.h>
#include <linux/delay.h>
#include <linux/bitops.h>
-#include <linux/genhd.h>
#include <linux/device.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -89,7 +88,7 @@ struct swap_map_page_list {
struct swap_map_page_list *next;
};
-/**
+/*
* The swap_map_handle structure is used for handling swap in
* a file-alike way
*/
@@ -117,7 +116,7 @@ struct swsusp_header {
static struct swsusp_header *swsusp_header;
-/**
+/*
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
*/
@@ -171,7 +170,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
return 0;
}
-/**
+/*
* alloc_swapdev_block - allocate a swap page and register that it has
* been allocated, so that it can be freed in case of an error.
*/
@@ -190,7 +189,7 @@ sector_t alloc_swapdev_block(int swap)
return 0;
}
-/**
+/*
* free_all_swap_pages - free swap pages allocated for saving image data.
* It also frees the extents used to register which swap entries had been
* allocated.
@@ -277,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
+ bio = bio_alloc(hib_resume_bdev, 1, op | op_flags,
+ GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio_set_dev(bio, hib_resume_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
pr_err("Adding page to bio failed at %llu\n",
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 82abfaf3c2aa..da03c15ecc89 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -93,6 +93,12 @@ EXPORT_SYMBOL_GPL(console_drivers);
*/
int __read_mostly suppress_printk;
+/*
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
+ */
+static int __read_mostly suppress_panic_printk;
+
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
@@ -146,8 +152,10 @@ static int __control_devkmsg(char *str)
static int __init control_devkmsg(char *str)
{
- if (__control_devkmsg(str) < 0)
+ if (__control_devkmsg(str) < 0) {
+ pr_warn("printk.devkmsg: bad option string '%s'\n", str);
return 1;
+ }
/*
* Set sysctl string accordingly:
@@ -166,7 +174,7 @@ static int __init control_devkmsg(char *str)
*/
devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
- return 0;
+ return 1;
}
__setup("printk.devkmsg=", control_devkmsg);
@@ -257,6 +265,11 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
+static bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -1843,6 +1856,16 @@ static int console_trylock_spinning(void)
if (console_trylock())
return 1;
+ /*
+ * It's unsafe to spin once a panic has begun. If we are the
+ * panic CPU, we may have already halted the owner of the
+ * console_sem. If we are not the panic CPU, then we should
+ * avoid taking console_sem, so the panic CPU has a better
+ * chance of cleanly acquiring it later.
+ */
+ if (panic_in_progress())
+ return 0;
+
printk_safe_enter_irqsave(flags);
raw_spin_lock(&console_owner_lock);
@@ -2218,6 +2241,10 @@ asmlinkage int vprintk_emit(int facility, int level,
if (unlikely(suppress_printk))
return 0;
+ if (unlikely(suppress_panic_printk) &&
+ atomic_read(&panic_cpu) != raw_smp_processor_id())
+ return 0;
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
@@ -2324,6 +2351,20 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
}
#endif
+static void set_user_specified(struct console_cmdline *c, bool user_specified)
+{
+ if (!user_specified)
+ return;
+
+ /*
+ * @c console was defined by the user on the command line.
+ * Do not clear when added twice also by SPCR or the device tree.
+ */
+ c->user_specified = true;
+ /* At least one console defined by the user on the command line. */
+ console_set_on_cmdline = 1;
+}
+
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options, bool user_specified)
{
@@ -2340,8 +2381,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
if (strcmp(c->name, name) == 0 && c->index == idx) {
if (!brl_options)
preferred_console = i;
- if (user_specified)
- c->user_specified = true;
+ set_user_specified(c, user_specified);
return 0;
}
}
@@ -2351,7 +2391,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
preferred_console = i;
strlcpy(c->name, name, sizeof(c->name));
c->options = options;
- c->user_specified = user_specified;
+ set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
c->index = idx;
@@ -2417,7 +2457,6 @@ static int __init console_setup(char *str)
*s = 0;
__add_preferred_console(buf, idx, options, brl_options, true);
- console_set_on_cmdline = 1;
return 1;
}
__setup("console=", console_setup);
@@ -2574,6 +2613,25 @@ static int have_callable_console(void)
}
/*
+ * Return true when this CPU should unlock console_sem without pushing all
+ * messages to the console. This reduces the chance that the console is
+ * locked when the panic CPU tries to use it.
+ */
+static bool abandon_console_lock_in_panic(void)
+{
+ if (!panic_in_progress())
+ return false;
+
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
+/*
* Can we actually use the console at this time on this cpu?
*
* Console drivers may assume that per-cpu resources have been allocated. So
@@ -2603,6 +2661,7 @@ void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[CONSOLE_LOG_MAX];
+ static int panic_console_dropped;
unsigned long flags;
bool do_cond_resched, retry;
struct printk_info info;
@@ -2657,6 +2716,10 @@ skip:
if (console_seq != r.info->seq) {
console_dropped += r.info->seq - console_seq;
console_seq = r.info->seq;
+ if (panic_in_progress() && panic_console_dropped++ > 10) {
+ suppress_panic_printk = 1;
+ pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
+ }
}
if (suppress_message_printing(r.info->level)) {
@@ -2716,6 +2779,10 @@ skip:
if (handover)
return;
+ /* Allow panic_cpu to take over the consoles safely */
+ if (abandon_console_lock_in_panic())
+ break;
+
if (do_cond_resched)
cond_resched();
}
@@ -2733,7 +2800,7 @@ skip:
* flush, no worries.
*/
retry = prb_read_valid(prb, next_seq, NULL);
- if (retry && console_trylock())
+ if (retry && !abandon_console_lock_in_panic() && console_trylock())
goto again;
}
EXPORT_SYMBOL(console_unlock);
@@ -3228,7 +3295,7 @@ static DEFINE_PER_CPU(int, printk_pending);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
- int pending = __this_cpu_xchg(printk_pending, 0);
+ int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
/* If trylock fails, someone else is doing the printing */
@@ -3262,7 +3329,7 @@ void defer_console_output(void)
return;
preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
+ this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
}
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 8a7b7362c0dd..2b7b6ddab4f7 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
* state has been re-checked. A memcpy() for all of @desc
* cannot be used because of the atomic_t @state_var field.
*/
- memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
- sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ if (desc_out) {
+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ }
if (seq_out)
*seq_out = info->seq; /* also part of desc_read:C */
if (caller_id_out)
@@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
d_state = get_desc_state(id, state_val);
out:
- atomic_long_set(&desc_out->state_var, state_val);
+ if (desc_out)
+ atomic_long_set(&desc_out->state_var, state_val);
return d_state;
}
@@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, id);
}
/**
@@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e)
*/
void prb_final_commit(struct prb_reserved_entry *e)
{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+
_prb_commit(e, desc_finalized);
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, e->id);
}
/*
@@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
*/
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
- u64 seq = 0;
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ unsigned long id;
+ u64 seq;
+
+ /* Check if the cached @id still points to a valid @seq. */
+ id = atomic_long_read(&desc_ring->last_finalized_id);
+ d_state = desc_read(desc_ring, id, NULL, &seq, NULL);
- /* Search forward from the oldest descriptor. */
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
+ } else {
+ /*
+ * The information about the last finalized sequence number
+ * has gone. It should happen only when there is a flood of
+ * new messages and the ringbuffer is rapidly recycled.
+ * Give up and start from the beginning.
+ */
+ seq = 0;
+ }
+
+ /*
+ * The information about the last finalized @seq might be inaccurate.
+ * Search forward to find the current one.
+ */
while (_prb_read_valid(rb, &seq, NULL, NULL))
seq++;
@@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb,
rb->desc_ring.infos = infos;
atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits));
rb->text_data_ring.size_bits = textbits;
rb->text_data_ring.data = text_buf;
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 73cc80e01cef..18cd25e489b8 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -75,6 +75,7 @@ struct prb_desc_ring {
struct printk_info *infos;
atomic_long_t head_id;
atomic_long_t tail_id;
+ atomic_long_t last_finalized_id;
};
/*
@@ -258,6 +259,7 @@ static struct printk_ringbuffer name = { \
.infos = &_##name##_infos[0], \
.head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
.tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \
}, \
.text_data_ring = { \
.size_bits = (avgtextbits) + (descbits), \
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index e373fbe44da5..431cee212467 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -56,13 +56,13 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
int flags)
{
- rsclp->flags |= flags;
+ WRITE_ONCE(rsclp->flags, rsclp->flags | flags);
}
static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
int flags)
{
- rsclp->flags &= ~flags;
+ WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags);
}
static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 422f7e4cc08d..55d049c39608 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -284,7 +284,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
+static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
/*
* Allocate an element from the rcu_tortures pool.
@@ -387,7 +387,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
+ if (!atomic_read(&rcu_fwd_cb_nodelay) &&
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
@@ -674,6 +674,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
@@ -708,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
@@ -997,7 +999,7 @@ static int rcu_torture_boost(void *arg)
goto checkwait;
/* Wait for the next test interval. */
- oldstarttime = boost_starttime;
+ oldstarttime = READ_ONCE(boost_starttime);
while (time_before(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
if (stutter_wait("rcu_torture_boost"))
@@ -1041,10 +1043,11 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime && !kthread_should_stop()) {
+ while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
if (oldstarttime == boost_starttime) {
- boost_starttime = jiffies + test_boost_interval * HZ;
+ WRITE_ONCE(boost_starttime,
+ jiffies + test_boost_interval * HZ);
n_rcu_torture_boosts++;
}
mutex_unlock(&boost_mutex);
@@ -1276,7 +1279,7 @@ rcu_torture_writer(void *arg)
boot_ended = rcu_inkernel_boot_has_ended();
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
- !READ_ONCE(rcu_fwd_cb_nodelay) &&
+ !atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
boot_ended)
@@ -2180,7 +2183,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
__func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
@@ -2193,7 +2195,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
gps_old = gps;
}
pr_cont("\n");
- mutex_unlock(&rcu_fwd_mutex);
}
/* Callback function for continuous-flood RCU callbacks. */
@@ -2281,6 +2282,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
unsigned long stopat;
static DEFINE_TORTURE_RANDOM(trs);
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (!cur_ops->sync)
return; // Cannot do need_resched() forward progress testing without ->sync.
if (cur_ops->call && cur_ops->cb_barrier) {
@@ -2289,7 +2291,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
}
/* Tight loop containing cond_resched(). */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 0);
@@ -2325,6 +2327,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
cur_ops->sync(); /* Wait for running CB to complete. */
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for queued callbacks. */
}
@@ -2333,7 +2336,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
destroy_rcu_head_on_stack(&fcs.rh);
}
schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
/* Carry out call_rcu() forward-progress testing. */
@@ -2353,13 +2356,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
unsigned long stopat;
unsigned long stoppedat;
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (READ_ONCE(rcu_fwd_emergency_stop))
return; /* Get out of the way quickly, no GP wait! */
if (!cur_ops->call)
return; /* Can't do call_rcu() fwd prog without ->call. */
/* Loop continuously posting RCU callbacks. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
@@ -2414,6 +2418,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree(rfp);
@@ -2427,11 +2432,13 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
+ mutex_unlock(&rcu_fwd_mutex);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
@@ -2511,7 +2518,7 @@ static int rcu_torture_fwd_prog(void *args)
firsttime = false;
WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
} else {
- while (READ_ONCE(rcu_fwd_seq) == oldseq)
+ while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
schedule_timeout_interruptible(1);
oldseq = READ_ONCE(rcu_fwd_seq);
}
@@ -2905,8 +2912,10 @@ rcu_torture_cleanup(void)
int i;
if (torture_cleanup_begin()) {
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
return;
}
if (!cur_ops) {
@@ -2961,8 +2970,10 @@ rcu_torture_cleanup(void)
* Wait for all RCU callbacks to fire, then do torture-type-specific
* cleanup operations.
*/
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index d64f0b1d8cd3..99cf3a13954c 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -123,7 +123,7 @@ static struct rcu_tasks rt_name = \
.call_func = call, \
.rtpcpu = &rt_name ## __percpu, \
.name = n, \
- .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS) + 1, \
+ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
.percpu_enqueue_lim = 1, \
.percpu_dequeue_lim = 1, \
.barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
@@ -302,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
if (unlikely(needadjust)) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
- WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1);
+ WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
@@ -417,7 +417,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim > 1) {
- WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1);
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
smp_store_release(&rtp->percpu_enqueue_lim, 1);
rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
@@ -496,7 +496,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current, HK_FLAG_RCU);
+ housekeeping_affine(current, HK_TYPE_RCU);
WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4c25a6283b0..a4b8189455d5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -87,11 +87,12 @@ static struct rcu_state rcu_state = {
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+ .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
.name = RCU_NAME,
.abbr = RCU_ABBR,
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
- .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+ .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -153,7 +154,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
-/* rcuc/rcub kthread realtime priority */
+/* rcuc/rcub/rcuop kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@@ -222,6 +223,16 @@ static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
}
/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+/*
* Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -1086,9 +1097,8 @@ void rcu_irq_enter_irqson(void)
* Just check whether or not this CPU has non-offloaded RCU callbacks
* queued.
*/
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
+int rcu_needs_cpu(void)
{
- *nextevt = KTIME_MAX;
return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
}
@@ -1167,15 +1177,20 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
bool rcu_lockdep_current_cpu_online(void)
{
struct rcu_data *rdp;
- struct rcu_node *rnp;
bool ret = false;
if (in_nmi() || !rcu_scheduler_fully_active)
return true;
preempt_disable_notrace();
rdp = this_cpu_ptr(&rcu_data);
- rnp = rdp->mynode;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
ret = true;
preempt_enable_notrace();
return ret;
@@ -1260,8 +1275,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* For more detail, please refer to the "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
- bool onl;
+ if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
struct rcu_node *rnp1;
pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
@@ -1270,9 +1284,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
- __func__, rdp->cpu, ".o"[onl],
+ __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
return 1; /* Break things loose after complaining. */
@@ -1739,7 +1752,6 @@ static void rcu_strict_gp_boundary(void *unused)
*/
static noinline_for_stack bool rcu_gp_init(void)
{
- unsigned long firstseq;
unsigned long flags;
unsigned long oldmask;
unsigned long mask;
@@ -1782,22 +1794,17 @@ static noinline_for_stack bool rcu_gp_init(void)
* of RCU's Requirements documentation.
*/
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+ /* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- // Wait for CPU-hotplug operations that might have
- // started before this grace period did.
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
- firstseq = READ_ONCE(rnp->ofl_seq);
- if (firstseq & 0x1)
- while (firstseq == READ_ONCE(rnp->ofl_seq))
- schedule_timeout_idle(1); // Can't wake unless RCU is watching.
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
- raw_spin_lock(&rcu_state.ofl_lock);
- raw_spin_lock_irq_rcu_node(rnp);
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
continue;
}
@@ -1832,8 +1839,9 @@ static noinline_for_stack bool rcu_gp_init(void)
rcu_cleanup_dead_rnp(rnp);
}
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -2850,10 +2858,12 @@ static void rcu_cpu_kthread(unsigned int cpu)
{
unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
int spincnt;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
+ WRITE_ONCE(*j, jiffies);
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
@@ -2874,6 +2884,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(*j, jiffies);
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
@@ -2894,7 +2905,7 @@ static int __init rcu_spawn_core_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
- if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ if (use_softirq)
return 0;
WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
@@ -2995,9 +3006,47 @@ static void check_cb_ovld(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp);
}
-/* Helper function for call_rcu() and friends. */
-static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func)
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ */
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
static atomic_t doublefrees;
unsigned long flags;
@@ -3011,7 +3060,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
/*
* Probable double call_rcu(), so leak the callback.
* Use rcu:rcu_callback trace event to find the previous
- * time callback was passed to __call_rcu().
+ * time callback was passed to call_rcu().
*/
if (atomic_inc_return(&doublefrees) < 4) {
pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
@@ -3022,8 +3071,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
head->func = func;
head->next = NULL;
- local_irq_save(flags);
kasan_record_aux_stack_noalloc(head);
+ local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
@@ -3060,51 +3109,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
local_irq_restore(flags);
}
}
-
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed. However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.
- *
- * RCU read-side critical sections are delimited by rcu_read_lock()
- * and rcu_read_unlock(), and may be nested. In addition, but only in
- * v5.0 and later, regions of code across which interrupts, preemption,
- * or softirqs have been disabled also serve as RCU read-side critical
- * sections. This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section. On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu(). It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section. Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- *
- * Implementation of these memory-ordering guarantees is described here:
- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
- */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
-{
- __call_rcu(head, func);
-}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -3984,13 +3988,16 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
}
/*
- * Called with preemption disabled, and from cross-cpu IRQ context.
+ * If needed, entrain an rcu_barrier() callback on rdp->cblist.
*/
-static void rcu_barrier_func(void *cpu_in)
+static void rcu_barrier_entrain(struct rcu_data *rdp)
{
- uintptr_t cpu = (uintptr_t)cpu_in;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
+ unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+ lockdep_assert_held(&rcu_state.barrier_lock);
+ if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
+ return;
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
@@ -4000,10 +4007,26 @@ static void rcu_barrier_func(void *cpu_in)
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- rcu_barrier_trace(TPS("IRQNQ"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
+ smp_store_release(&rdp->barrier_seq_snap, gseq);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_handler(void *cpu_in)
+{
+ uintptr_t cpu = (uintptr_t)cpu_in;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ lockdep_assert_irqs_disabled();
+ WARN_ON_ONCE(cpu != rdp->cpu);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+ raw_spin_lock(&rcu_state.barrier_lock);
+ rcu_barrier_entrain(rdp);
+ raw_spin_unlock(&rcu_state.barrier_lock);
}
/**
@@ -4017,6 +4040,8 @@ static void rcu_barrier_func(void *cpu_in)
void rcu_barrier(void)
{
uintptr_t cpu;
+ unsigned long flags;
+ unsigned long gseq;
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
@@ -4027,15 +4052,16 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
- rcu_barrier_trace(TPS("EarlyExit"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
}
/* Mark the start of the barrier operation. */
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
rcu_seq_start(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
@@ -4047,7 +4073,7 @@ void rcu_barrier(void)
*/
init_completion(&rcu_state.barrier_completion);
atomic_set(&rcu_state.barrier_cpu_count, 2);
- cpus_read_lock();
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
/*
* Force each CPU with callbacks to register a new callback.
@@ -4056,29 +4082,31 @@ void rcu_barrier(void)
*/
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (cpu_is_offline(cpu) &&
- !rcu_rdp_is_offloaded(rdp))
+retry:
+ if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
continue;
- if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
- rcu_barrier_trace(TPS("OnlineQ"), cpu,
- rcu_state.barrier_sequence);
- smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
- } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
- cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
- rcu_state.barrier_sequence);
- local_irq_disable();
- rcu_barrier_func((void *)cpu);
- local_irq_enable();
- } else if (cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNQ"), cpu,
- rcu_state.barrier_sequence);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ if (!rcu_rdp_cpu_online(rdp)) {
+ rcu_barrier_entrain(rdp);
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
+ schedule_timeout_uninterruptible(1);
+ goto retry;
}
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
}
- cpus_read_unlock();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -4093,6 +4121,12 @@ void rcu_barrier(void)
/* Mark the end of the barrier operation. */
rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
rcu_seq_end(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ }
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_state.barrier_mutex);
@@ -4140,6 +4174,7 @@ rcu_boot_init_percpu_data(int cpu)
INIT_WORK(&rdp->strict_work, strict_work_handler);
WARN_ON_ONCE(rdp->dynticks_nesting != 1);
WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+ rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
@@ -4287,12 +4322,13 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
rcu_dynticks_eqs_online();
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ raw_spin_lock(&rcu_state.barrier_lock);
+ raw_spin_lock_rcu_node(rnp);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
+ raw_spin_unlock(&rcu_state.barrier_lock);
newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
/* Allow lockless access for expedited grace periods. */
@@ -4304,15 +4340,18 @@ void rcu_cpu_starting(unsigned int cpu)
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+ /* rcu_report_qs_rnp() *really* wants some flags to restore */
+ unsigned long flags2;
+
+ local_irq_save(flags2);
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
- rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
} else {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ raw_spin_unlock_rcu_node(rnp);
}
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
@@ -4326,7 +4365,7 @@ void rcu_cpu_starting(unsigned int cpu)
*/
void rcu_report_dead(unsigned int cpu)
{
- unsigned long flags;
+ unsigned long flags, seq_flags;
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -4340,10 +4379,8 @@ void rcu_report_dead(unsigned int cpu)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock(&rcu_state.ofl_lock);
+ local_irq_save(seq_flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
@@ -4354,10 +4391,8 @@ void rcu_report_dead(unsigned int cpu)
}
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- raw_spin_unlock(&rcu_state.ofl_lock);
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(seq_flags);
rdp->cpu_started = false;
}
@@ -4380,7 +4415,9 @@ void rcutree_migrate_callbacks(int cpu)
rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
- local_irq_save(flags);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
+ rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
my_rnp = my_rdp->mynode;
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
@@ -4390,10 +4427,10 @@ void rcutree_migrate_callbacks(int cpu)
needwake = rcu_advance_cbs(my_rnp, rdp) ||
rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
- WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
- !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
@@ -4440,26 +4477,10 @@ static int rcu_pm_notify(struct notifier_block *self,
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
- int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct sched_param sp;
struct task_struct *t;
- /* Force priority into range. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
- && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
- kthread_prio = 2;
- else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
- kthread_prio = 1;
- else if (kthread_prio < 0)
- kthread_prio = 0;
- else if (kthread_prio > 99)
- kthread_prio = 99;
-
- if (kthread_prio != kthread_prio_in)
- pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
- kthread_prio, kthread_prio_in);
-
rcu_scheduler_fully_active = 1;
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
@@ -4570,6 +4591,7 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
+ mutex_init(&rnp->boost_kthread_mutex);
}
}
@@ -4585,6 +4607,28 @@ static void __init rcu_init_one(void)
}
/*
+ * Force priority from the kernel command-line into range.
+ */
+static void __init sanitize_kthread_prio(void)
+{
+ int kthread_prio_in = kthread_prio;
+
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
+ && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
+ kthread_prio = 2;
+ else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("%s: Limited prio to %d from %d\n",
+ __func__, kthread_prio, kthread_prio_in);
+}
+
+/*
* Compute the rcu_node tree geometry from kernel parameters. This cannot
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
@@ -4744,6 +4788,7 @@ void __init rcu_init(void)
kfree_rcu_batch_init();
rcu_bootup_announce();
+ sanitize_kthread_prio();
rcu_init_geometry();
rcu_init_one();
if (dump_tree)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 486fc901bd08..926673ebe355 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,8 +56,6 @@ struct rcu_node {
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
- unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */
- /* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
/* to complete. */
@@ -110,6 +108,9 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
+ struct mutex boost_kthread_mutex;
+ /* Exclusion for thread spawning and affinity */
+ /* manipulation. */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
@@ -190,6 +191,7 @@ struct rcu_data {
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
+ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
int exp_dynticks_snap; /* Double-check need for IPI. */
@@ -203,6 +205,8 @@ struct rcu_data {
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+ struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */
+ /* spawning */
/* The following fields are used by call_rcu, hence own cacheline. */
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
@@ -237,6 +241,7 @@ struct rcu_data {
/* rcuc per-CPU kthread or NULL. */
unsigned int rcu_cpu_kthread_status;
char rcu_cpu_has_work;
+ unsigned long rcuc_activity;
/* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
@@ -302,9 +307,8 @@ struct rcu_state {
/* The following fields are guarded by the root rcu_node's lock. */
- u8 boost ____cacheline_internodealigned_in_smp;
- /* Subject to priority boost. */
- unsigned long gp_seq; /* Grace-period sequence #. */
+ unsigned long gp_seq ____cacheline_internodealigned_in_smp;
+ /* Grace-period sequence #. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
@@ -323,6 +327,8 @@ struct rcu_state {
/* rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */
+
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
@@ -355,7 +361,7 @@ struct rcu_state {
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
- raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+ arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
};
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 237a79989aba..60197ea24ceb 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -502,7 +502,8 @@ static void synchronize_rcu_expedited_wait(void)
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ mask = READ_ONCE(rnp->expmask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
@@ -656,7 +657,7 @@ static void rcu_exp_handler(void *unused)
*/
if (!depth) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
- rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_is_cpu_rrupt_from_idle()) {
rcu_report_exp_rdp(rdp);
} else {
WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index eeafb546a7a0..636d0546a4e9 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1169,7 +1169,7 @@ void __init rcu_init_nohz(void)
struct rcu_data *rdp;
#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
@@ -1226,6 +1226,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
raw_spin_lock_init(&rdp->nocb_gp_lock);
timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
rcu_cblist_init(&rdp->nocb_bypass);
+ mutex_init(&rdp->nocb_gp_kthread_mutex);
}
/*
@@ -1238,6 +1239,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_data *rdp_gp;
struct task_struct *t;
+ struct sched_param sp;
if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup)
return;
@@ -1247,20 +1249,30 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
return;
/* If we didn't spawn the GP kthread first, reorganize! */
+ sp.sched_priority = kthread_prio;
rdp_gp = rdp->nocb_gp_rdp;
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
if (!rdp_gp->nocb_gp_kthread) {
t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
"rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
return;
+ }
WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
/* Spawn the kthread for this CPU. */
t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
return;
+
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
WRITE_ONCE(rdp->nocb_cb_kthread, t);
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
}
@@ -1348,7 +1360,7 @@ static void __init rcu_organize_nocb_kthreads(void)
*/
void rcu_bind_current_to_nocb(void)
{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+ if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask))
WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
}
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c5b45c2f68a1..8360d86db1c0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -330,7 +330,7 @@ void rcu_note_context_switch(bool preempt)
* then queue the task as required based on the states
* of any ongoing and expedited grace periods.
*/
- WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+ WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
trace_rcu_preempt_task(rcu_state.name,
t->pid,
@@ -556,16 +556,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
- /* Unboost if we were boosted. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
-
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
rcu_report_exp_rnp(rnp, true);
+
+ /* Unboost if we were boosted. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+ rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
} else {
local_irq_restore(flags);
}
@@ -773,7 +773,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
int cpu;
int i;
struct list_head *lhp;
- bool onl;
struct rcu_data *rdp;
struct rcu_node *rnp1;
@@ -797,9 +796,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_cont("\n");
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
- cpu, ".o"[onl],
+ cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
}
@@ -996,12 +994,15 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
*/
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
#ifdef CONFIG_RCU_BOOST
struct sched_param sp;
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ WRITE_ONCE(rdp->rcuc_activity, jiffies);
}
#ifdef CONFIG_RCU_BOOST
@@ -1172,15 +1173,14 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct sched_param sp;
struct task_struct *t;
+ mutex_lock(&rnp->boost_kthread_mutex);
if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
- return;
-
- rcu_state.boost = 1;
+ goto out;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- return;
+ goto out;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1188,6 +1188,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+
+ out:
+ mutex_unlock(&rnp->boost_kthread_mutex);
}
/*
@@ -1210,14 +1213,16 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
+ mutex_lock(&rnp->boost_kthread_mutex);
for_each_leaf_node_possible_cpu(rnp, cpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU));
- if (cpumask_weight(cm) == 0)
- cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU));
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm))
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
set_cpus_allowed_ptr(t, cm);
+ mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
}
@@ -1291,7 +1296,7 @@ static void rcu_bind_gp_kthread(void)
{
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current, HK_FLAG_RCU);
+ housekeeping_affine(current, HK_TYPE_RCU);
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 21bebf7c9030..0c5d8516516a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -379,6 +379,15 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
return j > 2 * HZ;
}
+static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
+{
+ unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity);
+
+ if (jp)
+ *jp = j;
+ return j > 2 * HZ;
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -430,6 +439,29 @@ static void print_cpu_stall_info(int cpu)
falsepositive ? " (false positive?)" : "");
}
+static void rcuc_kthread_dump(struct rcu_data *rdp)
+{
+ int cpu;
+ unsigned long j;
+ struct task_struct *rcuc;
+
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return;
+
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return;
+
+ if (!rcu_is_rcuc_kthread_starving(rdp, &j))
+ return;
+
+ pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
+ sched_show_task(rcuc);
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+}
+
/* Complain about starvation of grace-period kthread. */
static void rcu_check_gp_kthread_starvation(void)
{
@@ -601,6 +633,9 @@ static void print_cpu_stall(unsigned long gps)
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
+ if (!use_softirq)
+ rcuc_kthread_dump(rdp);
+
rcu_dump_cpu_stacks();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 156892c22bb5..180ff9c41fa8 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -407,6 +407,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);
+void finish_rcuwait(struct rcuwait *w)
+{
+ rcu_assign_pointer(w->task, NULL);
+ __set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL_GPL(finish_rcuwait);
+
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
{
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 6d45ac3dae7f..97ac20b4f738 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
int ret;
#ifdef CONFIG_64BIT
- if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
+ if (get_user(ptr, &t->rseq->rseq_cs))
return -EFAULT;
#else
- if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+ if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
return -EFAULT;
#endif
if (!ptr) {
@@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t)
* Set rseq_cs to NULL.
*/
#ifdef CONFIG_64BIT
- return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+ return put_user(0UL, &t->rseq->rseq_cs);
#else
- if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+ if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
return -EFAULT;
return 0;
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c83b37af155b..976092b7bd45 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,7 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
-endif
# The compilers are complaining about unused variables inside an if(0) scope
# block. This is daft, shut them up.
@@ -25,18 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
-
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-obj-$(CONFIG_CPU_FREQ) += cpufreq.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-obj-$(CONFIG_PSI) += psi.o
-obj-$(CONFIG_SCHED_CORE) += core_sched.o
+#
+# Build efficiency:
+#
+# These compilation units have roughly the same size and complexity - so their
+# build parallelizes well and finishes roughly at once:
+#
+obj-y += core.o
+obj-y += fair.o
+obj-y += build_policy.o
+obj-y += build_utility.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 8629b37d118e..16092b49ff6a 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,14 +1,35 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Auto-group scheduling implementation:
*/
-#include <linux/nospec.h>
-#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_autogroup_sysctls[] = {
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init sched_autogroup_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_autogroup_sysctls);
+}
+#else
+#define sched_autogroup_sysctl_init() do { } while (0)
+#endif
+
void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
@@ -198,6 +219,7 @@ void sched_autogroup_exit(struct signal_struct *sig)
static int __init setup_autogroup(char *str)
{
sysctl_sched_autogroup_enabled = 0;
+ sched_autogroup_sysctl_init();
return 1;
}
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index b96419974a1f..90d69f2c5eaf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_SCHED_AUTOGROUP_H
+#define _KERNEL_SCHED_AUTOGROUP_H
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
+ extern unsigned int sysctl_sched_autogroup_enabled;
int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
@@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
}
#endif /* CONFIG_SCHED_AUTOGROUP */
+
+#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
new file mode 100644
index 000000000000..e0104b45029a
--- /dev/null
+++ b/kernel/sched/build_policy.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are the scheduling policy related scheduler files, built
+ * in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c and fair.c, the other two big
+ * compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ *
+ * core.c and fair.c are built separately.
+ */
+
+/* Headers: */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/posix-timers.h>
+#include <linux/sched/rt.h>
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/livepatch.h>
+#include <linux/psi.h>
+#include <linux/seqlock_api.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/tsacct_kern.h>
+#include <linux/vtime.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+
+#include "autogroup.h"
+#include "stats.h"
+#include "pelt.h"
+
+/* Source code modules: */
+
+#include "idle.c"
+
+#include "rt.c"
+
+#ifdef CONFIG_SMP
+# include "cpudeadline.c"
+# include "pelt.c"
+#endif
+
+#include "cputime.c"
+#include "deadline.c"
+
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
new file mode 100644
index 000000000000..eec0849b2aae
--- /dev/null
+++ b/kernel/sched/build_utility.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are various utility functions of the scheduler,
+ * built in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c, fair.c, smp.c and policy.c, the other
+ * big compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/hashtable_api.h>
+#include <linux/irq.h>
+#include <linux/kobject_api.h>
+#include <linux/membarrier.h>
+#include <linux/mempolicy.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/psi.h>
+#include <linux/ptrace_api.h>
+#include <linux/sched_clock.h>
+#include <linux/security.h>
+#include <linux/spinlock_api.h>
+#include <linux/swait_api.h>
+#include <linux/timex.h>
+#include <linux/utsname.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#include <uapi/linux/prctl.h>
+#include <uapi/linux/sched/types.h>
+
+#include <asm/switch_to.h>
+
+#include "sched.h"
+#include "sched-pelt.h"
+#include "stats.h"
+#include "autogroup.h"
+
+#include "clock.c"
+
+#ifdef CONFIG_CGROUP_CPUACCT
+# include "cpuacct.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+# include "cpufreq.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+# include "cpufreq_schedutil.c"
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include "debug.c"
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+# include "stats.c"
+#endif
+
+#include "loadavg.c"
+#include "completion.c"
+#include "swait.c"
+#include "wait_bit.c"
+#include "wait.c"
+
+#ifdef CONFIG_SMP
+# include "cpupri.c"
+# include "stop_task.c"
+# include "topology.c"
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+# include "core_sched.c"
+#endif
+
+#ifdef CONFIG_PSI
+# include "psi.c"
+#endif
+
+#ifdef CONFIG_MEMBARRIER
+# include "membarrier.c"
+#endif
+
+#ifdef CONFIG_CPU_ISOLATION
+# include "isolation.c"
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+# include "autogroup.c"
+#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c2b2859ddd82..d9272d9061a3 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -53,15 +53,13 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
-#include "sched.h"
-#include <linux/sched_clock.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __weak sched_clock(void)
+notrace unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
@@ -95,28 +93,28 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-static inline struct sched_clock_data *this_scd(void)
+notrace static inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
-static inline struct sched_clock_data *cpu_sdc(int cpu)
+notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
-int sched_clock_stable(void)
+notrace int sched_clock_stable(void)
{
return static_branch_likely(&__sched_clock_stable);
}
-static void __scd_stamp(struct sched_clock_data *scd)
+notrace static void __scd_stamp(struct sched_clock_data *scd)
{
scd->tick_gtod = ktime_get_ns();
scd->tick_raw = sched_clock();
}
-static void __set_sched_clock_stable(void)
+notrace static void __set_sched_clock_stable(void)
{
struct sched_clock_data *scd;
@@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void)
* The only way to fully avoid random clock jumps is to boot with:
* "tsc=unstable".
*/
-static void __sched_clock_work(struct work_struct *work)
+notrace static void __sched_clock_work(struct work_struct *work)
{
struct sched_clock_data *scd;
int cpu;
@@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work)
static DECLARE_WORK(sched_clock_work, __sched_clock_work);
-static void __clear_sched_clock_stable(void)
+notrace static void __clear_sched_clock_stable(void)
{
if (!sched_clock_stable())
return;
@@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void)
schedule_work(&sched_clock_work);
}
-void clear_sched_clock_stable(void)
+notrace void clear_sched_clock_stable(void)
{
__sched_clock_stable_early = 0;
@@ -196,7 +194,7 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-static void __sched_clock_gtod_offset(void)
+notrace static void __sched_clock_gtod_offset(void)
{
struct sched_clock_data *scd = this_scd();
@@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-static inline u64 wrap_min(u64 x, u64 y)
+notrace static inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-static inline u64 wrap_max(u64 x, u64 y)
+notrace static inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -262,7 +260,7 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static u64 sched_clock_local(struct sched_clock_data *scd)
+notrace static u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
@@ -295,7 +293,7 @@ again:
return clock;
}
-static u64 sched_clock_remote(struct sched_clock_data *scd)
+notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
@@ -362,7 +360,7 @@ again:
*
* See cpu_clock().
*/
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
@@ -386,7 +384,7 @@ u64 sched_clock_cpu(int cpu)
}
EXPORT_SYMBOL_GPL(sched_clock_cpu);
-void sched_clock_tick(void)
+notrace void sched_clock_tick(void)
{
struct sched_clock_data *scd;
@@ -403,7 +401,7 @@ void sched_clock_tick(void)
sched_clock_local(scd);
}
-void sched_clock_tick_stable(void)
+notrace void sched_clock_tick_stable(void)
{
if (!sched_clock_stable())
return;
@@ -423,7 +421,7 @@ void sched_clock_tick_stable(void)
/*
* We are going deep-idle (irqs are disabled):
*/
-void sched_clock_idle_sleep_event(void)
+notrace void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
@@ -432,7 +430,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(void)
+notrace void sched_clock_idle_wakeup_event(void)
{
unsigned long flags;
@@ -458,7 +456,7 @@ void __init sched_clock_init(void)
local_irq_enable();
}
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
if (!static_branch_likely(&sched_clock_running))
return 0;
@@ -476,7 +474,7 @@ u64 sched_clock_cpu(int cpu)
* On bare metal this function should return the same as local_clock.
* Architectures and sub-architectures can override this.
*/
-u64 __weak running_clock(void)
+notrace u64 __weak running_clock(void)
{
return local_clock();
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a778554f9dad..35f15c26ed54 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Generic wait-for-completion handler;
*
@@ -11,7 +12,6 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include "sched.h"
/**
* complete: - signals a single thread waiting on this completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531c..d575b4914925 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,26 +6,90 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-#undef CREATE_TRACE_POINTS
-
-#include "sched.h"
+#include <linux/highmem.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
+#include <linux/debug_locks.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
-#include <linux/nospec.h>
#include <linux/blkdev.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/sched/wake_q.h>
#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_GENERIC_ENTRY
+# include <linux/entry-common.h>
+# endif
+#endif
+
+#include <uapi/linux/sched/types.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
-#include "../smpboot.h"
+#define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
+#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
+#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
+#include "stats.h"
+
+#include "../workqueue_internal.h"
+#include "../../fs/io-wq.h"
+#include "../smpboot.h"
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -36,6 +100,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -1024,13 +1089,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -1046,7 +1111,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
cpu = default_cpu;
unlock:
rcu_read_unlock();
@@ -4279,7 +4344,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
@@ -4287,13 +4354,22 @@ void set_numabalancing_state(bool enabled)
static_branch_disable(&sched_numa_balancing);
}
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+ else
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
+
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = static_branch_likely(&sched_numa_balancing);
+ int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4303,8 +4379,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
- if (write)
- set_numabalancing_state(state);
+ if (write) {
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
return err;
}
#endif
@@ -4834,7 +4912,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
/*
* The previous task will have left us with a preempt_count of 2
@@ -5379,7 +5457,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5400,7 +5478,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -6298,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -6353,8 +6431,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_flush_plug(tsk->plug, true);
+ blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -6491,17 +6568,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_dynamic_enabled
+#define preempt_schedule_dynamic_enabled preempt_schedule
+#define preempt_schedule_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+ return;
+ preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+#endif
#endif
-
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -6556,147 +6647,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_notrace_dynamic_enabled
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+#define preempt_schedule_notrace_dynamic_disabled NULL
#endif
-
-#endif /* CONFIG_PREEMPTION */
-
-#ifdef CONFIG_PREEMPT_DYNAMIC
-
-#include <linux/entry-common.h>
-
-/*
- * SC:cond_resched
- * SC:might_resched
- * SC:preempt_schedule
- * SC:preempt_schedule_notrace
- * SC:irqentry_exit_cond_resched
- *
- *
- * NONE:
- * cond_resched <- __cond_resched
- * might_resched <- RET0
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * VOLUNTARY:
- * cond_resched <- __cond_resched
- * might_resched <- __cond_resched
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * FULL:
- * cond_resched <- RET0
- * might_resched <- RET0
- * preempt_schedule <- preempt_schedule
- * preempt_schedule_notrace <- preempt_schedule_notrace
- * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
- */
-
-enum {
- preempt_dynamic_undefined = -1,
- preempt_dynamic_none,
- preempt_dynamic_voluntary,
- preempt_dynamic_full,
-};
-
-int preempt_dynamic_mode = preempt_dynamic_undefined;
-
-int sched_dynamic_mode(const char *str)
-{
- if (!strcmp(str, "none"))
- return preempt_dynamic_none;
-
- if (!strcmp(str, "voluntary"))
- return preempt_dynamic_voluntary;
-
- if (!strcmp(str, "full"))
- return preempt_dynamic_full;
-
- return -EINVAL;
-}
-
-void sched_dynamic_update(int mode)
-{
- /*
- * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
- * the ZERO state, which is invalid.
- */
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
-
- switch (mode) {
- case preempt_dynamic_none:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: none\n");
- break;
-
- case preempt_dynamic_voluntary:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: voluntary\n");
- break;
-
- case preempt_dynamic_full:
- static_call_update(cond_resched, (void *)&__static_call_return0);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
- break;
- }
-
- preempt_dynamic_mode = mode;
-}
-
-static int __init setup_preempt_mode(char *str)
-{
- int mode = sched_dynamic_mode(str);
- if (mode < 0) {
- pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
- return 0;
- }
-
- sched_dynamic_update(mode);
- return 1;
-}
-__setup("preempt=", setup_preempt_mode);
-
-static void __init preempt_dynamic_init(void)
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
{
- if (preempt_dynamic_mode == preempt_dynamic_undefined) {
- if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
- sched_dynamic_update(preempt_dynamic_none);
- } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
- sched_dynamic_update(preempt_dynamic_voluntary);
- } else {
- /* Default static call setting, nothing to do */
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
- preempt_dynamic_mode = preempt_dynamic_full;
- pr_info("Dynamic Preempt: full\n");
- }
- }
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+ return;
+ preempt_schedule_notrace();
}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+#endif
+#endif
-#else /* !CONFIG_PREEMPT_DYNAMIC */
-
-static inline void preempt_dynamic_init(void) { }
-
-#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+#endif /* CONFIG_PREEMPTION */
/*
* This is the entry point to schedule() from kernel preemption
@@ -8203,11 +8174,35 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define cond_resched_dynamic_enabled __cond_resched
+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
+#define might_resched_dynamic_enabled __cond_resched
+#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_cond_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_cond_resched);
+
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_might_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_might_resched);
+#endif
#endif
/*
@@ -8272,6 +8267,154 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_rwlock_write);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#ifdef CONFIG_GENERIC_ENTRY
+#include <linux/entry-common.h>
+#endif
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+int preempt_dynamic_mode = preempt_dynamic_undefined;
+
+int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return preempt_dynamic_none;
+
+ if (!strcmp(str, "voluntary"))
+ return preempt_dynamic_voluntary;
+
+ if (!strcmp(str, "full"))
+ return preempt_dynamic_full;
+
+ return -EINVAL;
+}
+
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#else
+#error "Unsupported PREEMPT_DYNAMIC mechanism"
+#endif
+
+void sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 0;
+ }
+
+ sched_dynamic_update(mode);
+ return 1;
+}
+__setup("preempt=", setup_preempt_mode);
+
+static void __init preempt_dynamic_init(void)
+{
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
+ }
+}
+
+#else /* !CONFIG_PREEMPT_DYNAMIC */
+
+static inline void preempt_dynamic_init(void) { }
+
+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+
/**
* yield - yield the current processor to other threads.
*
@@ -8380,9 +8523,7 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- if (current->plug)
- blk_flush_plug(current->plug, true);
-
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
@@ -8709,7 +8850,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
{
int ret = 1;
- if (!cpumask_weight(cur))
+ if (cpumask_empty(cur))
return ret;
ret = dl_cpuset_cpumask_can_shrink(cur, trial);
@@ -8737,8 +8878,11 @@ int task_can_attach(struct task_struct *p,
}
if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed))
- ret = dl_task_can_attach(p, cs_cpus_allowed);
+ cs_cpus_allowed)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+
+ ret = dl_cpu_busy(cpu, p);
+ }
out:
return ret;
@@ -9022,8 +9166,10 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- if (dl_cpu_busy(cpu))
- return -EBUSY;
+ int ret = dl_cpu_busy(cpu, NULL);
+
+ if (ret)
+ return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
@@ -9053,6 +9199,7 @@ int sched_cpu_activate(unsigned int cpu)
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
+ sched_update_numa(cpu, true);
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
@@ -9131,10 +9278,12 @@ int sched_cpu_deactivate(unsigned int cpu)
if (!sched_smp_initialized)
return 0;
+ sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
+ sched_update_numa(cpu, true);
return ret;
}
sched_domains_numa_masks_clear(cpu);
@@ -9237,7 +9386,7 @@ int sched_cpu_dying(unsigned int cpu)
void __init sched_init_smp(void)
{
- sched_init_numa();
+ sched_init_numa(NUMA_NO_NODE);
/*
* There's no userspace yet to cause hotplug operations; hence all the
@@ -9249,7 +9398,7 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
@@ -9349,7 +9498,6 @@ void __init sched_init(void)
#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index c8746a9a7ada..38a2cec21014 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -1,8 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/prctl.h>
-#include "sched.h"
-
/*
* A simple wrapper around refcount. An allocated sched_core_cookie's
* address is used to compute the cookie of the task.
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 3d06c5e4220d..0de9dda09949 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
-#include <asm/irq_regs.h>
-#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
@@ -334,14 +333,13 @@ static struct cftype files[] = {
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
+ unsigned int cpu = task_cpu(tsk);
struct cpuacct *ca;
- rcu_read_lock();
+ lockdep_assert_rq_held(cpu_rq(cpu));
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- __this_cpu_add(*ca->cpuusage, cputime);
-
- rcu_read_unlock();
+ *per_cpu_ptr(ca->cpuusage, cpu) += cputime;
}
/*
@@ -353,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
struct cpuacct *ca;
- rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
__this_cpu_add(ca->cpustat->cpustat[index], val);
- rcu_read_unlock();
}
struct cgroup_subsys cpuacct_cgrp_subsys = {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ceb03d76c0cc..02d970a879ed 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * kernel/sched/cpudl.c
+ * kernel/sched/cpudeadline.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
-#include "sched.h"
static inline int parent(int i)
{
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 7c2fe50fd76d..5252fb191fae 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,9 +5,6 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#include <linux/cpufreq.h>
-
-#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 26778884d9ab..3dbf351d12d5 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -6,13 +6,6 @@
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "sched.h"
-
-#include <linux/sched/cpufreq.h>
-#include <trace/events/power.h>
-
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
struct sugov_tunables {
@@ -289,6 +282,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* into the same scale so we can compare.
*/
boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
}
@@ -348,8 +342,11 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
*/
- if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
/* Restore cached freq as next_freq has changed */
@@ -395,8 +392,11 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
/*
* Do not reduce the target performance level if the CPU has not been
* idle recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
*/
- if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
@@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov);
static void sugov_tunables_free(struct kobject *kobj)
{
- struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
+ struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
kfree(to_sugov_tunables(attr_set));
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d583f2aa744e..fa9ce9d83683 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -22,7 +22,6 @@
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
-#include "sched.h"
/*
* p->rt_priority p->prio newpri cpupri
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b7ec42732b28..78a233d43757 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,7 +2,6 @@
/*
* Simple CPU accounting cgroup controller
*/
-#include "sched.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d2c072b0ef01..fb4255ae0b2c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,10 +15,6 @@
* Michael Trimarchi <michael@amarulasolutions.com>,
* Fabio Checconi <fchecconi@gmail.com>
*/
-#include "sched.h"
-#include "pelt.h"
-
-struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
@@ -130,6 +126,21 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
rd->visit_gen = gen;
return false;
}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -150,9 +161,38 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
{
return false;
}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
#endif
static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline bool
+__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+static inline
void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -408,7 +448,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
- return dl_rq->root.rb_leftmost == &dl_se->rb_node;
+ return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
}
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
@@ -423,12 +463,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
- raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
if (global_rt_runtime() == RUNTIME_INF)
dl_b->bw = -1;
else
dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
- raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
dl_b->total_bw = 0;
}
@@ -683,15 +721,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
}
-static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_dl_task(struct rq *rq)
-{
-}
-
static inline void deadline_queue_push_tasks(struct rq *rq)
{
}
@@ -1393,6 +1422,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
timer->function = inactive_task_timer;
}
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -1422,10 +1454,9 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
- struct rb_node *leftmost = dl_rq->root.rb_leftmost;
- struct sched_dl_entity *entry;
+ struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
+ struct sched_dl_entity *entry = __node_2_dle(leftmost);
- entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
@@ -1466,9 +1497,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
dec_dl_migration(dl_se, dl_rq);
}
-#define __node_2_dle(node) \
- rb_entry((node), struct sched_dl_entity, rb_node)
-
static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
{
return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
@@ -1931,15 +1959,14 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
deadline_queue_push_tasks(rq);
}
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
- struct dl_rq *dl_rq)
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
{
struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
- return rb_entry(left, struct sched_dl_entity, rb_node);
+ return __node_2_dle(left);
}
static struct task_struct *pick_task_dl(struct rq *rq)
@@ -1951,7 +1978,7 @@ static struct task_struct *pick_task_dl(struct rq *rq)
if (!sched_dl_runnable(rq))
return NULL;
- dl_se = pick_next_dl_entity(rq, dl_rq);
+ dl_se = pick_next_dl_entity(dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
@@ -2034,15 +2061,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
+ struct rb_node *next_node;
if (!has_pushable_dl_tasks(rq))
return NULL;
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+
next_node:
if (next_node) {
- p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(next_node);
if (pick_dl_task(rq, p, cpu))
return p;
@@ -2208,8 +2237,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
- struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
@@ -2240,12 +2268,6 @@ static int push_dl_task(struct rq *rq)
return 0;
retry:
- if (is_migration_disabled(next_task))
- return 0;
-
- if (WARN_ON(next_task == rq->curr))
- return 0;
-
/*
* If next_task preempts rq->curr, and rq->curr
* can move away, it makes sense to just reschedule
@@ -2258,6 +2280,12 @@ retry:
return 0;
}
+ if (is_migration_disabled(next_task))
+ return 0;
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -2731,9 +2759,6 @@ void sched_dl_do_global(void)
int cpu;
unsigned long flags;
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
@@ -2955,41 +2980,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
}
#ifdef CONFIG_SMP
-int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
-{
- unsigned long flags, cap;
- unsigned int dest_cpu;
- struct dl_bw *dl_b;
- bool overflow;
- int ret;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cap = dl_bw_capacity(dest_cpu);
- overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
- if (overflow) {
- ret = -EBUSY;
- } else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- int cpus = dl_bw_cpus(dest_cpu);
-
- __dl_add(dl_b, p->dl.dl_bw, cpus);
- ret = 0;
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
-
- return ret;
-}
-
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -3011,7 +3001,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-bool dl_cpu_busy(unsigned int cpu)
+int dl_cpu_busy(int cpu, struct task_struct *p)
{
unsigned long flags, cap;
struct dl_bw *dl_b;
@@ -3021,11 +3011,22 @@ bool dl_cpu_busy(unsigned int cpu)
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cap = dl_bw_capacity(cpu);
- overflow = __dl_overflow(dl_b, cap, 0, 0);
+ overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
+
+ if (!overflow && p) {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+ }
+
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- return overflow;
+ return overflow ? -EBUSY : 0;
}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aa29211de1bf..bb3d63bdf4ae 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,7 +6,6 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
-#include "sched.h"
/*
* This allows printing both to /proc/sched_debug and
@@ -931,25 +930,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
-
if (p->mm)
P(mm->numa_scan_seq);
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
P(numa_pages_migrated);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
- mpol_put(pol);
#endif
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5146163bfabb..ee0664c9d291 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,38 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
+
+#include <asm/switch_to.h>
+
+#include <linux/sched/cond_resched.h>
+
#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -1259,10 +1290,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
- int maxdist, bool task)
+ int lim_dist, bool task)
{
unsigned long score = 0;
- int node;
+ int node, max_dist;
/*
* All nodes are directly connected, and the same distance
@@ -1271,6 +1302,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
+ /* sched_max_numa_distance may be changed in parallel. */
+ max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
@@ -1283,7 +1316,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
- if (dist == sched_max_numa_distance || node == nid)
+ if (dist >= max_dist || node == nid)
continue;
/*
@@ -1293,8 +1326,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist >= maxdist)
+ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1312,8 +1344,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
- faults *= (sched_max_numa_distance - dist);
- faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ faults *= (max_dist - dist);
+ faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
@@ -1489,6 +1521,7 @@ struct task_numa_env {
int src_cpu, src_nid;
int dst_cpu, dst_nid;
+ int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
@@ -1503,7 +1536,7 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight);
+ int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1884,7 +1917,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
- env->dst_stats.weight);
+ env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -1949,8 +1982,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();
/*
@@ -1985,7 +2020,7 @@ static int task_numa_migrate(struct task_struct *p)
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -2083,13 +2118,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
@@ -2243,7 +2278,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
dist = sched_max_numa_distance;
- for_each_online_node(node) {
+ for_each_node_state(node, N_CPU) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
@@ -2262,7 +2297,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
- nodes = node_online_map;
+ nodes = node_states[N_CPU];
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
@@ -2401,6 +2436,21 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ /* Cannot migrate task to CPU-less node */
+ if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
+ int near_nid = max_nid;
+ int distance, near_distance = INT_MAX;
+
+ for_each_node_state(nid, N_CPU) {
+ distance = node_distance(max_nid, nid);
+ if (distance < near_distance) {
+ near_nid = nid;
+ near_distance = distance;
+ }
+ }
+ max_nid = near_nid;
+ }
+
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
@@ -2825,6 +2875,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
+ p->numa_pages_migrated = 0;
+ p->total_numa_faults = 0;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
@@ -9040,9 +9092,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
* This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity.
*/
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
{
- return (dst_running < (dst_weight >> 2));
+ return running <= imb_numa_nr;
}
/*
@@ -9176,12 +9228,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
#endif
/*
- * Otherwise, keep the task on this node to stay close
- * its wakeup source and improve locality. If there is
- * a real need of migration, periodic load balance will
- * take care of it.
+ * Otherwise, keep the task close to the wakeup source
+ * and improve locality if the number of running tasks
+ * would remain below threshold where an imbalance is
+ * allowed. If there is a real need of migration,
+ * periodic load balance will take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+ if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
return NULL;
}
@@ -9273,9 +9326,9 @@ next_group:
#define NUMA_IMBALANCE_MIN 2
static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight)
+ int dst_running, int imb_numa_nr)
{
- if (!allow_numa_imbalance(dst_running, dst_weight))
+ if (!allow_numa_imbalance(dst_running, imb_numa_nr))
return imbalance;
/*
@@ -9387,7 +9440,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- busiest->sum_nr_running, busiest->group_weight);
+ local->sum_nr_running + 1, env->sd->imb_numa_nr);
}
return;
@@ -10351,7 +10404,7 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
* anywhere yet.
*/
@@ -10360,7 +10413,7 @@ static inline int find_new_ilb(void)
int ilb;
const struct cpumask *hk_mask;
- hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
@@ -10376,7 +10429,7 @@ static inline int find_new_ilb(void)
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -10589,7 +10642,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
return;
/*
@@ -10805,7 +10858,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d17b0a5ce6ac..8f8b5020e76a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,9 +6,6 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
-#include "sched.h"
-
-#include <trace/events/power.h>
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 7f06eaf12818..373d42c707bc 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,136 +7,179 @@
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
-#include "sched.h"
+
+enum hk_flags {
+ HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
+ HK_FLAG_RCU = BIT(HK_TYPE_RCU),
+ HK_FLAG_MISC = BIT(HK_TYPE_MISC),
+ HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
+ HK_FLAG_TICK = BIT(HK_TYPE_TICK),
+ HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
+ HK_FLAG_WQ = BIT(HK_TYPE_WQ),
+ HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
+ HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
+};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
-static cpumask_var_t housekeeping_mask;
-static unsigned int housekeeping_flags;
-bool housekeeping_enabled(enum hk_flags flags)
+struct housekeeping {
+ cpumask_var_t cpumasks[HK_TYPE_MAX];
+ unsigned long flags;
+};
+
+static struct housekeeping housekeeping;
+
+bool housekeeping_enabled(enum hk_type type)
{
- return !!(housekeeping_flags & flags);
+ return !!(housekeeping.flags & BIT(type));
}
EXPORT_SYMBOL_GPL(housekeeping_enabled);
-int housekeeping_any_cpu(enum hk_flags flags)
+int housekeeping_any_cpu(enum hk_type type)
{
int cpu;
if (static_branch_unlikely(&housekeeping_overridden)) {
- if (housekeeping_flags & flags) {
- cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (housekeeping.flags & BIT(type)) {
+ cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
if (cpu < nr_cpu_ids)
return cpu;
- return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
}
}
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
-const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return housekeeping_mask;
+ if (housekeeping.flags & BIT(type))
+ return housekeeping.cpumasks[type];
return cpu_possible_mask;
}
EXPORT_SYMBOL_GPL(housekeeping_cpumask);
-void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+void housekeeping_affine(struct task_struct *t, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- set_cpus_allowed_ptr(t, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
-bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return cpumask_test_cpu(cpu, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
void __init housekeeping_init(void)
{
- if (!housekeeping_flags)
+ enum hk_type type;
+
+ if (!housekeeping.flags)
return;
static_branch_enable(&housekeeping_overridden);
- if (housekeeping_flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_TICK)
sched_tick_offload_init();
- /* We need at least one CPU to handle housekeeping work */
- WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+ for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+ }
}
-static int __init housekeeping_setup(char *str, enum hk_flags flags)
+static void __init housekeeping_setup_type(enum hk_type type,
+ cpumask_var_t housekeeping_staging)
{
- cpumask_var_t non_housekeeping_mask;
- cpumask_var_t tmp;
+
+ alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
+ cpumask_copy(housekeeping.cpumasks[type],
+ housekeeping_staging);
+}
+
+static int __init housekeeping_setup(char *str, unsigned long flags)
+{
+ cpumask_var_t non_housekeeping_mask, housekeeping_staging;
+ int err = 0;
+
+ if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ return 0;
+ }
+ }
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
if (cpulist_parse(str, non_housekeeping_mask) < 0) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ goto free_non_housekeeping_mask;
}
- alloc_bootmem_cpumask_var(&tmp);
- if (!housekeeping_flags) {
- alloc_bootmem_cpumask_var(&housekeeping_mask);
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, non_housekeeping_mask);
+ alloc_bootmem_cpumask_var(&housekeeping_staging);
+ cpumask_andnot(housekeeping_staging,
+ cpu_possible_mask, non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp)) {
+ if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) {
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+ if (!housekeeping.flags) {
pr_warn("Housekeeping: must include one present CPU, "
"using boot CPU:%d\n", smp_processor_id());
- __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- }
- } else {
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp))
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
- if (!cpumask_equal(tmp, housekeeping_mask)) {
- pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
- free_bootmem_cpumask_var(tmp);
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
}
}
- free_bootmem_cpumask_var(tmp);
- if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
- if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
- tick_nohz_full_setup(non_housekeeping_mask);
- } else {
- pr_warn("Housekeeping: nohz unsupported."
- " Build with CONFIG_NO_HZ_FULL\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ if (!housekeeping.flags) {
+ /* First setup call ("nohz_full=" or "isolcpus=") */
+ enum hk_type type;
+
+ for_each_set_bit(type, &flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
+ } else {
+ /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
+ enum hk_type type;
+ unsigned long iter_flags = flags & housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
+ if (!cpumask_equal(housekeeping_staging,
+ housekeeping.cpumasks[type])) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ goto free_housekeeping_staging;
+ }
}
+
+ iter_flags = flags & ~housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
}
- housekeeping_flags |= flags;
+ if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
+ tick_nohz_full_setup(non_housekeeping_mask);
+
+ housekeeping.flags |= flags;
+ err = 1;
+free_housekeeping_staging:
+ free_bootmem_cpumask_var(housekeeping_staging);
+free_non_housekeeping_mask:
free_bootmem_cpumask_var(non_housekeeping_mask);
- return 1;
+ return err;
}
static int __init housekeeping_nohz_full_setup(char *str)
{
- unsigned int flags;
+ unsigned long flags;
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
HK_FLAG_MISC | HK_FLAG_KTHREAD;
@@ -147,7 +190,7 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);
static int __init housekeeping_isolcpus_setup(char *str)
{
- unsigned int flags = 0;
+ unsigned long flags = 0;
bool illegal = false;
char *par;
int len;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 954b229868d9..52c8f8226b0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,7 +6,6 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
-#include "sched.h"
/*
* Global load-average calculations
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 3d2825408e3a..0c5be7ebb1dc 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,7 +4,6 @@
*
* membarrier system call
*/
-#include "sched.h"
/*
* For documentation purposes, here are some membarrier ordering
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a554e3bbab2b..0f310768260c 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -24,10 +24,6 @@
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
-#include <linux/sched.h>
-#include "sched.h"
-#include "pelt.h"
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e14358178849..a4fa3aadfcba 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -137,21 +137,6 @@
* sampling of the aggregate task states would be.
*/
-#include "../workqueue_internal.h"
-#include <linux/sched/loadavg.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/seqlock.h>
-#include <linux/uaccess.h>
-#include <linux/cgroup.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/ctype.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/psi.h>
-#include "sched.h"
-
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
@@ -523,7 +508,7 @@ static void init_triggers(struct psi_group *group, u64 now)
static u64 update_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
- bool new_stall = false;
+ bool update_total = false;
u64 *total = group->total[PSI_POLL];
/*
@@ -532,24 +517,35 @@ static u64 update_triggers(struct psi_group *group, u64 now)
*/
list_for_each_entry(t, &group->triggers, node) {
u64 growth;
+ bool new_stall;
- /* Check for stall activity */
- if (group->polling_total[t->state] == total[t->state])
- continue;
+ new_stall = group->polling_total[t->state] != total[t->state];
+ /* Check for stall activity or a previous threshold breach */
+ if (!new_stall && !t->pending_event)
+ continue;
/*
- * Multiple triggers might be looking at the same state,
- * remember to update group->polling_total[] once we've
- * been through all of them. Also remember to extend the
- * polling time if we see new stall activity.
+ * Check for new stall activity, as well as deferred
+ * events that occurred in the last window after the
+ * trigger had already fired (we want to ratelimit
+ * events without dropping any).
*/
- new_stall = true;
-
- /* Calculate growth since last update */
- growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
-
+ if (new_stall) {
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ update_total = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ t->pending_event = true;
+ }
/* Limit event signaling to once per window */
if (now < t->last_event_time + t->win.size)
continue;
@@ -558,9 +554,11 @@ static u64 update_triggers(struct psi_group *group, u64 now)
if (cmpxchg(&t->event, 0, 1) == 0)
wake_up_interruptible(&t->event_wait);
t->last_event_time = now;
+ /* Reset threshold breach flag once event got generated */
+ t->pending_event = false;
}
- if (new_stall)
+ if (update_total)
memcpy(group->polling_total, total,
sizeof(group->polling_total));
@@ -1124,6 +1122,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0;
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
+ t->pending_event = false;
mutex_lock(&group->trigger_lock);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7b4f4fbbb404..a32c46889af8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,9 +3,6 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
-#include "sched.h"
-
-#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -271,8 +268,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
-static void pull_rt_task(struct rq *this_rq);
-
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
@@ -429,15 +424,6 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
-static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_rt_task(struct rq *this_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -1730,8 +1716,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f
rt_queue_push_tasks(rq);
}
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
- struct rt_rq *rt_rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
@@ -1753,7 +1738,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;
do {
- rt_se = pick_next_rt_entity(rq, rt_rq);
+ rt_se = pick_next_rt_entity(rt_rq);
BUG_ON(!rt_se);
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
@@ -2026,6 +2011,16 @@ static int push_rt_task(struct rq *rq, bool pull)
return 0;
retry:
+ /*
+ * It's possible that the next_task slipped in of
+ * higher priority than current. If that's the case
+ * just reschedule current.
+ */
+ if (unlikely(next_task->prio < rq->curr->prio)) {
+ resched_curr(rq);
+ return 0;
+ }
+
if (is_migration_disabled(next_task)) {
struct task_struct *push_task = NULL;
int cpu;
@@ -2033,6 +2028,18 @@ retry:
if (!pull || rq->push_busy)
return 0;
+ /*
+ * Invoking find_lowest_rq() on anything but an RT task doesn't
+ * make sense. Per the above priority check, curr has to
+ * be of higher priority than next_task, so no need to
+ * reschedule when bailing out.
+ *
+ * Note that the stoppers are masqueraded as SCHED_FIFO
+ * (cf. sched_set_stop_task()), so we can't rely on rt_task().
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ return 0;
+
cpu = find_lowest_rq(rq->curr);
if (cpu == -1 || cpu == rq->cpu)
return 0;
@@ -2057,16 +2064,6 @@ retry:
if (WARN_ON(next_task == rq->curr))
return 0;
- /*
- * It's possible that the next_task slipped in of
- * higher priority than current. If that's the case
- * just reschedule current.
- */
- if (unlikely(next_task->prio < rq->curr->prio)) {
- resched_curr(rq);
- return 0;
- }
-
/* We might release rq lock */
get_task_struct(next_task);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..58263f90c559 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,86 +2,98 @@
/*
* Scheduler internal types and methods:
*/
-#include <linux/sched.h>
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/cpufreq.h>
-#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/hotplug.h>
-#include <linux/sched/idle.h>
-#include <linux/sched/init.h>
-#include <linux/sched/isolation.h>
-#include <linux/sched/jobctl.h>
+#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
-#include <linux/sched/nohz.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/prio.h>
-#include <linux/sched/rt.h>
+#include <linux/sched/rseq_api.h>
#include <linux/sched/signal.h>
#include <linux/sched/smt.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
#include <linux/sched/topology.h>
-#include <linux/sched/user.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/xacct.h>
-#include <uapi/linux/sched/types.h>
-
-#include <linux/binfmts.h>
-#include <linux/bitops.h>
-#include <linux/compat.h>
-#include <linux/context_tracking.h>
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/capability.h>
+#include <linux/cgroup_api.h>
+#include <linux/cgroup.h>
#include <linux/cpufreq.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuset.h>
+#include <linux/cpumask_api.h>
#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/delayacct.h>
-#include <linux/energy_model.h>
-#include <linux/init_task.h>
-#include <linux/kprobes.h>
+#include <linux/file.h>
+#include <linux/fs_api.h>
+#include <linux/hrtimer_api.h>
+#include <linux/interrupt.h>
+#include <linux/irq_work.h>
+#include <linux/jiffies.h>
+#include <linux/kref_api.h>
#include <linux/kthread.h>
-#include <linux/membarrier.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/nmi.h>
+#include <linux/ktime_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex_api.h>
+#include <linux/plist.h>
+#include <linux/poll.h>
#include <linux/proc_fs.h>
-#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/psi.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/security.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/softirq.h>
+#include <linux/spinlock_api.h>
+#include <linux/static_key.h>
#include <linux/stop_machine.h>
-#include <linux/suspend.h>
-#include <linux/swait.h>
+#include <linux/syscalls_api.h>
#include <linux/syscalls.h>
-#include <linux/task_work.h>
-#include <linux/tsacct_kern.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+#include <linux/u64_stats_sync_api.h>
+#include <linux/uaccess.h>
+#include <linux/wait_api.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue_api.h>
+
+#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
+#include "../workqueue_internal.h"
+
+#ifdef CONFIG_CGROUP_SCHED
+#include <linux/cgroup.h>
+#include <linux/psi.h>
+#endif
-#include <asm/tlb.h>
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/static_key.h>
+#endif
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
+# include <asm/paravirt_api_clock.h>
#endif
#include "cpupri.h"
#include "cpudeadline.h"
-#include <trace/events/sched.h>
-
#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
-# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
+# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
struct rq;
@@ -301,29 +313,6 @@ struct dl_bw {
u64 total_bw;
};
-static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
-
-static inline
-void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw -= tsk_bw;
- __dl_update(dl_b, (s32)tsk_bw / cpus);
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw += tsk_bw;
- __dl_update(dl_b, -((s32)tsk_bw / cpus));
-}
-
-static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
- u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
-}
-
/*
* Verify the fitness of task @p to run on @cpu taking into account the
* CPU original capacity and the runtime/deadline ratio of the task.
@@ -347,15 +336,11 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern bool dl_cpu_busy(unsigned int cpu);
+extern int dl_cpu_busy(int cpu, struct task_struct *p);
#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-
struct cfs_rq;
struct rt_rq;
@@ -1662,12 +1647,14 @@ enum numa_topology_type {
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-extern void sched_init_numa(void);
+extern void sched_init_numa(int offline_node);
+extern void sched_update_numa(int cpu, bool online);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
#else
-static inline void sched_init_numa(void) { }
+static inline void sched_init_numa(int offline_node) { }
+static inline void sched_update_numa(int cpu, bool online) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
@@ -1854,7 +1841,6 @@ static inline void flush_smp_call_function_from_idle(void) { }
#endif
#include "stats.h"
-#include "autogroup.h"
#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
@@ -1950,7 +1936,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
# define const_debug __read_mostly
#else
# define const_debug const
@@ -2331,7 +2316,6 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
@@ -2747,32 +2731,6 @@ extern void nohz_run_idle_balance(int cpu);
static inline void nohz_run_idle_balance(int cpu) { }
#endif
-#ifdef CONFIG_SMP
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
- int i;
-
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask) {
- struct rq *rq = cpu_rq(i);
-
- rq->dl.extra_bw += bw;
- }
-}
-#else
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
- dl->extra_bw += bw;
-}
-#endif
-
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 total;
@@ -2841,88 +2799,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
-#ifdef CONFIG_UCLAMP_TASK
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
-
-/**
- * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
- * @rq: The rq to clamp against. Must not be NULL.
- * @util: The util value to clamp.
- * @p: The task to clamp against. Can be NULL if you want to clamp
- * against @rq only.
- *
- * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
- *
- * If sched_uclamp_used static key is disabled, then just return the util
- * without any clamping since uclamp aggregation at the rq level in the fast
- * path is disabled, rendering this operation a NOP.
- *
- * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
- * will return the correct effective uclamp value of the task even if the
- * static key is disabled.
- */
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- unsigned long min_util = 0;
- unsigned long max_util = 0;
-
- if (!static_branch_likely(&sched_uclamp_used))
- return util;
-
- if (p) {
- min_util = uclamp_eff_value(p, UCLAMP_MIN);
- max_util = uclamp_eff_value(p, UCLAMP_MAX);
-
- /*
- * Ignore last runnable task's max clamp, as this task will
- * reset it. Similarly, no need to read the rq's min clamp.
- */
- if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
- goto out;
- }
-
- min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
- max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
-out:
- /*
- * Since CPU's {min,max}_util clamps are MAX aggregated considering
- * RUNNABLE tasks with _different_ clamps, we can end up with an
- * inversion. Fix it now when the clamps are applied.
- */
- if (unlikely(min_util >= max_util))
- return min_util;
-
- return clamp(util, min_util, max_util);
-}
-
-/*
- * When uclamp is compiled in, the aggregation at rq level is 'turned off'
- * by default in the fast path and only gets turned on once userspace performs
- * an operation that requires it.
- *
- * Returns true if userspace opted-in to use uclamp and aggregation at rq level
- * hence is active.
- */
-static inline bool uclamp_is_used(void)
-{
- return static_branch_likely(&sched_uclamp_used);
-}
-#else /* CONFIG_UCLAMP_TASK */
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- return util;
-}
-
-static inline bool uclamp_is_used(void)
-{
- return false;
-}
-#endif /* CONFIG_UCLAMP_TASK */
-
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
@@ -3020,6 +2896,105 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
}
#endif
+#ifdef CONFIG_UCLAMP_TASK
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+
+/**
+ * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
+ * @rq: The rq to clamp against. Must not be NULL.
+ * @util: The util value to clamp.
+ * @p: The task to clamp against. Can be NULL if you want to clamp
+ * against @rq only.
+ *
+ * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
+ *
+ * If sched_uclamp_used static key is disabled, then just return the util
+ * without any clamping since uclamp aggregation at the rq level in the fast
+ * path is disabled, rendering this operation a NOP.
+ *
+ * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
+ * will return the correct effective uclamp value of the task even if the
+ * static key is disabled.
+ */
+static __always_inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
+{
+ unsigned long min_util = 0;
+ unsigned long max_util = 0;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return util;
+
+ if (p) {
+ min_util = uclamp_eff_value(p, UCLAMP_MIN);
+ max_util = uclamp_eff_value(p, UCLAMP_MAX);
+
+ /*
+ * Ignore last runnable task's max clamp, as this task will
+ * reset it. Similarly, no need to read the rq's min clamp.
+ */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ goto out;
+ }
+
+ min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
+ max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+out:
+ /*
+ * Since CPU's {min,max}_util clamps are MAX aggregated considering
+ * RUNNABLE tasks with _different_ clamps, we can end up with an
+ * inversion. Fix it now when the clamps are applied.
+ */
+ if (unlikely(min_util >= max_util))
+ return min_util;
+
+ return clamp(util, min_util, max_util);
+}
+
+/* Is the rq being capped/throttled by uclamp_max? */
+static inline bool uclamp_rq_is_capped(struct rq *rq)
+{
+ unsigned long rq_util;
+ unsigned long max_util;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return false;
+
+ rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+ max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+}
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
+{
+ return util;
+}
+
+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
+
+static inline bool uclamp_is_used(void)
+{
+ return false;
+}
+#endif /* CONFIG_UCLAMP_TASK */
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
{
@@ -3118,3 +3093,4 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 07dde2928c79..857f837f52cb 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,7 +2,6 @@
/*
* /proc/schedstat implementation
*/
-#include "sched.h"
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 3a3c826dd83a..baa839c1ba96 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_STATS_H
+#define _KERNEL_STATS_H
#ifdef CONFIG_SCHEDSTATS
@@ -298,3 +300,5 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n
# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
+
+#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 0b165a25f22f..d04073a93eb4 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,7 +7,6 @@
*
* See kernel/stop_machine.c
*/
-#include "sched.h"
#ifdef CONFIG_SMP
static int
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e1c655f928c7..76b9b796e695 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,7 +2,6 @@
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
-#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a7052a29..810750e62118 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,7 +2,6 @@
/*
* Scheduler topology setup/handling methods
*/
-#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
@@ -74,7 +73,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_span(group))) {
+ if (cpumask_empty(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
@@ -1366,7 +1365,7 @@ static void asym_cpu_capacity_scan(void)
list_for_each_entry(entry, &asym_cap_list, link)
cpumask_clear(cpu_capacity_span(entry));
- for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
+ for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
asym_cpu_capacity_update_data(cpu);
list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
@@ -1492,8 +1491,6 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-
-static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
/*
@@ -1651,6 +1648,7 @@ static struct sched_domain_topology_level default_topology[] = {
static struct sched_domain_topology_level *sched_domain_topology =
default_topology;
+static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
@@ -1661,6 +1659,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl)
return;
sched_domain_topology = tl;
+ sched_domain_topology_saved = NULL;
}
#ifdef CONFIG_NUMA
@@ -1684,8 +1683,12 @@ static void sched_numa_warn(const char *str)
for (i = 0; i < nr_node_ids; i++) {
printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
+ printk(KERN_CONT "(%02d) ", node_distance(i,j));
+ else
+ printk(KERN_CONT " %02d ", node_distance(i,j));
+ }
printk(KERN_CONT "\n");
}
printk(KERN_WARNING "\n");
@@ -1693,19 +1696,34 @@ static void sched_numa_warn(const char *str)
bool find_numa_distance(int distance)
{
- int i;
+ bool found = false;
+ int i, *distances;
if (distance == node_distance(0, 0))
return true;
+ rcu_read_lock();
+ distances = rcu_dereference(sched_domains_numa_distance);
+ if (!distances)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
+ if (distances[i] == distance) {
+ found = true;
+ break;
+ }
}
+unlock:
+ rcu_read_unlock();
- return false;
+ return found;
}
+#define for_each_cpu_node_but(n, nbut) \
+ for_each_node_state(n, N_CPU) \
+ if (n == nbut) \
+ continue; \
+ else
+
/*
* A system can have three types of NUMA topology:
* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
@@ -1725,7 +1743,7 @@ bool find_numa_distance(int distance)
* there is an intermediary node C, which is < N hops away from both
* nodes A and B, the system is a glueless mesh.
*/
-static void init_numa_topology_type(void)
+static void init_numa_topology_type(int offline_node)
{
int a, b, c, n;
@@ -1736,14 +1754,14 @@ static void init_numa_topology_type(void)
return;
}
- for_each_online_node(a) {
- for_each_online_node(b) {
+ for_each_cpu_node_but(a, offline_node) {
+ for_each_cpu_node_but(b, offline_node) {
/* Find two nodes furthest removed from each other. */
if (node_distance(a, b) < n)
continue;
/* Is there an intermediary node between a and b? */
- for_each_online_node(c) {
+ for_each_cpu_node_but(c, offline_node) {
if (node_distance(a, c) < n &&
node_distance(b, c) < n) {
sched_numa_topology_type =
@@ -1756,17 +1774,22 @@ static void init_numa_topology_type(void)
return;
}
}
+
+ pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
+ sched_numa_topology_type = NUMA_DIRECT;
}
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
-void sched_init_numa(void)
+void sched_init_numa(int offline_node)
{
struct sched_domain_topology_level *tl;
unsigned long *distance_map;
int nr_levels = 0;
int i, j;
+ int *distances;
+ struct cpumask ***masks;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
@@ -1777,12 +1800,13 @@ void sched_init_numa(void)
return;
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
+ for_each_cpu_node_but(i, offline_node) {
+ for_each_cpu_node_but(j, offline_node) {
int distance = node_distance(i, j);
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
+ bitmap_free(distance_map);
return;
}
@@ -1795,16 +1819,17 @@ void sched_init_numa(void)
*/
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
- sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
- if (!sched_domains_numa_distance) {
+ distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!distances) {
bitmap_free(distance_map);
return;
}
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
- sched_domains_numa_distance[i] = j;
+ distances[i] = j;
}
+ rcu_assign_pointer(sched_domains_numa_distance, distances);
bitmap_free(distance_map);
@@ -1826,8 +1851,8 @@ void sched_init_numa(void)
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
- if (!sched_domains_numa_masks)
+ masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+ if (!masks)
return;
/*
@@ -1835,31 +1860,20 @@ void sched_init_numa(void)
* CPUs of nodes that are that many hops away from us.
*/
for (i = 0; i < nr_levels; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
+ masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!masks[i])
return;
- for (j = 0; j < nr_node_ids; j++) {
+ for_each_cpu_node_but(j, offline_node) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
int k;
if (!mask)
return;
- sched_domains_numa_masks[i][j] = mask;
-
- for_each_node(k) {
- /*
- * Distance information can be unreliable for
- * offline nodes, defer building the node
- * masks to its bringup.
- * This relies on all unique distance values
- * still being visible at init time.
- */
- if (!node_online(j))
- continue;
+ masks[i][j] = mask;
+ for_each_cpu_node_but(k, offline_node) {
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
@@ -1870,6 +1884,7 @@ void sched_init_numa(void)
}
}
}
+ rcu_assign_pointer(sched_domains_numa_masks, masks);
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
@@ -1907,59 +1922,67 @@ void sched_init_numa(void)
};
}
+ sched_domain_topology_saved = sched_domain_topology;
sched_domain_topology = tl;
sched_domains_numa_levels = nr_levels;
- sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
+ WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
- init_numa_topology_type();
-
- sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
- if (!sched_numa_onlined_nodes)
- return;
-
- bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
- for_each_online_node(i)
- bitmap_set(sched_numa_onlined_nodes, i, 1);
+ init_numa_topology_type(offline_node);
}
-static void __sched_domains_numa_masks_set(unsigned int node)
-{
- int i, j;
-
- /*
- * NUMA masks are not built for offline nodes in sched_init_numa().
- * Thus, when a CPU of a never-onlined-before node gets plugged in,
- * adding that new CPU to the right NUMA masks is not sufficient: the
- * masks of that CPU's node must also be updated.
- */
- if (test_bit(node, sched_numa_onlined_nodes))
- return;
- bitmap_set(sched_numa_onlined_nodes, node, 1);
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- if (!node_online(j) || node == j)
- continue;
+static void sched_reset_numa(void)
+{
+ int nr_levels, *distances;
+ struct cpumask ***masks;
- if (node_distance(j, node) > sched_domains_numa_distance[i])
+ nr_levels = sched_domains_numa_levels;
+ sched_domains_numa_levels = 0;
+ sched_max_numa_distance = 0;
+ sched_numa_topology_type = NUMA_DIRECT;
+ distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_domains_numa_distance, NULL);
+ masks = sched_domains_numa_masks;
+ rcu_assign_pointer(sched_domains_numa_masks, NULL);
+ if (distances || masks) {
+ int i, j;
+
+ synchronize_rcu();
+ kfree(distances);
+ for (i = 0; i < nr_levels && masks; i++) {
+ if (!masks[i])
continue;
-
- /* Add remote nodes in our masks */
- cpumask_or(sched_domains_numa_masks[i][node],
- sched_domains_numa_masks[i][node],
- sched_domains_numa_masks[0][j]);
+ for_each_node(j)
+ kfree(masks[i][j]);
+ kfree(masks[i]);
}
+ kfree(masks);
+ }
+ if (sched_domain_topology_saved) {
+ kfree(sched_domain_topology);
+ sched_domain_topology = sched_domain_topology_saved;
+ sched_domain_topology_saved = NULL;
}
+}
+/*
+ * Call with hotplug lock held
+ */
+void sched_update_numa(int cpu, bool online)
+{
+ int node;
+
+ node = cpu_to_node(cpu);
/*
- * A new node has been brought up, potentially changing the topology
- * classification.
- *
- * Note that this is racy vs any use of sched_numa_topology_type :/
+ * Scheduler NUMA topology is updated when the first CPU of a
+ * node is onlined or the last CPU of a node is offlined.
*/
- init_numa_topology_type();
+ if (cpumask_weight(cpumask_of_node(node)) != 1)
+ return;
+
+ sched_reset_numa();
+ sched_init_numa(online ? NUMA_NO_NODE : node);
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1967,11 +1990,9 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu);
int i, j;
- __sched_domains_numa_masks_set(node);
-
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
- if (!node_online(j))
+ if (!node_state(j, N_CPU))
continue;
/* Set ourselves in the remote node's masks */
@@ -1986,8 +2007,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ for (j = 0; j < nr_node_ids; j++) {
+ if (sched_domains_numa_masks[i][j])
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
}
}
@@ -2001,14 +2024,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
*/
int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
- int i, j = cpu_to_node(cpu);
+ int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
+ struct cpumask ***masks;
+ rcu_read_lock();
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
- if (cpu < nr_cpu_ids)
- return cpu;
+ if (!masks[i][j])
+ break;
+ cpu = cpumask_any_and(cpus, masks[i][j]);
+ if (cpu < nr_cpu_ids) {
+ found = cpu;
+ break;
+ }
}
- return nr_cpu_ids;
+unlock:
+ rcu_read_unlock();
+
+ return found;
}
#endif /* CONFIG_NUMA */
@@ -2242,6 +2277,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
+ /*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+ for_each_cpu(i, cpu_map) {
+ unsigned int imb = 0;
+ unsigned int imb_span = 1;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ struct sched_domain *child = sd->child;
+
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+ (child->flags & SD_SHARE_PKG_RESOURCES)) {
+ struct sched_domain __rcu *top_p;
+ unsigned int nr_llcs;
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 25% of the node. This is an
+ * arbitrary cutoff based on SMT-2 to balance
+ * between memory bandwidth and avoiding
+ * premature sharing of HT resources and SMT-4
+ * or SMT-8 *may* benefit from a different
+ * cutoff.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle.
+ */
+ nr_llcs = sd->span_weight / child->span_weight;
+ if (nr_llcs == 1)
+ imb = sd->span_weight >> 2;
+ else
+ imb = nr_llcs;
+ sd->imb_numa_nr = imb;
+
+ /* Set span based on the first NUMA domain. */
+ top_p = sd->parent;
+ while (top_p && !(top_p->flags & SD_NUMA)) {
+ top_p = top_p->parent;
+ }
+ imb_span = top_p ? top_p->span_weight : sd->span_weight;
+ } else {
+ int factor = max(1U, (sd->span_weight / imb_span));
+
+ sd->imb_numa_nr = imb * factor;
+ }
+ }
+ }
+
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
@@ -2351,7 +2437,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
- cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
return err;
@@ -2440,7 +2526,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
if (doms_new) {
n = 1;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
} else {
n = ndoms_new;
@@ -2475,7 +2561,7 @@ match1:
n = 0;
doms_new = &fallback_doms;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
/* Build new domains: */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index eca38107b32f..9860bb9a847c 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,7 +4,6 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
-#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc0..d4788f810b55 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
-#include "sched.h"
#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b04631acde8..e93de6daa188 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1308,6 +1308,43 @@ enum sig_handler {
};
/*
+ * On some archictectures, PREEMPT_RT has to delay sending a signal from a
+ * trap since it cannot enable preemption, and the signal code's
+ * spin_locks turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME
+ * which will send the signal on exit of the trap.
+ */
+#ifdef CONFIG_RT_DELAYED_SIGNALS
+static inline bool force_sig_delayed(struct kernel_siginfo *info,
+ struct task_struct *t)
+{
+ if (!in_atomic())
+ return false;
+
+ if (WARN_ON_ONCE(t->forced_info.si_signo))
+ return true;
+
+ if (is_si_special(info)) {
+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
+ t->forced_info.si_signo = info->si_signo;
+ t->forced_info.si_errno = 0;
+ t->forced_info.si_code = SI_KERNEL;
+ t->forced_info.si_pid = 0;
+ t->forced_info.si_uid = 0;
+ } else {
+ t->forced_info = *info;
+ }
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ return true;
+}
+#else
+static inline bool force_sig_delayed(struct kernel_siginfo *info,
+ struct task_struct *t)
+{
+ return false;
+}
+#endif
+
+/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
*
@@ -1327,6 +1364,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
struct k_sigaction *action;
int sig = info->si_signo;
+ if (force_sig_delayed(info, t))
+ return 0;
+
spin_lock_irqsave(&t->sighand->siglock, flags);
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 41f470929e99..fac801815554 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -222,7 +222,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
u32 pending;
int curcnt;
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
local_irq_save(flags);
@@ -305,7 +305,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
raw_local_irq_save(flags);
/*
@@ -352,14 +352,14 @@ static void __local_bh_enable(unsigned int cnt)
*/
void _local_bh_enable(void)
{
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(_local_bh_enable);
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_disable();
@@ -618,7 +618,7 @@ static inline void tick_irq_exit(void)
/* Make sure that timer wheel updates are propagated */
if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
- if (!in_irq())
+ if (!in_hardirq())
tick_nohz_irq_exit();
}
#endif
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c625257023d..9ed5ce989415 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -226,15 +226,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
.store = store,
.size = size,
};
- mm_segment_t fs;
/* Trace user stack if not a kernel thread */
if (current->flags & PF_KTHREAD)
return 0;
- fs = force_uaccess_begin();
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- force_uaccess_end(fs);
return c.len;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ae443b2882e..830aaf8ca08e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -180,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write,
return ret;
}
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
static int bpf_unpriv_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -197,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
return -EPERM;
*(int *)table->data = unpriv_enable;
}
+
+ unpriv_ebpf_notify(unpriv_enable);
+
return ret;
}
#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
@@ -1689,7 +1696,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_FOUR,
},
#endif /* CONFIG_NUMA_BALANCING */
{
@@ -1750,17 +1757,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif
-#ifdef CONFIG_SCHED_AUTOGROUP
- {
- .procname = "sched_autogroup_enabled",
- .data = &sysctl_sched_autogroup_enabled,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 04bfd62f5e5c..27b7868b5c30 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -181,5 +181,14 @@ config HIGH_RES_TIMERS
hardware is not capable then this option only increases
the size of the kernel image.
+config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+ int "Clocksource watchdog maximum allowable skew (in μs)"
+ depends on CLOCKSOURCE_WATCHDOG
+ range 50 1000
+ default 100
+ help
+ Specify the maximum amount of allowable watchdog skew in
+ microseconds before reporting the clocksource to be unstable.
+
endmenu
endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1cf73807b450..95d7ca35bdf2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,7 +107,13 @@ static u64 suspend_start;
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
*/
-#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#else
+#define MAX_SKEW_USEC 100
+#endif
+
+#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 17a283ce2b20..2d76c91b85de 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -169,6 +169,8 @@ static ktime_t tick_init_jiffy_update(void)
return period;
}
+#define MAX_STALLED_JIFFIES 5
+
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
@@ -196,6 +198,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
+ /*
+ * If jiffies update stalled for too long (timekeeper in stop_machine()
+ * or VMEXIT'ed for several msecs), force an update.
+ */
+ if (ts->last_tick_jiffies != jiffies) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ } else {
+ if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
+ tick_do_update_jiffies64(now);
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
+ }
+
if (ts->inidle)
ts->got_idle_tick = 1;
}
@@ -768,7 +785,7 @@ static inline bool local_timer_softirq_pending(void)
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
- u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+ u64 basemono, next_tick, delta, expires;
unsigned long basejiff;
unsigned int seq;
@@ -791,7 +808,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* minimal delta which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
- if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ if (rcu_needs_cpu() || arch_needs_cpu() ||
irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
@@ -802,10 +819,8 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* disabled this also looks at the next expiring
* hrtimer.
*/
- next_tmr = get_next_timer_interrupt(basejiff, basemono);
- ts->next_timer = next_tmr;
- /* Take the next rcu event into account */
- next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
+ next_tick = get_next_timer_interrupt(basejiff, basemono);
+ ts->next_timer = next_tick;
}
/*
@@ -984,6 +999,45 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
__tick_nohz_full_update_tick(ts, ktime_get());
}
+/*
+ * A pending softirq outside an IRQ (or softirq disabled section) context
+ * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
+ * reach here due to the need_resched() early check in can_stop_idle_tick().
+ *
+ * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
+ * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
+ * triggering the below since wakep_softirqd() is ignored.
+ *
+ */
+static bool report_idle_softirq(void)
+{
+ static int ratelimit;
+ unsigned int pending = local_softirq_pending();
+
+ if (likely(!pending))
+ return false;
+
+ /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
+ if (!cpu_active(smp_processor_id())) {
+ pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
+ if (!pending)
+ return false;
+ }
+
+ if (ratelimit < 10)
+ return false;
+
+ /* On RT, softirqs handling may be waiting on some lock */
+ if (!local_bh_blocked())
+ return false;
+
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+
+ return true;
+}
+
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
@@ -1010,17 +1064,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (need_resched())
return false;
- if (unlikely(local_softirq_pending())) {
- static int ratelimit;
-
- if (ratelimit < 10 && !local_bh_blocked() &&
- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
- pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
- (unsigned int) local_softirq_pending());
- ratelimit++;
- }
+ if (unlikely(report_idle_softirq()))
return false;
- }
if (tick_nohz_full_enabled()) {
/*
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index d952ae393423..504649513399 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -49,6 +49,8 @@ enum tick_nohz_mode {
* @timer_expires_base: Base time clock monotonic for @timer_expires
* @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
*/
struct tick_sched {
struct hrtimer sched_timer;
@@ -77,6 +79,8 @@ struct tick_sched {
u64 next_timer;
ktime_t idle_expires;
atomic_t tick_dep_mask;
+ unsigned long last_tick_jiffies;
+ unsigned int stalled_jiffies;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/torture.c b/kernel/torture.c
index ef27a6c82451..789aeb0e1159 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -911,7 +911,7 @@ void torture_kthread_stopping(char *title)
{
char buf[128];
- snprintf(buf, sizeof(buf), "Stopping %s", title);
+ snprintf(buf, sizeof(buf), "%s is stopping", title);
VERBOSE_TOROUT_STRING(buf);
while (!kthread_should_stop()) {
torture_shutdown_absorb(title);
@@ -931,12 +931,14 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
int ret = 0;
VERBOSE_TOROUT_STRING(m);
- *tp = kthread_run(fn, arg, "%s", s);
+ *tp = kthread_create(fn, arg, "%s", s);
if (IS_ERR(*tp)) {
ret = PTR_ERR(*tp);
TOROUT_ERRSTRING(f);
*tp = NULL;
+ return ret;
}
+ wake_up_process(*tp); // Process is sleeping, so ordering provided.
torture_shuffle_task_register(*tp);
return ret;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5eb5e7fd624..16a52a71732d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -737,6 +737,20 @@ config SYNTH_EVENTS
If in doubt, say N.
+config USER_EVENTS
+ bool "User trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ help
+ User trace events are user-defined trace events that
+ can be used like an existing kernel trace event. User trace
+ events are generated by writing to a tracefs file. User
+ processes can determine if their tracing events should be
+ generated by memory mapping a tracefs file and checking for
+ an associated byte being non-zero.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..19ef3758da95 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 21aa30644219..8115fff17018 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -332,8 +332,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -835,8 +833,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..19028e072cdb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -415,7 +415,9 @@ free:
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a4b462b6f944..e59d80c9afd7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7096,6 +7096,8 @@ void __init ftrace_free_init_mem(void)
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
+ ftrace_boot_snapshot();
+
ftrace_free_mem(NULL, start, end);
}
@@ -7346,7 +7348,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
@@ -7790,7 +7794,7 @@ int ftrace_is_dead(void)
/**
* register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
+ * @ops: ops structure that holds the function for profiling.
*
* Register a function to be called by all functions in the
* kernel.
@@ -7817,7 +7821,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_function);
/**
* unregister_ftrace_function - unregister a function for profiling.
- * @ops - ops structure that holds the function to unregister
+ * @ops: ops structure that holds the function to unregister
*
* Unregister a function that was added to be called by ftrace profiling.
*/
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eb44418574f9..08ea781540b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static bool allocate_snapshot;
+static bool snapshot_at_boot;
static int __init set_cmdline_ftrace(char *str)
{
@@ -230,6 +231,15 @@ static int __init boot_alloc_snapshot(char *str)
__setup("alloc_snapshot", boot_alloc_snapshot);
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
@@ -7725,7 +7735,7 @@ const struct file_operations trace_min_max_fops = {
struct err_info {
const char **errs; /* ptr to loc-specific array of err strings */
u8 type; /* index into errs -> specific err string */
- u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u16 pos; /* caret position */
u64 ts;
};
@@ -7733,26 +7743,52 @@ struct tracing_log_err {
struct list_head list;
struct err_info info;
char loc[TRACING_LOG_LOC_MAX]; /* err location */
- char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+ char *cmd; /* what caused err */
};
static DEFINE_MUTEX(tracing_err_log_lock);
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
+{
+ struct tracing_log_err *err;
+
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ return ERR_PTR(-ENOMEM);
+
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd) {
+ kfree(err);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+ kfree(err->cmd);
+ kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+ int len)
{
struct tracing_log_err *err;
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
- err = kzalloc(sizeof(*err), GFP_KERNEL);
- if (!err)
- err = ERR_PTR(-ENOMEM);
- else
+ err = alloc_tracing_log_err(len);
+ if (PTR_ERR(err) != -ENOMEM)
tr->n_err_log_entries++;
return err;
}
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ kfree(err->cmd);
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd)
+ return ERR_PTR(-ENOMEM);
list_del(&err->list);
return err;
@@ -7813,22 +7849,25 @@ unsigned int err_pos(char *cmd, const char *str)
*/
void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos)
+ const char **errs, u8 type, u16 pos)
{
struct tracing_log_err *err;
+ int len = 0;
if (!tr)
tr = &global_trace;
+ len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
mutex_lock(&tracing_err_log_lock);
- err = get_tracing_log_err(tr);
+ err = get_tracing_log_err(tr, len);
if (PTR_ERR(err) == -ENOMEM) {
mutex_unlock(&tracing_err_log_lock);
return;
}
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
- snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+ snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
err->info.errs = errs;
err->info.type = type;
@@ -7846,7 +7885,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
mutex_lock(&tracing_err_log_lock);
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
- kfree(err);
+ free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
@@ -7874,9 +7913,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&tracing_err_log_lock);
}
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
- u8 i;
+ u16 i;
for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
seq_putc(m, ' ');
@@ -10122,6 +10161,14 @@ out:
return ret;
}
+void __init ftrace_boot_snapshot(void)
+{
+ if (snapshot_at_boot) {
+ tracing_snapshot();
+ internal_trace_puts("** Boot snapshot taken **\n");
+ }
+}
+
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5b09c31e077..07d990270e2a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1877,7 +1877,7 @@ extern ssize_t trace_parse_run_command(struct file *file,
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos);
+ const char **errs, u8 type, u16 pos);
/*
* Normal trace_printk() and friends allocates special buffers
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..e11e167b7809 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
static bool eventdir_initialized;
+static LIST_HEAD(module_strings);
+
+struct module_string {
+ struct list_head next;
+ struct module *module;
+ char *str;
+};
+
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
static struct kmem_cache *field_cachep;
@@ -384,6 +392,12 @@ static void test_event_printk(struct trace_event_call *call)
if (!(dereference_flags & (1ULL << arg)))
goto next_arg;
+ /* Check for __get_sockaddr */;
+ if (str_has_prefix(fmt + i, "__get_sockaddr(")) {
+ dereference_flags &= ~(1ULL << arg);
+ goto next_arg;
+ }
+
/* Find the REC-> in the argument */
c = strchr(fmt + i, ',');
r = strstr(fmt + i, "REC->");
@@ -759,7 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -2633,6 +2651,76 @@ static void update_event_printk(struct trace_event_call *call,
}
}
+static void add_str_to_module(struct module *module, char *str)
+{
+ struct module_string *modstr;
+
+ modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+ /*
+ * If we failed to allocate memory here, then we'll just
+ * let the str memory leak when the module is removed.
+ * If this fails to allocate, there's worse problems than
+ * a leaked string on module removal.
+ */
+ if (WARN_ON_ONCE(!modstr))
+ return;
+
+ modstr->module = module;
+ modstr->str = str;
+
+ list_add(&modstr->next, &module_strings);
+}
+
+static void update_event_fields(struct trace_event_call *call,
+ struct trace_eval_map *map)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ char *ptr;
+ char *str;
+ int len = strlen(map->eval_string);
+
+ /* Dynamic events should never have field maps */
+ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ return;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ ptr = strchr(field->type, '[');
+ if (!ptr)
+ continue;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ continue;
+
+ if (strncmp(map->eval_string, ptr, len) != 0)
+ continue;
+
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
+
+ /*
+ * If the event is part of a module, then we need to free the string
+ * when the module is removed. Otherwise, it will stay allocated
+ * until a reboot.
+ */
+ if (call->module)
+ add_str_to_module(call->module, str);
+
+ field->type = str;
+ }
+}
+
void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
@@ -2668,6 +2756,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
first = false;
}
update_event_printk(call, map[i]);
+ update_event_fields(call, map[i]);
}
}
}
@@ -2758,6 +2847,7 @@ int trace_add_event_call(struct trace_event_call *call)
mutex_unlock(&trace_types_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_add_event_call);
/*
* Must be called under locking of trace_types_lock, event_mutex and
@@ -2819,6 +2909,7 @@ int trace_remove_event_call(struct trace_event_call *call)
return ret;
}
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
#define for_each_event(event, start, end) \
for (event = start; \
@@ -2853,6 +2944,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
+ struct module_string *modstr, *m;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -2861,6 +2953,14 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
+ /* Check for any strings allocade for this module */
+ list_for_each_entry_safe(modstr, m, &module_strings, next) {
+ if (modstr->module != mod)
+ continue;
+ list_del(&modstr->next);
+ kfree(modstr->str);
+ kfree(modstr);
+ }
up_write(&trace_event_sem);
/*
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index dc7f733b4cb3..44db5ba9cabb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
return data;
}
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
static char last_cmd_loc[MAX_FILTER_STR_VAL];
static int errpos(char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -739,12 +744,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
+ int len;
if (!str)
return;
- strcpy(last_cmd, "hist:");
- strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+ /* sizeof() contains the nul byte */
+ len = sizeof(HIST_PREFIX) + strlen(str);
+ kfree(last_cmd);
+ last_cmd = kzalloc(len, GFP_KERNEL);
+ if (!last_cmd)
+ return;
+
+ strcpy(last_cmd, HIST_PREFIX);
+ /* Again, sizeof() contains the nul byte */
+ len -= sizeof(HIST_PREFIX);
+ strncat(last_cmd, str, len);
if (file) {
call = file->event_call;
@@ -757,18 +772,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
}
if (system)
- snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
}
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
err_type, err_pos);
}
static void hist_err_clear(void)
{
- last_cmd[0] = '\0';
+ if (last_cmd)
+ last_cmd[0] = '\0';
last_cmd_loc[0] = '\0';
}
@@ -5610,7 +5629,7 @@ static int event_hist_trigger_print(struct seq_file *m,
bool have_var = false;
unsigned int i;
- seq_puts(m, "hist:");
+ seq_puts(m, HIST_PREFIX);
if (data->name)
seq_printf(m, "%s:", data->name);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 154db74dadbc..5e8c07aef071 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -42,10 +42,13 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
-static char last_cmd[MAX_FILTER_STR_VAL];
+static char *last_cmd;
static int errpos(const char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -54,11 +57,16 @@ static void last_cmd_set(const char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ kfree(last_cmd);
+
+ last_cmd = kstrdup(str, GFP_KERNEL);
}
-static void synth_err(u8 err_type, u8 err_pos)
+static void synth_err(u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..8b3d241a31c2
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1690 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/user_events.h>
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be a power of two of PAGE_SIZE.
+ */
+#define MAX_PAGE_ORDER 0
+#define MAX_PAGES (1 << MAX_PAGE_ORDER)
+#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+#define MAX_FIELD_ARG_NAME 256
+
+#define MAX_BPF_COPY_SIZE PAGE_SIZE
+#define MAX_STACK_BPF_DATA 512
+
+static char *register_page_data;
+
+static DEFINE_MUTEX(reg_mutex);
+static DEFINE_HASHTABLE(register_table, 4);
+static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches zero.
+ */
+struct user_event {
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ atomic_t refcnt;
+ int index;
+ int flags;
+ int min_size;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
+};
+
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+ struct list_head link;
+ int offset;
+ int flags;
+};
+
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted);
+
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t ret;
+
+ pagefault_disable();
+
+ ret = copy_from_iter_nocache(addr, bytes, i);
+
+ pagefault_enable();
+
+ return ret;
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+ struct user_event *user = (struct user_event *)call->data;
+
+ return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
+ */
+static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+{
+ char *name = raw_command;
+ char *args = strpbrk(name, " ");
+ char *flags;
+
+ if (args)
+ *args++ = '\0';
+
+ flags = strpbrk(name, ":");
+
+ if (flags)
+ *flags++ = '\0';
+
+ return user_event_parse(name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+ const char *start = strchr(type, '[');
+ char val[8];
+ char *bracket;
+ int size = 0;
+
+ if (start == NULL)
+ return -EINVAL;
+
+ if (strscpy(val, start + 1, sizeof(val)) <= 0)
+ return -EINVAL;
+
+ bracket = strchr(val, ']');
+
+ if (!bracket)
+ return -EINVAL;
+
+ *bracket = '\0';
+
+ if (kstrtouint(val, 0, &size))
+ return -EINVAL;
+
+ if (size > MAX_FIELD_ARRAY_SIZE)
+ return -EINVAL;
+
+ return size;
+}
+
+static int user_field_size(const char *type)
+{
+ /* long is not allowed from a user, since it's ambigious in size */
+ if (strcmp(type, "s64") == 0)
+ return sizeof(s64);
+ if (strcmp(type, "u64") == 0)
+ return sizeof(u64);
+ if (strcmp(type, "s32") == 0)
+ return sizeof(s32);
+ if (strcmp(type, "u32") == 0)
+ return sizeof(u32);
+ if (strcmp(type, "int") == 0)
+ return sizeof(int);
+ if (strcmp(type, "unsigned int") == 0)
+ return sizeof(unsigned int);
+ if (strcmp(type, "s16") == 0)
+ return sizeof(s16);
+ if (strcmp(type, "u16") == 0)
+ return sizeof(u16);
+ if (strcmp(type, "short") == 0)
+ return sizeof(short);
+ if (strcmp(type, "unsigned short") == 0)
+ return sizeof(unsigned short);
+ if (strcmp(type, "s8") == 0)
+ return sizeof(s8);
+ if (strcmp(type, "u8") == 0)
+ return sizeof(u8);
+ if (strcmp(type, "char") == 0)
+ return sizeof(char);
+ if (strcmp(type, "unsigned char") == 0)
+ return sizeof(unsigned char);
+ if (str_has_prefix(type, "char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "unsigned char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "__data_loc "))
+ return sizeof(u32);
+ if (str_has_prefix(type, "__rel_loc "))
+ return sizeof(u32);
+
+ /* Uknown basic type, error */
+ return -EINVAL;
+}
+
+static void user_event_destroy_validators(struct user_event *user)
+{
+ struct user_event_validator *validator, *next;
+ struct list_head *head = &user->validators;
+
+ list_for_each_entry_safe(validator, next, head, link) {
+ list_del(&validator->link);
+ kfree(validator);
+ }
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kfree(field);
+ }
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct user_event_validator *validator;
+ struct ftrace_event_field *field;
+ int validator_flags = 0;
+
+ field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+ if (!field)
+ return -ENOMEM;
+
+ if (str_has_prefix(type, "__data_loc "))
+ goto add_validator;
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ validator_flags |= VALIDATOR_REL;
+ goto add_validator;
+ }
+
+ goto add_field;
+
+add_validator:
+ if (strstr(type, "char") != 0)
+ validator_flags |= VALIDATOR_ENSURE_NULL;
+
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+
+ if (!validator) {
+ kfree(field);
+ return -ENOMEM;
+ }
+
+ validator->flags = validator_flags;
+ validator->offset = offset;
+
+ /* Want sequential access when validating */
+ list_add_tail(&validator->link, &user->validators);
+
+add_field:
+ field->type = type;
+ field->name = name;
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+ field->filter_type = filter_type;
+
+ list_add(&field->link, &user->fields);
+
+ /*
+ * Min size from user writes that are required, this does not include
+ * the size of trace_entry (common fields).
+ */
+ user->min_size = (offset + size) - sizeof(struct trace_entry);
+
+ return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+ u32 *offset)
+{
+ char *part, *type, *name;
+ u32 depth = 0, saved_offset = *offset;
+ int len, size = -EINVAL;
+ bool is_struct = false;
+
+ field = skip_spaces(field);
+
+ if (*field == '\0')
+ return 0;
+
+ /* Handle types that have a space within */
+ len = str_has_prefix(field, "unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "struct ");
+ if (len) {
+ is_struct = true;
+ goto skip_next;
+ }
+
+ len = str_has_prefix(field, "__data_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__data_loc ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc ");
+ if (len)
+ goto skip_next;
+
+ goto parse;
+skip_next:
+ type = field;
+ field = strpbrk(field + len, " ");
+
+ if (field == NULL)
+ return -EINVAL;
+
+ *field++ = '\0';
+ depth++;
+parse:
+ name = NULL;
+
+ while ((part = strsep(&field, " ")) != NULL) {
+ switch (depth++) {
+ case FIELD_DEPTH_TYPE:
+ type = part;
+ break;
+ case FIELD_DEPTH_NAME:
+ name = part;
+ break;
+ case FIELD_DEPTH_SIZE:
+ if (!is_struct)
+ return -EINVAL;
+
+ if (kstrtou32(part, 10, &size))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (depth < FIELD_DEPTH_SIZE || !name)
+ return -EINVAL;
+
+ if (depth == FIELD_DEPTH_SIZE)
+ size = user_field_size(type);
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (size < 0)
+ return size;
+
+ *offset = saved_offset + size;
+
+ return user_event_add_field(user, type, name, saved_offset, size,
+ type[0] != 'u', FILTER_OTHER);
+}
+
+static void user_event_parse_flags(struct user_event *user, char *flags)
+{
+ char *flag;
+
+ if (flags == NULL)
+ return;
+
+ while ((flag = strsep(&flags, ",")) != NULL) {
+ if (strcmp(flag, "BPF_ITER") == 0)
+ user->flags |= FLAG_BPF_ITER;
+ }
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+ char *field;
+ u32 offset = sizeof(struct trace_entry);
+ int ret = -EINVAL;
+
+ if (args == NULL)
+ return 0;
+
+ while ((field = strsep(&args, ";")) != NULL) {
+ ret = user_event_parse_field(field, user, &offset);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static const char *user_field_format(const char *type)
+{
+ if (strcmp(type, "s64") == 0)
+ return "%lld";
+ if (strcmp(type, "u64") == 0)
+ return "%llu";
+ if (strcmp(type, "s32") == 0)
+ return "%d";
+ if (strcmp(type, "u32") == 0)
+ return "%u";
+ if (strcmp(type, "int") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned int") == 0)
+ return "%u";
+ if (strcmp(type, "s16") == 0)
+ return "%d";
+ if (strcmp(type, "u16") == 0)
+ return "%u";
+ if (strcmp(type, "short") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned short") == 0)
+ return "%u";
+ if (strcmp(type, "s8") == 0)
+ return "%d";
+ if (strcmp(type, "u8") == 0)
+ return "%u";
+ if (strcmp(type, "char") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned char") == 0)
+ return "%u";
+ if (strstr(type, "char[") != 0)
+ return "%s";
+
+ /* Unknown, likely struct, allowed treat as 64-bit */
+ return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+ if (str_has_prefix(type, "__data_loc ")) {
+ *str_func = "__get_str";
+ goto check;
+ }
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ *str_func = "__get_rel_str";
+ goto check;
+ }
+
+ return false;
+check:
+ return strstr(type, "char") != 0;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int pos = 0, depth = 0;
+ const char *str_func;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth != 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+ field->name, user_field_format(field->type));
+
+ depth++;
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (user_field_is_dyn_string(field->type, &str_func))
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", %s(%s)", str_func, field->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", field->name);
+ }
+
+ return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+ char *print_fmt;
+ int len;
+
+ len = user_event_set_print_fmt(user, NULL, 0);
+
+ print_fmt = kmalloc(len, GFP_KERNEL);
+
+ if (!print_fmt)
+ return -ENOMEM;
+
+ user_event_set_print_fmt(user, print_fmt, len);
+
+ user->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ /* Unsafe to try to decode user provided print_fmt, use hex */
+ trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+ 1, iter->ent, iter->ent_size, true);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+ .trace = user_event_print_trace,
+};
+
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+ int ret;
+ const struct cred *old_cred;
+ struct cred *cred;
+
+ cred = prepare_creds();
+
+ if (!cred)
+ return -ENOMEM;
+
+ /*
+ * While by default tracefs is locked down, systems can be configured
+ * to allow user_event files to be less locked down. The extreme case
+ * being "other" has read/write access to user_events_data/status.
+ *
+ * When not locked down, processes may not have have permissions to
+ * add/remove calls themselves to tracefs. We need to temporarily
+ * switch to root file permission to allow for this scenario.
+ */
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ old_cred = override_creds(cred);
+
+ if (visible)
+ ret = trace_add_event_call(&user->call);
+ else
+ ret = trace_remove_event_call(&user->call);
+
+ revert_creds(old_cred);
+ put_cred(cred);
+
+ return ret;
+}
+
+static int destroy_user_event(struct user_event *user)
+{
+ int ret = 0;
+
+ /* Must destroy fields before call removal */
+ user_event_destroy_fields(user);
+
+ ret = user_event_set_call_visible(user, false);
+
+ if (ret)
+ return ret;
+
+ dyn_event_remove(&user->devent);
+
+ register_page_data[user->index] = 0;
+ clear_bit(user->index, page_bitmap);
+ hash_del(&user->node);
+
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(EVENT_NAME(user));
+ kfree(user);
+
+ return ret;
+}
+
+static struct user_event *find_user_event(char *name, u32 *outkey)
+{
+ struct user_event *user;
+ u32 key = user_event_key(name);
+
+ *outkey = key;
+
+ hash_for_each_possible(register_table, user, node, key)
+ if (!strcmp(EVENT_NAME(user), name)) {
+ atomic_inc(&user->refcnt);
+ return user;
+ }
+
+ return NULL;
+}
+
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+ struct list_head *head = &user->validators;
+ struct user_event_validator *validator;
+ void *pos, *end = data + len;
+ u32 loc, offset, size;
+
+ list_for_each_entry(validator, head, link) {
+ pos = data + validator->offset;
+
+ /* Already done min_size check, no bounds check here */
+ loc = *(u32 *)pos;
+ offset = loc & 0xffff;
+ size = loc >> 16;
+
+ if (likely(validator->flags & VALIDATOR_REL))
+ pos += offset + sizeof(loc);
+ else
+ pos = data + offset;
+
+ pos += size;
+
+ if (unlikely(pos > end))
+ return -EFAULT;
+
+ if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+ if (unlikely(*(char *)(pos - 1) != '\0'))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct trace_event_file *file;
+ struct trace_entry *entry;
+ struct trace_event_buffer event_buffer;
+ size_t size = sizeof(*entry) + i->count;
+
+ file = (struct trace_event_file *)tpdata;
+
+ if (!file ||
+ !(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file))
+ return;
+
+ /* Allocates and fills trace_entry, + 1 of this is data payload */
+ entry = trace_event_buffer_reserve(&event_buffer, file, size);
+
+ if (unlikely(!entry))
+ return;
+
+ if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, entry, size)))
+ goto discard;
+
+ trace_event_buffer_commit(&event_buffer);
+
+ return;
+discard:
+ *faulted = true;
+ __trace_event_discard_commit(event_buffer.buffer,
+ event_buffer.event);
+}
+
+#ifdef CONFIG_PERF_EVENTS
+static void user_event_bpf(struct user_event *user, struct iov_iter *i)
+{
+ struct user_bpf_context context;
+ struct user_bpf_iter bpf_i;
+ char fast_data[MAX_STACK_BPF_DATA];
+ void *temp = NULL;
+
+ if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
+ /* Raw iterator */
+ context.data_type = USER_BPF_DATA_ITER;
+ context.data_len = i->count;
+ context.iter = &bpf_i;
+
+ bpf_i.iov_offset = i->iov_offset;
+ bpf_i.iov = i->iov;
+ bpf_i.nr_segs = i->nr_segs;
+ } else if (i->nr_segs == 1 && iter_is_iovec(i)) {
+ /* Single buffer from user */
+ context.data_type = USER_BPF_DATA_USER;
+ context.data_len = i->count;
+ context.udata = i->iov->iov_base + i->iov_offset;
+ } else {
+ /* Multi buffer from user */
+ struct iov_iter copy = *i;
+ size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE);
+
+ context.data_type = USER_BPF_DATA_KERNEL;
+ context.kdata = fast_data;
+
+ if (unlikely(copy_size > sizeof(fast_data))) {
+ temp = kmalloc(copy_size, GFP_NOWAIT);
+
+ if (temp)
+ context.kdata = temp;
+ else
+ copy_size = sizeof(fast_data);
+ }
+
+ context.data_len = copy_nofault(context.kdata,
+ copy_size, &copy);
+ }
+
+ trace_call_bpf(&user->call, &context);
+
+ kfree(temp);
+}
+
+/*
+ * Writes the user supplied payload out to perf ring buffer or eBPF program.
+ */
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct hlist_head *perf_head;
+
+ if (bpf_prog_array_valid(&user->call))
+ user_event_bpf(user, i);
+
+ perf_head = this_cpu_ptr(user->call.perf_events);
+
+ if (perf_head && !hlist_empty(perf_head)) {
+ struct trace_entry *perf_entry;
+ struct pt_regs *regs;
+ size_t size = sizeof(*perf_entry) + i->count;
+ int context;
+
+ perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+ &regs, &context);
+
+ if (unlikely(!perf_entry))
+ return;
+
+ perf_fetch_caller_regs(regs);
+
+ if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, perf_entry, size)))
+ goto discard;
+
+ perf_trace_buf_submit(perf_entry, size, context,
+ user->call.event.type, 1, regs,
+ perf_head, NULL);
+
+ return;
+discard:
+ *faulted = true;
+ perf_swevent_put_recursion_context(context);
+ }
+}
+#endif
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+ struct tracepoint *tp = &user->tracepoint;
+ char status = 0;
+
+ if (atomic_read(&tp->key.enabled) > 0) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+
+ if (probe_func == user_event_ftrace)
+ status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+ else if (probe_func == user_event_perf)
+ status |= EVENT_STATUS_PERF;
+#endif
+ else
+ status |= EVENT_STATUS_OTHER;
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+ }
+
+ register_page_data[user->index] = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+ enum trace_reg type,
+ void *data)
+{
+ struct user_event *user = (struct user_event *)call->data;
+ int ret = 0;
+
+ if (!user)
+ return -ENOENT;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ data);
+ goto dec;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ data);
+ goto dec;
+
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ break;
+#endif
+ }
+
+ return ret;
+inc:
+ atomic_inc(&user->refcnt);
+ update_reg_page_for(user);
+ return 0;
+dec:
+ update_reg_page_for(user);
+ atomic_dec(&user->refcnt);
+ return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+ struct user_event *user;
+ char *name;
+ int ret;
+
+ if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+ return -ECANCELED;
+
+ raw_command += USER_EVENTS_PREFIX_LEN;
+ raw_command = skip_spaces(raw_command);
+
+ name = kstrdup(raw_command, GFP_KERNEL);
+
+ if (!name)
+ return -ENOMEM;
+
+ mutex_lock(&reg_mutex);
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (!ret)
+ atomic_dec(&user->refcnt);
+
+ mutex_unlock(&reg_mutex);
+
+ if (ret)
+ kfree(name);
+
+ return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ struct ftrace_event_field *field, *next;
+ struct list_head *head;
+ int depth = 0;
+
+ seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+ head = trace_get_fields(&user->call);
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth == 0)
+ seq_puts(m, " ");
+ else
+ seq_puts(m, "; ");
+
+ seq_printf(m, "%s %s", field->type, field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ seq_printf(m, " %d", field->size);
+
+ depth++;
+ }
+
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ return atomic_read(&user->refcnt) != 0;
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ if (atomic_read(&user->refcnt) != 0)
+ return -EBUSY;
+
+ return destroy_user_event(user);
+}
+
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+ const char **argv, int *iout)
+{
+ char *field_name, *arg_name;
+ int len, pos, i = *iout;
+ bool colon = false, match = false;
+
+ if (i >= argc)
+ return false;
+
+ len = MAX_FIELD_ARG_NAME;
+ field_name = kmalloc(len, GFP_KERNEL);
+ arg_name = kmalloc(len, GFP_KERNEL);
+
+ if (!arg_name || !field_name)
+ goto out;
+
+ pos = 0;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(arg_name + pos, len - pos, " ");
+
+ pos += snprintf(arg_name + pos, len - pos, argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ colon = true;
+ break;
+ }
+ }
+
+ pos = 0;
+
+ pos += snprintf(field_name + pos, len - pos, field->type);
+ pos += snprintf(field_name + pos, len - pos, " ");
+ pos += snprintf(field_name + pos, len - pos, field->name);
+
+ if (colon)
+ pos += snprintf(field_name + pos, len - pos, ";");
+
+ *iout = i;
+
+ match = strcmp(arg_name, field_name) == 0;
+out:
+ kfree(arg_name);
+ kfree(field_name);
+
+ return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int i = 0;
+
+ list_for_each_entry_safe_reverse(field, next, head, link)
+ if (!user_field_match(field, argc, argv, &i))
+ return false;
+
+ if (i != argc)
+ return false;
+
+ return true;
+}
+
+static bool user_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ bool match;
+
+ match = strcmp(EVENT_NAME(user), event) == 0 &&
+ (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+ if (match && argc > 0)
+ match = user_fields_match(user, argc, argv);
+
+ return match;
+}
+
+static struct dyn_event_operations user_event_dops = {
+ .create = user_event_create,
+ .show = user_event_show,
+ .is_busy = user_event_is_busy,
+ .free = user_event_free,
+ .match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+ int ret;
+
+ ret = register_trace_event(&user->call.event);
+
+ if (!ret)
+ return -ENODEV;
+
+ ret = user_event_set_call_visible(user, true);
+
+ if (ret)
+ unregister_trace_event(&user->call.event);
+
+ return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
+ */
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser)
+{
+ int ret;
+ int index;
+ u32 key;
+ struct user_event *user;
+
+ /* Prevent dyn_event from racing */
+ mutex_lock(&event_mutex);
+ user = find_user_event(name, &key);
+ mutex_unlock(&event_mutex);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ return 0;
+ }
+
+ index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+
+ if (index == MAX_EVENTS)
+ return -EMFILE;
+
+ user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+ if (!user)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&user->class.fields);
+ INIT_LIST_HEAD(&user->fields);
+ INIT_LIST_HEAD(&user->validators);
+
+ user->tracepoint.name = name;
+
+ user_event_parse_flags(user, flags);
+
+ ret = user_event_parse_fields(user, args);
+
+ if (ret)
+ goto put_user;
+
+ ret = user_event_create_print_fmt(user);
+
+ if (ret)
+ goto put_user;
+
+ user->call.data = user;
+ user->call.class = &user->class;
+ user->call.name = name;
+ user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+ user->call.tp = &user->tracepoint;
+ user->call.event.funcs = &user_event_funcs;
+
+ user->class.system = USER_EVENTS_SYSTEM;
+ user->class.fields_array = user_event_fields_array;
+ user->class.get_fields = user_event_get_fields;
+ user->class.reg = user_event_reg;
+ user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+ user->class.perf_probe = user_event_perf;
+#endif
+
+ mutex_lock(&event_mutex);
+ ret = user_event_trace_register(user);
+ mutex_unlock(&event_mutex);
+
+ if (ret)
+ goto put_user;
+
+ user->index = index;
+
+ /* Ensure we track ref */
+ atomic_inc(&user->refcnt);
+
+ dyn_event_init(&user->devent, &user_event_dops);
+ dyn_event_add(&user->devent, &user->call);
+ set_bit(user->index, page_bitmap);
+ hash_add(register_table, &user->node, key);
+
+ *newuser = user;
+ return 0;
+put_user:
+ user_event_destroy_fields(user);
+ user_event_destroy_validators(user);
+ kfree(user);
+ return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(char *name)
+{
+ u32 key;
+ int ret;
+ struct user_event *user = find_user_event(name, &key);
+
+ if (!user)
+ return -ENOENT;
+
+ /* Ensure we are the last ref */
+ if (atomic_read(&user->refcnt) != 1) {
+ ret = -EBUSY;
+ goto put_ref;
+ }
+
+ ret = destroy_user_event(user);
+
+ if (ret)
+ goto put_ref;
+
+ return ret;
+put_ref:
+ /* No longer have this ref */
+ atomic_dec(&user->refcnt);
+
+ return ret;
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+ struct user_event_refs *refs;
+ struct user_event *user = NULL;
+ struct tracepoint *tp;
+ ssize_t ret = i->count;
+ int idx;
+
+ if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+ return -EFAULT;
+
+ rcu_read_lock_sched();
+
+ refs = rcu_dereference_sched(file->private_data);
+
+ /*
+ * The refs->events array is protected by RCU, and new items may be
+ * added. But the user retrieved from indexing into the events array
+ * shall be immutable while the file is opened.
+ */
+ if (likely(refs && idx < refs->count))
+ user = refs->events[idx];
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(user == NULL))
+ return -ENOENT;
+
+ if (unlikely(i->count < user->min_size))
+ return -EINVAL;
+
+ tp = &user->tracepoint;
+
+ /*
+ * It's possible key.enabled disables after this check, however
+ * we don't mind if a few events are included in this condition.
+ */
+ if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+ struct iov_iter copy;
+ void *tpdata;
+ bool faulted;
+
+ if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+ return -EFAULT;
+
+ faulted = false;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ copy = *i;
+ probe_func = probe_func_ptr->func;
+ tpdata = probe_func_ptr->data;
+ probe_func(user, &copy, tpdata, &faulted);
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(faulted))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec iov;
+ struct iov_iter i;
+
+ if (unlikely(*ppos != 0))
+ return -EFAULT;
+
+ if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+ return -EFAULT;
+
+ return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+ return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct file *file, struct user_event *user)
+{
+ struct user_event_refs *refs, *new_refs;
+ int i, size, count = 0;
+
+ refs = rcu_dereference_protected(file->private_data,
+ lockdep_is_held(&reg_mutex));
+
+ if (refs) {
+ count = refs->count;
+
+ for (i = 0; i < count; ++i)
+ if (refs->events[i] == user)
+ return i;
+ }
+
+ size = struct_size(refs, events, count + 1);
+
+ new_refs = kzalloc(size, GFP_KERNEL);
+
+ if (!new_refs)
+ return -ENOMEM;
+
+ new_refs->count = count + 1;
+
+ for (i = 0; i < count; ++i)
+ new_refs->events[i] = refs->events[i];
+
+ new_refs->events[i] = user;
+
+ atomic_inc(&user->refcnt);
+
+ rcu_assign_pointer(file->private_data, new_refs);
+
+ if (refs)
+ kfree_rcu(refs, rcu);
+
+ return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+{
+ struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+ struct user_reg reg;
+ struct user_event *user;
+ char *name;
+ long ret;
+
+ ret = user_reg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+ MAX_EVENT_DESC);
+
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ return ret;
+ }
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ ret = user_events_ref_add(file, user);
+
+ /* No longer need parse ref, ref_add either worked or not */
+ atomic_dec(&user->refcnt);
+
+ /* Positive number is index and valid */
+ if (ret < 0)
+ return ret;
+
+ put_user((u32)ret, &ureg->write_index);
+ put_user(user->index, &ureg->status_index);
+
+ return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+{
+ void __user *ubuf = (void __user *)uarg;
+ char *name;
+ long ret;
+
+ name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* event_mutex prevents dyn_event from racing */
+ mutex_lock(&event_mutex);
+ ret = delete_user_event(name);
+ mutex_unlock(&event_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+ unsigned long uarg)
+{
+ long ret = -ENOTTY;
+
+ switch (cmd) {
+ case DIAG_IOCSREG:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_reg(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+
+ case DIAG_IOCSDEL:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_del(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+ struct user_event_refs *refs;
+ struct user_event *user;
+ int i;
+
+ /*
+ * Ensure refs cannot change under any situation by taking the
+ * register mutex during the final freeing of the references.
+ */
+ mutex_lock(&reg_mutex);
+
+ refs = file->private_data;
+
+ if (!refs)
+ goto out;
+
+ /*
+ * The lifetime of refs has reached an end, it's tied to this file.
+ * The underlying user_events are ref counted, and cannot be freed.
+ * After this decrement, the user_events may be freed elsewhere.
+ */
+ for (i = 0; i < refs->count; ++i) {
+ user = refs->events[i];
+
+ if (user)
+ atomic_dec(&user->refcnt);
+ }
+out:
+ file->private_data = NULL;
+
+ mutex_unlock(&reg_mutex);
+
+ kfree(refs);
+
+ return 0;
+}
+
+static const struct file_operations user_data_fops = {
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
+ .unlocked_ioctl = user_events_ioctl,
+ .release = user_events_release,
+};
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (size != MAX_EVENTS)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(register_page_data) >> PAGE_SHIFT,
+ size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+ struct user_event *user;
+ char status;
+ int i, active = 0, busy = 0, flags;
+
+ mutex_lock(&reg_mutex);
+
+ hash_for_each(register_table, i, user, node) {
+ status = register_page_data[user->index];
+ flags = user->flags;
+
+ seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+ if (flags != 0 || status != 0)
+ seq_puts(m, " #");
+
+ if (status != 0) {
+ seq_puts(m, " Used by");
+ if (status & EVENT_STATUS_FTRACE)
+ seq_puts(m, " ftrace");
+ if (status & EVENT_STATUS_PERF)
+ seq_puts(m, " perf");
+ if (status & EVENT_STATUS_OTHER)
+ seq_puts(m, " other");
+ busy++;
+ }
+
+ if (flags & FLAG_BPF_ITER)
+ seq_puts(m, " FLAG:BPF_ITER");
+
+ seq_puts(m, "\n");
+ active++;
+ }
+
+ mutex_unlock(&reg_mutex);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Active: %d\n", active);
+ seq_printf(m, "Busy: %d\n", busy);
+ seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+ return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+ return seq_open(file, &user_seq_ops);
+}
+
+static const struct file_operations user_status_fops = {
+ .open = user_status_open,
+ .mmap = user_status_mmap,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+ struct dentry *edata, *emmap;
+
+ edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+ NULL, NULL, &user_data_fops);
+
+ if (!edata) {
+ pr_warn("Could not create tracefs 'user_events_data' entry\n");
+ goto err;
+ }
+
+ /* mmap with MAP_SHARED requires writable fd */
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ NULL, NULL, &user_status_fops);
+
+ if (!emmap) {
+ tracefs_remove(edata);
+ pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENODEV;
+}
+
+static void set_page_reservations(bool set)
+{
+ int page;
+
+ for (page = 0; page < MAX_PAGES; ++page) {
+ void *addr = register_page_data + (PAGE_SIZE * page);
+
+ if (set)
+ SetPageReserved(virt_to_page(addr));
+ else
+ ClearPageReserved(virt_to_page(addr));
+ }
+}
+
+static int __init trace_events_user_init(void)
+{
+ struct page *pages;
+ int ret;
+
+ /* Zero all bits beside 0 (which is reserved for failures) */
+ bitmap_zero(page_bitmap, MAX_EVENTS);
+ set_bit(0, page_bitmap);
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+ if (!pages)
+ return -ENOMEM;
+ register_page_data = page_address(pages);
+
+ set_page_reservations(true);
+
+ ret = create_user_tracefs();
+
+ if (ret) {
+ pr_warn("user_events could not register with tracefs\n");
+ set_page_reservations(false);
+ __free_pages(pages, MAX_PAGE_ORDER);
+ return ret;
+ }
+
+ if (dyn_event_register(&user_event_dops))
+ pr_warn("user_events could not register with dyn_events\n");
+
+ return 0;
+}
+
+fs_initcall(trace_events_user_init);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index cfddb30e65ab..e9ae1f33a7f0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window.
*/
static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p,
struct task_struct *n)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
@@ -1387,6 +1389,26 @@ static int run_osnoise(void)
}
/*
+ * In some cases, notably when running on a nohz_full CPU with
+ * a stopped tick PREEMPT_RCU has no way to account for QSs.
+ * This will eventually cause unwarranted noise as PREEMPT_RCU
+ * will force preemption as the means of ending the current
+ * grace period. We avoid this problem by calling
+ * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * EQS allowing PREEMPT_RCU to end the current grace period.
+ * This call shouldn't be wrapped inside an RCU critical
+ * section.
+ *
+ * Note that in non PREEMPT_RCU kernels QSs are handled through
+ * cond_resched()
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ local_irq_disable();
+ rcu_momentary_dyntick_idle();
+ local_irq_enable();
+ }
+
+ /*
* For the non-preemptive kernel config: let threads runs, if
* they so wish.
*/
@@ -2200,6 +2222,17 @@ static void osnoise_workload_stop(void)
if (osnoise_has_registered_instances())
return;
+ /*
+ * If callbacks were already disabled in a previous stop
+ * call, there is no need to disable then again.
+ *
+ * For instance, this happens when tracing is stopped via:
+ * echo 0 > tracing_on
+ * echo nop > current_tracer.
+ */
+ if (!trace_osnoise_callback_enabled)
+ return;
+
trace_osnoise_callback_enabled = false;
/*
* Make sure that ftrace_nmi_enter/exit() see
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..45796d8bd4b2 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
int flags;
@@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
if (!flags)
return;
- tracing_record_taskinfo(current, flags);
+ tracing_record_taskinfo_sched_switch(current, wakee, flags);
}
static int tracing_sched_register(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de520eca..46429f9a96fa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 9c9eb20dd2c5..3990e4df3d7b 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
bit += page->index;
set_bit(bit, wqueue->notes_bitmap);
+ generic_pipe_buf_release(pipe, buf);
}
// No try_steal function => no stealing
@@ -112,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
buf->offset = offset;
buf->len = len;
buf->flags = PIPE_BUF_FLAG_WHOLE;
- pipe->head = head + 1;
+ smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */
if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
spin_unlock_irq(&pipe->rd_wait.lock);
@@ -219,7 +220,6 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
struct page **pages;
unsigned long *bitmap;
unsigned long user_bufs;
- unsigned int bmsize;
int ret, i, nr_pages;
if (!wqueue)
@@ -243,7 +243,8 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
goto error;
}
- ret = pipe_resize_ring(pipe, nr_notes);
+ nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes));
if (ret < 0)
goto error;
@@ -258,21 +259,19 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
}
- bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG;
- bmsize *= sizeof(unsigned long);
- bitmap = kmalloc(bmsize, GFP_KERNEL);
+ bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
if (!bitmap)
goto error_p;
- memset(bitmap, 0xff, bmsize);
+ bitmap_fill(bitmap, nr_notes);
wqueue->notes = pages;
wqueue->notes_bitmap = bitmap;
wqueue->nr_pages = nr_pages;
- wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ wqueue->nr_notes = nr_notes;
return 0;
error_p:
- for (i = 0; i < nr_pages; i++)
+ while (--i >= 0)
__free_page(pages[i]);
kfree(pages);
error:
@@ -320,7 +319,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
tf[i].info_mask & WATCH_INFO_LENGTH)
goto err_filter;
/* Ignore any unknown types */
- if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
nr_filter++;
}
@@ -336,7 +335,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
q = wfilter->filters;
for (i = 0; i < filter.nr_filters; i++) {
- if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
q->type = tf[i].type;
@@ -371,6 +370,7 @@ static void __put_watch_queue(struct kref *kref)
for (i = 0; i < wqueue->nr_pages; i++)
__free_page(wqueue->notes[i]);
+ bitmap_free(wqueue->notes_bitmap);
wfilter = rcu_access_pointer(wqueue->filter);
if (wfilter)
@@ -395,6 +395,7 @@ static void free_watch(struct rcu_head *rcu)
put_watch_queue(rcu_access_pointer(watch->queue));
atomic_dec(&watch->cred->user->nr_watches);
put_cred(watch->cred);
+ kfree(watch);
}
static void __put_watch(struct kref *kref)
@@ -566,7 +567,7 @@ void watch_queue_clear(struct watch_queue *wqueue)
rcu_read_lock();
spin_lock_bh(&wqueue->lock);
- /* Prevent new additions and prevent notifications from happening */
+ /* Prevent new notifications from being stored. */
wqueue->defunct = true;
while (!hlist_empty(&wqueue->watches)) {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 99afb88d2e85..9166220457bc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -848,7 +848,7 @@ void __init lockup_detector_init(void)
pr_info("Disabling watchdog on nohz_full cores by default\n");
cpumask_copy(&watchdog_cpumask,
- housekeeping_cpumask(HK_FLAG_TIMER));
+ housekeeping_cpumask(HK_TYPE_TIMER));
if (!watchdog_nmi_probe())
nmi_watchdog_available = true;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33f1106b4f99..0d2514b4ff0d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -154,15 +154,20 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */
- /* The current concurrency level. */
- atomic_t nr_running;
+ /*
+ * The counter is incremented in a process context on the associated CPU
+ * w/ preemption disabled, and decremented or reset in the same context
+ * but w/ pool->lock held. The readers grab pool->lock and are
+ * guaranteed to see if the counter reached zero.
+ */
+ int nr_running;
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
int nr_idle; /* L: currently idle workers */
- struct list_head idle_list; /* X: list of idle workers */
+ struct list_head idle_list; /* L: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
@@ -777,7 +782,7 @@ static bool work_is_canceling(struct work_struct *work)
static bool __need_more_worker(struct worker_pool *pool)
{
- return !atomic_read(&pool->nr_running);
+ return !pool->nr_running;
}
/*
@@ -802,8 +807,7 @@ static bool may_start_working(struct worker_pool *pool)
/* Do I need to keep working? Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) &&
- atomic_read(&pool->nr_running) <= 1;
+ return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}
/* Do we need a new worker? Called from manager. */
@@ -826,7 +830,7 @@ static bool too_many_workers(struct worker_pool *pool)
* Wake up functions.
*/
-/* Return the first idle worker. Safe with preemption disabled */
+/* Return the first idle worker. Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
if (unlikely(list_empty(&pool->idle_list)))
@@ -873,7 +877,7 @@ void wq_worker_running(struct task_struct *task)
*/
preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&worker->pool->nr_running);
+ worker->pool->nr_running++;
preempt_enable();
worker->sleeping = 0;
}
@@ -887,7 +891,7 @@ void wq_worker_running(struct task_struct *task)
*/
void wq_worker_sleeping(struct task_struct *task)
{
- struct worker *next, *worker = kthread_data(task);
+ struct worker *worker = kthread_data(task);
struct worker_pool *pool;
/*
@@ -917,23 +921,9 @@ void wq_worker_sleeping(struct task_struct *task)
return;
}
- /*
- * The counterpart of the following dec_and_test, implied mb,
- * worklist not empty test sequence is in insert_work().
- * Please read comment there.
- *
- * NOT_RUNNING is clear. This means that we're bound to and
- * running on the local cpu w/ rq lock held and preemption
- * disabled, which in turn means that none else could be
- * manipulating idle_list, so dereferencing idle_list without pool
- * lock is safe.
- */
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist)) {
- next = first_idle_worker(pool);
- if (next)
- wake_up_process(next->task);
- }
+ pool->nr_running--;
+ if (need_more_worker(pool))
+ wake_up_worker(pool);
raw_spin_unlock_irq(&pool->lock);
}
@@ -987,7 +977,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- atomic_dec(&pool->nr_running);
+ pool->nr_running--;
}
worker->flags |= flags;
@@ -1019,7 +1009,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&pool->nr_running);
+ pool->nr_running++;
}
/**
@@ -1372,13 +1362,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
list_add_tail(&work->entry, head);
get_pwq(pwq);
- /*
- * Ensure either wq_worker_sleeping() sees the above
- * list_add_tail() or we see zero nr_running to avoid workers lying
- * around lazily while there are works to be processed.
- */
- smp_mb();
-
if (__need_more_worker(pool))
wake_up_worker(pool);
}
@@ -1827,8 +1810,7 @@ static void worker_enter_idle(struct worker *worker)
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/* Sanity check nr_running. */
- WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
- atomic_read(&pool->nr_running));
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}
/**
@@ -5006,7 +4988,7 @@ static void unbind_workers(int cpu)
* an unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
- atomic_set(&pool->nr_running, 0);
+ pool->nr_running = 0;
/*
* With concurrency management just turned off, a busy
@@ -6006,13 +5988,13 @@ static void __init wq_numa_init(void)
void __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
- int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
+ cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);