summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c59
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/audit_tree.c12
-rw-r--r--kernel/audit_watch.c8
-rw-r--r--kernel/auditfilter.c147
-rw-r--r--kernel/auditsc.c370
-rw-r--r--kernel/bpf/arraymap.c163
-rw-r--r--kernel/bpf/core.c17
-rw-r--r--kernel/bpf/hashtab.c84
-rw-r--r--kernel/bpf/helpers.c57
-rw-r--r--kernel/bpf/inode.c44
-rw-r--r--kernel/bpf/stackmap.c10
-rw-r--r--kernel/bpf/syscall.c66
-rw-r--r--kernel/bpf/verifier.c999
-rw-r--r--kernel/capability.c46
-rw-r--r--kernel/cgroup.c277
-rw-r--r--kernel/cgroup_pids.c34
-rw-r--r--kernel/configs/android-base.config152
-rw-r--r--kernel/configs/android-recommended.config121
-rw-r--r--kernel/configs/kvm_guest.config32
-rw-r--r--kernel/configs/tiny.config8
-rw-r--r--kernel/cpu.c524
-rw-r--r--kernel/cpuset.c63
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/callchain.c48
-rw-r--r--kernel/events/core.c710
-rw-r--r--kernel/events/internal.h25
-rw-r--r--kernel/events/ring_buffer.c15
-rw-r--r--kernel/events/uprobes.c49
-rw-r--r--kernel/exit.c153
-rw-r--r--kernel/fork.c324
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c54
-rw-r--r--kernel/gcov/Kconfig1
-rw-r--r--kernel/gcov/gcc_4_7.c2
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c154
-rw-r--r--kernel/irq/chip.c119
-rw-r--r--kernel/irq/generic-chip.c72
-rw-r--r--kernel/irq/handle.c18
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/ipi.c6
-rw-r--r--kernel/irq/irqdesc.c253
-rw-r--r--kernel/irq/irqdomain.c113
-rw-r--r--kernel/irq/manage.c82
-rw-r--r--kernel/irq/msi.c49
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/jump_label.c99
-rw-r--r--kernel/kcov.c7
-rw-r--r--kernel/kexec.c112
-rw-r--r--kernel/kexec_core.c81
-rw-r--r--kernel/kexec_file.c11
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/kthread.c8
-rw-r--r--kernel/livepatch/core.c206
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/lglock.c111
-rw-r--r--kernel/locking/lockdep.c13
-rw-r--r--kernel/locking/mutex-debug.c12
-rw-r--r--kernel/locking/mutex-debug.h8
-rw-r--r--kernel/locking/mutex.c15
-rw-r--r--kernel/locking/mutex.h12
-rw-r--r--kernel/locking/percpu-rwsem.c229
-rw-r--r--kernel/locking/qrwlock.c2
-rw-r--r--kernel/locking/qspinlock.c146
-rw-r--r--kernel/locking/qspinlock_paravirt.h32
-rw-r--r--kernel/locking/qspinlock_stat.h5
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-xadd.c266
-rw-r--r--kernel/locking/rwsem.c24
-rw-r--r--kernel/locking/rwsem.h52
-rw-r--r--kernel/memremap.c34
-rw-r--r--kernel/module.c252
-rw-r--r--kernel/module_signing.c7
-rw-r--r--kernel/padata.c226
-rw-r--r--kernel/panic.c17
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c50
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/console.c8
-rw-r--r--kernel/power/hibernate.c132
-rw-r--r--kernel/power/main.c12
-rw-r--r--kernel/power/power.h13
-rw-r--r--kernel/power/process.c15
-rw-r--r--kernel/power/qos.c11
-rw-r--r--kernel/power/snapshot.c982
-rw-r--r--kernel/power/suspend.c24
-rw-r--r--kernel/power/swap.c39
-rw-r--r--kernel/power/user.c14
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/braille.c4
-rw-r--r--kernel/printk/internal.h57
-rw-r--r--kernel/printk/nmi.c268
-rw-r--r--kernel/printk/printk.c206
-rw-r--r--kernel/profile.c181
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcu/rcuperf.c32
-rw-r--r--kernel/rcu/rcutorture.c71
-rw-r--r--kernel/rcu/sync.c14
-rw-r--r--kernel/rcu/tree.c735
-rw-r--r--kernel/rcu/tree.h16
-rw-r--r--kernel/rcu/tree_exp.h685
-rw-r--r--kernel/rcu/tree_plugin.h96
-rw-r--r--kernel/rcu/tree_trace.c7
-rw-r--r--kernel/rcu/update.c36
-rw-r--r--kernel/relay.c153
-rw-r--r--kernel/sched/core.c561
-rw-r--r--kernel/sched/cpuacct.c114
-rw-r--r--kernel/sched/cpudeadline.c153
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpufreq.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c196
-rw-r--r--kernel/sched/cputime.c283
-rw-r--r--kernel/sched/deadline.c80
-rw-r--r--kernel/sched/debug.c114
-rw-r--r--kernel/sched/fair.c1062
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/idle_task.c4
-rw-r--r--kernel/sched/loadavg.c8
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h159
-rw-r--r--kernel/sched/stats.h23
-rw-r--r--kernel/sched/wait.c113
-rw-r--r--kernel/seccomp.c169
-rw-r--r--kernel/signal.c41
-rw-r--r--kernel/smp.c133
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c43
-rw-r--r--kernel/stop_machine.c55
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/sysctl.c94
-rw-r--r--kernel/sysctl_binary.c23
-rw-r--r--kernel/task_work.c11
-rw-r--r--kernel/time/alarmtimer.c1
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c23
-rw-r--r--kernel/time/hrtimer.c72
-rw-r--r--kernel/time/posix-cpu-timers.c1
-rw-r--r--kernel/time/test_udelay.c16
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c1
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c102
-rw-r--r--kernel/time/time.c21
-rw-r--r--kernel/time/timeconv.c11
-rw-r--r--kernel/time/timekeeping.c16
-rw-r--r--kernel/time/timekeeping_debug.c11
-rw-r--r--kernel/time/timer.c1196
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/torture.c203
-rw-r--r--kernel/trace/Kconfig67
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/blktrace.c83
-rw-r--r--kernel/trace/bpf_trace.c298
-rw-r--r--kernel/trace/ftrace.c350
-rw-r--r--kernel/trace/trace.c692
-rw-r--r--kernel/trace/trace.h243
-rw-r--r--kernel/trace/trace_entries.h31
-rw-r--r--kernel/trace/trace_events.c286
-rw-r--r--kernel/trace/trace_events_filter.c77
-rw-r--r--kernel/trace/trace_events_hist.c1755
-rw-r--r--kernel/trace/trace_events_trigger.c216
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c92
-rw-r--r--kernel/trace/trace_hwlat.c633
-rw-r--r--kernel/trace/trace_kprobe.c5
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_output.c66
-rw-r--r--kernel/trace/trace_printk.c7
-rw-r--r--kernel/trace/trace_probe.c63
-rw-r--r--kernel/trace/trace_probe.h21
-rw-r--r--kernel/trace/trace_uprobe.c8
-rw-r--r--kernel/trace/tracing_map.c1062
-rw-r--r--kernel/trace/tracing_map.h283
-rw-r--r--kernel/ucount.c235
-rw-r--r--kernel/up.c18
-rw-r--r--kernel/user_namespace.c113
-rw-r--r--kernel/utsname.c40
-rw-r--r--kernel/workqueue.c170
181 files changed, 16789 insertions, 6731 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f0c40bf49d9f..eb26e12c6c2a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o
+ async.o range.o smpboot.o ucount.o
obj-$(CONFIG_MULTIUSER) += groups.o
@@ -91,9 +91,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
+obj-$(CONFIG_ELFCORE) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
diff --git a/kernel/audit.c b/kernel/audit.c
index 678c3f000191..f1ca11613379 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -64,7 +64,6 @@
#include <linux/security.h>
#endif
#include <linux/freezer.h>
-#include <linux/tty.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -430,7 +429,6 @@ restart:
attempts, audit_pid);
set_current_state(TASK_INTERRUPTIBLE);
schedule();
- __set_current_state(TASK_RUNNING);
goto restart;
}
}
@@ -879,6 +877,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
+ /* NOTE: we are using task_tgid_vnr() below because
+ * the s.pid value is relative to the namespace
+ * of the caller; at present this doesn't matter
+ * much since you can really only run auditd
+ * from the initial pid namespace, but something
+ * to keep in mind if this changes */
int new_pid = s.pid;
pid_t requesting_pid = task_tgid_vnr(current);
@@ -934,7 +938,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(msg_type);
+ err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
err = 0;
if (msg_type == AUDIT_USER_TTY) {
@@ -1341,15 +1345,14 @@ static inline void audit_get_stamp(struct audit_context *ctx,
static long wait_for_auditd(long sleep_time)
{
DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_UNINTERRUPTIBLE);
- add_wait_queue_exclusive(&audit_backlog_wait, &wait);
if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
sleep_time = schedule_timeout(sleep_time);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
return sleep_time;
}
@@ -1382,7 +1385,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
- if (unlikely(audit_filter_type(type)))
+ if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
return NULL;
if (gfp_mask & __GFP_DIRECT_RECLAIM) {
@@ -1886,31 +1889,41 @@ out_null:
audit_log_format(ab, " exe=(null)");
}
+struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+ struct tty_struct *tty = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->sighand->siglock, flags);
+ if (tsk->signal)
+ tty = tty_kref_get(tsk->signal->tty);
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+ return tty;
+}
+
+void audit_put_tty(struct tty_struct *tty)
+{
+ tty_kref_put(tty);
+}
+
void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
{
const struct cred *cred;
char comm[sizeof(tsk->comm)];
- char *tty;
+ struct tty_struct *tty;
if (!ab)
return;
/* tsk == current */
cred = current_cred();
-
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
+ tty = audit_get_tty(tsk);
audit_log_format(ab,
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
task_ppid_nr(tsk),
- task_pid_nr(tsk),
+ task_tgid_nr(tsk),
from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
@@ -1920,11 +1933,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
from_kgid(&init_user_ns, cred->egid),
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid),
- tty, audit_get_sessionid(tsk));
-
+ tty ? tty_name(tty) : "(none)",
+ audit_get_sessionid(tsk));
+ audit_put_tty(tty);
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-
audit_log_d_path_exe(ab, tsk->mm);
audit_log_task_context(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index cbbe6bb6496e..431444c3708b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -23,6 +23,7 @@
#include <linux/audit.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
+#include <linux/tty.h>
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
@@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
extern void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm);
+extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
+extern void audit_put_tty(struct tty_struct *tty);
+
/* audit watch functions */
#ifdef CONFIG_AUDIT_WATCH
extern void audit_put_watch(struct audit_watch *watch);
@@ -327,6 +331,8 @@ extern pid_t audit_sig_pid;
extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
+extern int audit_filter(int msgtype, unsigned int listtype);
+
#ifdef CONFIG_AUDITSYSCALL
extern int __audit_signal_info(int sig, struct task_struct *t);
static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5efe9b299a12..25772476fa4a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -661,10 +661,10 @@ static int tag_mount(struct vfsmount *mnt, void *arg)
static int prune_tree_thread(void *unused)
{
for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (list_empty(&prune_list))
+ if (list_empty(&prune_list)) {
+ set_current_state(TASK_INTERRUPTIBLE);
schedule();
- __set_current_state(TASK_RUNNING);
+ }
mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
@@ -693,16 +693,14 @@ static int audit_launch_prune(void)
{
if (prune_thread)
return 0;
- prune_thread = kthread_create(prune_tree_thread, NULL,
+ prune_thread = kthread_run(prune_tree_thread, NULL,
"audit_prune_tree");
if (IS_ERR(prune_thread)) {
pr_err("cannot start thread audit_prune_tree");
prune_thread = NULL;
return -ENOMEM;
- } else {
- wake_up_process(prune_thread);
- return 0;
}
+ return 0;
}
/* called with audit_filter_mutex */
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d6709eb70970..0d302a87f21b 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- rcu_read_lock();
- exe_file = rcu_dereference(tsk->mm->exe_file);
+ exe_file = get_task_exe_file(tsk);
+ if (!exe_file)
+ return 0;
ino = exe_file->f_inode->i_ino;
dev = exe_file->f_inode->i_sb->s_dev;
- rcu_read_unlock();
+ fput(exe_file);
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 94ca7b1e5e7e..85d9cac497e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1290,113 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct audit_krule *rule, int type,
- enum audit_state *state)
+int audit_filter(int msgtype, unsigned int listtype)
{
- int i;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &rule->fields[i];
- pid_t pid;
- int result = 0;
- u32 sid;
-
- switch (f->type) {
- case AUDIT_PID:
- pid = task_pid_nr(current);
- result = audit_comparator(pid, f->op, f->val);
- break;
- case AUDIT_UID:
- result = audit_uid_comparator(current_uid(), f->op, f->uid);
- break;
- case AUDIT_GID:
- result = audit_gid_comparator(current_gid(), f->op, f->gid);
- break;
- case AUDIT_LOGINUID:
- result = audit_uid_comparator(audit_get_loginuid(current),
- f->op, f->uid);
- break;
- case AUDIT_LOGINUID_SET:
- result = audit_comparator(audit_loginuid_set(current),
- f->op, f->val);
- break;
- case AUDIT_MSGTYPE:
- result = audit_comparator(type, f->op, f->val);
- break;
- case AUDIT_SUBJ_USER:
- case AUDIT_SUBJ_ROLE:
- case AUDIT_SUBJ_TYPE:
- case AUDIT_SUBJ_SEN:
- case AUDIT_SUBJ_CLR:
- if (f->lsm_rule) {
- security_task_getsecid(current, &sid);
- result = security_audit_rule_match(sid,
- f->type,
- f->op,
- f->lsm_rule,
- NULL);
- }
- break;
- }
-
- if (!result)
- return 0;
- }
- switch (rule->action) {
- case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
- }
- return 1;
-}
-
-int audit_filter_user(int type)
-{
- enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
- int rc, ret;
-
- ret = 1; /* Audit by default */
-
- rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- rc = audit_filter_user_rules(&e->rule, type, &state);
- if (rc) {
- if (rc > 0 && state == AUDIT_DISABLED)
- ret = 0;
- break;
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-int audit_filter_type(int type)
-{
- struct audit_entry *e;
- int result = 0;
+ int ret = 1; /* Audit by default */
rcu_read_lock();
- if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+ if (list_empty(&audit_filter_list[listtype]))
goto unlock_and_return;
+ list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
+ int i, result = 0;
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
- list) {
- int i;
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
- if (f->type == AUDIT_MSGTYPE) {
- result = audit_comparator(type, f->op, f->val);
- if (!result)
- break;
+ pid_t pid;
+ u32 sid;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
+ break;
+ case AUDIT_UID:
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
+ break;
+ case AUDIT_GID:
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
+ break;
+ case AUDIT_LOGINUID:
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
+ f->op, f->val);
+ break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(msgtype, f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ if (f->lsm_rule) {
+ security_task_getsecid(current, &sid);
+ result = security_audit_rule_match(sid,
+ f->type, f->op, f->lsm_rule, NULL);
+ }
+ break;
+ default:
+ goto unlock_and_return;
}
+ if (result < 0) /* error */
+ goto unlock_and_return;
+ if (!result)
+ break;
+ }
+ if (result > 0) {
+ if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE)
+ ret = 0;
+ break;
}
- if (result)
- goto unlock_and_return;
}
unlock_and_return:
rcu_read_unlock();
- return result;
+ return ret;
}
static int update_lsm_rule(struct audit_krule *r)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d0e3cf8abe1..2cd5256dbff7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -63,7 +63,6 @@
#include <asm/unistd.h>
#include <linux/security.h>
#include <linux/list.h>
-#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
@@ -73,6 +72,7 @@
#include <linux/compat.h>
#include <linux/ctype.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <uapi/linux/limits.h>
#include "audit.h"
@@ -82,7 +82,8 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
#define MAX_EXECVE_AUDIT_LEN 7500
/* max length to print of cmdline/proctitle value during audit */
@@ -456,7 +457,7 @@ static int audit_filter_rules(struct task_struct *tsk,
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(tsk);
+ pid = task_tgid_nr(tsk);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
@@ -695,8 +696,12 @@ static int audit_filter_rules(struct task_struct *tsk,
ctx->prio = rule->prio;
}
switch (rule->action) {
- case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ case AUDIT_NEVER:
+ *state = AUDIT_DISABLED;
+ break;
+ case AUDIT_ALWAYS:
+ *state = AUDIT_RECORD_CONTEXT;
+ break;
}
return 1;
}
@@ -988,184 +993,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
return rc;
}
-/*
- * to_send and len_sent accounting are very loose estimates. We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf? an int is up to 12 digits long. if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message. In one 7500 byte message we can log up to
- * about 1000 min size arguments. That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
- struct audit_buffer **ab,
- int arg_num,
- size_t *len_sent,
- const char __user *p,
- char *buf)
+static void audit_log_execve_info(struct audit_context *context,
+ struct audit_buffer **ab)
{
- char arg_num_len_buf[12];
- const char __user *tmp_p = p;
- /* how many digits are in arg_num? 5 is the length of ' a=""' */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
- size_t len, len_left, to_send;
- size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
- unsigned int i, has_cntl = 0, too_long = 0;
- int ret;
-
- /* strnlen_user includes the null we don't want to send */
- len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
- /*
- * We just created this mm, if we can't find the strings
- * we just copied into it something is _very_ wrong. Similar
- * for strings that are too long, we should not have created
- * any.
- */
- if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
- send_sig(SIGKILL, current, 0);
- return -1;
+ long len_max;
+ long len_rem;
+ long len_full;
+ long len_buf;
+ long len_abuf;
+ long len_tmp;
+ bool require_data;
+ bool encode;
+ unsigned int iter;
+ unsigned int arg;
+ char *buf_head;
+ char *buf;
+ const char __user *p = (const char __user *)current->mm->arg_start;
+
+ /* NOTE: this buffer needs to be large enough to hold all the non-arg
+ * data we put in the audit record for this argument (see the
+ * code below) ... at this point in time 96 is plenty */
+ char abuf[96];
+
+ /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+ * current value of 7500 is not as important as the fact that it
+ * is less than 8k, a setting of 7500 gives us plenty of wiggle
+ * room if we go over a little bit in the logging below */
+ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+ len_max = MAX_EXECVE_AUDIT_LEN;
+
+ /* scratch buffer to hold the userspace args */
+ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+ if (!buf_head) {
+ audit_panic("out of memory for argv string");
+ return;
}
+ buf = buf_head;
+
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
- /* walk the whole argument looking for non-ascii chars */
+ len_rem = len_max;
+ len_buf = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ iter = 0;
+ arg = 0;
do {
- if (len_left > MAX_EXECVE_AUDIT_LEN)
- to_send = MAX_EXECVE_AUDIT_LEN;
- else
- to_send = len_left;
- ret = copy_from_user(buf, tmp_p, to_send);
- /*
- * There is no reason for this copy to be short. We just
- * copied them here, and the mm hasn't been exposed to user-
- * space yet.
- */
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
- }
- buf[to_send] = '\0';
- has_cntl = audit_string_contains_control(buf, to_send);
- if (has_cntl) {
- /*
- * hex messages get logged as 2 bytes, so we can only
- * send half as much in each message
- */
- max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
- break;
- }
- len_left -= to_send;
- tmp_p += to_send;
- } while (len_left > 0);
-
- len_left = len;
-
- if (len > max_execve_audit_len)
- too_long = 1;
-
- /* rewalk the argument actually logging the message */
- for (i = 0; len_left > 0; i++) {
- int room_left;
-
- if (len_left > max_execve_audit_len)
- to_send = max_execve_audit_len;
- else
- to_send = len_left;
-
- /* do we have space left to send this argument in this ab? */
- room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
- if (has_cntl)
- room_left -= (to_send * 2);
- else
- room_left -= to_send;
- if (room_left < 0) {
- *len_sent = 0;
- audit_log_end(*ab);
- *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
- if (!*ab)
- return 0;
- }
+ /* NOTE: we don't ever want to trust this value for anything
+ * serious, but the audit record format insists we
+ * provide an argument length for really long arguments,
+ * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+ * to use strncpy_from_user() to obtain this value for
+ * recording in the log, although we don't use it
+ * anywhere here to avoid a double-fetch problem */
+ if (len_full == 0)
+ len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+ /* read more data from userspace */
+ if (require_data) {
+ /* can we make more room in the buffer? */
+ if (buf != buf_head) {
+ memmove(buf_head, buf, len_buf);
+ buf = buf_head;
+ }
+
+ /* fetch as much as we can of the argument */
+ len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+ len_max - len_buf);
+ if (len_tmp == -EFAULT) {
+ /* unable to copy from userspace */
+ send_sig(SIGKILL, current, 0);
+ goto out;
+ } else if (len_tmp == (len_max - len_buf)) {
+ /* buffer is not large enough */
+ require_data = true;
+ /* NOTE: if we are going to span multiple
+ * buffers force the encoding so we stand
+ * a chance at a sane len_full value and
+ * consistent record encoding */
+ encode = true;
+ len_full = len_full * 2;
+ p += len_tmp;
+ } else {
+ require_data = false;
+ if (!encode)
+ encode = audit_string_contains_control(
+ buf, len_tmp);
+ /* try to use a trusted value for len_full */
+ if (len_full < len_max)
+ len_full = (encode ?
+ len_tmp * 2 : len_tmp);
+ p += len_tmp + 1;
+ }
+ len_buf += len_tmp;
+ buf_head[len_buf] = '\0';
- /*
- * first record needs to say how long the original string was
- * so we can be sure nothing was lost.
- */
- if ((i == 0) && (too_long))
- audit_log_format(*ab, " a%d_len=%zu", arg_num,
- has_cntl ? 2*len : len);
-
- /*
- * normally arguments are small enough to fit and we already
- * filled buf above when we checked for control characters
- * so don't bother with another copy_from_user
- */
- if (len >= max_execve_audit_len)
- ret = copy_from_user(buf, p, to_send);
- else
- ret = 0;
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
+ /* length of the buffer in the audit record? */
+ len_abuf = (encode ? len_buf * 2 : len_buf + 2);
}
- buf[to_send] = '\0';
-
- /* actually log it */
- audit_log_format(*ab, " a%d", arg_num);
- if (too_long)
- audit_log_format(*ab, "[%d]", i);
- audit_log_format(*ab, "=");
- if (has_cntl)
- audit_log_n_hex(*ab, buf, to_send);
- else
- audit_log_string(*ab, buf);
-
- p += to_send;
- len_left -= to_send;
- *len_sent += arg_num_len;
- if (has_cntl)
- *len_sent += to_send * 2;
- else
- *len_sent += to_send;
- }
- /* include the null we didn't log */
- return len + 1;
-}
-static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab)
-{
- int i, len;
- size_t len_sent = 0;
- const char __user *p;
- char *buf;
+ /* write as much as we can to the audit log */
+ if (len_buf > 0) {
+ /* NOTE: some magic numbers here - basically if we
+ * can't fit a reasonable amount of data into the
+ * existing audit buffer, flush it and start with
+ * a new buffer */
+ if ((sizeof(abuf) + 8) > len_rem) {
+ len_rem = len_max;
+ audit_log_end(*ab);
+ *ab = audit_log_start(context,
+ GFP_KERNEL, AUDIT_EXECVE);
+ if (!*ab)
+ goto out;
+ }
- p = (const char __user *)current->mm->arg_start;
+ /* create the non-arg portion of the arg record */
+ len_tmp = 0;
+ if (require_data || (iter > 0) ||
+ ((len_abuf + sizeof(abuf)) > len_rem)) {
+ if (iter == 0) {
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d_len=%lu",
+ arg, len_full);
+ }
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d[%d]=", arg, iter++);
+ } else
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d=", arg);
+ WARN_ON(len_tmp >= sizeof(abuf));
+ abuf[sizeof(abuf) - 1] = '\0';
+
+ /* log the arg in the audit record */
+ audit_log_format(*ab, "%s", abuf);
+ len_rem -= len_tmp;
+ len_tmp = len_buf;
+ if (encode) {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem / 2; /* encoding */
+ audit_log_n_hex(*ab, buf, len_tmp);
+ len_rem -= len_tmp * 2;
+ len_abuf -= len_tmp * 2;
+ } else {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem - 2; /* quotes */
+ audit_log_n_string(*ab, buf, len_tmp);
+ len_rem -= len_tmp + 2;
+ /* don't subtract the "2" because we still need
+ * to add quotes to the remaining string */
+ len_abuf -= len_tmp;
+ }
+ len_buf -= len_tmp;
+ buf += len_tmp;
+ }
- audit_log_format(*ab, "argc=%d", context->execve.argc);
+ /* ready to move to the next argument? */
+ if ((len_buf == 0) && !require_data) {
+ arg++;
+ iter = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ }
+ } while (arg < context->execve.argc);
- /*
- * we need some kernel buffer to hold the userspace args. Just
- * allocate one big one rather than allocating one of the right size
- * for every single argument inside audit_log_single_execve_arg()
- * should be <8k allocation so should be pretty safe.
- */
- buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
- if (!buf) {
- audit_panic("out of memory for argv string");
- return;
- }
+ /* NOTE: the caller handles the final audit_log_end() call */
- for (i = 0; i < context->execve.argc; i++) {
- len = audit_log_single_execve_arg(context, ab, i,
- &len_sent, p, buf);
- if (len <= 0)
- break;
- p += len;
- }
- kfree(buf);
+out:
+ kfree(buf_head);
}
static void show_special(struct audit_context *context, int *call_panic)
@@ -1426,7 +1425,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, " cwd=", &context->pwd);
+ audit_log_d_path(ab, "cwd=", &context->pwd);
audit_log_end(ab);
}
}
@@ -1980,21 +1979,26 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
{
struct audit_buffer *ab;
uid_t uid, oldloginuid, loginuid;
+ struct tty_struct *tty;
if (!audit_enabled)
return;
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+
uid = from_kuid(&init_user_ns, task_uid(current));
oldloginuid = from_kuid(&init_user_ns, koldloginuid);
loginuid = from_kuid(&init_user_ns, kloginuid),
+ tty = audit_get_tty(current);
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (!ab)
- return;
- audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
- audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
- oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+ audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+ oldsessionid, sessionid, !rc);
+ audit_put_tty(tty);
audit_log_end(ab);
}
@@ -2216,7 +2220,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = task_pid_nr(t);
+ context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2241,7 +2245,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_pid_nr(tsk);
+ audit_sig_pid = task_tgid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
@@ -2341,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = task_pid_nr(current);
+ context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
@@ -2373,7 +2377,7 @@ static void audit_log_task(struct audit_buffer *ab)
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..a2ac051c342f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
}
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
+
if (IS_ERR(prog))
return prog;
@@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+
return prog;
}
static void prog_fd_array_put_ptr(void *ptr)
{
- struct bpf_prog *prog = ptr;
-
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(ptr);
}
/* decrement refcnt of all bpf_progs that are stored in this map */
@@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = {
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -425,59 +425,105 @@ static int __init register_prog_array_map(void)
}
late_initcall(register_prog_array_map);
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+ struct file *map_file)
{
- bpf_fd_array_map_clear(map);
- fd_array_map_free(map);
+ struct bpf_event_entry *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ if (ee) {
+ ee->event = perf_file->private_data;
+ ee->perf_file = perf_file;
+ ee->map_file = map_file;
+ }
+
+ return ee;
}
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void __bpf_event_entry_free(struct rcu_head *rcu)
{
- struct perf_event *event;
- const struct perf_event_attr *attr;
- struct file *file;
+ struct bpf_event_entry *ee;
- file = perf_event_get(fd);
- if (IS_ERR(file))
- return file;
+ ee = container_of(rcu, struct bpf_event_entry, rcu);
+ fput(ee->perf_file);
+ kfree(ee);
+}
- event = file->private_data;
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+ call_rcu(&ee->rcu, __bpf_event_entry_free);
+}
- attr = perf_event_attrs(event);
- if (IS_ERR(attr))
- goto err;
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
+{
+ const struct perf_event_attr *attr;
+ struct bpf_event_entry *ee;
+ struct perf_event *event;
+ struct file *perf_file;
- if (attr->inherit)
- goto err;
+ perf_file = perf_event_get(fd);
+ if (IS_ERR(perf_file))
+ return perf_file;
- if (attr->type == PERF_TYPE_RAW)
- return file;
+ event = perf_file->private_data;
+ ee = ERR_PTR(-EINVAL);
- if (attr->type == PERF_TYPE_HARDWARE)
- return file;
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr) || attr->inherit)
+ goto err_out;
+
+ switch (attr->type) {
+ case PERF_TYPE_SOFTWARE:
+ if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+ goto err_out;
+ /* fall-through */
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
+ /* fall-through */
+ default:
+ break;
+ }
- if (attr->type == PERF_TYPE_SOFTWARE &&
- attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return file;
-err:
- fput(file);
- return ERR_PTR(-EINVAL);
+err_out:
+ fput(perf_file);
+ return ee;
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- fput((struct file *)ptr);
+ bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+ struct file *map_file)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_event_entry *ee;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < array->map.max_entries; i++) {
+ ee = READ_ONCE(array->ptrs[i]);
+ if (ee && ee->map_file == map_file)
+ fd_array_map_delete_elem(map, &i);
+ }
+ rcu_read_unlock();
}
static const struct bpf_map_ops perf_event_array_ops = {
.map_alloc = fd_array_map_alloc,
- .map_free = perf_event_array_map_free,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+ .map_release = perf_event_fd_array_release,
};
static struct bpf_map_type_list perf_event_array_type __read_mostly = {
@@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void)
return 0;
}
late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_CGROUPS
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int fd)
+{
+ return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+ /* cgroup_put free cgrp after a rcu grace period */
+ cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = cgroup_fd_array_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+ .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+ .ops = &cgroup_array_ops,
+ .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+ bpf_register_map_type(&cgroup_array_type);
+ return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f1e8a0def99b..aa6d98154106 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -231,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
hdr->pages = size / PAGE_SIZE;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (prandom_u32() % hole) & ~(alignment - 1);
+ start = (get_random_int() % hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -251,7 +251,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
struct bpf_insn *to_buff)
{
struct bpf_insn *to = to_buff;
- u32 imm_rnd = prandom_u32();
+ u32 imm_rnd = get_random_int();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -719,14 +719,13 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
-
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
prog = READ_ONCE(array->ptrs[index]);
- if (unlikely(!prog))
+ if (!prog)
goto out;
/* ARG1 at this point is guaranteed to point to CTX from
@@ -1019,7 +1018,7 @@ void bpf_user_rnd_init_once(void)
prandom_init_once(&bpf_user_rnd_state);
}
-u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_user_rnd_u32)
{
/* Should someone ever have the rather unwise idea to use some
* of the registers passed into this function, then note that
@@ -1032,7 +1031,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
state = &get_cpu_var(bpf_user_rnd_state);
res = prandom_u32_state(state);
- put_cpu_var(state);
+ put_cpu_var(bpf_user_rnd_state);
return res;
}
@@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
return NULL;
}
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- return NULL;
+ return -ENOTSUPP;
}
/* Always built-in helper functions. */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d52fc..570eeca7bdfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
struct bucket *buckets;
void *elems;
struct pcpu_freelist freelist;
+ void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
+enum extra_elem_state {
+ HTAB_NOT_AN_EXTRA_ELEM = 0,
+ HTAB_EXTRA_ELEM_FREE,
+ HTAB_EXTRA_ELEM_USED
+};
+
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
@@ -38,7 +45,10 @@ struct htab_elem {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
};
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ enum extra_elem_state state;
+ };
u32 hash;
char key[0] __aligned(8);
};
@@ -113,6 +123,23 @@ free_elems:
return err;
}
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+ void __percpu *pptr;
+ int cpu;
+
+ pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+ HTAB_EXTRA_ELEM_FREE;
+ }
+ htab->extra_elems = pptr;
+ return 0;
+}
+
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (percpu)
cost += (u64) round_up(htab->map.value_size, 8) *
num_possible_cpus() * htab->map.max_entries;
+ else
+ cost += (u64) htab->elem_size * num_possible_cpus();
if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
+ if (!percpu) {
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_buckets;
+ }
+
if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
err = prealloc_elems_and_freelist(htab);
if (err)
- goto free_buckets;
+ goto free_extra_elems;
}
return &htab->map;
+free_extra_elems:
+ free_percpu(htab->extra_elems);
free_buckets:
kvfree(htab->buckets);
free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
kfree(l);
-
}
static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
+ if (l->state == HTAB_EXTRA_ELEM_USED) {
+ l->state = HTAB_EXTRA_ELEM_FREE;
+ return;
+ }
+
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
- bool percpu, bool onallcpus)
+ bool percpu, bool onallcpus,
+ bool old_elem_exists)
{
u32 size = htab->map.value_size;
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
struct htab_elem *l_new;
void __percpu *pptr;
+ int err = 0;
if (prealloc) {
l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
if (!l_new)
- return ERR_PTR(-E2BIG);
+ err = -E2BIG;
} else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
atomic_dec(&htab->count);
- return ERR_PTR(-E2BIG);
+ err = -E2BIG;
+ } else {
+ l_new = kmalloc(htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
}
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return ERR_PTR(-ENOMEM);
+ }
+
+ if (err) {
+ if (!old_elem_exists)
+ return ERR_PTR(err);
+
+ /* if we're updating the existing element and the hash table
+ * is full, use per-cpu extra elems
+ */
+ l_new = this_cpu_ptr(htab->extra_elems);
+ if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+ return ERR_PTR(-E2BIG);
+ l_new->state = HTAB_EXTRA_ELEM_USED;
+ } else {
+ l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
}
memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret)
goto err;
- l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+ !!l_old);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
}
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus);
+ hash, true, onallcpus, false);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
htab_free_elems(htab);
pcpu_freelist_destroy(&htab->freelist);
}
+ free_percpu(htab->extra_elems);
kvfree(htab->buckets);
kfree(htab);
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/uidgid.h>
+#include <linux/filter.h>
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -26,48 +27,32 @@
* if program is allowed to access maps, so check rcu_read_lock_held in
* all three functions.
*/
-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- /* verifier checked that R1 contains a valid pointer to bpf_map
- * and R2 points to a program stack and map->key_size bytes were
- * initialized
- */
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
- void *value;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
- value = map->ops->map_lookup_elem(map, key);
-
- /* lookup() returns either pointer to element value or NULL
- * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
- */
- return (unsigned long) value;
+ return (unsigned long) map->ops->map_lookup_elem(map, key);
}
const struct bpf_func_proto bpf_map_lookup_elem_proto = {
.func = bpf_map_lookup_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
};
-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
+ void *, value, u64, flags)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
- void *value = (void *) (unsigned long) r3;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
- return map->ops->map_update_elem(map, key, value, r4);
+ return map->ops->map_update_elem(map, key, value, flags);
}
const struct bpf_func_proto bpf_map_update_elem_proto = {
.func = bpf_map_update_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -75,19 +60,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
return map->ops->map_delete_elem(map, key);
}
const struct bpf_func_proto bpf_map_delete_elem_proto = {
.func = bpf_map_delete_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -99,9 +81,9 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_smp_processor_id)
{
- return raw_smp_processor_id();
+ return smp_processor_id();
}
const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
@@ -110,7 +92,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_ktime_get_ns)
{
/* NMI safe access to clock monotonic */
return ktime_get_mono_fast_ns();
@@ -122,11 +104,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
- if (!task)
+ if (unlikely(!task))
return -EINVAL;
return (u64) task->tgid << 32 | task->pid;
@@ -138,18 +120,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_uid_gid)
{
struct task_struct *task = current;
kuid_t uid;
kgid_t gid;
- if (!task)
+ if (unlikely(!task))
return -EINVAL;
current_uid_gid(&uid, &gid);
return (u64) from_kgid(&init_user_ns, gid) << 32 |
- from_kuid(&init_user_ns, uid);
+ from_kuid(&init_user_ns, uid);
}
const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
@@ -158,10 +140,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
{
struct task_struct *task = current;
- char *buf = (char *) (long) r1;
if (unlikely(!task))
goto err_clear;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 8f94ca1860cf..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
* version 2 as published by the Free Software Foundation.
*/
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/mount.h>
@@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
return 0;
}
-static bool bpf_dname_reserved(const struct dentry *dentry)
-{
- return strchr(dentry->d_name.name, '.');
-}
-
static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
- if (bpf_dname_reserved(dentry))
- return -EPERM;
-
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
{
struct inode *inode;
- if (bpf_dname_reserved(dentry))
- return -EPERM;
-
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
}
}
-static int bpf_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *new_dentry)
+static struct dentry *
+bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
- if (bpf_dname_reserved(new_dentry))
- return -EPERM;
-
- return simple_link(old_dentry, dir, new_dentry);
-}
-
-static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- if (bpf_dname_reserved(new_dentry))
- return -EPERM;
-
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (strchr(dentry->d_name.name, '.'))
+ return ERR_PTR(-EPERM);
+ return simple_lookup(dir, dentry, flags);
}
static const struct inode_operations bpf_dir_iops = {
- .lookup = simple_lookup,
+ .lookup = bpf_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.rmdir = simple_rmdir,
- .rename = bpf_rename,
- .link = bpf_link,
+ .rename = simple_rename,
+ .link = simple_link,
.unlink = simple_unlink,
};
@@ -378,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
static struct dentry *bpf_mount(struct file_system_type *type, int flags,
const char *dev_name, void *data)
{
- return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+ return mount_nodev(type, flags, data, bpf_fill_super);
}
static struct file_system_type bpf_fs_type = {
@@ -386,11 +365,8 @@ static struct file_system_type bpf_fs_type = {
.name = "bpf",
.mount = bpf_mount,
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
};
-MODULE_ALIAS_FS("bpf");
-
static int __init bpf_init(void)
{
int ret;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index c8ee35287bfe..732ae16d12b7 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (err)
goto free_smap;
- err = get_callchain_buffers();
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
if (err)
goto free_smap;
@@ -116,10 +116,9 @@ free_smap:
return ERR_PTR(err);
}
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags)
{
- struct pt_regs *regs = (struct pt_regs *) (long) r1;
- struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
@@ -136,7 +135,8 @@ u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack, false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
static int bpf_map_release(struct inode *inode, struct file *filp)
{
- bpf_map_put_with_uref(filp->private_data);
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_release)
+ map->ops->map_release(map, filp);
+
+ bpf_map_put_with_uref(map);
return 0;
}
@@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ attr->flags);
+ rcu_read_unlock();
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
- if (atomic_dec_and_test(&prog->aux->refcnt))
- call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt))
- __prog_put_common(&prog->aux->rcu);
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
return 0;
}
@@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
{
if (!f.file)
return ERR_PTR(-EBADF);
@@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
{
- if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
- atomic_dec(&prog->aux->refcnt);
+ if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_sub(i, &prog->aux->refcnt);
return ERR_PTR(-EBUSY);
}
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_add);
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ return bpf_prog_add(prog, 1);
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
- prog = __bpf_prog_get(f);
+ prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
+ if (type && prog->type != *type) {
+ prog = ERR_PTR(-EINVAL);
+ goto out;
+ }
prog = bpf_prog_inc(prog);
+out:
fdput(f);
-
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+ return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD kern_version
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a08d66215245..99a7e5b388f2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
#include <linux/file.h>
@@ -126,100 +127,16 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/
-/* types of values stored in eBPF registers */
-enum bpf_reg_type {
- NOT_INIT = 0, /* nothing was written into register */
- UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
- PTR_TO_CTX, /* reg points to bpf_context */
- CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
- PTR_TO_MAP_VALUE, /* reg points to map element value */
- PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
- FRAME_PTR, /* reg == frame_pointer */
- PTR_TO_STACK, /* reg == frame_pointer + imm */
- CONST_IMM, /* constant integer value */
-
- /* PTR_TO_PACKET represents:
- * skb->data
- * skb->data + imm
- * skb->data + (u16) var
- * skb->data + (u16) var + imm
- * if (range > 0) then [ptr, ptr + range - off) is safe to access
- * if (id > 0) means that some 'var' was added
- * if (off > 0) menas that 'imm' was added
- */
- PTR_TO_PACKET,
- PTR_TO_PACKET_END, /* skb->data + headlen */
-};
-
-struct reg_state {
- enum bpf_reg_type type;
- union {
- /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
- s64 imm;
-
- /* valid when type == PTR_TO_PACKET* */
- struct {
- u32 id;
- u16 off;
- u16 range;
- };
-
- /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
- * PTR_TO_MAP_VALUE_OR_NULL
- */
- struct bpf_map *map_ptr;
- };
-};
-
-enum bpf_stack_slot_type {
- STACK_INVALID, /* nothing was stored in this stack slot */
- STACK_SPILL, /* register spilled into stack */
- STACK_MISC /* BPF program wrote some data into this slot */
-};
-
-#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
-
-/* state of the program:
- * type of all registers and stack info
- */
-struct verifier_state {
- struct reg_state regs[MAX_BPF_REG];
- u8 stack_slot_type[MAX_BPF_STACK];
- struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
-};
-
-/* linked list of verifier states used to prune search */
-struct verifier_state_list {
- struct verifier_state state;
- struct verifier_state_list *next;
-};
-
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
-struct verifier_stack_elem {
+struct bpf_verifier_stack_elem {
/* verifer state is 'st'
* before processing instruction 'insn_idx'
* and after processing instruction 'prev_insn_idx'
*/
- struct verifier_state st;
+ struct bpf_verifier_state st;
int insn_idx;
int prev_insn_idx;
- struct verifier_stack_elem *next;
-};
-
-#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
-
-/* single container for all structs
- * one verifier_env per bpf_check() call
- */
-struct verifier_env {
- struct bpf_prog *prog; /* eBPF program being verified */
- struct verifier_stack_elem *head; /* stack of verifier states to be processed */
- int stack_size; /* number of states to be processed */
- struct verifier_state cur_state; /* current verifier state */
- struct verifier_state_list **explored_states; /* search pruning optimization */
- struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
- u32 used_map_cnt; /* number of used maps */
- bool allow_ptr_leaks;
+ struct bpf_verifier_stack_elem *next;
};
#define BPF_COMPLEXITY_LIMIT_INSNS 65536
@@ -228,6 +145,7 @@ struct verifier_env {
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
+ bool pkt_access;
int regno;
int access_size;
};
@@ -264,6 +182,7 @@ static const char * const reg_type_str[] = {
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
[FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
[CONST_IMM] = "imm",
@@ -271,9 +190,9 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET_END] = "pkt_end",
};
-static void print_verifier_state(struct verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_state *state)
{
- struct reg_state *reg;
+ struct bpf_reg_state *reg;
enum bpf_reg_type t;
int i;
@@ -291,10 +210,17 @@ static void print_verifier_state(struct verifier_state *state)
else if (t == UNKNOWN_VALUE && reg->imm)
verbose("%lld", reg->imm);
else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
+ t == PTR_TO_MAP_VALUE_OR_NULL ||
+ t == PTR_TO_MAP_VALUE_ADJ)
verbose("(ks=%d,vs=%d)",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
+ if (reg->min_value != BPF_REGISTER_MIN_RANGE)
+ verbose(",min_value=%llu",
+ (unsigned long long)reg->min_value);
+ if (reg->max_value != BPF_REGISTER_MAX_RANGE)
+ verbose(",max_value=%llu",
+ (unsigned long long)reg->max_value);
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -449,9 +375,9 @@ static void print_bpf_insn(struct bpf_insn *insn)
}
}
-static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
{
- struct verifier_stack_elem *elem;
+ struct bpf_verifier_stack_elem *elem;
int insn_idx;
if (env->head == NULL)
@@ -468,12 +394,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
return insn_idx;
}
-static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
- int prev_insn_idx)
+static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
{
- struct verifier_stack_elem *elem;
+ struct bpf_verifier_stack_elem *elem;
- elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
@@ -499,13 +425,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void init_reg_state(struct reg_state *regs)
+static void init_reg_state(struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
regs[i].type = NOT_INIT;
regs[i].imm = 0;
+ regs[i].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[i].max_value = BPF_REGISTER_MAX_RANGE;
}
/* frame pointer */
@@ -515,20 +443,26 @@ static void init_reg_state(struct reg_state *regs)
regs[BPF_REG_1].type = PTR_TO_CTX;
}
-static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
{
BUG_ON(regno >= MAX_BPF_REG);
regs[regno].type = UNKNOWN_VALUE;
regs[regno].imm = 0;
}
+static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+{
+ regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct reg_state *regs, u32 regno,
+static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
enum reg_arg_type t)
{
if (regno >= MAX_BPF_REG) {
@@ -588,8 +522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct verifier_state *state, int off, int size,
- int value_regno)
+static int check_stack_write(struct bpf_verifier_state *state, int off,
+ int size, int value_regno)
{
int i;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -614,7 +548,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
} else {
/* regular write of data into stack */
state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct reg_state) {};
+ (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -622,7 +556,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
return 0;
}
-static int check_stack_read(struct verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
@@ -663,7 +597,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct verifier_env *env, u32 regno, int off,
+static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -678,31 +612,48 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
-static int check_packet_access(struct verifier_env *env, u32 regno, int off,
- int size)
+static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
+ const struct bpf_call_arg_meta *meta)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *reg = &regs[regno];
- int linear_size = (int) reg->range - (int) reg->off;
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ if (meta)
+ return meta->pkt_access;
- if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) {
- verbose("verifier bug\n");
- return -EFAULT;
+ env->seen_direct_write = true;
+ return true;
+ default:
+ return false;
}
- if (off < 0 || off + size > linear_size) {
- verbose("invalid access to packet, off=%d size=%d, allowed=%d\n",
- off, size, linear_size);
+}
+
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *reg = &regs[regno];
+
+ off += reg->off;
+ if (off < 0 || size <= 0 || off + size > reg->range) {
+ verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
}
return 0;
}
/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct verifier_env *env, int off, int size,
- enum bpf_access_type t)
+static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+ enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
+ /* for analyzer ctx accesses are already validated and converted */
+ if (env->analyzer_ops)
+ return 0;
+
if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t)) {
+ env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@@ -713,7 +664,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
return -EACCES;
}
-static bool is_pointer_value(struct verifier_env *env, int regno)
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
if (env->allow_ptr_leaks)
return false;
@@ -727,27 +678,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
}
}
-static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
- int off, int size)
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int off, int size)
{
- if (reg->type != PTR_TO_PACKET) {
+ if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
if (off % size != 0) {
- verbose("misaligned access off %d size %d\n", off, size);
+ verbose("misaligned access off %d size %d\n",
+ off, size);
return -EACCES;
} else {
return 0;
}
}
- switch (env->prog->type) {
- case BPF_PROG_TYPE_SCHED_CLS:
- case BPF_PROG_TYPE_SCHED_ACT:
- break;
- default:
- verbose("verifier is misconfigured\n");
- return -EACCES;
- }
-
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
/* misaligned access to packet is ok on x86,arm,arm64 */
return 0;
@@ -758,7 +701,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
}
/* skb->data is NET_IP_ALIGN-ed */
- if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+ if (reg->type == PTR_TO_PACKET &&
+ (NET_IP_ALIGN + reg->off + off) % size != 0) {
verbose("misaligned packet access off %d+%d+%d size %d\n",
NET_IP_ALIGN, reg->off, off, size);
return -EACCES;
@@ -772,12 +716,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *reg = &state->regs[regno];
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;
if (reg->type == PTR_TO_STACK)
@@ -791,32 +735,69 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
if (err)
return err;
- if (reg->type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_VALUE ||
+ reg->type == PTR_TO_MAP_VALUE_ADJ) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
+
+ /* If we adjusted the register to this map value at all then we
+ * need to change off and size to min_value and max_value
+ * respectively to make sure our theoretical access will be
+ * safe.
+ */
+ if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ if (log_level)
+ print_verifier_state(state);
+ env->varlen_map_value_access = true;
+ /* The minimum value is only important with signed
+ * comparisons where we can't assume the floor of a
+ * value is 0. If we are using signed variables for our
+ * index'es we need to make sure that whatever we use
+ * will have a set floor within our range.
+ */
+ if ((s64)reg->min_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_map_access(env, regno, reg->min_value + off,
+ size);
+ if (err) {
+ verbose("R%d min value is outside of the array range\n",
+ regno);
+ return err;
+ }
+
+ /* If we haven't set a max value then we need to bail
+ * since we can't be sure we won't do bad things.
+ */
+ if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ regno);
+ return -EACCES;
+ }
+ off += reg->max_value;
+ }
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
+ enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
- err = check_ctx_access(env, off, size, t);
+ err = check_ctx_access(env, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
mark_reg_unknown_value(state->regs, value_regno);
- if (off == offsetof(struct __sk_buff, data) &&
- env->allow_ptr_leaks)
- /* note that reg.[id|off|range] == 0 */
- state->regs[value_regno].type = PTR_TO_PACKET;
- else if (off == offsetof(struct __sk_buff, data_end) &&
- env->allow_ptr_leaks)
- state->regs[value_regno].type = PTR_TO_PACKET_END;
+ /* note that reg.[id|off|range] == 0 */
+ state->regs[value_regno].type = reg_type;
}
} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -836,10 +817,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE) {
+ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
verbose("cannot write into packet\n");
return -EACCES;
}
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into packet\n", value_regno);
+ return -EACCES;
+ }
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
@@ -860,9 +846,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
return err;
}
-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -896,12 +882,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized
*/
-static int check_stack_boundary(struct verifier_env *env, int regno,
+static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs;
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *regs = state->regs;
int off, i;
if (regs[regno].type != PTR_TO_STACK) {
@@ -940,18 +926,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
return 0;
}
-static int check_func_arg(struct verifier_env *env, u32 regno,
+static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
{
- struct reg_state *reg = env->cur_state.regs + regno;
- enum bpf_reg_type expected_type;
+ struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
if (arg_type == ARG_DONTCARE)
return 0;
- if (reg->type == NOT_INIT) {
+ if (type == NOT_INIT) {
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
@@ -964,16 +950,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return 0;
}
+ if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+ verbose("helper access to the packet is not allowed\n");
+ return -EACCES;
+ }
+
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
+ if (type != PTR_TO_PACKET && type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_CONST_STACK_SIZE ||
arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
expected_type = CONST_IMM;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_CTX) {
expected_type = PTR_TO_CTX;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_STACK ||
arg_type == ARG_PTR_TO_RAW_STACK) {
expected_type = PTR_TO_STACK;
@@ -981,20 +980,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
* passed in as argument, it's a CONST_IMM type. Final test
* happens during stack boundary checking.
*/
- if (reg->type == CONST_IMM && reg->imm == 0)
- expected_type = CONST_IMM;
+ if (type == CONST_IMM && reg->imm == 0)
+ /* final test in check_stack_boundary() */;
+ else if (type != PTR_TO_PACKET && type != expected_type)
+ goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
- if (reg->type != expected_type) {
- verbose("R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type], reg_type_str[expected_type]);
- return -EACCES;
- }
-
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
meta->map_ptr = reg->map_ptr;
@@ -1012,8 +1007,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
- false, NULL);
+ if (type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno, 0,
+ meta->map_ptr->key_size);
+ else
+ err = check_stack_boundary(env, regno,
+ meta->map_ptr->key_size,
+ false, NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
@@ -1023,9 +1023,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->value\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno,
- meta->map_ptr->value_size,
- false, NULL);
+ if (type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno, 0,
+ meta->map_ptr->value_size);
+ else
+ err = check_stack_boundary(env, regno,
+ meta->map_ptr->value_size,
+ false, NULL);
} else if (arg_type == ARG_CONST_STACK_SIZE ||
arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -1039,11 +1043,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
+ if (regs[regno - 1].type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno - 1, 0, reg->imm);
+ else
+ err = check_stack_boundary(env, regno - 1, reg->imm,
+ zero_size_allowed, meta);
}
return err;
+err_type:
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[type], reg_type_str[expected_type]);
+ return -EACCES;
}
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
@@ -1066,6 +1077,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_get_stackid)
goto error;
break;
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ if (func_id != BPF_FUNC_skb_under_cgroup &&
+ func_id != BPF_FUNC_current_task_under_cgroup)
+ goto error;
+ break;
default:
break;
}
@@ -1085,6 +1101,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+ case BPF_FUNC_current_task_under_cgroup:
+ case BPF_FUNC_skb_under_cgroup:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+ goto error;
+ break;
default:
break;
}
@@ -1114,10 +1135,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
-static void clear_all_pkt_pointers(struct verifier_env *env)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs, *reg;
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
@@ -1137,12 +1158,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env)
}
}
-static int check_call(struct verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id)
{
- struct verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct reg_state *regs = state->regs;
- struct reg_state *reg;
+ struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *reg;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
@@ -1170,6 +1191,7 @@ static int check_call(struct verifier_env *env, int func_id)
changes_data = bpf_helper_changes_skb_data(fn->func);
memset(&meta, 0, sizeof(meta));
+ meta.pkt_access = fn->pkt_access;
/* We only support one arg being in raw mode at the moment, which
* is sufficient for the helper functions we have right now.
@@ -1220,6 +1242,7 @@ static int check_call(struct verifier_env *env, int func_id)
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
@@ -1244,11 +1267,13 @@ static int check_call(struct verifier_env *env, int func_id)
return 0;
}
-static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
+static int check_packet_ptr_add(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *dst_reg = &regs[insn->dst_reg];
- struct reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state tmp_reg;
s32 imm;
if (BPF_SRC(insn->code) == BPF_K) {
@@ -1271,6 +1296,19 @@ add_imm:
*/
dst_reg->off += imm;
} else {
+ if (src_reg->type == PTR_TO_PACKET) {
+ /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
+ tmp_reg = *dst_reg; /* save r7 state */
+ *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
+ src_reg = &tmp_reg; /* pretend it's src_reg state */
+ /* if the checks below reject it, the copy won't matter,
+ * since we're rejecting the whole program. If all ok,
+ * then imm22 state will be added to r7
+ * and r7 will be pkt(id=0,off=22,r=62) while
+ * r6 will stay as pkt(id=0,off=0,r=62)
+ */
+ }
+
if (src_reg->type == CONST_IMM) {
/* pkt_ptr += reg where reg is known constant */
imm = src_reg->imm;
@@ -1294,7 +1332,7 @@ add_imm:
/* dst_reg stays as pkt_ptr type and since some positive
* integer value was added to the pointer, increment its 'id'
*/
- dst_reg->id++;
+ dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range and off to zero */
dst_reg->off = 0;
@@ -1303,10 +1341,10 @@ add_imm:
return 0;
}
-static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
u8 opcode = BPF_OP(insn->code);
s64 imm_log2;
@@ -1316,7 +1354,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
*/
if (BPF_SRC(insn->code) == BPF_X) {
- struct reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
dst_reg->imm && opcode == BPF_ADD) {
@@ -1405,11 +1443,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *dst_reg = &regs[insn->dst_reg];
- struct reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
@@ -1425,10 +1464,110 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static void check_reg_overflow(struct bpf_reg_state *reg)
+{
+ if (reg->max_value > BPF_REGISTER_MAX_RANGE)
+ reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE)
+ reg->min_value = BPF_REGISTER_MIN_RANGE;
+}
+
+static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE;
+ bool min_set = false, max_set = false;
+ u8 opcode = BPF_OP(insn->code);
+
+ dst_reg = &regs[insn->dst_reg];
+ if (BPF_SRC(insn->code) == BPF_X) {
+ check_reg_overflow(&regs[insn->src_reg]);
+ min_val = regs[insn->src_reg].min_value;
+ max_val = regs[insn->src_reg].max_value;
+
+ /* If the source register is a random pointer then the
+ * min_value/max_value values represent the range of the known
+ * accesses into that value, not the actual min/max value of the
+ * register itself. In this case we have to reset the reg range
+ * values so we know it is not safe to look at.
+ */
+ if (regs[insn->src_reg].type != CONST_IMM &&
+ regs[insn->src_reg].type != UNKNOWN_VALUE) {
+ min_val = BPF_REGISTER_MIN_RANGE;
+ max_val = BPF_REGISTER_MAX_RANGE;
+ }
+ } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
+ (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
+ min_val = max_val = insn->imm;
+ min_set = max_set = true;
+ }
+
+ /* We don't know anything about what was done to this register, mark it
+ * as unknown.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) {
+ reset_reg_range_values(regs, insn->dst_reg);
+ return;
+ }
+
+ switch (opcode) {
+ case BPF_ADD:
+ dst_reg->min_value += min_val;
+ dst_reg->max_value += max_val;
+ break;
+ case BPF_SUB:
+ dst_reg->min_value -= min_val;
+ dst_reg->max_value -= max_val;
+ break;
+ case BPF_MUL:
+ dst_reg->min_value *= min_val;
+ dst_reg->max_value *= max_val;
+ break;
+ case BPF_AND:
+ /* & is special since it could end up with 0 bits set. */
+ dst_reg->min_value &= min_val;
+ dst_reg->max_value = max_val;
+ break;
+ case BPF_LSH:
+ /* Gotta have special overflow logic here, if we're shifting
+ * more than MAX_RANGE then just assume we have an invalid
+ * range.
+ */
+ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else
+ dst_reg->min_value <<= min_val;
+
+ if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ else
+ dst_reg->max_value <<= max_val;
+ break;
+ case BPF_RSH:
+ dst_reg->min_value >>= min_val;
+ dst_reg->max_value >>= max_val;
+ break;
+ case BPF_MOD:
+ /* % is special since it is an unsigned modulus, so the floor
+ * will always be 0.
+ */
+ dst_reg->min_value = 0;
+ dst_reg->max_value = max_val - 1;
+ break;
+ default:
+ reset_reg_range_values(regs, insn->dst_reg);
+ break;
+ }
+
+ check_reg_overflow(dst_reg);
+}
+
/* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
+static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1488,6 +1627,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ /* we are setting our register to something new, we need to
+ * reset its range values.
+ */
+ reset_reg_range_values(regs, insn->dst_reg);
+
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
@@ -1509,6 +1653,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = insn->imm;
+ regs[insn->dst_reg].max_value = insn->imm;
+ regs[insn->dst_reg].min_value = insn->imm;
}
} else if (opcode > BPF_END) {
@@ -1561,6 +1707,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
dst_reg = &regs[insn->dst_reg];
+ /* first we want to adjust our ranges. */
+ adjust_reg_min_max_vals(env, insn);
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
@@ -1569,7 +1718,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return 0;
} else if (opcode == BPF_ADD &&
BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == PTR_TO_PACKET) {
+ (dst_reg->type == PTR_TO_PACKET ||
+ (BPF_SRC(insn->code) == BPF_X &&
+ regs[insn->src_reg].type == PTR_TO_PACKET))) {
/* ptr_to_packet += K|X */
return check_packet_ptr_add(env, insn);
} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
@@ -1593,28 +1744,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EACCES;
}
- /* mark dest operand */
- mark_reg_unknown_value(regs, insn->dst_reg);
+ /* If we did pointer math on a map value then just set it to our
+ * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
+ * loads to this register appropriately, otherwise just mark the
+ * register as unknown.
+ */
+ if (env->allow_ptr_leaks &&
+ (dst_reg->type == PTR_TO_MAP_VALUE ||
+ dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
+ dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
+ else
+ mark_reg_unknown_value(regs, insn->dst_reg);
}
return 0;
}
-static void find_good_pkt_pointers(struct verifier_env *env,
- struct reg_state *dst_reg)
+static void find_good_pkt_pointers(struct bpf_verifier_state *state,
+ struct bpf_reg_state *dst_reg)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs, *reg;
+ struct bpf_reg_state *regs = state->regs, *reg;
int i;
- /* r2 = r3;
- * r2 += 8
- * if (r2 > pkt_end) goto somewhere
- * r2 == dst_reg, pkt_end == src_reg,
- * r2=pkt(id=n,off=8,r=0)
- * r3=pkt(id=n,off=0,r=0)
- * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
- * so that range of bytes [r3, r3 + 8) is safe to access
+
+ /* LLVM can generate two kind of checks:
+ *
+ * Type 1:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 > pkt_end) goto <handle exception>
+ * <access okay>
+ *
+ * Where:
+ * r2 == dst_reg, pkt_end == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * Type 2:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end >= r2) goto <access okay>
+ * <handle exception>
+ *
+ * Where:
+ * pkt_end == dst_reg, r2 == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+ * so that range of bytes [r3, r3 + 8) is safe to access.
*/
+
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
regs[i].range = dst_reg->off;
@@ -1628,11 +1809,109 @@ static void find_good_pkt_pointers(struct verifier_env *env,
}
}
-static int check_cond_jmp_op(struct verifier_env *env,
+/* Adjusts the register min/max values in the case that the dst_reg is the
+ * variable register that we are working on, and src_reg is a constant or we're
+ * simply doing a BPF_K check.
+ */
+static void reg_set_min_max(struct bpf_reg_state *true_reg,
+ struct bpf_reg_state *false_reg, u64 val,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ /* If this is false then we know nothing Jon Snow, but if it is
+ * true then we know for sure.
+ */
+ true_reg->max_value = true_reg->min_value = val;
+ break;
+ case BPF_JNE:
+ /* If this is true we know nothing Jon Snow, but if it is false
+ * we know the value for sure;
+ */
+ false_reg->max_value = false_reg->min_value = val;
+ break;
+ case BPF_JGT:
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ case BPF_JSGT:
+ /* If this is false then we know the maximum val is val,
+ * otherwise we know the min val is val+1.
+ */
+ false_reg->max_value = val;
+ true_reg->min_value = val + 1;
+ break;
+ case BPF_JGE:
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ case BPF_JSGE:
+ /* If this is false then we know the maximum value is val - 1,
+ * otherwise we know the mimimum value is val.
+ */
+ false_reg->max_value = val - 1;
+ true_reg->min_value = val;
+ break;
+ default:
+ break;
+ }
+
+ check_reg_overflow(false_reg);
+ check_reg_overflow(true_reg);
+}
+
+/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
+ * is the variable reg.
+ */
+static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
+ struct bpf_reg_state *false_reg, u64 val,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ /* If this is false then we know nothing Jon Snow, but if it is
+ * true then we know for sure.
+ */
+ true_reg->max_value = true_reg->min_value = val;
+ break;
+ case BPF_JNE:
+ /* If this is true we know nothing Jon Snow, but if it is false
+ * we know the value for sure;
+ */
+ false_reg->max_value = false_reg->min_value = val;
+ break;
+ case BPF_JGT:
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ case BPF_JSGT:
+ /*
+ * If this is false, then the val is <= the register, if it is
+ * true the register <= to the val.
+ */
+ false_reg->min_value = val;
+ true_reg->max_value = val - 1;
+ break;
+ case BPF_JGE:
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ case BPF_JSGE:
+ /* If this is false then constant < register, if it is true then
+ * the register < constant.
+ */
+ false_reg->min_value = val + 1;
+ true_reg->max_value = val;
+ break;
+ default:
+ break;
+ }
+
+ check_reg_overflow(false_reg);
+ check_reg_overflow(true_reg);
+}
+
+static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct reg_state *regs = env->cur_state.regs, *dst_reg;
- struct verifier_state *other_branch;
+ struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1694,7 +1973,24 @@ static int check_cond_jmp_op(struct verifier_env *env,
if (!other_branch)
return -EFAULT;
- /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ /* detect if we are comparing against a constant value so we can adjust
+ * our min/max values for our dst register.
+ */
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (regs[insn->src_reg].type == CONST_IMM)
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].imm,
+ opcode);
+ else if (dst_reg->type == CONST_IMM)
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg], dst_reg->imm,
+ opcode);
+ } else {
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, insn->imm, opcode);
+ }
+
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
if (BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1713,13 +2009,17 @@ static int check_cond_jmp_op(struct verifier_env *env,
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- find_good_pkt_pointers(env, dst_reg);
+ find_good_pkt_pointers(this_branch, dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+ dst_reg->type == PTR_TO_PACKET_END &&
+ regs[insn->src_reg].type == PTR_TO_PACKET) {
+ find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
}
if (log_level)
- print_verifier_state(&env->cur_state);
+ print_verifier_state(this_branch);
return 0;
}
@@ -1732,9 +2032,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
}
/* verify BPF_LD_IMM64 instruction */
-static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -1750,9 +2050,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
- if (insn->src_reg == 0)
- /* generic move 64-bit immediate into a register */
+ if (insn->src_reg == 0) {
+ /* generic move 64-bit immediate into a register,
+ * only analyzer needs to collect the ld_imm value.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+
+ if (!env->analyzer_ops)
+ return 0;
+
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = imm;
return 0;
+ }
/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
@@ -1789,11 +2099,11 @@ static bool may_access_skb(enum bpf_prog_type type)
* Output:
* R0 - 8/16/32-bit skb data converted to cpu endianness
*/
-static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 mode = BPF_MODE(insn->code);
- struct reg_state *reg;
+ struct bpf_reg_state *reg;
int i, err;
if (!may_access_skb(env->prog->type)) {
@@ -1879,7 +2189,7 @@ enum {
BRANCH = 2,
};
-#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
static int *insn_stack; /* stack of insns to process */
static int cur_stack; /* current stack index */
@@ -1890,7 +2200,7 @@ static int *insn_state;
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
@@ -1931,7 +2241,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env)
/* non-recursive depth-first-search to detect loops in BPF program
* loop == back-edge in directed graph
*/
-static int check_cfg(struct verifier_env *env)
+static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -2040,7 +2350,8 @@ err_free:
/* the following conditions reduce the number of explored insns
* from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
*/
-static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
+static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
{
if (old->id != cur->id)
return false;
@@ -2115,9 +2426,11 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+static bool states_equal(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
{
- struct reg_state *rold, *rcur;
+ struct bpf_reg_state *rold, *rcur;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
@@ -2127,6 +2440,13 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
if (memcmp(rold, rcur, sizeof(*rold)) == 0)
continue;
+ /* If the ranges were not the same, but everything else was and
+ * we didn't do a variable access into a map then we are a-ok.
+ */
+ if (!env->varlen_map_value_access &&
+ rold->type == rcur->type && rold->imm == rcur->imm)
+ continue;
+
if (rold->type == NOT_INIT ||
(rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
continue;
@@ -2157,9 +2477,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
* the same, check that stored pointers types
* are the same as well.
* Ex: explored safe path could have stored
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
* but current path has stored:
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
@@ -2170,10 +2490,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
return true;
}
-static int is_state_visited(struct verifier_env *env, int insn_idx)
+static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
- struct verifier_state_list *new_sl;
- struct verifier_state_list *sl;
+ struct bpf_verifier_state_list *new_sl;
+ struct bpf_verifier_state_list *sl;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -2183,7 +2503,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(&sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, &env->cur_state))
/* reached equivalent register/stack state,
* prune the search
*/
@@ -2197,7 +2517,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
if (!new_sl)
return -ENOMEM;
@@ -2208,11 +2528,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
return 0;
}
-static int do_check(struct verifier_env *env)
+static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
+{
+ if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
+ return 0;
+
+ return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+}
+
+static int do_check(struct bpf_verifier_env *env)
{
- struct verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = &env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
- struct reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs;
int insn_cnt = env->prog->len;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
@@ -2220,6 +2549,7 @@ static int do_check(struct verifier_env *env)
init_reg_state(regs);
insn_idx = 0;
+ env->varlen_map_value_access = false;
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -2266,13 +2596,17 @@ static int do_check(struct verifier_env *env)
print_bpf_insn(insn);
}
+ err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
+ if (err)
+ return err;
+
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type src_reg_type;
+ enum bpf_reg_type *prev_src_type, src_reg_type;
/* check for reserved fields is already done */
@@ -2296,21 +2630,25 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W) {
+ reset_reg_range_values(regs, insn->dst_reg);
+ if (BPF_SIZE(insn->code) != BPF_W &&
+ BPF_SIZE(insn->code) != BPF_DW) {
insn_idx++;
continue;
}
- if (insn->imm == 0) {
+ prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_src_type == NOT_INIT) {
/* saw a valid insn
* dst_reg = *(u32 *)(src_reg + off)
- * use reserved 'imm' field to mark this insn
+ * save type to validate intersecting paths
*/
- insn->imm = src_reg_type;
+ *prev_src_type = src_reg_type;
- } else if (src_reg_type != insn->imm &&
+ } else if (src_reg_type != *prev_src_type &&
(src_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_src_type == PTR_TO_CTX)) {
/* ABuser program is trying to use the same insn
* dst_reg = *(u32*) (src_reg + off)
* with different pointer types:
@@ -2323,7 +2661,7 @@ static int do_check(struct verifier_env *env)
}
} else if (class == BPF_STX) {
- enum bpf_reg_type dst_reg_type;
+ enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, insn);
@@ -2351,11 +2689,13 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
- if (insn->imm == 0) {
- insn->imm = dst_reg_type;
- } else if (dst_reg_type != insn->imm &&
+ prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_dst_type == NOT_INIT) {
+ *prev_dst_type = dst_reg_type;
+ } else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_dst_type == PTR_TO_CTX)) {
verbose("same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -2461,6 +2801,7 @@ process_bpf_exit:
verbose("invalid BPF_LD mode\n");
return -EINVAL;
}
+ reset_reg_range_values(regs, insn->dst_reg);
} else {
verbose("unknown insn class %d\n", class);
return -EINVAL;
@@ -2473,14 +2814,28 @@ process_bpf_exit:
return 0;
}
+static int check_map_prog_compatibility(struct bpf_map *map,
+ struct bpf_prog *prog)
+
+{
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
+ (map->map_flags & BPF_F_NO_PREALLOC)) {
+ verbose("perf_event programs can only use preallocated hash map\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
-static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j;
+ int i, j, err;
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
@@ -2524,6 +2879,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return PTR_ERR(map);
}
+ err = check_map_prog_compatibility(map, env->prog);
+ if (err) {
+ fdput(f);
+ return err;
+ }
+
/* store map pointer inside BPF_LD_IMM64 instruction */
insn[0].imm = (u32) (unsigned long) map;
insn[1].imm = ((u64) (unsigned long) map) >> 32;
@@ -2567,7 +2928,7 @@ next_insn:
}
/* drop refcnt of maps used by the rejected program */
-static void release_maps(struct verifier_env *env)
+static void release_maps(struct bpf_verifier_env *env)
{
int i;
@@ -2576,7 +2937,7 @@ static void release_maps(struct verifier_env *env)
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
-static void convert_pseudo_ld_imm64(struct verifier_env *env)
+static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -2590,62 +2951,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
-static int convert_ctx_accesses(struct verifier_env *env)
+static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16];
+ const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ const int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i;
+ int i, cnt, delta = 0;
- if (!env->prog->aux->ops->convert_ctx_access)
+ if (ops->gen_prologue) {
+ cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+ env->prog);
+ if (cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ } else if (cnt) {
+ new_prog = bpf_patch_insn_single(env->prog, 0,
+ insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+ }
+ }
+
+ if (!ops->convert_ctx_access)
return 0;
- for (i = 0; i < insn_cnt; i++, insn++) {
- u32 insn_delta, cnt;
+ insn = env->prog->insnsi + delta;
- if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW))
type = BPF_WRITE;
else
continue;
- if (insn->imm != PTR_TO_CTX) {
- /* clear internal mark */
- insn->imm = 0;
+ if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
continue;
- }
- cnt = env->prog->aux->ops->
- convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf, env->prog);
+ cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
}
- new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
+ new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
+ cnt);
if (!new_prog)
return -ENOMEM;
- insn_delta = cnt - 1;
+ delta += cnt - 1;
/* keep walking new program and skip insns we just inserted */
env->prog = new_prog;
- insn = new_prog->insnsi + i + insn_delta;
-
- insn_cnt += insn_delta;
- i += insn_delta;
+ insn = new_prog->insnsi + i + delta;
}
return 0;
}
-static void free_states(struct verifier_env *env)
+static void free_states(struct bpf_verifier_env *env)
{
- struct verifier_state_list *sl, *sln;
+ struct bpf_verifier_state_list *sl, *sln;
int i;
if (!env->explored_states)
@@ -2668,19 +3041,24 @@ static void free_states(struct verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
- struct verifier_env *env;
+ struct bpf_verifier_env *env;
int ret = -EINVAL;
if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
return -E2BIG;
- /* 'struct verifier_env' can be global, but since it's not small,
+ /* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ (*prog)->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
env->prog = *prog;
/* grab the mutex to protect few globals used by verifier */
@@ -2699,12 +3077,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* log_* values have to be sane */
if (log_size < 128 || log_size > UINT_MAX >> 8 ||
log_level == 0 || log_ubuf == NULL)
- goto free_env;
+ goto err_unlock;
ret = -ENOMEM;
log_buf = vmalloc(log_size);
if (!log_buf)
- goto free_env;
+ goto err_unlock;
} else {
log_level = 0;
}
@@ -2714,7 +3092,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
goto skip_full_check;
env->explored_states = kcalloc(env->prog->len,
- sizeof(struct verifier_state_list *),
+ sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
@@ -2773,14 +3151,67 @@ skip_full_check:
free_log_buf:
if (log_level)
vfree(log_buf);
-free_env:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
*/
release_maps(env);
*prog = env->prog;
+err_unlock:
+ mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
kfree(env);
+ return ret;
+}
+
+int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
+ void *priv)
+{
+ struct bpf_verifier_env *env;
+ int ret;
+
+ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ prog->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
+ env->prog = prog;
+ env->analyzer_ops = ops;
+ env->analyzer_priv = priv;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ log_level = 0;
+
+ env->explored_states = kcalloc(env->prog->len,
+ sizeof(struct bpf_verifier_state_list *),
+ GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
+ kfree(env);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b54d5c6..00411c82dac5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+{
+ int capable;
+
+ if (unlikely(!cap_valid(cap))) {
+ pr_crit("capable() called with invalid cap=%u\n", cap);
+ BUG();
+ }
+
+ capable = audit ? security_capable(current_cred(), ns, cap) :
+ security_capable_noaudit(current_cred(), ns, cap);
+ if (capable == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return true;
+ }
+ return false;
+}
+
/**
* ns_capable - Determine if the current task has a superior capability in effect
* @ns: The usernamespace we want the capability in
@@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- if (unlikely(!cap_valid(cap))) {
- pr_crit("capable() called with invalid cap=%u\n", cap);
- BUG();
- }
-
- if (security_capable(current_cred(), ns, cap) == 0) {
- current->flags |= PF_SUPERPRIV;
- return true;
- }
- return false;
+ return ns_capable_common(ns, cap, true);
}
EXPORT_SYMBOL(ns_capable);
+/**
+ * ns_capable_noaudit - Determine if the current task has a superior capability
+ * (unaudited) in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, false);
+}
+EXPORT_SYMBOL(ns_capable_noaudit);
/**
* capable - Determine if the current task has a superior capability in effect
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6e8932..44066158f0d1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,7 +61,7 @@
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
-#include <linux/proc_ns.h>
+#include <linux/file.h>
#include <net/sock.h>
/*
@@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset)
static void put_css_set(struct css_set *cset)
{
+ unsigned long flags;
+
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
@@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset)
if (atomic_add_unless(&cset->refcount, -1, 1))
return;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, flags);
put_css_set_locked(cset);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, flags);
}
/*
@@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
/* First see if we already have a cgroup group that matches
* the desired set */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
if (cset)
return cset;
@@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
css_get(css);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return cset;
}
@@ -1158,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
{
lockdep_assert_held(&cgroup_mutex);
- if (root->hierarchy_id) {
- idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
- root->hierarchy_id = 0;
- }
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
static void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
- /* hierarchy ID should already have been released */
- WARN_ON_ONCE(root->hierarchy_id);
-
idr_destroy(&root->cgroup_idr);
kfree(root);
}
@@ -1192,7 +1188,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
@@ -1200,7 +1196,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
kfree(link);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
@@ -1600,11 +1596,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
ss->root = dst_root;
css->cgroup = dcgrp;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
@@ -1640,10 +1636,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
if (!buf)
return -ENOMEM;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
if (len >= PATH_MAX)
len = -ERANGE;
@@ -1897,7 +1893,7 @@ static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
if (use_task_css_set_links)
goto out_unlock;
@@ -1922,8 +1918,12 @@ static void cgroup_enable_task_cg_lists(void)
* entry won't be deleted though the process has exited.
* Do it while holding siglock so that we don't end up
* racing against cgroup_exit().
+ *
+ * Interrupts were already disabled while acquiring
+ * the css_set_lock, so we do not need to disable it
+ * again when acquiring the sighand->siglock here.
*/
- spin_lock_irq(&p->sighand->siglock);
+ spin_lock(&p->sighand->siglock);
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
@@ -1932,11 +1932,11 @@ static void cgroup_enable_task_cg_lists(void)
list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
}
- spin_unlock_irq(&p->sighand->siglock);
+ spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
out_unlock:
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -2043,13 +2043,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2209,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
- /*
- * We know this subsystem has not yet been bound. Users in a non-init
- * user namespace may only mount hierarchies with no bound subsystems,
- * i.e. 'none,name=user1'
- */
- if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+ /* Hierarchies may only be created in the initial cgroup namespace. */
+ if (ns != &init_cgroup_ns) {
ret = -EPERM;
goto out_unlock;
}
@@ -2256,11 +2252,11 @@ out_mount:
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ns->root_cset, root);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
@@ -2337,11 +2333,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
char *ret;
mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return ret;
@@ -2369,7 +2365,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
char *path = NULL;
mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2382,7 +2378,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
path = buf;
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return path;
}
@@ -2557,7 +2553,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
* the new cgroup. There are no failure cases after here, so this
* is the commit point.
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(cset, &tset->src_csets, mg_node) {
list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
struct css_set *from_cset = task_css_set(task);
@@ -2568,7 +2564,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
put_css_set_locked(from_cset);
}
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/*
* Migration is committed, all target tasks are now on dst_csets.
@@ -2597,13 +2593,13 @@ out_cancel_attach:
}
} while_each_subsys_mask();
out_release_tset:
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
list_del_init(&cset->mg_node);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return ret;
}
@@ -2634,7 +2630,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
lockdep_assert_held(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
@@ -2642,7 +2638,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
/**
@@ -2783,7 +2779,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
@@ -2792,7 +2788,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return cgroup_taskset_migrate(&tset, root);
}
@@ -2816,7 +2812,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
return -EBUSY;
/* look up all src csets */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
@@ -2826,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(&preloaded_csets);
@@ -2859,9 +2855,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *cgrp;
struct inode *inode;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
while (!cgroup_is_descendant(dst_cgrp, cgrp))
cgrp = cgroup_parent(cgrp);
@@ -2956,20 +2952,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
if (root == &cgrp_dfl_root)
continue;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
break;
}
+ percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return retval;
@@ -3080,7 +3078,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
@@ -3088,14 +3086,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
cgroup_migrate_add_src(link->cset, dsct,
&preloaded_csets);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret)
goto out_finish;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
struct task_struct *task, *ntask;
@@ -3107,7 +3105,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
cgroup_taskset_add(task, &tset);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
ret = cgroup_taskset_migrate(&tset, cgrp->root);
out_finish:
@@ -3448,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* Except for the root, subtree_control must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
- if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
- ret = -EBUSY;
- goto out_unlock;
+ if (enable && cgroup_parent(cgrp)) {
+ struct cgrp_cset_link *link;
+
+ /*
+ * Because namespaces pin csets too, @cgrp->cset_links
+ * might not be empty even when @cgrp is empty. Walk and
+ * verify each cset.
+ */
+ spin_lock_irq(&css_set_lock);
+
+ ret = 0;
+ list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+ if (css_set_populated(link->cset)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ spin_unlock_irq(&css_set_lock);
+
+ if (ret)
+ goto out_unlock;
}
/* save and update control masks and prepare csses */
@@ -3901,17 +3918,19 @@ void cgroup_file_notify(struct cgroup_file *cfile)
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
- * Return the number of tasks in the cgroup.
+ * Return the number of tasks in the cgroup. The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
*/
static int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return count;
}
@@ -4249,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
memset(it, 0, sizeof(*it));
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
it->ss = css->ss;
@@ -4262,7 +4281,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
css_task_iter_advance_css_set(it);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
/**
@@ -4280,7 +4299,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
it->cur_task = NULL;
}
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4289,7 +4308,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
css_task_iter_advance(it);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return it->cur_task;
}
@@ -4303,10 +4322,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
void css_task_iter_end(struct css_task_iter *it)
{
if (it->cur_cset) {
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
if (it->cur_task)
@@ -4337,11 +4356,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* all tasks in @from are being moved, all csets are source */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret)
@@ -4365,6 +4386,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&preloaded_csets);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -5063,6 +5085,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
css->ss = ss;
+ css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++;
@@ -5139,6 +5162,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(parent_css);
+ if (!css)
+ css = ERR_PTR(-ENOMEM);
if (IS_ERR(css))
return css;
@@ -5150,7 +5175,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
if (err < 0)
- goto err_free_percpu_ref;
+ goto err_free_css;
css->id = err;
/* @css is ready to be brought online now, make it visible */
@@ -5174,9 +5199,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
- cgroup_idr_remove(&ss->css_idr, css->id);
-err_free_percpu_ref:
- percpu_ref_exit(&css->refcnt);
err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn);
return ERR_PTR(err);
@@ -5451,10 +5473,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
cgrp->self.flags &= ~CSS_ONLINE;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
link->cset->dead = true;
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
@@ -5605,6 +5627,12 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ /*
+ * The latency of the synchronize_sched() is too high for cgroups,
+ * avoid it at the cost of forcing all readers into the slow path.
+ */
+ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
@@ -5725,7 +5753,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
goto out;
mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
for_each_root(root) {
struct cgroup_subsys *ss;
@@ -5778,7 +5806,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
kfree(buf);
out:
@@ -5923,13 +5951,13 @@ void cgroup_post_fork(struct task_struct *child)
if (use_task_css_set_links) {
struct css_set *cset;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
css_set_move_task(child, NULL, cset, false);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
/*
@@ -5974,9 +6002,9 @@ void cgroup_exit(struct task_struct *tsk)
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) {
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
}
@@ -6044,9 +6072,9 @@ static void cgroup_release_agent(struct work_struct *work)
if (!pathbuf || !agentbuf)
goto out;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
if (!path)
goto out;
@@ -6168,7 +6196,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+ return idr_find(&ss->css_idr, id);
}
/**
@@ -6205,6 +6233,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory. Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *cgrp;
+ struct file *f;
+
+ f = fget_raw(fd);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ fput(f);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ cgrp = css->cgroup;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
@@ -6235,6 +6297,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
if (cgroup_sk_alloc_disabled)
return;
+ /* Socket clone path */
+ if (skcd->val) {
+ cgroup_get(sock_cgroup_ptr(skcd));
+ return;
+ }
+
rcu_read_lock();
while (true) {
@@ -6260,6 +6328,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
/* cgroup namespaces */
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
struct cgroup_namespace *new_ns;
@@ -6281,6 +6359,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
void free_cgroup_ns(struct cgroup_namespace *ns)
{
put_css_set(ns->root_cset);
+ dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
@@ -6292,6 +6371,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns;
+ struct ucounts *ucounts;
struct css_set *cset;
BUG_ON(!old_ns);
@@ -6305,22 +6385,25 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ ucounts = inc_cgroup_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+ /* It is not safe to take cgroup_mutex here */
+ spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
-
- spin_unlock_bh(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ spin_unlock_irq(&css_set_lock);
new_ns = alloc_cgroup_ns();
if (IS_ERR(new_ns)) {
put_css_set(cset);
+ dec_cgroup_namespaces(ucounts);
return new_ns;
}
new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
return new_ns;
@@ -6371,12 +6454,18 @@ static void cgroupns_put(struct ns_common *ns)
put_cgroup_ns(to_cg_ns(ns));
}
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+ return to_cg_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
.type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
+ .owner = cgroupns_owner,
};
static __init int cgroup_namespaces_init(void)
@@ -6435,7 +6524,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
if (!name_buf)
return -ENOMEM;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -6446,7 +6535,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
kfree(name_buf);
return 0;
}
@@ -6457,7 +6546,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
@@ -6480,7 +6569,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
overflow:
seq_puts(seq, " ...\n");
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return 0;
}
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 303097b37429..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -49,6 +49,12 @@ struct pids_cgroup {
*/
atomic64_t counter;
int64_t limit;
+
+ /* Handle for "pids.events" */
+ struct cgroup_file events_file;
+
+ /* Number of times fork failed because limit was hit. */
+ atomic64_t events_limit;
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
+ int err;
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
- return pids_try_charge(pids, 1);
+ err = pids_try_charge(pids, 1);
+ if (err) {
+ /* Only log the first time events_limit is incremented. */
+ if (atomic64_inc_return(&pids->events_limit) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+ pr_cont("\n");
+ }
+ cgroup_file_notify(&pids->events_file);
+ }
+ return err;
}
static void pids_cancel_fork(struct task_struct *task)
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ struct pids_cgroup *pids = css_pids(seq_css(sf));
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+ return 0;
+}
+
static struct cftype pids_files[] = {
{
.name = "max",
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = {
.read_s64 = pids_current_read,
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100644
index 000000000000..9f748ed7bea8
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,152 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_MODULES is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100644
index 000000000000..e3b953e966d2
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,121 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_UEVENT=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
new file mode 100644
index 000000000000..8d9643767142
--- /dev/null
+++ b/kernel/configs/kvm_guest.config
@@ -0,0 +1,32 @@
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_BLK_DEV=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_TTY=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_BINFMT_ELF=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_9P_FS=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
CONFIG_SLOB=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d948e44c471e..5df20d6d1520 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,8 @@
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/smpboot.h>
+#include <linux/relay.h>
+#include <linux/slab.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -37,8 +39,9 @@
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
* @rollback: Perform a rollback
- * @cb_stat: The state for a single callback (install/uninstall)
- * @cb: Single callback function (install/uninstall)
+ * @single: Single callback invocation
+ * @bringup: Single callback bringup or teardown selector
+ * @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
* @done: Signal completion to the issuer of the task
*/
@@ -49,8 +52,10 @@ struct cpuhp_cpu_state {
struct task_struct *thread;
bool should_run;
bool rollback;
+ bool single;
+ bool bringup;
+ struct hlist_node *node;
enum cpuhp_state cb_state;
- int (*cb)(unsigned int cpu);
int result;
struct completion done;
#endif
@@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
* @cant_stop: Bringup/teardown can't be stopped at this step
*/
struct cpuhp_step {
- const char *name;
- int (*startup)(unsigned int cpu);
- int (*teardown)(unsigned int cpu);
- bool skip_onerr;
- bool cant_stop;
+ const char *name;
+ union {
+ int (*single)(unsigned int cpu);
+ int (*multi)(unsigned int cpu,
+ struct hlist_node *node);
+ } startup;
+ union {
+ int (*single)(unsigned int cpu);
+ int (*multi)(unsigned int cpu,
+ struct hlist_node *node);
+ } teardown;
+ struct hlist_head list;
+ bool skip_onerr;
+ bool cant_stop;
+ bool multi_instance;
};
static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_bp_states[];
static struct cpuhp_step cpuhp_ap_states[];
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+ /*
+ * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+ * purposes as that state is handled explicitly in cpu_down.
+ */
+ return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+ struct cpuhp_step *sp;
+
+ sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
+ return sp + state;
+}
+
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
* @step: The step in the state machine
- * @cb: The callback function to invoke
+ * @bringup: True if the bringup callback should be invoked
*
- * Called from cpu hotplug and from the state register machinery
+ * Called from cpu hotplug and from the state register machinery.
*/
-static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
- int (*cb)(unsigned int))
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+ bool bringup, struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- int ret = 0;
-
- if (cb) {
- trace_cpuhp_enter(cpu, st->target, step, cb);
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ int (*cbm)(unsigned int cpu, struct hlist_node *node);
+ int (*cb)(unsigned int cpu);
+ int ret, cnt;
+
+ if (!step->multi_instance) {
+ cb = bringup ? step->startup.single : step->teardown.single;
+ if (!cb)
+ return 0;
+ trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
- trace_cpuhp_exit(cpu, st->state, step, ret);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ return ret;
+ }
+ cbm = bringup ? step->startup.multi : step->teardown.multi;
+ if (!cbm)
+ return 0;
+
+ /* Single invocation for instance add/remove */
+ if (node) {
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ return ret;
+ }
+
+ /* State transition. Invoke on all instances */
+ cnt = 0;
+ hlist_for_each(node, &step->list) {
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ if (ret)
+ goto err;
+ cnt++;
+ }
+ return 0;
+err:
+ /* Rollback the instances if one failed */
+ cbm = !bringup ? step->startup.multi : step->teardown.multi;
+ if (!cbm)
+ return ret;
+
+ hlist_for_each(node, &step->list) {
+ if (!cnt--)
+ break;
+ cbm(cpu, node);
}
return ret;
}
@@ -260,10 +333,17 @@ void cpu_hotplug_disable(void)
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
+static void __cpu_hotplug_enable(void)
+{
+ if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+ return;
+ cpu_hotplug_disabled--;
+}
+
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
+ __cpu_hotplug_enable();
cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu)
return 0;
}
-static int notify_starting(unsigned int cpu)
-{
- cpu_notify(CPU_STARTING, cpu);
- return 0;
-}
-
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu)
struct task_struct *idle = idle_thread_get(cpu);
int ret;
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ * Prevent irq alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
+ irq_unlock_sparse();
if (ret) {
cpu_notify(CPU_UP_CANCELED, cpu);
return ret;
@@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu)
/*
* Hotplug state machine related functions
*/
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps)
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
{
for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = steps + st->state;
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, step->startup);
+ cpuhp_invoke_callback(cpu, st->state, true, NULL);
}
}
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps, enum cpuhp_state target)
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
for (; st->state > target; st->state--) {
- struct cpuhp_step *step = steps + st->state;
-
- ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
if (ret) {
st->target = prev_state;
- undo_cpu_down(cpu, st, steps);
+ undo_cpu_down(cpu, st);
break;
}
}
return ret;
}
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps)
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
for (st->state--; st->state > st->target; st->state--) {
- struct cpuhp_step *step = steps + st->state;
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, step->teardown);
+ cpuhp_invoke_callback(cpu, st->state, false, NULL);
}
}
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps, enum cpuhp_state target)
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
while (st->state < target) {
- struct cpuhp_step *step;
-
st->state++;
- step = steps + st->state;
- ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
if (ret) {
st->target = prev_state;
- undo_cpu_up(cpu, st, steps);
+ undo_cpu_up(cpu, st);
break;
}
}
@@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
{
enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
- return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
+ return cpuhp_down_callbacks(cpu, st, target);
}
/* Execute the online startup callbacks. Used to be CPU_ONLINE */
static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
{
- return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
+ return cpuhp_up_callbacks(cpu, st, st->target);
}
/*
@@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu)
st->should_run = false;
/* Single callback invocation for [un]install ? */
- if (st->cb) {
+ if (st->single) {
if (st->cb_state < CPUHP_AP_ONLINE) {
local_irq_disable();
- ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+ ret = cpuhp_invoke_callback(cpu, st->cb_state,
+ st->bringup, st->node);
local_irq_enable();
} else {
- ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+ ret = cpuhp_invoke_callback(cpu, st->cb_state,
+ st->bringup, st->node);
}
} else if (st->rollback) {
BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
- undo_cpu_down(cpu, st, cpuhp_ap_states);
+ undo_cpu_down(cpu, st);
/*
* This is a momentary workaround to keep the notifier users
* happy. Will go away once we got rid of the notifiers.
@@ -509,16 +586,27 @@ static void cpuhp_thread_fun(unsigned int cpu)
}
/* Invoke a single callback on a remote cpu */
-static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
- int (*cb)(unsigned int))
+static int
+cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
+ struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
if (!cpu_online(cpu))
return 0;
+ /*
+ * If we are up and running, use the hotplug thread. For early calls
+ * we invoke the thread function directly.
+ */
+ if (!st->thread)
+ return cpuhp_invoke_callback(cpu, state, bringup, node);
+
st->cb_state = state;
- st->cb = cb;
+ st->single = true;
+ st->bringup = bringup;
+ st->node = node;
+
/*
* Make sure the above stores are visible before should_run becomes
* true. Paired with the mb() above in cpuhp_thread_fun()
@@ -534,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
{
st->result = 0;
- st->cb = NULL;
+ st->single = false;
/*
* Make sure the above stores are visible before should_run becomes
* true. Paired with the mb() above in cpuhp_thread_fun()
@@ -667,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu)
return err;
}
-static int notify_dying(unsigned int cpu)
-{
- cpu_notify(CPU_DYING, cpu);
- return 0;
-}
-
/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
@@ -685,12 +767,16 @@ static int take_cpu_down(void *_param)
if (err < 0)
return err;
+ /*
+ * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
+ * do this step again.
+ */
+ WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
+ st->state--;
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--) {
- struct cpuhp_step *step = cpuhp_ap_states + st->state;
+ for (; st->state > target; st->state--)
+ cpuhp_invoke_callback(cpu, st->state, false, NULL);
- cpuhp_invoke_callback(cpu, st->state, step->teardown);
- }
/* Give up timekeeping duties */
tick_handover_do_timer();
/* Park the stopper thread */
@@ -727,7 +813,7 @@ static int takedown_cpu(unsigned int cpu)
BUG_ON(cpu_online(cpu));
/*
- * The migration_call() CPU_DYING callback will have removed all
+ * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
* runnable tasks from the cpu, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
@@ -780,7 +866,6 @@ void cpuhp_report_idle_dead(void)
#define notify_down_prepare NULL
#define takedown_cpu NULL
#define notify_dead NULL
-#define notify_dying NULL
#endif
#ifdef CONFIG_HOTPLUG_CPU
@@ -829,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
* to do the further cleanups.
*/
- ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+ ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
st->target = prev_state;
st->rollback = true;
@@ -870,10 +955,9 @@ EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
* @cpu: cpu that just started
*
- * This function calls the cpu_chain notifiers with CPU_STARTING.
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
@@ -882,12 +966,10 @@ void notify_cpu_starting(unsigned int cpu)
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+ rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
while (st->state < target) {
- struct cpuhp_step *step;
-
st->state++;
- step = cpuhp_ap_states + st->state;
- cpuhp_invoke_callback(cpu, st->state, step->startup);
+ cpuhp_invoke_callback(cpu, st->state, true, NULL);
}
}
@@ -972,7 +1054,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
* responsible for bringing it up to the target state.
*/
target = min((int)target, CPUHP_BRINGUP_CPU);
- ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
+ ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpu_hotplug_done();
return ret;
@@ -1017,12 +1099,13 @@ EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
-int disable_nonboot_cpus(void)
+int freeze_secondary_cpus(int primary)
{
- int cpu, first_cpu, error = 0;
+ int cpu, error = 0;
cpu_maps_update_begin();
- first_cpu = cpumask_first(cpu_online_mask);
+ if (!cpu_online(primary))
+ primary = cpumask_first(cpu_online_mask);
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -1031,7 +1114,7 @@ int disable_nonboot_cpus(void)
pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
- if (cpu == first_cpu)
+ if (cpu == primary)
continue;
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
@@ -1074,7 +1157,7 @@ void enable_nonboot_cpus(void)
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
+ __cpu_hotplug_enable();
if (cpumask_empty(frozen_cpus))
goto out;
@@ -1163,44 +1246,96 @@ core_initcall(cpu_hotplug_pm_sync_init);
static struct cpuhp_step cpuhp_bp_states[] = {
[CPUHP_OFFLINE] = {
.name = "offline",
- .startup = NULL,
- .teardown = NULL,
+ .startup.single = NULL,
+ .teardown.single = NULL,
},
#ifdef CONFIG_SMP
[CPUHP_CREATE_THREADS]= {
- .name = "threads:create",
- .startup = smpboot_create_threads,
- .teardown = NULL,
+ .name = "threads:prepare",
+ .startup.single = smpboot_create_threads,
+ .teardown.single = NULL,
.cant_stop = true,
},
+ [CPUHP_PERF_PREPARE] = {
+ .name = "perf:prepare",
+ .startup.single = perf_event_init_cpu,
+ .teardown.single = perf_event_exit_cpu,
+ },
+ [CPUHP_WORKQUEUE_PREP] = {
+ .name = "workqueue:prepare",
+ .startup.single = workqueue_prepare_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_HRTIMERS_PREPARE] = {
+ .name = "hrtimers:prepare",
+ .startup.single = hrtimers_prepare_cpu,
+ .teardown.single = hrtimers_dead_cpu,
+ },
+ [CPUHP_SMPCFD_PREPARE] = {
+ .name = "smpcfd:prepare",
+ .startup.single = smpcfd_prepare_cpu,
+ .teardown.single = smpcfd_dead_cpu,
+ },
+ [CPUHP_RELAY_PREPARE] = {
+ .name = "relay:prepare",
+ .startup.single = relay_prepare_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_SLAB_PREPARE] = {
+ .name = "slab:prepare",
+ .startup.single = slab_prepare_cpu,
+ .teardown.single = slab_dead_cpu,
+ },
+ [CPUHP_RCUTREE_PREP] = {
+ .name = "RCU/tree:prepare",
+ .startup.single = rcutree_prepare_cpu,
+ .teardown.single = rcutree_dead_cpu,
+ },
/*
* Preparatory and dead notifiers. Will be replaced once the notifiers
* are converted to states.
*/
[CPUHP_NOTIFY_PREPARE] = {
.name = "notify:prepare",
- .startup = notify_prepare,
- .teardown = notify_dead,
+ .startup.single = notify_prepare,
+ .teardown.single = notify_dead,
.skip_onerr = true,
.cant_stop = true,
},
+ /*
+ * On the tear-down path, timers_dead_cpu() must be invoked
+ * before blk_mq_queue_reinit_notify() from notify_dead(),
+ * otherwise a RCU stall occurs.
+ */
+ [CPUHP_TIMERS_DEAD] = {
+ .name = "timers:dead",
+ .startup.single = NULL,
+ .teardown.single = timers_dead_cpu,
+ },
/* Kicks the plugged cpu into life */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
- .startup = bringup_cpu,
- .teardown = NULL,
+ .startup.single = bringup_cpu,
+ .teardown.single = NULL,
.cant_stop = true,
},
+ [CPUHP_AP_SMPCFD_DYING] = {
+ .name = "smpcfd:dying",
+ .startup.single = NULL,
+ .teardown.single = smpcfd_dying_cpu,
+ },
/*
* Handled on controll processor until the plugged processor manages
* this itself.
*/
[CPUHP_TEARDOWN_CPU] = {
.name = "cpu:teardown",
- .startup = NULL,
- .teardown = takedown_cpu,
+ .startup.single = NULL,
+ .teardown.single = takedown_cpu,
.cant_stop = true,
},
+#else
+ [CPUHP_BRINGUP_CPU] = { },
#endif
};
@@ -1222,20 +1357,13 @@ static struct cpuhp_step cpuhp_ap_states[] = {
/* First state is scheduler control. Interrupts are disabled */
[CPUHP_AP_SCHED_STARTING] = {
.name = "sched:starting",
- .startup = sched_cpu_starting,
- .teardown = sched_cpu_dying,
+ .startup.single = sched_cpu_starting,
+ .teardown.single = sched_cpu_dying,
},
- /*
- * Low level startup/teardown notifiers. Run with interrupts
- * disabled. Will be removed once the notifiers are converted to
- * states.
- */
- [CPUHP_AP_NOTIFY_STARTING] = {
- .name = "notify:starting",
- .startup = notify_starting,
- .teardown = notify_dying,
- .skip_onerr = true,
- .cant_stop = true,
+ [CPUHP_AP_RCUTREE_DYING] = {
+ .name = "RCU/tree:dying",
+ .startup.single = NULL,
+ .teardown.single = rcutree_dying_cpu,
},
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
@@ -1244,18 +1372,34 @@ static struct cpuhp_step cpuhp_ap_states[] = {
},
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
- .name = "smpboot:threads",
- .startup = smpboot_unpark_threads,
- .teardown = NULL,
+ .name = "smpboot/threads:online",
+ .startup.single = smpboot_unpark_threads,
+ .teardown.single = NULL,
},
+ [CPUHP_AP_PERF_ONLINE] = {
+ .name = "perf:online",
+ .startup.single = perf_event_init_cpu,
+ .teardown.single = perf_event_exit_cpu,
+ },
+ [CPUHP_AP_WORKQUEUE_ONLINE] = {
+ .name = "workqueue:online",
+ .startup.single = workqueue_online_cpu,
+ .teardown.single = workqueue_offline_cpu,
+ },
+ [CPUHP_AP_RCUTREE_ONLINE] = {
+ .name = "RCU/tree:online",
+ .startup.single = rcutree_online_cpu,
+ .teardown.single = rcutree_offline_cpu,
+ },
+
/*
* Online/down_prepare notifiers. Will be removed once the notifiers
* are converted to states.
*/
[CPUHP_AP_NOTIFY_ONLINE] = {
.name = "notify:online",
- .startup = notify_online,
- .teardown = notify_down_prepare,
+ .startup.single = notify_online,
+ .teardown.single = notify_down_prepare,
.skip_onerr = true,
},
#endif
@@ -1267,16 +1411,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
/* Last state is scheduler control setting the cpu active */
[CPUHP_AP_ACTIVE] = {
.name = "sched:active",
- .startup = sched_cpu_activate,
- .teardown = sched_cpu_deactivate,
+ .startup.single = sched_cpu_activate,
+ .teardown.single = sched_cpu_deactivate,
},
#endif
/* CPU is fully up and running. */
[CPUHP_ONLINE] = {
.name = "online",
- .startup = NULL,
- .teardown = NULL,
+ .startup.single = NULL,
+ .teardown.single = NULL,
},
};
@@ -1288,54 +1432,42 @@ static int cpuhp_cb_check(enum cpuhp_state state)
return 0;
}
-static bool cpuhp_is_ap_state(enum cpuhp_state state)
-{
- /*
- * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
- * purposes as that state is handled explicitely in cpu_down.
- */
- return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
-}
-
-static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
-{
- struct cpuhp_step *sp;
-
- sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
- return sp + state;
-}
-
static void cpuhp_store_callbacks(enum cpuhp_state state,
const char *name,
int (*startup)(unsigned int cpu),
- int (*teardown)(unsigned int cpu))
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
{
/* (Un)Install the callbacks for further cpu hotplug operations */
struct cpuhp_step *sp;
mutex_lock(&cpuhp_state_mutex);
sp = cpuhp_get_step(state);
- sp->startup = startup;
- sp->teardown = teardown;
+ sp->startup.single = startup;
+ sp->teardown.single = teardown;
sp->name = name;
+ sp->multi_instance = multi_instance;
+ INIT_HLIST_HEAD(&sp->list);
mutex_unlock(&cpuhp_state_mutex);
}
static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
- return cpuhp_get_step(state)->teardown;
+ return cpuhp_get_step(state)->teardown.single;
}
/*
* Call the startup/teardown function for a step either on the AP or
* on the current CPU.
*/
-static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
- int (*cb)(unsigned int), bool bringup)
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
+ struct hlist_node *node)
{
+ struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
- if (!cb)
+ if ((bringup && !sp->startup.single) ||
+ (!bringup && !sp->teardown.single))
return 0;
/*
* The non AP bound callbacks can fail on bringup. On teardown
@@ -1343,11 +1475,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
*/
#ifdef CONFIG_SMP
if (cpuhp_is_ap_state(state))
- ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+ ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
- ret = cpuhp_invoke_callback(cpu, state, cb);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node);
#else
- ret = cpuhp_invoke_callback(cpu, state, cb);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node);
#endif
BUG_ON(ret && !bringup);
return ret;
@@ -1359,13 +1491,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
* Note: The teardown callbacks for rollback are not allowed to fail!
*/
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
- int (*teardown)(unsigned int cpu))
+ struct hlist_node *node)
{
int cpu;
- if (!teardown)
- return;
-
/* Roll back the already executed steps on the other cpus */
for_each_present_cpu(cpu) {
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -1376,7 +1505,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
/* Did we invoke the startup call on that cpu ? */
if (cpustate >= state)
- cpuhp_issue_call(cpu, state, teardown, false);
+ cpuhp_issue_call(cpu, state, false, node);
}
}
@@ -1403,6 +1532,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
return -ENOSPC;
}
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+ bool invoke)
+{
+ struct cpuhp_step *sp;
+ int cpu;
+ int ret;
+
+ sp = cpuhp_get_step(state);
+ if (sp->multi_instance == false)
+ return -EINVAL;
+
+ get_online_cpus();
+
+ if (!invoke || !sp->startup.multi)
+ goto add_node;
+
+ /*
+ * Try to call the startup callback for each present cpu
+ * depending on the hotplug state of the cpu.
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate < state)
+ continue;
+
+ ret = cpuhp_issue_call(cpu, state, true, node);
+ if (ret) {
+ if (sp->teardown.multi)
+ cpuhp_rollback_install(cpu, state, node);
+ goto err;
+ }
+ }
+add_node:
+ ret = 0;
+ mutex_lock(&cpuhp_state_mutex);
+ hlist_add_head(node, &sp->list);
+ mutex_unlock(&cpuhp_state_mutex);
+
+err:
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
+
/**
* __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
* @state: The state to setup
@@ -1416,7 +1591,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
int __cpuhp_setup_state(enum cpuhp_state state,
const char *name, bool invoke,
int (*startup)(unsigned int cpu),
- int (*teardown)(unsigned int cpu))
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
{
int cpu, ret = 0;
int dyn_state = 0;
@@ -1435,7 +1611,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
state = ret;
}
- cpuhp_store_callbacks(state, name, startup, teardown);
+ cpuhp_store_callbacks(state, name, startup, teardown, multi_instance);
if (!invoke || !startup)
goto out;
@@ -1451,10 +1627,11 @@ int __cpuhp_setup_state(enum cpuhp_state state,
if (cpustate < state)
continue;
- ret = cpuhp_issue_call(cpu, state, startup, true);
+ ret = cpuhp_issue_call(cpu, state, true, NULL);
if (ret) {
- cpuhp_rollback_install(cpu, state, teardown);
- cpuhp_store_callbacks(state, NULL, NULL, NULL);
+ if (teardown)
+ cpuhp_rollback_install(cpu, state, NULL);
+ cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
goto out;
}
}
@@ -1466,6 +1643,42 @@ out:
}
EXPORT_SYMBOL(__cpuhp_setup_state);
+int __cpuhp_state_remove_instance(enum cpuhp_state state,
+ struct hlist_node *node, bool invoke)
+{
+ struct cpuhp_step *sp = cpuhp_get_step(state);
+ int cpu;
+
+ BUG_ON(cpuhp_cb_check(state));
+
+ if (!sp->multi_instance)
+ return -EINVAL;
+
+ get_online_cpus();
+ if (!invoke || !cpuhp_get_teardown_cb(state))
+ goto remove;
+ /*
+ * Call the teardown callback for each present cpu depending
+ * on the hotplug state of the cpu. This function is not
+ * allowed to fail currently!
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate >= state)
+ cpuhp_issue_call(cpu, state, false, node);
+ }
+
+remove:
+ mutex_lock(&cpuhp_state_mutex);
+ hlist_del(node);
+ mutex_unlock(&cpuhp_state_mutex);
+ put_online_cpus();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
/**
* __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
* @state: The state to remove
@@ -1477,14 +1690,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state);
*/
void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
- int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+ struct cpuhp_step *sp = cpuhp_get_step(state);
int cpu;
BUG_ON(cpuhp_cb_check(state));
get_online_cpus();
- if (!invoke || !teardown)
+ if (sp->multi_instance) {
+ WARN(!hlist_empty(&sp->list),
+ "Error: Removing state %d which has instances left.\n",
+ state);
+ goto remove;
+ }
+
+ if (!invoke || !cpuhp_get_teardown_cb(state))
goto remove;
/*
@@ -1497,10 +1717,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
int cpustate = st->state;
if (cpustate >= state)
- cpuhp_issue_call(cpu, state, teardown, false);
+ cpuhp_issue_call(cpu, state, false, NULL);
}
remove:
- cpuhp_store_callbacks(state, NULL, NULL, NULL);
+ cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
put_online_cpus();
}
EXPORT_SYMBOL(__cpuhp_remove_state);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956baba1..2b4c20ab5bbe 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
@@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
cs = parent_cs(cs);
+ if (unlikely(!cs)) {
+ /*
+ * The top cpuset doesn't have any online cpu as a
+ * consequence of a race between cpuset_hotplug_work
+ * and cpu hotplug notifier. But we know the top
+ * cpuset's effective_cpus is on its way to to be
+ * identical to cpu_online_mask.
+ */
+ cpumask_copy(pmask, cpu_online_mask);
+ return;
+ }
+ }
cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
@@ -1034,15 +1045,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{
bool need_loop;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return;
- if (current->flags & PF_EXITING) /* Let dying task have memory */
- return;
-
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
@@ -2078,6 +2080,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&cpuset_mutex);
}
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+static void cpuset_fork(struct task_struct *task)
+{
+ if (task_css_is_root(task, cpuset_cgrp_id))
+ return;
+
+ set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ task->mems_allowed = current->mems_allowed;
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
@@ -2088,6 +2104,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .fork = cpuset_fork,
.legacy_cftypes = files,
.early_init = true,
};
@@ -2528,27 +2545,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
- return 1;
+ return true;
if (node_isset(node, current->mems_allowed))
- return 1;
+ return true;
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
+ return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
- return 0;
+ return false;
if (current->flags & PF_EXITING) /* Let dying task have memory */
- return 1;
+ return true;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
@@ -2591,13 +2608,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
static int cpuset_spread_node(int *rotor)
{
- int node;
-
- node = next_node(*rotor, current->mems_allowed);
- if (node == MAX_NUMNODES)
- node = first_node(current->mems_allowed);
- *rotor = node;
- return node;
+ return *rotor = next_node_in(*rotor, current->mems_allowed);
}
int cpuset_mem_spread_node(void)
diff --git a/kernel/cred.c b/kernel/cred.c
index 0c0cd8a62285..5f264fb5737d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EINVAL;
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index b9325e7dcba1..e9fdb5203de5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -19,11 +19,13 @@ struct callchain_cpus_entries {
};
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
static inline size_t perf_callchain_entry__sizeof(void)
{
return (sizeof(struct perf_callchain_entry) +
- sizeof(__u64) * sysctl_perf_event_max_stack);
+ sizeof(__u64) * (sysctl_perf_event_max_stack +
+ sysctl_perf_event_max_contexts_per_stack));
}
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
@@ -32,12 +34,12 @@ static DEFINE_MUTEX(callchain_mutex);
static struct callchain_cpus_entries *callchain_cpus_entries;
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
}
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
}
@@ -102,7 +104,7 @@ fail:
return -ENOMEM;
}
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
{
int err = 0;
int count;
@@ -119,6 +121,15 @@ int get_callchain_buffers(void)
/* If the allocation failed, give up */
if (!callchain_cpus_entries)
err = -ENOMEM;
+ /*
+ * If requesting per event more than the global cap,
+ * return a different error to help userspace figure
+ * this out.
+ *
+ * And also do it here so that we have &callchain_mutex held.
+ */
+ if (event_max_stack > sysctl_perf_event_max_stack)
+ err = -EOVERFLOW;
goto exit;
}
@@ -172,18 +183,20 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
bool user = !event->attr.exclude_callchain_user;
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
+ const u32 max_stack = event->attr.sample_max_stack;
if (!kernel && !user)
return NULL;
- return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+ return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
}
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
- bool crosstask, bool add_mark)
+ u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
+ struct perf_callchain_entry_ctx ctx;
int rctx;
entry = get_callchain_entry(&rctx);
@@ -193,12 +206,16 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
if (!entry)
goto exit_put;
- entry->nr = init_nr;
+ ctx.entry = entry;
+ ctx.max_stack = max_stack;
+ ctx.nr = entry->nr = init_nr;
+ ctx.contexts = 0;
+ ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
- perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
- perf_callchain_kernel(entry, regs);
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
+ perf_callchain_kernel(&ctx, regs);
}
if (user) {
@@ -214,8 +231,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
goto exit_put;
if (add_mark)
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+ perf_callchain_user(&ctx, regs);
}
}
@@ -225,10 +242,15 @@ exit_put:
return entry;
}
+/*
+ * Used for sysctl_perf_event_max_stack and
+ * sysctl_perf_event_max_contexts_per_stack.
+ */
int perf_event_max_stack_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int new_value = sysctl_perf_event_max_stack, ret;
+ int *value = table->data;
+ int new_value = *value, ret;
struct ctl_table new_table = *table;
new_table.data = &new_value;
@@ -240,7 +262,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
if (atomic_read(&nr_callchain_events))
ret = -EBUSY;
else
- sysctl_perf_event_max_stack = new_value;
+ *value = new_value;
mutex_unlock(&callchain_mutex);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450efea90..c6e47e97b33f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
return ret;
}
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
- struct event_function_struct efs = {
- .event = event,
- .func = func,
- .data = data,
- };
-
- int ret = event_function(&efs);
- WARN_ON_ONCE(ret);
-}
-
static void event_function_call(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
raw_spin_unlock_irq(&ctx->lock);
}
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct task_struct *task = READ_ONCE(ctx->task);
+ struct perf_event_context *task_ctx = NULL;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (task) {
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ task_ctx = ctx;
+ }
+
+ perf_ctx_lock(cpuctx, task_ctx);
+
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
+
+ if (task) {
+ /*
+ * We must be either inactive or active and the right task,
+ * otherwise we're screwed, since we cannot IPI to somewhere
+ * else.
+ */
+ if (ctx->is_active) {
+ if (WARN_ON_ONCE(task != current))
+ goto unlock;
+
+ if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+ goto unlock;
+ }
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ func(event, cpuctx, ctx, data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
+
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
@@ -335,6 +371,7 @@ static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -396,6 +433,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
+ /*
+ * If throttling is disabled don't allow the write:
+ */
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0)
+ return -EINVAL;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
@@ -440,7 +484,7 @@ static u64 __report_allowed;
static void perf_duration_warn(struct irq_work *w)
{
- printk_ratelimited(KERN_WARNING
+ printk_ratelimited(KERN_INFO
"perf: interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
__report_avg, __report_allowed,
@@ -835,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
}
}
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
+ return;
+
+ if (add && ctx->nr_cgroups++)
+ return;
+ else if (!add && --ctx->nr_cgroups)
+ return;
+ /*
+ * Because cgroup events are always per-cpu events,
+ * this will always be called from the right CPU.
+ */
+ cpuctx = __get_cpu_context(ctx);
+ cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
#else /* !CONFIG_CGROUP_PERF */
static inline bool
@@ -912,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
}
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+}
+
#endif
/*
@@ -1384,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1397,15 +1475,13 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event) {
struct list_head *list;
- if (is_software_event(event))
- event->group_flags |= PERF_GROUP_SOFTWARE;
+ event->group_caps = event->event_caps;
list = ctx_group_list(event, ctx);
list_add_tail(&event->group_entry, list);
}
- if (is_cgroup_event(event))
- ctx->nr_cgroups++;
+ list_update_cgroup_event(event, ctx, true);
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
@@ -1553,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event)
WARN_ON_ONCE(group_leader->ctx != event->ctx);
- if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
- !is_software_event(event))
- group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+ group_leader->group_caps &= event->event_caps;
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
@@ -1573,8 +1647,6 @@ static void perf_group_attach(struct perf_event *event)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx;
-
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -1586,20 +1658,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- if (is_cgroup_event(event)) {
- ctx->nr_cgroups--;
- /*
- * Because cgroup events are always per-cpu events, this will
- * always be called from the right CPU.
- */
- cpuctx = __get_cpu_context(ctx);
- /*
- * If there are no more cgroup events then clear cgrp to avoid
- * stale pointer in update_cgrp_time_from_cpuctx().
- */
- if (!ctx->nr_cgroups)
- cpuctx->cgrp = NULL;
- }
+ list_update_cgroup_event(event, ctx, false);
ctx->nr_events--;
if (event->attr.inherit_stat)
@@ -1661,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event)
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
- sibling->group_flags = event->group_flags;
+ sibling->group_caps = event->group_caps;
WARN_ON_ONCE(sibling->ctx != event->ctx);
}
@@ -1678,17 +1737,38 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int pmu_filter_match(struct perf_event *event)
+static inline int __pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
return pmu->filter_match ? pmu->filter_match(event) : 1;
}
+/*
+ * Check whether we should attempt to schedule an event group based on
+ * PMU-specific filtering. An event group can consist of HW and SW events,
+ * potentially with a SW leader, so we must check all the filters, to
+ * determine whether a group is schedulable:
+ */
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct perf_event *child;
+
+ if (!__pmu_filter_match(event))
+ return 0;
+
+ list_for_each_entry(child, &event->sibling_list, group_entry) {
+ if (!__pmu_filter_match(child))
+ return 0;
+ }
+
+ return 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
- return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event) && pmu_filter_match(event);
+ return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+ perf_cgroup_match(event) && pmu_filter_match(event);
}
static void
@@ -1708,8 +1788,8 @@ event_sched_out(struct perf_event *event,
* maintained, otherwise bogus information is return
* via read() for time_enabled, time_running:
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE
- && !event_filter_match(event)) {
+ if (event->state == PERF_EVENT_STATE_INACTIVE &&
+ !event_filter_match(event)) {
delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
@@ -1749,6 +1829,8 @@ group_sched_out(struct perf_event *group_event,
struct perf_event *event;
int state = group_event->state;
+ perf_pmu_disable(ctx->pmu);
+
event_sched_out(group_event, cpuctx, ctx);
/*
@@ -1757,6 +1839,8 @@ group_sched_out(struct perf_event *group_event,
list_for_each_entry(event, &group_event->sibling_list, group_entry)
event_sched_out(event, cpuctx, ctx);
+ perf_pmu_enable(ctx->pmu);
+
if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
@@ -2062,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event,
/*
* Groups consisting entirely of software events can always go on.
*/
- if (event->group_flags & PERF_GROUP_SOFTWARE)
+ if (event->group_caps & PERF_EV_CAP_SOFTWARE)
return 1;
/*
* If an exclusive group is already on, no other hardware
@@ -2207,10 +2291,15 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
- event->ctx = ctx;
if (event->cpu != -1)
event->cpu = cpu;
+ /*
+ * Ensures that if we can observe event->ctx, both the event and ctx
+ * will be 'complete'. See perf_iterate_sb_cpu().
+ */
+ smp_store_release(&event->ctx, ctx);
+
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -2403,16 +2492,16 @@ static int __perf_event_stop(void *info)
* while restarting.
*/
if (sd->restart)
- event->pmu->start(event, PERF_EF_START);
+ event->pmu->start(event, 0);
return 0;
}
-static int perf_event_restart(struct perf_event *event)
+static int perf_event_stop(struct perf_event *event, int restart)
{
struct stop_event_data sd = {
.event = event,
- .restart = 1,
+ .restart = restart,
};
int ret = 0;
@@ -2749,19 +2838,36 @@ unlock:
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
+
void perf_sched_cb_inc(struct pmu *pmu)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
this_cpu_inc(perf_sched_cb_usages);
}
/*
* This function provides the context switch callback to the lower code
* layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
*/
static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
@@ -2769,34 +2875,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
- unsigned long flags;
if (prev == next)
return;
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- if (pmu->sched_task) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
- perf_pmu_disable(pmu);
+ if (WARN_ON_ONCE(!pmu->sched_task))
+ continue;
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
- perf_pmu_enable(pmu);
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
-
- rcu_read_unlock();
-
- local_irq_restore(flags);
}
static void perf_event_switch(struct task_struct *task,
@@ -3328,6 +3424,22 @@ struct perf_read_data {
int ret;
};
+static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+{
+ int event_cpu = event->oncpu;
+ u16 local_pkg, event_pkg;
+
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+ event_pkg = topology_physical_package_id(event_cpu);
+ local_pkg = topology_physical_package_id(local_cpu);
+
+ if (event_pkg == local_pkg)
+ return local_cpu;
+ }
+
+ return event_cpu;
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -3449,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0;
+ int ret = 0, cpu_to_read, local_cpu;
/*
* If event is enabled and currently active on a CPU, update the
@@ -3461,8 +3573,22 @@ static int perf_event_read(struct perf_event *event, bool group)
.group = group,
.ret = 0,
};
- smp_call_function_single(event->oncpu,
- __perf_event_read, &data, 1);
+
+ local_cpu = get_cpu();
+ cpu_to_read = find_cpu_to_read(event, local_cpu);
+ put_cpu();
+
+ /*
+ * Purposely ignore the smp_call_function_single() return
+ * value.
+ *
+ * If event->oncpu isn't a valid CPU it means the event got
+ * scheduled out and that will have updated the event count.
+ *
+ * Therefore, either way, we'll have an up-to-date event count
+ * after this.
+ */
+ (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
@@ -3665,6 +3791,39 @@ static void free_event_rcu(struct rcu_head *head)
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb);
+static void detach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_del_rcu(&event->sb_list);
+ raw_spin_unlock(&pel->lock);
+}
+
+static bool is_sb_event(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ if (event->parent)
+ return false;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return false;
+
+ if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+ attr->comm || attr->comm_exec ||
+ attr->task ||
+ attr->context_switch)
+ return true;
+ return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ detach_sb_event(event);
+}
+
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -3728,6 +3887,8 @@ static void unaccount_event(struct perf_event *event)
}
unaccount_event_cpu(event, event->cpu);
+
+ unaccount_pmu_sb_event(event);
}
static void perf_sched_delayed(struct work_struct *work)
@@ -3797,7 +3958,7 @@ static void exclusive_event_destroy(struct perf_event *event)
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
- if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+ if ((e1->pmu == e2->pmu) &&
(e1->cpu == e2->cpu ||
e1->cpu == -1 ||
e2->cpu == -1))
@@ -3862,10 +4023,8 @@ static void _free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -4715,6 +4874,19 @@ static void ring_buffer_attach(struct perf_event *event,
spin_unlock_irqrestore(&rb->event_lock, flags);
}
+ /*
+ * Avoid racing with perf_mmap_close(AUX): stop the event
+ * before swizzling the event::rb pointer; if it's getting
+ * unmapped, its aux_mmap_count will be 0 and it won't
+ * restart. See the comment in __perf_pmu_output_stop().
+ *
+ * Data will inevitably be lost when set_output is done in
+ * mid-air, but then again, whoever does it like this is
+ * not in for the data anyway.
+ */
+ if (has_aux(event))
+ perf_event_stop(event, 0);
+
rcu_assign_pointer(event->rb, rb);
if (old_rb) {
@@ -5207,9 +5379,10 @@ perf_output_sample_regs(struct perf_output_handle *handle,
struct pt_regs *regs, u64 mask)
{
int bit;
+ DECLARE_BITMAP(_mask, 64);
- for_each_set_bit(bit, (const unsigned long *) &mask,
- sizeof(mask) * BITS_PER_BYTE) {
+ bitmap_from_u64(_mask, mask);
+ for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
u64 val;
val = perf_reg_value(regs, bit);
@@ -5555,16 +5728,26 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- u32 raw_size = data->raw->size;
- u32 real_size = round_up(raw_size + sizeof(u32),
- sizeof(u64)) - sizeof(u32);
- u64 zero = 0;
-
- perf_output_put(handle, real_size);
- __output_copy(handle, data->raw->data, raw_size);
- if (real_size - raw_size)
- __output_copy(handle, &zero, real_size - raw_size);
+ struct perf_raw_record *raw = data->raw;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+
+ perf_output_put(handle, raw->size);
+ do {
+ if (frag->copy) {
+ __output_custom(handle, frag->copy,
+ frag->data, frag->size);
+ } else {
+ __output_copy(handle, frag->data,
+ frag->size);
+ }
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+ if (frag->pad)
+ __output_skip(handle, NULL, frag->pad);
} else {
struct {
u32 size;
@@ -5689,14 +5872,28 @@ void perf_prepare_sample(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
-
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+ struct perf_raw_record *raw = data->raw;
+ int size;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+ u32 sum = 0;
+
+ do {
+ sum += frag->size;
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+
+ size = round_up(sum + sizeof(u32), sizeof(u64));
+ raw->size = size - sizeof(u32);
+ frag->pad = raw->size - sum;
+ } else {
+ size = sizeof(u64);
+ }
- header->size += round_up(size, sizeof(u64));
+ header->size += size;
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5856,11 +6053,11 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
- perf_event_aux_output_cb output,
+perf_iterate_ctx(struct perf_event_context *ctx,
+ perf_iterate_f output,
void *data, bool all)
{
struct perf_event *event;
@@ -5877,52 +6074,63 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
}
}
-static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
- struct perf_event_context *task_ctx)
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
- rcu_read_lock();
- preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data, false);
- preempt_enable();
- rcu_read_unlock();
+ struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &pel->list, sb_list) {
+ /*
+ * Skip events that are not fully formed yet; ensure that
+ * if we observe event->ctx, both event and ctx will be
+ * complete enough. See perf_install_in_context().
+ */
+ if (!smp_load_acquire(&event->ctx))
+ continue;
+
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
}
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
- struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
int ctxn;
+ rcu_read_lock();
+ preempt_disable();
+
/*
- * If we have task_ctx != NULL we only notify
- * the task context itself. The task_ctx is set
- * only for EXIT events before releasing task
+ * If we have task_ctx != NULL we only notify the task context itself.
+ * The task_ctx is set only for EXIT events before releasing task
* context.
*/
if (task_ctx) {
- perf_event_aux_task_ctx(output, data, task_ctx);
- return;
+ perf_iterate_ctx(task_ctx, output, data, false);
+ goto done;
}
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
+ perf_iterate_sb_cpu(output, data);
+
+ for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, output, data, false);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
+ perf_iterate_ctx(ctx, output, data, false);
}
+done:
+ preempt_enable();
rcu_read_unlock();
}
@@ -5955,7 +6163,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (restart)
- perf_event_restart(event);
+ perf_event_stop(event, 1);
}
void perf_event_exec(void)
@@ -5971,7 +6179,7 @@ void perf_event_exec(void)
perf_event_enable_on_exec(ctxn);
- perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
true);
}
rcu_read_unlock();
@@ -5999,7 +6207,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
/*
* In case of inheritance, it will be the parent that links to the
- * ring-buffer, but it will be the child that's actually using it:
+ * ring-buffer, but it will be the child that's actually using it.
+ *
+ * We are using event::rb to determine if the event should be stopped,
+ * however this may race with ring_buffer_attach() (through set_output),
+ * which will make us skip the event that actually needs to be stopped.
+ * So ring_buffer_attach() has to stop an aux event before re-assigning
+ * its rb pointer.
*/
if (rcu_dereference(parent->rb) == rb)
ro->err = __perf_event_stop(&sd);
@@ -6009,15 +6223,15 @@ static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
struct pmu *pmu = event->pmu;
- struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
rcu_read_lock();
- perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
if (cpuctx->task_ctx)
- perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
&ro, false);
rcu_read_unlock();
@@ -6146,7 +6360,7 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_aux(perf_event_task_output,
+ perf_iterate_sb(perf_event_task_output,
&task_event,
task_ctx);
}
@@ -6225,7 +6439,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- perf_event_aux(perf_event_comm_output,
+ perf_iterate_sb(perf_event_comm_output,
comm_event,
NULL);
}
@@ -6456,7 +6670,7 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- perf_event_aux(perf_event_mmap_output,
+ perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
@@ -6464,15 +6678,6 @@ got_name:
}
/*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
- return filter->filter && filter->inode;
-}
-
-/*
* Check whether inode and address range match filter criteria.
*/
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6522,7 +6727,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (restart)
- perf_event_restart(event);
+ perf_event_stop(event, 1);
}
/*
@@ -6533,13 +6738,20 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
struct perf_event_context *ctx;
int ctxn;
+ /*
+ * Data tracing isn't supported yet and as such there is no need
+ * to keep track of anything that isn't related to executable code:
+ */
+ if (!(vma->vm_flags & VM_EXEC))
+ return;
+
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (!ctx)
continue;
- perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
}
rcu_read_unlock();
}
@@ -6726,7 +6938,7 @@ static void perf_event_switch(struct task_struct *task,
},
};
- perf_event_aux(perf_event_switch_output,
+ perf_iterate_sb(perf_event_switch_output,
&switch_event,
NULL);
}
@@ -6867,7 +7079,7 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(&event->pending);
}
- event->overflow_handler(event, data, regs);
+ READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
@@ -7333,7 +7545,7 @@ static struct pmu perf_swevent = {
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
- void *record = data->raw->data;
+ void *record = data->raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@@ -7389,8 +7601,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_event *event;
struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
+ .frag = {
+ .size = entry_size,
+ .data = record,
+ },
};
perf_sample_data_init(&data, 0, 0);
@@ -7480,11 +7694,83 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
+#ifdef CONFIG_BPF_SYSCALL
+static void bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct bpf_perf_event_data_kern ctx = {
+ .data = data,
+ .regs = regs,
+ };
+ int ret = 0;
+
+ preempt_disable();
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+ goto out;
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
+ rcu_read_unlock();
+out:
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ if (!ret)
+ return;
+
+ event->orig_overflow_handler(event, data, regs);
+}
+
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->overflow_handler_context)
+ /* hw breakpoint or kernel counter */
+ return -EINVAL;
+
+ if (event->prog)
+ return -EEXIST;
+
+ prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ event->prog = prog;
+ event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
+ WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
+ return 0;
+}
+
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+ struct bpf_prog *prog = event->prog;
+
+ if (!prog)
+ return;
+
+ WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
+ event->prog = NULL;
+ bpf_prog_put(prog);
+}
+#else
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+ return -EOPNOTSUPP;
+}
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
+ if (event->attr.type == PERF_TYPE_HARDWARE ||
+ event->attr.type == PERF_TYPE_SOFTWARE)
+ return perf_event_set_bpf_handler(event, prog_fd);
+
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
@@ -7525,6 +7811,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
{
struct bpf_prog *prog;
+ perf_event_free_bpf_handler(event);
+
if (!event->tp_event)
return;
@@ -7683,7 +7971,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
list_for_each_entry(filter, &ifh->list, entry) {
event->addr_filters_offs[count] = 0;
- if (perf_addr_filter_needs_mmap(filter))
+ /*
+ * Adjust base offset if the filter is associated to a binary
+ * that needs to be mapped:
+ */
+ if (filter->inode)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -7698,7 +7990,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
mmput(mm);
restart:
- perf_event_restart(event);
+ perf_event_stop(event, 1);
}
/*
@@ -7814,8 +8106,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail;
}
- if (token == IF_SRC_FILE) {
- filename = match_strdup(&args[2]);
+ if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+ int fpos = filter->range ? 2 : 1;
+
+ filename = match_strdup(&args[fpos]);
if (!filename) {
ret = -ENOMEM;
goto fail;
@@ -8648,6 +8942,28 @@ unlock:
return pmu;
}
+static void attach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_add_rcu(&event->sb_list, &pel->list);
+ raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ attach_sb_event(event);
+}
+
static void account_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -8728,6 +9044,8 @@ static void account_event(struct perf_event *event)
enabled:
account_event_cpu(event, event->cpu);
+
+ account_pmu_sb_event(event);
}
/*
@@ -8811,6 +9129,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
+ if (overflow_handler == bpf_overflow_handler) {
+ struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto err_ns;
+ }
+ event->prog = prog;
+ event->orig_overflow_handler =
+ parent_event->orig_overflow_handler;
+ }
+#endif
}
if (overflow_handler) {
@@ -8876,7 +9207,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
- err = get_callchain_buffers();
+ err = get_callchain_buffers(attr->sample_max_stack);
if (err)
goto err_addr_filters;
}
@@ -9198,6 +9529,9 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ if (!attr.sample_max_stack)
+ attr.sample_max_stack = sysctl_perf_event_max_stack;
+
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9271,7 +9605,7 @@ SYSCALL_DEFINE5(perf_event_open,
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
- err = -ENOTSUPP;
+ err = -EOPNOTSUPP;
goto err_alloc;
}
}
@@ -9288,6 +9622,9 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
+
if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
@@ -9301,7 +9638,7 @@ SYSCALL_DEFINE5(perf_event_open,
*/
pmu = group_leader->pmu;
} else if (is_software_event(group_leader) &&
- (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
/*
* In case the group is a pure software group, and we
* try to add a hardware event, move the whole group to
@@ -10233,10 +10570,15 @@ static void __init perf_event_init_all_cpus(void)
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+ raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10249,6 +10591,7 @@ static void perf_event_init_cpu(int cpu)
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
+ return 0;
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10280,14 +10623,17 @@ static void perf_event_exit_cpu_context(int cpu)
}
srcu_read_unlock(&pmus_srcu, idx);
}
+#else
+
+static void perf_event_exit_cpu_context(int cpu) { }
-static void perf_event_exit_cpu(int cpu)
+#endif
+
+int perf_event_exit_cpu(unsigned int cpu)
{
perf_event_exit_cpu_context(cpu);
+ return 0;
}
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10309,46 +10655,6 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- /*
- * This must be done before the CPU comes alive, because the
- * moment we can run tasks we can encounter (software) events.
- *
- * Specifically, someone can have inherited events on kthreadd
- * or a pre-existing worker thread that gets re-bound.
- */
- perf_event_init_cpu(cpu);
- break;
-
- case CPU_DOWN_PREPARE:
- /*
- * This must be done before the CPU dies because after that an
- * active event might want to IPI the CPU and that'll not work
- * so great for dead CPUs.
- *
- * XXX smp_call_function_single() return -ENXIO without a warn
- * so we could possibly deal with this.
- *
- * This is safe against new events arriving because
- * sys_perf_event_open() serializes against hotplug using
- * get_online_cpus().
- */
- perf_event_exit_cpu(cpu);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
void __init perf_event_init(void)
{
int ret;
@@ -10361,7 +10667,7 @@ void __init perf_event_init(void)
perf_pmu_register(&perf_cpu_clock, NULL, -1);
perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
- perf_cpu_notifier(perf_cpu_notify);
+ perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
return rb->aux_nr_pages << PAGE_SHIFT;
}
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned long \
-func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned long len) \
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
- written = memcpy_func(handle->addr, buf, size); \
+ written = memcpy_func(__VA_ARGS__); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
- buf += written; \
+ if (advance_buf) \
+ buf += written; \
handle->size -= written; \
if (!handle->size) { \
struct ring_buffer *rb = handle->rb; \
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+ const void *buf, unsigned long len)
+{
+ unsigned long orig_len = len;
+ __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+ orig_len - len, size)
+}
+
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ae9b90dc9a5a..257fa460b846 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!rb)
return NULL;
- if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+ if (!rb_has_aux(rb))
goto err;
/*
- * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
- * the aux buffer is in perf_mmap_close(), about to get freed.
+ * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
+ * about to get freed, so we leave immediately.
+ *
+ * Checking rb::aux_mmap_count and rb::refcount has to be done in
+ * the same order, see perf_mmap_close. Otherwise we end up freeing
+ * aux pages in this path, which is a bug, because in_atomic().
*/
if (!atomic_read(&rb->aux_mmap_count))
- goto err_put;
+ goto err;
+
+ if (!atomic_inc_not_zero(&rb->aux_refcount))
+ goto err;
/*
* Nesting is not supported for AUX area, make sure nested
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7edc95edfaee..d4129bb05e5d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
* Returns 0 on success, -EFAULT on failure.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, struct page *kpage)
+ struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
@@ -161,48 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
false);
if (err)
return err;
/* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(page);
+ lock_page(old_page);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
- ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ ptep = page_check_address(old_page, mm, addr, &ptl, 0);
+ if (!ptep) {
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
+ }
- get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr, false);
- mem_cgroup_commit_charge(kpage, memcg, false, false);
- lru_cache_add_active_or_unevictable(kpage, vma);
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
- if (!PageAnon(page)) {
- dec_mm_counter(mm, mm_counter_file(page));
+ if (!PageAnon(old_page)) {
+ dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+ set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(page, false);
- if (!page_mapped(page))
- try_to_free_swap(page);
+ page_remove_rmap(old_page, false);
+ if (!page_mapped(old_page))
+ try_to_free_swap(old_page);
pte_unmap_unlock(ptep, ptl);
if (vma->vm_flags & VM_LOCKED)
- munlock_vma_page(page);
- put_page(page);
+ munlock_vma_page(old_page);
+ put_page(old_page);
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- unlock_page(page);
+ unlock_page(old_page);
return err;
}
@@ -1130,7 +1131,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
struct vm_area_struct *vma;
int ret;
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
if (mm->uprobes_state.xol_area) {
ret = -EALREADY;
goto fail;
@@ -1469,7 +1472,8 @@ static void dup_xol_work(struct callback_head *work)
if (current->flags & PF_EXITING)
return;
- if (!__create_xol_area(current->utask->dup_xol_addr))
+ if (!__create_xol_area(current->utask->dup_xol_addr) &&
+ !fatal_signal_pending(current))
uprobe_warn(current, "dup xol area");
}
@@ -1694,8 +1698,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
int result;
pagefault_disable();
- result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
- sizeof(opcode));
+ result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
if (likely(result == 0))
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..1e1d913914c0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
}
/*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+ struct sighand_struct *sighand;
+ struct task_struct *task;
+
+ /*
+ * We need to verify that release_task() was not called and thus
+ * delayed_put_task_struct() can't run and drop the last reference
+ * before rcu_read_unlock(). We check task->sighand != NULL,
+ * but we can read the already freed and reused memory.
+ */
+retry:
+ task = rcu_dereference(*ptask);
+ if (!task)
+ return NULL;
+
+ probe_kernel_address(&task->sighand, sighand);
+
+ /*
+ * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+ * was already freed we can not miss the preceding update of this
+ * pointer.
+ */
+ smp_rmb();
+ if (unlikely(task != READ_ONCE(*ptask)))
+ goto retry;
+
+ /*
+ * We've re-checked that "task == *ptask", now we have two different
+ * cases:
+ *
+ * 1. This is actually the same task/task_struct. In this case
+ * sighand != NULL tells us it is still alive.
+ *
+ * 2. This is another task which got the same memory for task_struct.
+ * We can't know this of course, and we can not trust
+ * sighand != NULL.
+ *
+ * In this case we actually return a random value, but this is
+ * correct.
+ *
+ * If we return NULL - we can pretend that we actually noticed that
+ * *ptask was updated when the previous task has exited. Or pretend
+ * that probe_slab_address(&sighand) reads NULL.
+ *
+ * If we return the new task (because sighand is not NULL for any
+ * reason) - this is fine too. This (new) task can't go away before
+ * another gp pass.
+ *
+ * And note: We could even eliminate the false positive if re-read
+ * task->sighand once again to avoid the falsely NULL. But this case
+ * is very unlikely so we don't care.
+ */
+ if (!sighand)
+ return NULL;
+
+ return task;
+}
+
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = task_rcu_dereference(ptask);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+}
+
+/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
@@ -639,7 +715,7 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
@@ -649,7 +725,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
-void do_exit(long code)
+void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
@@ -700,10 +776,14 @@ void do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
/*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
+ * Ensure that all new tsk->pi_lock acquisitions must observe
+ * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
*/
smp_mb();
+ /*
+ * Ensure that we must observe the pi_state in exit_mm() ->
+ * mm_release() -> exit_pi_state_list().
+ */
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic())) {
@@ -746,7 +826,7 @@ void do_exit(long code)
disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
- exit_thread();
+ exit_thread(tsk);
/*
* Flush inherited counters to the parent - before the parent
@@ -768,12 +848,7 @@ void do_exit(long code)
TASKS_RCU(preempt_enable());
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
-#ifdef CONFIG_NUMA
- task_lock(tsk);
- mpol_put(tsk->mempolicy);
- tsk->mempolicy = NULL;
- task_unlock(tsk);
-#endif
+ mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
@@ -807,29 +882,7 @@ void do_exit(long code)
exit_rcu();
TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
- /*
- * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
- * when the following two conditions become true.
- * - There is race condition of mmap_sem (It is acquired by
- * exit_mm()), and
- * - SMI occurs before setting TASK_RUNINNG.
- * (or hypervisor of virtual machine switches to other guest)
- * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
- *
- * To avoid it, we have to wait for releasing tsk->pi_lock which
- * is held by try_to_wake_up()
- */
- smp_mb();
- raw_spin_unlock_wait(&tsk->pi_lock);
-
- /* causes final put_task_struct in finish_task_switch(). */
- tsk->state = TASK_DEAD;
- tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
- schedule();
- BUG();
- /* Avoid "noreturn function does return". */
- for (;;)
- cpu_relax(); /* For when BUG is null */
+ do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
@@ -918,17 +971,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return 0;
- /* Wait for all children (clone and not) if __WALL is set;
- * otherwise, wait for clone children *only* if __WCLONE is
- * set; otherwise, wait for non-clone children *only*. (Note:
- * A "clone" child here is one that reports to its parent
- * using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
- && !(wo->wo_flags & __WALL))
+
+ /*
+ * Wait for all children (clone and not) if __WALL is set or
+ * if it is traced by us.
+ */
+ if (ptrace || (wo->wo_flags & __WALL))
+ return 1;
+
+ /*
+ * Otherwise, wait for clone children *only* if __WCLONE is set;
+ * otherwise, wait for non-clone children *only*.
+ *
+ * Note: a "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD, or a non-leader thread which
+ * we can only see if it is traced by us.
+ */
+ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
return 0;
return 1;
@@ -1300,7 +1364,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
if (unlikely(exit_state == EXIT_DEAD))
return 0;
- ret = eligible_child(wo, p);
+ ret = eligible_child(wo, ptrace, p);
if (!ret)
return ret;
@@ -1524,7 +1588,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
enum pid_type type;
long ret;
- if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+ if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
return -EINVAL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e8451527cbe..9a05bd93f8e7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,57 +148,113 @@ static inline void free_task_struct(struct task_struct *tsk)
}
#endif
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
{
}
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
*/
-# if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
- int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush. Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+#endif
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
- struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
- THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+ void *stack;
+ int i;
+
+ local_irq_disable();
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+
+ if (!s)
+ continue;
+ this_cpu_write(cached_stacks[i], NULL);
+
+ tsk->stack_vm_area = s;
+ local_irq_enable();
+ return s->addr;
+ }
+ local_irq_enable();
- if (page)
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- 1 << THREAD_SIZE_ORDER);
+ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ VMALLOC_START, VMALLOC_END,
+ THREADINFO_GFP | __GFP_HIGHMEM,
+ PAGE_KERNEL,
+ 0, node, __builtin_return_address(0));
+
+ /*
+ * We can't call find_vm_area() in interrupt context, and
+ * free_thread_stack() can be called in interrupt context,
+ * so cache the vm_struct.
+ */
+ if (stack)
+ tsk->stack_vm_area = find_vm_area(stack);
+ return stack;
+#else
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
+#endif
}
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(struct task_struct *tsk)
{
- struct page *page = virt_to_page(ti);
+#ifdef CONFIG_VMAP_STACK
+ if (task_stack_vm_area(tsk)) {
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_read(cached_stacks[i]))
+ continue;
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- -(1 << THREAD_SIZE_ORDER));
- __free_kmem_pages(page, THREAD_SIZE_ORDER);
+ this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+ local_irq_restore(flags);
+ return;
+ }
+ local_irq_restore(flags);
+
+ vfree(tsk->stack);
+ return;
+ }
+#endif
+
+ __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
}
# else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
- return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+ return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
}
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_info_cache, ti);
+ kmem_cache_free(thread_stack_cache, tsk->stack);
}
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
{
- thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
THREAD_SIZE, 0, NULL);
- BUG_ON(thread_info_cache == NULL);
+ BUG_ON(thread_stack_cache == NULL);
}
# endif
#endif
@@ -221,18 +277,76 @@ struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
+{
+ void *stack = task_stack_page(tsk);
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+ if (vm) {
+ int i;
+
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ mod_zone_page_state(page_zone(vm->pages[i]),
+ NR_KERNEL_STACK_KB,
+ PAGE_SIZE / 1024 * account);
+ }
+
+ /* All stack pages belong to the same memcg. */
+ memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ } else {
+ /*
+ * All stack pages are in the same zone and belong to the
+ * same memcg.
+ */
+ struct page *first_page = virt_to_page(stack);
+
+ mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ THREAD_SIZE / 1024 * account);
+
+ memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ }
+}
+
+static void release_task_stack(struct task_struct *tsk)
{
- struct zone *zone = page_zone(virt_to_page(ti));
+ account_kernel_stack(tsk, -1);
+ arch_release_thread_stack(tsk->stack);
+ free_thread_stack(tsk);
+ tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = NULL;
+#endif
+}
- mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+ if (atomic_dec_and_test(&tsk->stack_refcount))
+ release_task_stack(tsk);
}
+#endif
void free_task(struct task_struct *tsk)
{
- account_kernel_stack(tsk->stack, -1);
- arch_release_thread_info(tsk->stack);
- free_thread_info(tsk->stack);
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+ /*
+ * The task is finally done with both the stack and thread_info,
+ * so free both.
+ */
+ release_task_stack(tsk);
+#else
+ /*
+ * If the task had a separate stack allocation, it should be gone
+ * by now.
+ */
+ WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
put_seccomp_filter(tsk);
@@ -304,6 +418,7 @@ int arch_task_struct_size __read_mostly;
void __init fork_init(void)
{
+ int i;
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
@@ -323,6 +438,10 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
+
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ init_user_ns.ucount_max[i] = max_threads/2;
+ }
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -340,26 +459,43 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
-static struct task_struct *dup_task_struct(struct task_struct *orig)
+static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- struct thread_info *ti;
- int node = tsk_fork_get_node(orig);
+ unsigned long *stack;
+ struct vm_struct *stack_vm_area;
int err;
+ if (node == NUMA_NO_NODE)
+ node = tsk_fork_get_node(orig);
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
- ti = alloc_thread_info_node(tsk, node);
- if (!ti)
+ stack = alloc_thread_stack_node(tsk, node);
+ if (!stack)
goto free_tsk;
+ stack_vm_area = task_stack_vm_area(tsk);
+
err = arch_dup_task_struct(tsk, orig);
+
+ /*
+ * arch_dup_task_struct() clobbers the stack-related fields. Make
+ * sure they're properly initialized before using any stack-related
+ * functions again.
+ */
+ tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ atomic_set(&tsk->stack_refcount, 1);
+#endif
+
if (err)
- goto free_ti;
+ goto free_stack;
- tsk->stack = ti;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
@@ -391,14 +527,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- account_kernel_stack(ti, 1);
+ account_kernel_stack(tsk, 1);
kcov_task_init(tsk);
return tsk;
-free_ti:
- free_thread_info(ti);
+free_stack:
+ free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
@@ -413,7 +549,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
unsigned long charge;
uprobe_start_dup_mmap();
- down_write(&oldmm->mmap_sem);
+ if (down_write_killable(&oldmm->mmap_sem)) {
+ retval = -EINTR;
+ goto fail_uprobe_end;
+ }
flush_cache_dup_mm(oldmm);
uprobe_dup_mmap(oldmm, mm);
/*
@@ -525,6 +664,7 @@ out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
fail_nomem_anon_vma_fork:
@@ -699,6 +839,26 @@ void __mmdrop(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(__mmdrop);
+static inline void __mmput(struct mm_struct *mm)
+{
+ VM_BUG_ON(atomic_read(&mm->mm_users));
+
+ uprobe_clear_state(mm);
+ exit_aio(mm);
+ ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
+ exit_mmap(mm);
+ set_mm_exe_file(mm, NULL);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+ mmdrop(mm);
+}
+
/*
* Decrement the use count and release all resources for an mm.
*/
@@ -706,24 +866,26 @@ void mmput(struct mm_struct *mm)
{
might_sleep();
+ if (atomic_dec_and_test(&mm->mm_users))
+ __mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
if (atomic_dec_and_test(&mm->mm_users)) {
- uprobe_clear_state(mm);
- exit_aio(mm);
- ksm_exit(mm);
- khugepaged_exit(mm); /* must run before exit_mmap */
- exit_mmap(mm);
- set_mm_exe_file(mm, NULL);
- if (!list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- list_del(&mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
- mmdrop(mm);
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
}
}
-EXPORT_SYMBOL_GPL(mmput);
+#endif
/**
* set_mm_exe_file - change a reference to the mm's executable file
@@ -774,6 +936,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
EXPORT_SYMBOL(get_mm_exe_file);
/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+ struct file *exe_file = NULL;
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm) {
+ if (!(task->flags & PF_KTHREAD))
+ exe_file = get_mm_exe_file(mm);
+ }
+ task_unlock(task);
+ return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
+/**
* get_task_mm - acquire a reference to the task's mm
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
@@ -888,14 +1073,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
deactivate_mm(tsk, mm);
/*
- * If we're exiting normally, clear a user-space tid field if
- * requested. We leave this alone when dying by signal, to leave
- * the value intact in a core dump, and to save the unnecessary
- * trouble, say, a killed vfork parent shouldn't touch this mm.
- * Userland only wants this done for a sys_exit.
+ * Signal userspace if we're not exiting with a core dump
+ * because we want to leave the value intact for debugging
+ * purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->flags & PF_SIGNALED) &&
+ if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
@@ -1256,7 +1439,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
int __user *child_tidptr,
struct pid *pid,
int trace,
- unsigned long tls)
+ unsigned long tls,
+ int node)
{
int retval;
struct task_struct *p;
@@ -1308,7 +1492,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
retval = -ENOMEM;
- p = dup_task_struct(current);
+ p = dup_task_struct(current, node);
if (!p)
goto fork_out;
@@ -1378,7 +1562,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
- threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1470,7 +1653,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
- goto bad_fork_cleanup_io;
+ goto bad_fork_cleanup_thread;
}
}
@@ -1530,6 +1713,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
@@ -1630,8 +1814,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p);
bad_fork_free_pid:
+ threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
+bad_fork_cleanup_thread:
+ exit_thread(p);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
@@ -1660,12 +1847,12 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
- threadgroup_change_end(current);
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
+ put_task_stack(p);
free_task(p);
fork_out:
return ERR_PTR(retval);
@@ -1684,7 +1871,8 @@ static inline void init_idle_pids(struct pid_link *links)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+ cpu_to_node(cpu));
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1729,7 +1917,7 @@ long _do_fork(unsigned long clone_flags,
}
p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace, tls);
+ child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
- if (test_thread_flag(TIF_MEMDIE))
+ if (test_tsk_thread_flag(p, TIF_MEMDIE))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/futex.c b/kernel/futex.c
index c20f06f38ef3..2c4be467fecd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
-#define FLAGS_SHARED 0x01
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x00
+#endif
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
@@ -373,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
#endif
}
-/*
- * We hash on the keys returned from get_futex_key (see below).
+/**
+ * hash_futex - Return the hash bucket in the global hash
+ * @key: Pointer to the futex key for which the hash is calculated
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket in the global hash.
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
@@ -384,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
return &futex_queues[hash & (futex_hashsize - 1)];
}
-/*
+
+/**
+ * match_futex - Check whether two futex keys are equal
+ * @key1: Pointer to key1
+ * @key2: Pointer to key2
+ *
* Return 1 if two futex_keys are equal, 0 otherwise.
*/
static inline int match_futex(union futex_key *key1, union futex_key *key2)
@@ -405,6 +422,16 @@ static void get_futex_key_refs(union futex_key *key)
if (!key->both.ptr)
return;
+ /*
+ * On MMU less systems futexes are always "private" as there is no per
+ * process address space. We need the smp wmb nevertheless - yes,
+ * arch/blackfin has MMU less SMP ...
+ */
+ if (!IS_ENABLED(CONFIG_MMU)) {
+ smp_mb(); /* explicit smp_mb(); (B) */
+ return;
+ }
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +463,9 @@ static void drop_futex_key_refs(union futex_key *key)
return;
}
+ if (!IS_ENABLED(CONFIG_MMU))
+ return;
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
iput(key->shared.inode);
@@ -469,7 +499,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page;
+ struct page *page, *tail;
struct address_space *mapping;
int err, ro = 0;
@@ -530,7 +560,15 @@ again:
* considered here and page lock forces unnecessarily serialization
* From this point on, mapping will be re-verified if necessary and
* page lock will be acquired only if it is unavoidable
- */
+ *
+ * Mapping checks require the head page for any compound page so the
+ * head page and mapping is looked up now. For anonymous pages, it
+ * does not matter if the page splits in the future as the key is
+ * based on the address. For filesystem-backed pages, the tail is
+ * required as the index of the page determines the key. For
+ * base pages, there is no tail page and tail == page.
+ */
+ tail = page;
page = compound_head(page);
mapping = READ_ONCE(page->mapping);
@@ -654,7 +692,7 @@ again:
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = inode;
- key->shared.pgoff = basepage_index(page);
+ key->shared.pgoff = basepage_index(tail);
rcu_read_unlock();
}
@@ -729,7 +767,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
int ret;
pagefault_disable();
- ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+ ret = __get_user(*dest, from);
pagefault_enable();
return ret ? -EFAULT : 0;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index c92e44855ddd..1276aabaab55 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
+ depends on !COMPILE_TEST
depends on GCOV_KERNEL
depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index e25e92fb44fa..6a5c239c7669 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,7 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d234022805dc..432c3d71d195 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -117,7 +117,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- debug_show_held_locks(t);
+ debug_show_all_locks();
touch_nmi_watchdog();
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2ee42e95a3ce..1d3ee3169202 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_SMP) += affinity.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 000000000000..17f51d63da56
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,154 @@
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+
+static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+ int cpus_per_vec)
+{
+ const struct cpumask *siblmsk;
+ int cpu, sibl;
+
+ for ( ; cpus_per_vec > 0; ) {
+ cpu = cpumask_first(nmsk);
+
+ /* Should not happen, but I'm too lazy to think about it */
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ cpumask_clear_cpu(cpu, nmsk);
+ cpumask_set_cpu(cpu, irqmsk);
+ cpus_per_vec--;
+
+ /* If the cpu has siblings, use them first */
+ siblmsk = topology_sibling_cpumask(cpu);
+ for (sibl = -1; cpus_per_vec > 0; ) {
+ sibl = cpumask_next(sibl, siblmsk);
+ if (sibl >= nr_cpu_ids)
+ break;
+ if (!cpumask_test_and_clear_cpu(sibl, nmsk))
+ continue;
+ cpumask_set_cpu(sibl, irqmsk);
+ cpus_per_vec--;
+ }
+ }
+}
+
+static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
+{
+ int n, nodes;
+
+ /* Calculate the number of nodes in the supplied affinity mask */
+ for (n = 0, nodes = 0; n < num_online_nodes(); n++) {
+ if (cpumask_intersects(mask, cpumask_of_node(n))) {
+ node_set(n, *nodemsk);
+ nodes++;
+ }
+ }
+ return nodes;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @affinity: The affinity mask to spread. If NULL cpu_online_mask
+ * is used
+ * @nvecs: The number of vectors
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
+ int nvec)
+{
+ int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0;
+ nodemask_t nodemsk = NODE_MASK_NONE;
+ struct cpumask *masks;
+ cpumask_var_t nmsk;
+
+ if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+ return NULL;
+
+ masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ goto out;
+
+ /* Stabilize the cpumasks */
+ get_online_cpus();
+ /* If the supplied affinity mask is NULL, use cpu online mask */
+ if (!affinity)
+ affinity = cpu_online_mask;
+
+ nodes = get_nodes_in_cpumask(affinity, &nodemsk);
+
+ /*
+ * If the number of nodes in the mask is less than or equal the
+ * number of vectors we just spread the vectors across the nodes.
+ */
+ if (nvec <= nodes) {
+ for_each_node_mask(n, nodemsk) {
+ cpumask_copy(masks + curvec, cpumask_of_node(n));
+ if (++curvec == nvec)
+ break;
+ }
+ goto outonl;
+ }
+
+ /* Spread the vectors per node */
+ vecs_per_node = nvec / nodes;
+ /* Account for rounding errors */
+ extra_vecs = nvec - (nodes * vecs_per_node);
+
+ for_each_node_mask(n, nodemsk) {
+ int ncpus, v, vecs_to_assign = vecs_per_node;
+
+ /* Get the cpus on this node which are in the mask */
+ cpumask_and(nmsk, affinity, cpumask_of_node(n));
+
+ /* Calculate the number of cpus per vector */
+ ncpus = cpumask_weight(nmsk);
+
+ for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) {
+ cpus_per_vec = ncpus / vecs_to_assign;
+
+ /* Account for extra vectors to compensate rounding errors */
+ if (extra_vecs) {
+ cpus_per_vec++;
+ if (!--extra_vecs)
+ vecs_per_node++;
+ }
+ irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+ }
+
+ if (curvec >= nvec)
+ break;
+ }
+
+outonl:
+ put_online_cpus();
+out:
+ free_cpumask_var(nmsk);
+ return masks;
+}
+
+/**
+ * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
+ * @affinity: The affinity mask to spread. If NULL cpu_online_mask
+ * is used
+ * @maxvec: The maximum number of vectors available
+ */
+int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
+{
+ int cpus, ret;
+
+ /* Stabilize the cpumasks */
+ get_online_cpus();
+ /* If the supplied affinity mask is NULL, use cpu online mask */
+ if (!affinity)
+ affinity = cpu_online_mask;
+
+ cpus = cpumask_weight(affinity);
+ ret = (cpus < maxvec) ? cpus : maxvec;
+
+ put_online_cpus();
+ return ret;
+}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2f9f2b0e79f2..be3c34e4f2ac 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
if (!desc)
return -EINVAL;
- type &= IRQ_TYPE_SENSE_MASK;
ret = __irq_set_trigger(desc, type);
irq_put_desc_busunlock(desc, flags);
return ret;
@@ -426,6 +425,49 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
+/**
+ * handle_untracked_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Untracked interrupts are sent from a demultiplexing interrupt
+ * handler when the demultiplexer does not know which device it its
+ * multiplexed irq domain generated the interrupt. IRQ's handled
+ * through here are not subjected to stats tracking, randomness, or
+ * spurious interrupt detection.
+ *
+ * Note: Like handle_simple_irq, the caller is expected to handle
+ * the ack, clear, mask and unmask issues if necessary.
+ */
+void handle_untracked_irq(struct irq_desc *desc)
+{
+ unsigned int flags = 0;
+
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ goto out_unlock;
+ }
+
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ raw_spin_unlock(&desc->lock);
+
+ __handle_irq_event_percpu(desc, &flags);
+
+ raw_spin_lock(&desc->lock);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+
+out_unlock:
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_untracked_irq);
+
/*
* Called unconditionally from handle_level_irq() and only for oneshot
* interrupts from handle_fasteoi_irq()
@@ -713,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct irqaction *action = desc->action;
- void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
@@ -722,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
- trace_irq_handler_entry(irq, action);
- res = action->handler(irq, dev_id);
- trace_irq_handler_exit(irq, action, res);
+ if (likely(action)) {
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+ } else {
+ unsigned int cpu = smp_processor_id();
+ bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+
+ if (enabled)
+ irq_percpu_disable(desc, cpu);
+
+ pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n",
+ enabled ? " and unmasked" : "", irq, cpu);
+ }
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
}
-void
+static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
{
@@ -777,6 +829,21 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
+ unsigned int type = irqd_get_trigger_type(&desc->irq_data);
+
+ /*
+ * We're about to start this interrupt immediately,
+ * hence the need to set the trigger configuration.
+ * But the .set_type callback may have overridden the
+ * flow handler, ignoring that we're dealing with a
+ * chained interrupt. Reset it immediately because we
+ * do know better.
+ */
+ if (type != IRQ_TYPE_NONE) {
+ __irq_set_trigger(desc, type);
+ desc->handle_irq = handle;
+ }
+
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
@@ -1093,3 +1160,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+
+/**
+ * irq_chip_pm_get - Enable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Enable the power to the IRQ chip referenced by the interrupt data
+ * structure.
+ */
+int irq_chip_pm_get(struct irq_data *data)
+{
+ int retval;
+
+ if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
+ retval = pm_runtime_get_sync(data->chip->parent_device);
+ if (retval < 0) {
+ pm_runtime_put_noidle(data->chip->parent_device);
+ return retval;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * irq_chip_pm_put - Disable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Disable the power to the IRQ chip referenced by the interrupt data
+ * structure, belongs. Note that power will only be disabled, once this
+ * function has been called for all IRQs that have called irq_chip_pm_get().
+ */
+int irq_chip_pm_put(struct irq_data *data)
+{
+ int retval = 0;
+
+ if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
+ retval = pm_runtime_put(data->chip->parent_device);
+
+ return (retval < 0) ? retval : 0;
+}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index abd286afbd27..ee32870079c9 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
/**
- * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
* @d: irq domain for which to allocate chips
- * @irqs_per_chip: Number of interrupts each chip handles
+ * @irqs_per_chip: Number of interrupts each chip handles (max 32)
* @num_ct: Number of irq_chip_type instances associated with this
* @name: Name of the irq chip
* @handler: Default flow handler associated with these chips
@@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
* @set: IRQ_* bits to set in the mapping function
* @gcflags: Generic chip specific setup flags
*/
-int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
- int num_ct, const char *name,
- irq_flow_handler_t handler,
- unsigned int clr, unsigned int set,
- enum irq_gc_flags gcflags)
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+ int num_ct, const char *name,
+ irq_flow_handler_t handler,
+ unsigned int clr, unsigned int set,
+ enum irq_gc_flags gcflags)
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
@@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
d->name = name;
return 0;
}
-EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
+
+static struct irq_chip_generic *
+__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+ struct irq_domain_chip_generic *dgc = d->gc;
+ int idx;
+
+ if (!dgc)
+ return ERR_PTR(-ENODEV);
+ idx = hw_irq / dgc->irqs_per_chip;
+ if (idx >= dgc->num_chips)
+ return ERR_PTR(-EINVAL);
+ return dgc->gc[idx];
+}
/**
* irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
@@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
struct irq_chip_generic *
irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
{
- struct irq_domain_chip_generic *dgc = d->gc;
- int idx;
+ struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq);
- if (!dgc)
- return NULL;
- idx = hw_irq / dgc->irqs_per_chip;
- if (idx >= dgc->num_chips)
- return NULL;
- return dgc->gc[idx];
+ return !IS_ERR(gc) ? gc : NULL;
}
EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
@@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
unsigned long flags;
int idx;
- if (!d->gc)
- return -ENODEV;
-
- idx = hw_irq / dgc->irqs_per_chip;
- if (idx >= dgc->num_chips)
- return -EINVAL;
- gc = dgc->gc[idx];
+ gc = __irq_get_domain_generic_chip(d, hw_irq);
+ if (IS_ERR(gc))
+ return PTR_ERR(gc);
idx = hw_irq % dgc->irqs_per_chip;
@@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int hw_irq = data->hwirq;
+ struct irq_chip_generic *gc;
+ int irq_idx;
+
+ gc = irq_get_domain_generic_chip(d, hw_irq);
+ if (!gc)
+ return;
+
+ irq_idx = hw_irq % dgc->irqs_per_chip;
+
+ clear_bit(irq_idx, &gc->installed);
+ irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+ NULL);
+
+}
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
+ .unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a15b5485b446..d3f24905852c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
{
irqreturn_t retval = IRQ_NONE;
- unsigned int flags = 0, irq = desc->irq_data.irq;
+ unsigned int irq = desc->irq_data.irq;
struct irqaction *action;
for_each_action_of_desc(desc, action) {
@@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
/* Fall through to add to randomness */
case IRQ_HANDLED:
- flags |= action->flags;
+ *flags |= action->flags;
break;
default:
@@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
retval |= res;
}
- add_interrupt_randomness(irq, flags);
+ return retval;
+}
+
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+{
+ irqreturn_t retval;
+ unsigned int flags = 0;
+
+ retval = __handle_irq_event_percpu(desc, &flags);
+
+ add_interrupt_randomness(desc->irq_data.irq, flags);
if (!noirqdebug)
note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 09be2c903c6d..bc226e783bd2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -7,6 +7,7 @@
*/
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
+#include <linux/pm_runtime.h>
#ifdef CONFIG_SPARSE_IRQ
# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq);
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
@@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
+extern bool irq_can_set_affinity_usr(unsigned int irq);
+
extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index c42742208e5e..1a9abc1c8ea0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain,
}
}
- virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+ virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc descs\n");
return -ENOMEM;
}
virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
- (void *) dest, true);
+ (void *) dest, true, NULL);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
@@ -125,7 +125,7 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
domain = data->domain;
if (WARN_ON(domain == NULL))
- return;
+ return -EINVAL;
if (!irq_domain_is_ipi(domain)) {
pr_warn("Trying to destroy a non IPI domain!\n");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8731e1c5d1e7..00bb0aeea1d0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -15,6 +15,7 @@
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
#include <linux/irqdomain.h>
+#include <linux/sysfs.h>
#include "internals.h"
@@ -68,9 +69,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
return 0;
}
-static void desc_smp_init(struct irq_desc *desc, int node)
+static void desc_smp_init(struct irq_desc *desc, int node,
+ const struct cpumask *affinity)
{
- cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
+ if (!affinity)
+ affinity = irq_default_affinity;
+ cpumask_copy(desc->irq_common_data.affinity, affinity);
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_clear(desc->pending_mask);
#endif
@@ -82,11 +87,12 @@ static void desc_smp_init(struct irq_desc *desc, int node)
#else
static inline int
alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
-static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline void
+desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
- struct module *owner)
+ const struct cpumask *affinity, struct module *owner)
{
int cpu;
@@ -107,7 +113,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->owner = owner;
for_each_possible_cpu(cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
- desc_smp_init(desc, node);
+ desc_smp_init(desc, node, affinity);
}
int nr_irqs = NR_IRQS;
@@ -118,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
#ifdef CONFIG_SPARSE_IRQ
+static void irq_kobj_release(struct kobject *kobj);
+
+#ifdef CONFIG_SYSFS
+static struct kobject *irq_kobj_base;
+
+#define IRQ_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+static ssize_t per_cpu_count_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ int cpu, irq = desc->irq_data.irq;
+ ssize_t ret = 0;
+ char *p = "";
+
+ for_each_possible_cpu(cpu) {
+ unsigned int c = kstat_irqs_cpu(irq, cpu);
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
+ p = ",";
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+IRQ_ATTR_RO(per_cpu_count);
+
+static ssize_t chip_name_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ if (desc->irq_data.chip && desc->irq_data.chip->name) {
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n",
+ desc->irq_data.chip->name);
+ }
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+}
+IRQ_ATTR_RO(chip_name);
+
+static ssize_t hwirq_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ if (desc->irq_data.domain)
+ ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+}
+IRQ_ATTR_RO(hwirq);
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ ret = sprintf(buf, "%s\n",
+ irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+
+}
+IRQ_ATTR_RO(type);
+
+static ssize_t name_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ if (desc->name)
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+}
+IRQ_ATTR_RO(name);
+
+static ssize_t actions_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ struct irqaction *action;
+ ssize_t ret = 0;
+ char *p = "";
+
+ raw_spin_lock_irq(&desc->lock);
+ for (action = desc->action; action != NULL; action = action->next) {
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ p, action->name);
+ p = ",";
+ }
+ raw_spin_unlock_irq(&desc->lock);
+
+ if (ret)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+
+ return ret;
+}
+IRQ_ATTR_RO(actions);
+
+static struct attribute *irq_attrs[] = {
+ &per_cpu_count_attr.attr,
+ &chip_name_attr.attr,
+ &hwirq_attr.attr,
+ &type_attr.attr,
+ &name_attr.attr,
+ &actions_attr.attr,
+ NULL
+};
+
+static struct kobj_type irq_kobj_type = {
+ .release = irq_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = irq_attrs,
+};
+
+static void irq_sysfs_add(int irq, struct irq_desc *desc)
+{
+ if (irq_kobj_base) {
+ /*
+ * Continue even in case of failure as this is nothing
+ * crucial.
+ */
+ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
+ pr_warn("Failed to add kobject for irq %d\n", irq);
+ }
+}
+
+static int __init irq_sysfs_init(void)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ /* Prevent concurrent irq alloc/free */
+ irq_lock_sparse();
+
+ irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
+ if (!irq_kobj_base) {
+ irq_unlock_sparse();
+ return -ENOMEM;
+ }
+
+ /* Add the already allocated interrupts */
+ for_each_irq_desc(irq, desc)
+ irq_sysfs_add(irq, desc);
+ irq_unlock_sparse();
+
+ return 0;
+}
+postcore_initcall(irq_sysfs_init);
+
+#else /* !CONFIG_SYSFS */
+
+static struct kobj_type irq_kobj_type = {
+ .release = irq_kobj_release,
+};
+
+static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
+
+#endif /* CONFIG_SYSFS */
+
static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
@@ -158,7 +339,9 @@ void irq_unlock_sparse(void)
mutex_unlock(&sparse_irq_lock);
}
-static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
+static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
+ const struct cpumask *affinity,
+ struct module *owner)
{
struct irq_desc *desc;
gfp_t gfp = GFP_KERNEL;
@@ -178,7 +361,9 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
init_rcu_head(&desc->rcu);
- desc_set_defaults(irq, desc, node, owner);
+ desc_set_defaults(irq, desc, node, affinity, owner);
+ irqd_set(&desc->irq_data, flags);
+ kobject_init(&desc->kobj, &irq_kobj_type);
return desc;
@@ -189,15 +374,22 @@ err_desc:
return NULL;
}
-static void delayed_free_desc(struct rcu_head *rhp)
+static void irq_kobj_release(struct kobject *kobj)
{
- struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
free_masks(desc);
free_percpu(desc->kstat_irqs);
kfree(desc);
}
+static void delayed_free_desc(struct rcu_head *rhp)
+{
+ struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+
+ kobject_put(&desc->kobj);
+}
+
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -209,8 +401,12 @@ static void free_desc(unsigned int irq)
* kstat_irq_usr(). Once we deleted the descriptor from the
* sparse tree we can free it. Access in proc will fail to
* lookup the descriptor.
+ *
+ * The sysfs entry must be serialized against a concurrent
+ * irq_sysfs_init() as well.
*/
mutex_lock(&sparse_irq_lock);
+ kobject_del(&desc->kobj);
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
@@ -223,17 +419,36 @@ static void free_desc(unsigned int irq)
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
- struct module *owner)
+ const struct cpumask *affinity, struct module *owner)
{
+ const struct cpumask *mask = NULL;
struct irq_desc *desc;
+ unsigned int flags;
int i;
+ /* Validate affinity mask(s) */
+ if (affinity) {
+ for (i = 0, mask = affinity; i < cnt; i++, mask++) {
+ if (cpumask_empty(mask))
+ return -EINVAL;
+ }
+ }
+
+ flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+ mask = NULL;
+
for (i = 0; i < cnt; i++) {
- desc = alloc_desc(start + i, node, owner);
+ if (affinity) {
+ node = cpu_to_node(cpumask_first(affinity));
+ mask = affinity;
+ affinity++;
+ }
+ desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
+ irq_sysfs_add(start + i, desc);
mutex_unlock(&sparse_irq_lock);
}
return start;
@@ -277,7 +492,7 @@ int __init early_irq_init(void)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
- desc = alloc_desc(i, node, NULL);
+ desc = alloc_desc(i, node, 0, NULL, NULL);
set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
@@ -311,7 +526,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], GFP_KERNEL, node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- desc_set_defaults(i, &desc[i], node, NULL);
+ desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
}
@@ -328,11 +543,12 @@ static void free_desc(unsigned int irq)
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+ const struct cpumask *affinity,
struct module *owner)
{
u32 i;
@@ -453,12 +669,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
* @cnt: Number of consecutive irqs to allocate.
* @node: Preferred node on which the irq descriptor should be allocated
* @owner: Owning module (can be NULL)
+ * @affinity: Optional pointer to an affinity mask array of size @cnt which
+ * hints where the irq descriptors should be allocated and which
+ * default affinities to use
*
* Returns the first irq number or error code
*/
int __ref
__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner)
+ struct module *owner, const struct cpumask *affinity)
{
int start, ret;
@@ -494,7 +713,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
bitmap_set(allocated_irqs, start, cnt);
mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, owner);
+ return alloc_descs(start, cnt, node, affinity, owner);
err:
mutex_unlock(&sparse_irq_lock);
@@ -512,7 +731,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
*/
unsigned int irq_alloc_hwirqs(int cnt, int node)
{
- int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
+ int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
if (irq < 0)
return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 503c5b9dd030..8c0a0ae43521 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
- * @of_node: optional device-tree node of the interrupt controller
+ * @fwnode: firmware node for the interrupt controller
* @size: Size of linear map; 0 for radix mapping only
* @hwirq_max: Maximum number of interrupts supported by controller
* @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
@@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
const struct irq_domain_ops *ops,
void *host_data)
{
+ struct device_node *of_node = to_of_node(fwnode);
struct irq_domain *domain;
- struct device_node *of_node;
-
- of_node = to_of_node(fwnode);
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
GFP_KERNEL, of_node_to_nid(of_node));
@@ -139,12 +137,7 @@ void irq_domain_remove(struct irq_domain *domain)
{
mutex_lock(&irq_domain_mutex);
- /*
- * radix_tree_delete() takes care of destroying the root
- * node when all entries are removed. Shout if there are
- * any mappings left.
- */
- WARN_ON(domain->revmap_tree.height);
+ WARN_ON(!radix_tree_empty(&domain->revmap_tree));
list_del(&domain->link);
@@ -486,7 +479,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -572,6 +565,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
+ struct irq_data *irq_data;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
int virq;
@@ -593,15 +587,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (irq_domain_translate(domain, fwspec, &hwirq, &type))
return 0;
- if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * WARN if the irqchip returns a type with bits
+ * outside the sense mask set and clear these bits.
+ */
+ if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
+ type &= IRQ_TYPE_SENSE_MASK;
+
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq) {
/*
- * If we've already configured this interrupt,
- * don't do it again, or hell will break loose.
+ * If the trigger type is not specified or matches the
+ * current trigger type then we are done so return the
+ * interrupt number.
*/
- virq = irq_find_mapping(domain, hwirq);
- if (virq)
+ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
+ return virq;
+
+ /*
+ * If the trigger type has not been set yet, then set
+ * it now and return the interrupt number.
+ */
+ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
+ irq_data = irq_get_irq_data(virq);
+ if (!irq_data)
+ return 0;
+
+ irqd_set_trigger_type(irq_data, type);
return virq;
+ }
+ pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
+ hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
+ return 0;
+ }
+
+ if (irq_domain_is_hierarchy(domain)) {
virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
if (virq <= 0)
return 0;
@@ -612,10 +637,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
return virq;
}
- /* Set type if specified and different than the current one */
- if (type != IRQ_TYPE_NONE &&
- type != irq_get_trigger_type(virq))
- irq_set_irq_type(virq, type);
+ irq_data = irq_get_irq_data(virq);
+ if (!irq_data) {
+ if (irq_domain_is_hierarchy(domain))
+ irq_domain_free_irqs(virq, 1);
+ else
+ irq_dispose_mapping(virq);
+ return 0;
+ }
+
+ /* Store trigger type */
+ irqd_set_trigger_type(irq_data, type);
+
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
@@ -645,8 +678,12 @@ void irq_dispose_mapping(unsigned int virq)
if (WARN_ON(domain == NULL))
return;
- irq_domain_disassociate(domain, virq);
- irq_free_desc(virq);
+ if (irq_domain_is_hierarchy(domain)) {
+ irq_domain_free_irqs(virq, 1);
+ } else {
+ irq_domain_disassociate(domain, virq);
+ irq_free_desc(virq);
+ }
}
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -829,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
if (WARN_ON(intsize < 1))
return -EINVAL;
*out_hwirq = intspec[0];
- *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+ if (intsize > 1)
+ *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+ else
+ *out_type = IRQ_TYPE_NONE;
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
@@ -840,19 +880,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
- int node)
+ int node, const struct cpumask *affinity)
{
unsigned int hint;
if (virq >= 0) {
- virq = irq_alloc_descs(virq, virq, cnt, node);
+ virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
+ affinity);
} else {
hint = hwirq % nr_irqs;
if (hint == 0)
hint++;
- virq = irq_alloc_descs_from(hint, cnt, node);
- if (virq <= 0 && hint > 1)
- virq = irq_alloc_descs_from(1, cnt, node);
+ virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
+ affinity);
+ if (virq <= 0 && hint > 1) {
+ virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE,
+ affinity);
+ }
}
return virq;
@@ -1100,6 +1144,7 @@ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
}
irq_domain_free_irqs_parent(domain, virq, nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common);
/**
* irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
@@ -1148,8 +1193,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
if (recursive)
ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
nr_irqs, arg);
- if (ret >= 0)
- ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
if (ret < 0 && recursive)
irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
@@ -1164,6 +1211,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
* @node: NUMA node id for memory allocation
* @arg: domain specific argument
* @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
*
* Allocate IRQ numbers and initialized all data structures to support
* hierarchy IRQ domains.
@@ -1179,7 +1227,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
*/
int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
unsigned int nr_irqs, int node, void *arg,
- bool realloc)
+ bool realloc, const struct cpumask *affinity)
{
int i, ret, virq;
@@ -1197,7 +1245,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
- virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
+ affinity);
if (virq < 0) {
pr_debug("cannot allocate IRQ(base %d, count %d)\n",
irq_base, nr_irqs);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef0bc02c3a70..0c5f1a5db654 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
-static int __irq_can_set_affinity(struct irq_desc *desc)
+static bool __irq_can_set_affinity(struct irq_desc *desc)
{
if (!desc || !irqd_can_balance(&desc->irq_data) ||
!desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
- return 0;
- return 1;
+ return false;
+ return true;
}
/**
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)
}
/**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq: Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return __irq_can_set_affinity(desc) &&
+ !irqd_affinity_is_managed(&desc->irq_data);
+}
+
+/**
* irq_set_thread_affinity - Notify irq threads to adjust affinity
* @desc: irq descriptor which has affitnity changed
*
@@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
return 0;
/*
- * Preserve an userspace affinity setup, but make sure that
- * one of the targets is online.
+ * Preserve the managed affinity setting and an userspace affinity
+ * setup, but make sure that one of the targets is online.
*/
- if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+ if (irqd_affinity_is_managed(&desc->irq_data) ||
+ irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
if (cpumask_intersects(desc->irq_common_data.affinity,
cpu_online_mask))
set = desc->irq_common_data.affinity;
@@ -653,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
return 0;
}
- flags &= IRQ_TYPE_SENSE_MASK;
-
if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
if (!irqd_irq_masked(&desc->irq_data))
mask_irq(desc);
@@ -662,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
unmask = 1;
}
- /* caller masked out all except trigger mode flags */
+ /* Mask all flags except trigger mode */
+ flags &= IRQ_TYPE_SENSE_MASK;
ret = chip->irq_set_type(&desc->irq_data, flags);
switch (ret) {
@@ -1117,6 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->irq = irq;
/*
+ * If the trigger type is not specified by the caller,
+ * then use the default for this interrupt.
+ */
+ if (!(new->flags & IRQF_TRIGGER_MASK))
+ new->flags |= irqd_get_trigger_type(&desc->irq_data);
+
+ /*
* Check whether the interrupt nests into another interrupt
* thread.
*/
@@ -1409,10 +1431,18 @@ int setup_irq(unsigned int irq, struct irqaction *act)
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return -EINVAL;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
chip_bus_sync_unlock(desc);
+ if (retval)
+ irq_chip_pm_put(&desc->irq_data);
+
return retval;
}
EXPORT_SYMBOL_GPL(setup_irq);
@@ -1506,6 +1536,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
}
}
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
kfree(action->secondary);
return action;
@@ -1648,11 +1679,18 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->dev_id = dev_id;
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0) {
+ kfree(action);
+ return retval;
+ }
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
chip_bus_sync_unlock(desc);
if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
kfree(action->secondary);
kfree(action);
}
@@ -1730,7 +1768,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
if (!desc)
return;
+ /*
+ * If the trigger type is not specified by the caller, then
+ * use the default for this interrupt.
+ */
type &= IRQ_TYPE_SENSE_MASK;
+ if (type == IRQ_TYPE_NONE)
+ type = irqd_get_trigger_type(&desc->irq_data);
+
if (type != IRQ_TYPE_NONE) {
int ret;
@@ -1822,6 +1867,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
unregister_handler_proc(irq, action);
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
return action;
@@ -1884,10 +1930,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
chip_bus_sync_unlock(desc);
+ if (retval)
+ irq_chip_pm_put(&desc->irq_data);
+
return retval;
}
@@ -1931,12 +1985,20 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->percpu_dev_id = dev_id;
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0) {
+ kfree(action);
+ return retval;
+ }
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
chip_bus_sync_unlock(desc);
- if (retval)
+ if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
kfree(action);
+ }
return retval;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 38e89ce7b071..8a3e872798f3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,20 +18,42 @@
/* Temparory solution for building, will be removed later */
#include <linux/pci.h>
-struct msi_desc *alloc_msi_entry(struct device *dev)
+/**
+ * alloc_msi_entry - Allocate an initialize msi_entry
+ * @dev: Pointer to the device for which this is allocated
+ * @nvec: The number of vectors used in this entry
+ * @affinity: Optional pointer to an affinity mask array size of @nvec
+ *
+ * If @affinity is not NULL then a an affinity array[@nvec] is allocated
+ * and the affinity masks from @affinity are copied.
+ */
+struct msi_desc *
+alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
{
- struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ struct msi_desc *desc;
+
+ desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
+ desc->nvec_used = nvec;
+ if (affinity) {
+ desc->affinity = kmemdup(affinity,
+ nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ if (!desc->affinity) {
+ kfree(desc);
+ return NULL;
+ }
+ }
return desc;
}
void free_msi_entry(struct msi_desc *entry)
{
+ kfree(entry->affinity);
kfree(entry);
}
@@ -324,7 +346,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_domain_ops *ops = info->ops;
msi_alloc_info_t arg;
struct msi_desc *desc;
- int i, ret, virq = -1;
+ int i, ret, virq;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
@@ -332,13 +354,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
for_each_msi_entry(desc, dev) {
ops->set_desc(&arg, desc);
- if (info->flags & MSI_FLAG_IDENTITY_MAP)
- virq = (int)ops->get_hwirq(info, &arg);
- else
- virq = -1;
- virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
- dev_to_node(dev), &arg, false);
+ virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
+ dev_to_node(dev), &arg, false,
+ desc->affinity);
if (virq < 0) {
ret = -ENOSPC;
if (ops->handle_error)
@@ -356,11 +375,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
ops->msi_finish(&arg, 0);
for_each_msi_entry(desc, dev) {
+ virq = desc->irq;
if (desc->nvec_used == 1)
dev_dbg(dev, "irq %d for MSI\n", virq);
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ irq_domain_activate_irq(irq_data);
+ }
}
return 0;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e1b94726818..feaa813b84a9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
cpumask_var_t new_value;
int err;
- if (!irq_can_set_affinity(irq) || no_irq_affinity)
+ if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
return -EIO;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
!name_unique(irq, action))
return;
- memset(name, 0, MAX_NAMELEN);
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
@@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
if (desc->dir)
goto out_unlock;
- memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
@@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
#endif
remove_proc_entry("spurious", desc->dir);
- memset(name, 0, MAX_NAMELEN);
sprintf(name, "%u", irq);
remove_proc_entry(name, root_irq_dir);
}
@@ -421,12 +418,8 @@ void init_irq_proc(void)
/*
* Create entries for all existing IRQs.
*/
- for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
+ for_each_irq_desc(irq, desc)
register_irq_proc(irq, desc);
- }
}
#ifdef CONFIG_GENERIC_IRQ_SHOW
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..93ad6c1fb9b6 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
+#include <linux/bug.h>
#ifdef HAVE_JUMP_LABEL
@@ -56,15 +57,81 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
static void jump_label_update(struct static_key *key);
+/*
+ * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
+ * The use of 'atomic_read()' requires atomic.h and its problematic for some
+ * kernel headers such as kernel.h and others. Since static_key_count() is not
+ * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
+ * to have it be a function here. Similarly, for 'static_key_enable()' and
+ * 'static_key_disable()', which require bug.h. This should allow jump_label.h
+ * to be included from most/all places for HAVE_JUMP_LABEL.
+ */
+int static_key_count(struct static_key *key)
+{
+ /*
+ * -1 means the first static_key_slow_inc() is in progress.
+ * static_key_enabled() must return true, so return 1 here.
+ */
+ int n = atomic_read(&key->enabled);
+
+ return n >= 0 ? n : 1;
+}
+EXPORT_SYMBOL_GPL(static_key_count);
+
+void static_key_enable(struct static_key *key)
+{
+ int count = static_key_count(key);
+
+ WARN_ON_ONCE(count < 0 || count > 1);
+
+ if (!count)
+ static_key_slow_inc(key);
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable(struct static_key *key)
+{
+ int count = static_key_count(key);
+
+ WARN_ON_ONCE(count < 0 || count > 1);
+
+ if (count)
+ static_key_slow_dec(key);
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
void static_key_slow_inc(struct static_key *key)
{
+ int v, v1;
+
STATIC_KEY_CHECK_USE();
- if (atomic_inc_not_zero(&key->enabled))
- return;
+
+ /*
+ * Careful if we get concurrent static_key_slow_inc() calls;
+ * later calls must wait for the first one to _finish_ the
+ * jump_label_update() process. At the same time, however,
+ * the jump_label_update() call below wants to see
+ * static_key_enabled(&key) for jumps to be updated properly.
+ *
+ * So give a special meaning to negative key->enabled: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+ v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+ if (likely(v1 == v))
+ return;
+ }
jump_label_lock();
- if (atomic_inc_return(&key->enabled) == 1)
+ if (atomic_read(&key->enabled) == 0) {
+ atomic_set(&key->enabled, -1);
jump_label_update(key);
+ atomic_set(&key->enabled, 1);
+ } else {
+ atomic_inc(&key->enabled);
+ }
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +139,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
static void __static_key_slow_dec(struct static_key *key,
unsigned long rate_limit, struct delayed_work *work)
{
+ /*
+ * The negative count check is valid even when a negative
+ * key->enabled is in use by static_key_slow_inc(); a
+ * __static_key_slow_dec() before the first static_key_slow_inc()
+ * returns is unbalanced, because all other static_key_slow_inc()
+ * instances block while the update is in progress.
+ */
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
@@ -205,6 +279,18 @@ void __init jump_label_init(void)
struct static_key *key = NULL;
struct jump_entry *iter;
+ /*
+ * Since we are initializing the static_key.enabled field with
+ * with the 'raw' int values (to avoid pulling in atomic.h) in
+ * jump_label.h, let's make sure that is safe. There are only two
+ * cases to check since we initialize to 0 or 1.
+ */
+ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
+ BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
+
+ if (static_key_initialized)
+ return;
+
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
@@ -254,11 +340,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
+ preempt_disable();
mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ preempt_enable();
+
if (!mod)
return 0;
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
return __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
@@ -422,7 +511,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
return notifier_from_errno(ret);
}
-struct notifier_block jump_label_module_nb = {
+static struct notifier_block jump_label_module_nb = {
.notifier_call = jump_label_module_notify,
.priority = 1, /* higher than tracepoints */
};
diff --git a/kernel/kcov.c b/kernel/kcov.c
index a02f2dddd1d7..8d44b3fea9d0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {
static int __init kcov_init(void)
{
- if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+ /*
+ * The kcov debugfs file won't ever get removed and thus,
+ * there is no need to protect it against removal races. The
+ * use of debugfs_create_file_unsafe() is actually safe here.
+ */
+ if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
pr_err("failed to create kcov in debugfs\n");
return -ENOMEM;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ee70aef5cd81..980936a90ee6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
if (kexec_on_panic) {
/* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ if ((entry < phys_to_boot_phys(crashk_res.start)) ||
+ (entry > phys_to_boot_phys(crashk_res.end)))
return -EADDRNOTAVAIL;
}
@@ -103,6 +104,65 @@ out_free_image:
return ret;
}
+static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
+ struct kexec_segment __user *segments, unsigned long flags)
+{
+ struct kimage **dest_image, *image;
+ unsigned long i;
+ int ret;
+
+ if (flags & KEXEC_ON_CRASH) {
+ dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ } else {
+ dest_image = &kexec_image;
+ }
+
+ if (nr_segments == 0) {
+ /* Uninstall image */
+ kimage_free(xchg(dest_image, NULL));
+ return 0;
+ }
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
+ * we corrupt it.
+ */
+ kimage_free(xchg(&kexec_crash_image, NULL));
+ }
+
+ ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
+ if (ret)
+ return ret;
+
+ if (flags & KEXEC_PRESERVE_CONTEXT)
+ image->preserve_context = 1;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < nr_segments; i++) {
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /* Install the new kernel and uninstall the old */
+ image = xchg(dest_image, image);
+
+out:
+ if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
+ kimage_free(image);
+ return ret;
+}
+
/*
* Exec Kernel system call: for obvious reasons only root may call it.
*
@@ -127,7 +187,6 @@ out_free_image:
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
{
- struct kimage **dest_image, *image;
int result;
/* We only trust the superuser with rebooting the system. */
@@ -152,9 +211,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (nr_segments > KEXEC_SEGMENT_MAX)
return -EINVAL;
- image = NULL;
- result = 0;
-
/* Because we write directly to the reserved memory
* region when loading crash kernels we need a mutex here to
* prevent multiple crash kernels from attempting to load
@@ -166,53 +222,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (!mutex_trylock(&kexec_mutex))
return -EBUSY;
- dest_image = &kexec_image;
- if (flags & KEXEC_ON_CRASH)
- dest_image = &kexec_crash_image;
- if (nr_segments > 0) {
- unsigned long i;
-
- if (flags & KEXEC_ON_CRASH) {
- /*
- * Loading another kernel to switch to if this one
- * crashes. Free any current crash dump kernel before
- * we corrupt it.
- */
-
- kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- crash_map_reserved_pages();
- } else {
- /* Loading another kernel to reboot into. */
-
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- }
- if (result)
- goto out;
-
- if (flags & KEXEC_PRESERVE_CONTEXT)
- image->preserve_context = 1;
- result = machine_kexec_prepare(image);
- if (result)
- goto out;
-
- for (i = 0; i < nr_segments; i++) {
- result = kimage_load_segment(image, &image->segment[i]);
- if (result)
- goto out;
- }
- kimage_terminate(image);
- if (flags & KEXEC_ON_CRASH)
- crash_unmap_reserved_pages();
- }
- /* Install the new kernel, and Uninstall the old */
- image = xchg(dest_image, image);
+ result = do_kexec_load(entry, nr_segments, segments, flags);
-out:
mutex_unlock(&kexec_mutex);
- kimage_free(image);
return result;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3ee3b86..561675589511 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p)
return 0;
}
+int kexec_crash_loaded(void)
+{
+ return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
/*
* When kexec transitions to the new kernel there is a one-to-one
* mapping between physical and virtual addresses. On processors
@@ -140,6 +146,7 @@ int kexec_should_crash(struct task_struct *p)
* allocating pages whose destination address we do not care about.
*/
#define KIMAGE_NO_DEST (-1UL)
+#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
static struct page *kimage_alloc_page(struct kimage *image,
gfp_t gfp_mask,
@@ -147,8 +154,9 @@ static struct page *kimage_alloc_page(struct kimage *image,
int sanity_check_segment_list(struct kimage *image)
{
- int result, i;
+ int i;
unsigned long nr_segments = image->nr_segments;
+ unsigned long total_pages = 0;
/*
* Verify we have good destination addresses. The caller is
@@ -163,16 +171,17 @@ int sanity_check_segment_list(struct kimage *image)
* simply because addresses are changed to page size
* granularity.
*/
- result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
+ if (mstart > mend)
+ return -EADDRNOTAVAIL;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
+ return -EADDRNOTAVAIL;
if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
+ return -EADDRNOTAVAIL;
}
/* Verify our destination addresses do not overlap.
@@ -180,7 +189,6 @@ int sanity_check_segment_list(struct kimage *image)
* through very weird things can happen with no
* easy explanation as one segment stops on another.
*/
- result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
unsigned long j;
@@ -194,7 +202,7 @@ int sanity_check_segment_list(struct kimage *image)
pend = pstart + image->segment[j].memsz;
/* Do the segments overlap ? */
if ((mend > pstart) && (mstart < pend))
- return result;
+ return -EINVAL;
}
}
@@ -203,12 +211,26 @@ int sanity_check_segment_list(struct kimage *image)
* and it is easier to check up front than to be surprised
* later on.
*/
- result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
+ return -EINVAL;
+ }
+
+ /*
+ * Verify that no more than half of memory will be consumed. If the
+ * request from userspace is too large, a large amount of time will be
+ * wasted allocating pages, which can cause a soft lockup.
+ */
+ for (i = 0; i < nr_segments; i++) {
+ if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+ return -EINVAL;
+
+ total_pages += PAGE_COUNT(image->segment[i].memsz);
}
+ if (total_pages > totalram_pages / 2)
+ return -EINVAL;
+
/*
* Verify we have good destination addresses. Normally
* the caller is responsible for making certain we don't
@@ -220,16 +242,15 @@ int sanity_check_segment_list(struct kimage *image)
*/
if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
+ if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
+ (mend > phys_to_boot_phys(crashk_res.end)))
+ return -EADDRNOTAVAIL;
}
}
@@ -352,7 +373,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
if (!pages)
break;
- pfn = page_to_pfn(pages);
+ pfn = page_to_boot_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
eaddr = epfn << PAGE_SHIFT;
@@ -478,7 +499,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
return -ENOMEM;
ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
image->entry = ind_page;
image->last_entry = ind_page +
((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -533,13 +554,13 @@ void kimage_terminate(struct kimage *image)
#define for_each_kimage_entry(image, ptr, entry) \
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
static void kimage_free_entry(kimage_entry_t entry)
{
struct page *page;
- page = pfn_to_page(entry >> PAGE_SHIFT);
+ page = boot_pfn_to_page(entry >> PAGE_SHIFT);
kimage_free_pages(page);
}
@@ -633,7 +654,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
* have a match.
*/
list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
if (addr == destination) {
list_del(&page->lru);
return page;
@@ -648,12 +669,12 @@ static struct page *kimage_alloc_page(struct kimage *image,
if (!page)
return NULL;
/* If the page cannot be used file it away */
- if (page_to_pfn(page) >
+ if (page_to_boot_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
list_add(&page->lru, &image->unusable_pages);
continue;
}
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
/* If it is the destination page we want use it */
if (addr == destination)
@@ -676,7 +697,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
struct page *old_page;
old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
copy_highpage(page, old_page);
*old = addr | (*old & ~PAGE_MASK);
@@ -732,7 +753,7 @@ static int kimage_load_normal_segment(struct kimage *image,
result = -ENOMEM;
goto out;
}
- result = kimage_add_page(image, page_to_pfn(page)
+ result = kimage_add_page(image, page_to_boot_pfn(page)
<< PAGE_SHIFT);
if (result < 0)
goto out;
@@ -793,7 +814,7 @@ static int kimage_load_crash_segment(struct kimage *image,
char *ptr;
size_t uchunk, mchunk;
- page = pfn_to_page(maddr >> PAGE_SHIFT);
+ page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
if (!page) {
result = -ENOMEM;
goto out;
@@ -893,6 +914,7 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
+ printk_nmi_flush_on_panic();
__crash_kexec(regs);
/*
@@ -920,7 +942,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
unsigned long addr;
for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+ free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -953,7 +975,6 @@ int crash_shrink_memory(unsigned long new_size)
start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
- crash_map_reserved_pages();
crash_free_reserved_phys_range(end, crashk_res.end);
if ((start == end) && (crashk_res.parent != NULL))
@@ -967,7 +988,6 @@ int crash_shrink_memory(unsigned long new_size)
crashk_res.end = end - 1;
insert_resource(&iomem_resource, ram_res);
- crash_unmap_reserved_pages();
unlock:
mutex_unlock(&kexec_mutex);
@@ -1375,7 +1395,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
void __weak arch_crash_save_vmcoreinfo(void)
{}
-unsigned long __weak paddr_vmcoreinfo_note(void)
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
return __pa((unsigned long)(char *)&vmcoreinfo_note);
}
@@ -1410,7 +1430,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_STRUCT_SIZE(list_head);
VMCOREINFO_SIZE(nodemask_t);
VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, _refcount);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
@@ -1552,13 +1572,14 @@ int kernel_kexec(void)
}
/*
- * Add and remove page tables for crashkernel memory
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
*
* Provide an empty default implementation here -- architecture
* code may override this
*/
-void __weak crash_map_reserved_pages(void)
+void __weak arch_kexec_protect_crashkres(void)
{}
-void __weak crash_unmap_reserved_pages(void)
+void __weak arch_kexec_unprotect_crashkres(void)
{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c72d2ff5896e..037c321c5618 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
return -EBUSY;
dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
+ if (flags & KEXEC_FILE_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }
if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
exchange:
image = xchg(dest_image, image);
out:
+ if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);
return ret;
@@ -881,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
return 0;
out:
vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
return ret;
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 152da4a48867..ee1bc1bb8feb 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", !!kexec_crash_image);
+ return sprintf(buf, "%d\n", kexec_crash_loaded());
}
KERNEL_ATTR_RO(kexec_crash_loaded);
@@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size);
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lx %x\n",
- paddr_vmcoreinfo_note(),
+ phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
+ return sprintf(buf, "%pa %x\n", &vmcore_base,
(unsigned int)sizeof(vmcoreinfo_note));
}
KERNEL_ATTR_RO(vmcoreinfo);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9ff173dca1ae..4ab4c3766a80 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
static struct kthread *to_live_kthread(struct task_struct *k)
{
struct completion *vfork = ACCESS_ONCE(k->vfork_done);
- if (likely(vfork))
+ if (likely(vfork) && try_get_task_stack(k))
return __to_kthread(vfork);
return NULL;
}
@@ -425,8 +425,10 @@ void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = to_live_kthread(k);
- if (kthread)
+ if (kthread) {
__kthread_unpark(k, kthread);
+ put_task_stack(k);
+ }
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -455,6 +457,7 @@ int kthread_park(struct task_struct *k)
wait_for_completion(&kthread->parked);
}
}
+ put_task_stack(k);
ret = 0;
}
return ret;
@@ -490,6 +493,7 @@ int kthread_stop(struct task_struct *k)
__kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
+ put_task_stack(k);
}
ret = k->exit_code;
put_task_struct(k);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index d68fbf63b083..af4643873e71 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,8 @@
#include <linux/list.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
#include <asm/cacheflush.h>
/**
@@ -204,76 +206,108 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-/*
- * external symbols are located outside the parent object (where the parent
- * object is either vmlinux or the kmod being patched).
- */
-static int klp_find_external_symbol(struct module *pmod, const char *name,
- unsigned long *addr)
+static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
{
- const struct kernel_symbol *sym;
-
- /* first, check if it's an exported symbol */
- preempt_disable();
- sym = find_symbol(name, NULL, NULL, true, true);
- if (sym) {
- *addr = sym->value;
- preempt_enable();
- return 0;
- }
- preempt_enable();
+ int i, cnt, vmlinux, ret;
+ char objname[MODULE_NAME_LEN];
+ char symname[KSYM_NAME_LEN];
+ char *strtab = pmod->core_kallsyms.strtab;
+ Elf_Rela *relas;
+ Elf_Sym *sym;
+ unsigned long sympos, addr;
/*
- * Check if it's in another .o within the patch module. This also
- * checks that the external symbol is unique.
+ * Since the field widths for objname and symname in the sscanf()
+ * call are hard-coded and correspond to MODULE_NAME_LEN and
+ * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN
+ * and KSYM_NAME_LEN have the values we expect them to have.
+ *
+ * Because the value of MODULE_NAME_LEN can differ among architectures,
+ * we use the smallest/strictest upper bound possible (56, based on
+ * the current definition of MODULE_NAME_LEN) to prevent overflows.
*/
- return klp_find_object_symbol(pmod->name, name, 0, addr);
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+ relas = (Elf_Rela *) relasec->sh_addr;
+ /* For each rela in this klp relocation section */
+ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
+ sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info);
+ if (sym->st_shndx != SHN_LIVEPATCH) {
+ pr_err("symbol %s is not marked as a livepatch symbol",
+ strtab + sym->st_name);
+ return -EINVAL;
+ }
+
+ /* Format: .klp.sym.objname.symname,sympos */
+ cnt = sscanf(strtab + sym->st_name,
+ ".klp.sym.%55[^.].%127[^,],%lu",
+ objname, symname, &sympos);
+ if (cnt != 3) {
+ pr_err("symbol %s has an incorrectly formatted name",
+ strtab + sym->st_name);
+ return -EINVAL;
+ }
+
+ /* klp_find_object_symbol() treats a NULL objname as vmlinux */
+ vmlinux = !strcmp(objname, "vmlinux");
+ ret = klp_find_object_symbol(vmlinux ? NULL : objname,
+ symname, sympos, &addr);
+ if (ret)
+ return ret;
+
+ sym->st_value = addr;
+ }
+
+ return 0;
}
static int klp_write_object_relocations(struct module *pmod,
struct klp_object *obj)
{
- int ret = 0;
- unsigned long val;
- struct klp_reloc *reloc;
+ int i, cnt, ret = 0;
+ const char *objname, *secname;
+ char sec_objname[MODULE_NAME_LEN];
+ Elf_Shdr *sec;
if (WARN_ON(!klp_is_object_loaded(obj)))
return -EINVAL;
- if (WARN_ON(!obj->relocs))
- return -EINVAL;
+ objname = klp_is_module(obj) ? obj->name : "vmlinux";
- module_disable_ro(pmod);
+ /* For each klp relocation section */
+ for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) {
+ sec = pmod->klp_info->sechdrs + i;
+ secname = pmod->klp_info->secstrings + sec->sh_name;
+ if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
+ continue;
- for (reloc = obj->relocs; reloc->name; reloc++) {
- /* discover the address of the referenced symbol */
- if (reloc->external) {
- if (reloc->sympos > 0) {
- pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
- reloc->name);
- ret = -EINVAL;
- goto out;
- }
- ret = klp_find_external_symbol(pmod, reloc->name, &val);
- } else
- ret = klp_find_object_symbol(obj->name,
- reloc->name,
- reloc->sympos,
- &val);
+ /*
+ * Format: .klp.rela.sec_objname.section_name
+ * See comment in klp_resolve_symbols() for an explanation
+ * of the selected field width value.
+ */
+ cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname);
+ if (cnt != 1) {
+ pr_err("section %s has an incorrectly formatted name",
+ secname);
+ ret = -EINVAL;
+ break;
+ }
+
+ if (strcmp(objname, sec_objname))
+ continue;
+
+ ret = klp_resolve_symbols(sec, pmod);
if (ret)
- goto out;
+ break;
- ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
- val + reloc->addend);
- if (ret) {
- pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
- reloc->name, val, ret);
- goto out;
- }
+ ret = apply_relocate_add(pmod->klp_info->sechdrs,
+ pmod->core_kallsyms.strtab,
+ pmod->klp_info->symndx, i, pmod);
+ if (ret)
+ break;
}
-out:
- module_enable_ro(pmod);
return ret;
}
@@ -298,6 +332,19 @@ unlock:
rcu_read_unlock();
}
+/*
+ * Convert a function address into the appropriate ftrace location.
+ *
+ * Usually this is just the address of the function, but on some architectures
+ * it's more complicated so allow them to provide a custom behaviour.
+ */
+#ifndef klp_get_ftrace_location
+static unsigned long klp_get_ftrace_location(unsigned long faddr)
+{
+ return faddr;
+}
+#endif
+
static void klp_disable_func(struct klp_func *func)
{
struct klp_ops *ops;
@@ -312,8 +359,14 @@ static void klp_disable_func(struct klp_func *func)
return;
if (list_is_singular(&ops->func_stack)) {
+ unsigned long ftrace_loc;
+
+ ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ if (WARN_ON(!ftrace_loc))
+ return;
+
WARN_ON(unregister_ftrace_function(&ops->fops));
- WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
list_del_rcu(&func->stack_node);
list_del(&ops->node);
@@ -338,6 +391,15 @@ static int klp_enable_func(struct klp_func *func)
ops = klp_find_ops(func->old_addr);
if (!ops) {
+ unsigned long ftrace_loc;
+
+ ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ if (!ftrace_loc) {
+ pr_err("failed to find location for function '%s'\n",
+ func->old_name);
+ return -EINVAL;
+ }
+
ops = kzalloc(sizeof(*ops), GFP_KERNEL);
if (!ops)
return -ENOMEM;
@@ -352,7 +414,7 @@ static int klp_enable_func(struct klp_func *func)
INIT_LIST_HEAD(&ops->func_stack);
list_add_rcu(&func->stack_node, &ops->func_stack);
- ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+ ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
if (ret) {
pr_err("failed to set ftrace filter for function '%s' (%d)\n",
func->old_name, ret);
@@ -363,7 +425,7 @@ static int klp_enable_func(struct klp_func *func)
if (ret) {
pr_err("failed to register ftrace handler for function '%s' (%d)\n",
func->old_name, ret);
- ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+ ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
goto err;
}
@@ -483,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
list_prev_entry(patch, list)->state == KLP_DISABLED)
return -EBUSY;
- pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
- add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
-
pr_notice("enabling patch '%s'\n", patch->mod->name);
klp_for_each_object(patch, obj) {
@@ -683,6 +742,9 @@ static void klp_free_patch(struct klp_patch *patch)
static int klp_init_func(struct klp_object *obj, struct klp_func *func)
{
+ if (!func->old_name || !func->new_func)
+ return -EINVAL;
+
INIT_LIST_HEAD(&func->stack_node);
func->state = KLP_DISABLED;
@@ -696,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
+/* Arches may override this to finish any remaining arch-specific tasks */
+void __weak arch_klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+}
+
/* parts of the initialization that is done only when the object is loaded */
static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
@@ -703,12 +771,16 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
- if (obj->relocs) {
- ret = klp_write_object_relocations(patch->mod, obj);
- if (ret)
- return ret;
+ module_disable_ro(patch->mod);
+ ret = klp_write_object_relocations(patch->mod, obj);
+ if (ret) {
+ module_enable_ro(patch->mod, true);
+ return ret;
}
+ arch_klp_init_object_loaded(patch, obj);
+ module_enable_ro(patch->mod, true);
+
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
@@ -842,12 +914,18 @@ int klp_register_patch(struct klp_patch *patch)
{
int ret;
- if (!klp_initialized())
- return -ENODEV;
-
if (!patch || !patch->mod)
return -EINVAL;
+ if (!is_livepatch_module(patch->mod)) {
+ pr_err("module %s is not marked as a livepatch module",
+ patch->mod->name);
+ return -EINVAL;
+ }
+
+ if (!klp_initialized())
+ return -ENODEV;
+
/*
* A reference is taken on the patch module to prevent it from being
* unloaded. Right now, we don't allow patch modules to unload since
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 31322a4275cd..6f88e352cd4f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
deleted file mode 100644
index 951cfcd10b4a..000000000000
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-
-void lg_lock_init(struct lglock *lg, char *name)
-{
- LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-
-void lg_local_lock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-
-void lg_local_unlock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
-{
- BUG_ON(cpu1 == cpu2);
-
- /* lock in cpu order, just like lg_global_lock */
- if (cpu2 < cpu1)
- swap(cpu1, cpu2);
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
-}
-
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
-{
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
- preempt_enable();
-}
-
-void lg_global_lock(struct lglock *lg)
-{
- int i;
-
- preempt_disable();
- lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_lock(lock);
- }
-}
-EXPORT_SYMBOL(lg_global_lock);
-
-void lg_global_unlock(struct lglock *lg)
-{
- int i;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_unlock(lock);
- }
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81f1a7107c0e..589d763a49b3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -46,6 +46,7 @@
#include <linux/gfp.h>
#include <linux/kmemcheck.h>
#include <linux/random.h>
+#include <linux/jhash.h>
#include <asm/sections.h>
@@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE];
* It's a 64-bit hash, because it's important for the keys to be
* unique.
*/
-#define iterate_chain_key(key1, key2) \
- (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
- ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
- (key2))
+static inline u64 iterate_chain_key(u64 key, u32 idx)
+{
+ u32 k0 = key, k1 = key >> 32;
+
+ __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
+
+ return k0 | (u64)k1 << 32;
+}
void lockdep_off(void)
{
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+ struct task_struct *task)
{
SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
/* Mark the current thread as blocked on the lock: */
- ti->task->blocked_on = waiter;
+ task->blocked_on = waiter;
}
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+ struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
- DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
- DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
- ti->task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(waiter->task != task);
+ DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
+ task->blocked_on = NULL;
list_del_init(&waiter->list);
waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..57a871ae3c81 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,21 +20,21 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
extern void debug_mutex_add_waiter(struct mutex *lock,
struct mutex_waiter *waiter,
- struct thread_info *ti);
+ struct task_struct *task);
extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti);
+ struct task_struct *task);
extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current;
+ WRITE_ONCE(lock->owner, current);
}
static inline void mutex_clear_owner(struct mutex *lock)
{
- lock->owner = NULL;
+ WRITE_ONCE(lock->owner, NULL);
}
#define spin_lock_mutex(lock, flags) \
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e364b424b019..a70b90db3909 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
if (!hold_ctx)
return 0;
- if (unlikely(ctx == hold_ctx))
- return -EALREADY;
-
if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
(ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
#ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
unsigned long flags;
int ret;
+ if (use_ww_ctx) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+ return -EALREADY;
+ }
+
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto skip_wait;
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+ debug_mutex_add_waiter(lock, &waiter, task);
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
__set_task_state(task, TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current_thread_info());
+ mutex_remove_waiter(lock, &waiter, task);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
@@ -602,7 +605,7 @@ skip_wait:
return 0;
err:
- mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ mutex_remove_waiter(lock, &waiter, task);
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..6cd6b8e9efd7 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,18 +13,24 @@
do { spin_lock(lock); (void)(flags); } while (0)
#define spin_unlock_mutex(lock, flags) \
do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
__list_del((waiter)->list.prev, (waiter)->list.next)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * The mutex owner can get read and written to locklessly.
+ * We should use WRITE_ONCE when writing the owner value to
+ * avoid store tearing, otherwise, a thread could potentially
+ * read a partially written and incomplete owner value.
+ */
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current;
+ WRITE_ONCE(lock->owner, current);
}
static inline void mutex_clear_owner(struct mutex *lock)
{
- lock->owner = NULL;
+ WRITE_ONCE(lock->owner, NULL);
}
#else
static inline void mutex_set_owner(struct mutex *lock)
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..ce182599cf2e 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,151 +8,186 @@
#include <linux/sched.h>
#include <linux/errno.h>
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
+ sem->read_count = alloc_percpu(int);
+ if (unlikely(!sem->read_count))
return -ENOMEM;
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ __init_rwsem(&sem->rw_sem, name, rwsem_key);
+ init_waitqueue_head(&sem->writer);
+ sem->readers_block = 0;
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
{
/*
* XXX: temporary kludge. The error path in alloc_super()
* assumes that percpu_free_rwsem() is safe after kzalloc().
*/
- if (!brw->fast_read_ctr)
+ if (!sem->read_count)
return;
- rcu_sync_dtor(&brw->rss);
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->read_count);
+ sem->read_count = NULL; /* catch use after free bugs */
}
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
{
- bool success;
+ /*
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
+ *
+ * If the reader misses the writer's assignment of readers_block, then
+ * the writer is guaranteed to see the reader's increment.
+ *
+ * Conversely, any readers that increment their sem->read_count after
+ * the writer looks are guaranteed to see the readers_block value,
+ * which in turn means that they are guaranteed to immediately
+ * decrement their sem->read_count, so that it doesn't matter that the
+ * writer missed them.
+ */
- preempt_disable();
- success = rcu_sync_is_idle(&brw->rss);
- if (likely(success))
- __this_cpu_add(*brw->fast_read_ctr, val);
- preempt_enable();
+ smp_mb(); /* A matches D */
- return success;
-}
+ /*
+ * If !readers_block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(!smp_load_acquire(&sem->readers_block)))
+ return 1;
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
- might_sleep();
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+ /*
+ * Per the above comment; we still have preemption disabled and
+ * will thus decrement on the same CPU as we incremented.
+ */
+ __percpu_up_read(sem);
- if (likely(update_fast_ctr(brw, +1)))
- return;
+ if (try)
+ return 0;
- /* Avoid rwsem_acquire_read() and rwsem_release() */
- __down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
-}
-EXPORT_SYMBOL_GPL(percpu_down_read);
+ /*
+ * We either call schedule() in the wait, or we'll fall through
+ * and reschedule on the preempt_enable() in percpu_down_read().
+ */
+ preempt_enable_no_resched();
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
-{
- if (unlikely(!update_fast_ctr(brw, +1))) {
- if (!__down_read_trylock(&brw->rw_sem))
- return 0;
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
- }
-
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+ /*
+ * Avoid lockdep for the down/up_read() we already have them.
+ */
+ __down_read(&sem->rw_sem);
+ this_cpu_inc(*sem->read_count);
+ __up_read(&sem->rw_sem);
+
+ preempt_disable();
return 1;
}
+EXPORT_SYMBOL_GPL(__percpu_down_read);
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
- if (likely(update_fast_ctr(brw, -1)))
- return;
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to aggregate
+ * zero, as that is the only time it matters) they will also see our
+ * critical section.
+ */
+ __this_cpu_dec(*sem->read_count);
- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
+ /* Prod writer to recheck readers_active */
+ wake_up(&sem->writer);
}
-EXPORT_SYMBOL_GPL(percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- unsigned int sum = 0;
- int cpu;
+ if (per_cpu_sum(*sem->read_count) != 0)
+ return false;
+
+ /*
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
+ */
- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
+ smp_mb(); /* C matches B */
- return sum;
+ return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);
+
+ down_write(&sem->rw_sem);
+
/*
- * Make rcu_sync_is_idle() == F and thus disable the fast-path in
- * percpu_down_read() and percpu_up_read(), and wait for gp pass.
- *
- * The latter synchronises us with the preceding readers which used
- * the fast-past, so we can not miss the result of __this_cpu_add()
- * or anything else inside their criticial sections.
+ * Notify new readers to block; up until now, and thus throughout the
+ * longish rcu_sync_enter() above, new readers could still come in.
*/
- rcu_sync_enter(&brw->rss);
+ WRITE_ONCE(sem->readers_block, 1);
- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
+ smp_mb(); /* D matches A */
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+ /*
+ * If they don't see our writer of readers_block, then we are
+ * guaranteed to see their sem->read_count increment, and therefore
+ * will wait for them.
+ */
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+ /* Wait for all now active readers to complete. */
+ wait_event(sem->writer, readers_active_check(sem));
}
EXPORT_SYMBOL_GPL(percpu_down_write);
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
/*
- * Enable the fast-path in percpu_down_read() and percpu_up_read()
- * but only after another gp pass; this adds the necessary barrier
- * to ensure the reader can't miss the changes done by us.
+ * Signal the writer is done, no fast path yet.
+ *
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
+ *
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
+ */
+ smp_store_release(&sem->readers_block, 0);
+
+ /*
+ * Release the write lock, this will allow readers back in the game.
+ */
+ up_write(&sem->rw_sem);
+
+ /*
+ * Once this completes (at least one RCU-sched grace period hence) the
+ * reader fast path will be available again. Safe to use outside the
+ * exclusive write lock because its counting.
*/
- rcu_sync_exit(&brw->rss);
+ rcu_sync_exit(&sem->rss);
}
EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec082338668..19248ddf37ce 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
rspin_until_writer_unlock(lock, cnts);
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..b2caec7315af 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
* therefore increment the cpu number by one.
*/
-static inline u32 encode_tail(int cpu, int idx)
+static inline __pure u32 encode_tail(int cpu, int idx)
{
u32 tail;
@@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx)
return tail;
}
-static inline struct mcs_spinlock *decode_tail(u32 tail)
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
{
int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
@@ -267,6 +267,123 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
+/*
+ * Various notes on spin_is_locked() and spin_unlock_wait(), which are
+ * 'interesting' functions:
+ *
+ * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
+ * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
+ * PPC). Also qspinlock has a similar issue per construction, the setting of
+ * the locked byte can be unordered acquiring the lock proper.
+ *
+ * This gets to be 'interesting' in the following cases, where the /should/s
+ * end up false because of this issue.
+ *
+ *
+ * CASE 1:
+ *
+ * So the spin_is_locked() correctness issue comes from something like:
+ *
+ * CPU0 CPU1
+ *
+ * global_lock(); local_lock(i)
+ * spin_lock(&G) spin_lock(&L[i])
+ * for (i) if (!spin_is_locked(&G)) {
+ * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
+ * return;
+ * }
+ * // deal with fail
+ *
+ * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
+ * that there is exclusion between the two critical sections.
+ *
+ * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
+ * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
+ * /should/ be constrained by the ACQUIRE from spin_lock(&G).
+ *
+ * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
+ *
+ *
+ * CASE 2:
+ *
+ * For spin_unlock_wait() there is a second correctness issue, namely:
+ *
+ * CPU0 CPU1
+ *
+ * flag = set;
+ * smp_mb(); spin_lock(&l)
+ * spin_unlock_wait(&l); if (!flag)
+ * // add to lockless list
+ * spin_unlock(&l);
+ * // iterate lockless list
+ *
+ * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
+ * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
+ * semantics etc..)
+ *
+ * Where flag /should/ be ordered against the locked store of l.
+ */
+
+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
+ *
+ * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
+ * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+ u32 val;
+
+ for (;;) {
+ val = atomic_read(&lock->val);
+
+ if (!val) /* not locked, we're done */
+ goto done;
+
+ if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+ break;
+
+ /* not locked, but pending, wait until we observe the lock */
+ cpu_relax();
+ }
+
+ /* any unlock is good */
+ while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+ cpu_relax();
+
+done:
+ smp_acquire__after_ctrl_dep();
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
+
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
@@ -358,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
- smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
/*
* take ownership and clear the pending bit.
@@ -395,6 +512,8 @@ queue:
* pending stuff.
*
* p,*,* -> n,*,*
+ *
+ * RELEASE, such that the stores to @node must be complete.
*/
old = xchg_tail(lock, tail);
next = NULL;
@@ -405,6 +524,15 @@ queue:
*/
if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);
+ /*
+ * The above xchg_tail() is also a load of @lock which generates,
+ * through decode_tail(), a pointer.
+ *
+ * The address dependency matches the RELEASE of xchg_tail()
+ * such that the access to @prev must happen after.
+ */
+ smp_read_barrier_depends();
+
WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev);
@@ -434,7 +562,7 @@ queue:
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
- * smp_cond_acquire() call. As the next PV queue head hasn't been
+ * smp_cond_load_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +573,7 @@ queue:
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
- smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+ val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
locked:
/*
@@ -465,9 +593,9 @@ locked:
break;
}
/*
- * The smp_cond_acquire() call above has provided the necessary
- * acquire semantics required for locking. At most two
- * iterations of this loop may be ran.
+ * The smp_cond_load_acquire() call above has provided the
+ * necessary acquire semantics required for locking. At most
+ * two iterations of this loop may be ran.
*/
old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
@@ -491,7 +619,7 @@ release:
/*
* release the node
*/
- this_cpu_dec(mcs_nodes[0].count);
+ __this_cpu_dec(mcs_nodes[0].count);
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57f68b3..e3b5520005db 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -70,11 +70,14 @@ struct pv_node {
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
- int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
- qstat_inc(qstat_pv_lock_stealing, ret);
- return ret;
+ if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ qstat_inc(qstat_pv_lock_stealing, true);
+ return true;
+ }
+
+ return false;
}
/*
@@ -112,12 +115,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
{
- atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+ atomic_or(_Q_PENDING_VAL, &lock->val);
}
static __always_inline void clear_pending(struct qspinlock *lock)
{
- atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
}
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
@@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
static inline bool
pv_wait_early(struct pv_node *prev, int loop)
{
-
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
@@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
- int waitcnt = 0;
int loop;
bool wait_early;
- /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
- for (;; waitcnt++) {
+ for (;;) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
@@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
if (!READ_ONCE(node->locked)) {
qstat_inc(qstat_pv_wait_node, true);
- qstat_inc(qstat_pv_wait_again, waitcnt);
qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
}
@@ -450,18 +449,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
goto gotlock;
}
}
- WRITE_ONCE(pn->state, vcpu_halted);
+ WRITE_ONCE(pn->state, vcpu_hashed);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
- * The unlocker should have freed the lock before kicking the
- * CPU. So if the lock is still not free, it is a spurious
- * wakeup or another vCPU has stolen the lock. The current
- * vCPU should spin again.
+ * Because of lock stealing, the queue head vCPU may not be
+ * able to acquire the lock before it has to wait again.
*/
- qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
@@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e025309845..eb0a599fcf58 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -24,8 +24,8 @@
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
* pv_lock_slowpath - # of locking operations via the slowpath
* pv_lock_stealing - # of lock stealing operations
- * pv_spurious_wakeup - # of spurious wakeups
- * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
+ * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
+ * pv_wait_again - # of wait's after a queue head vCPU kick
* pv_wait_early - # of early vCPU wait's
* pv_wait_head - # of vCPU wait's at the queue head
* pv_wait_node - # of vCPU wait's at a non-head queue node
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
*/
if ((counter == qstat_pv_latency_kick) ||
(counter == qstat_pv_latency_wake)) {
- stat = 0;
if (kicks)
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..1ec0f48962b3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
- if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
return 0;
return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09e30c6225e5..2337b4bb2366 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
- sem->count = RWSEM_UNLOCKED_VALUE;
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -114,97 +114,106 @@ enum rwsem_wake_type {
* - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
* - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
* - there must be someone on the queue
- * - the spinlock must be held by the caller
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
+ * - writers are only marked woken if downgrading is false
*/
-static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+static void __rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- struct list_head *next;
- long oldcount, woken, loop, adjustment;
+ struct rwsem_waiter *waiter, *tmp;
+ long oldcount, woken = 0, adjustment = 0;
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY)
- /* Wake writer at the front of the queue, but do not
- * grant it the lock yet as we want other writers
- * to be able to steal it. Readers, on the other hand,
- * will block as they will notice the queued writer.
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
*/
- wake_up_process(waiter->task);
- goto out;
+ wake_q_add(wake_q, waiter->task);
+ }
+
+ return;
}
- /* Writers might steal the lock before we grant it to the next reader.
+ /*
+ * Writers might steal the lock before we grant it to the next reader.
* We prefer to do the first reader grant before counting readers
* so we can bail out early if a writer stole the lock.
*/
- adjustment = 0;
if (wake_type != RWSEM_WAKE_READ_OWNED) {
adjustment = RWSEM_ACTIVE_READ_BIAS;
try_reader_grant:
- oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /* A writer stole the lock. Undo our reader grant. */
- if (rwsem_atomic_update(-adjustment, sem) &
- RWSEM_ACTIVE_MASK)
- goto out;
+ /*
+ * If the count is still less than RWSEM_WAITING_BIAS
+ * after removing the adjustment, it is assumed that
+ * a writer has stolen the lock. We have to undo our
+ * reader grant.
+ */
+ if (atomic_long_add_return(-adjustment, &sem->count) <
+ RWSEM_WAITING_BIAS)
+ return;
+
/* Last active locker left. Retry waking readers. */
goto try_reader_grant;
}
+ /*
+ * It is not really necessary to set it to reader-owned here,
+ * but it gives the spinners an early indication that the
+ * readers now have the lock.
+ */
+ rwsem_set_reader_owned(sem);
}
- /* Grant an infinite number of read locks to the readers at the front
- * of the queue. Note we increment the 'active part' of the count by
- * the number of readers before waking any processes up.
+ /*
+ * Grant an infinite number of read locks to the readers at the front
+ * of the queue. We know that woken will be at least 1 as we accounted
+ * for above. Note we increment the 'active part' of the count by the
+ * number of readers before waking any processes up.
*/
- woken = 0;
- do {
- woken++;
+ list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ struct task_struct *tsk;
- if (waiter->list.next == &sem->wait_list)
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE)
break;
- waiter = list_entry(waiter->list.next,
- struct rwsem_waiter, list);
+ woken++;
+ tsk = waiter->task;
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+ wake_q_add(wake_q, tsk);
+ list_del(&waiter->list);
+ /*
+ * Ensure that the last operation is setting the reader
+ * waiter to nil such that rwsem_down_read_failed() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
+ */
+ smp_store_release(&waiter->task, NULL);
+ }
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
- if (waiter->type != RWSEM_WAITING_FOR_WRITE)
+ if (list_empty(&sem->wait_list)) {
/* hit end of list above */
adjustment -= RWSEM_WAITING_BIAS;
+ }
if (adjustment)
- rwsem_atomic_add(adjustment, sem);
-
- next = sem->wait_list.next;
- loop = woken;
- do {
- waiter = list_entry(next, struct rwsem_waiter, list);
- next = waiter->list.next;
- tsk = waiter->task;
- /*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
- */
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- } while (--loop);
-
- sem->wait_list.next = next;
- next->prev = &sem->wait_list;
-
- out:
- return sem;
+ atomic_long_add(adjustment, &sem->count);
}
/*
@@ -216,11 +225,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
struct task_struct *tsk = current;
+ WAKE_Q(wake_q);
- /* set up my own style of waitqueue */
waiter.task = tsk;
waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list))
@@ -228,9 +236,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ count = atomic_long_add_return(adjustment, &sem->count);
- /* If there are no active locks, wake the front queued process(es).
+ /*
+ * If there are no active locks, wake the front queued process(es).
*
* If there are no writers and we are first in the queue,
* wake our own waiter to join the existing active readers !
@@ -238,9 +247,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
if (count == RWSEM_WAITING_BIAS ||
(count > RWSEM_WAITING_BIAS &&
adjustment != -RWSEM_ACTIVE_READ_BIAS))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
/* wait to be given the lock */
while (true) {
@@ -255,17 +265,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
}
EXPORT_SYMBOL(rwsem_down_read_failed);
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ */
static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
{
/*
- * Try acquiring the write lock. Check count first in order
- * to reduce unnecessary expensive cmpxchg() operations.
+ * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
+ */
+ if (count != RWSEM_WAITING_BIAS)
+ return false;
+
+ /*
+ * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
+ * are other tasks on the wait list, we need to add on WAITING_BIAS.
*/
- if (count == RWSEM_WAITING_BIAS &&
- cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
- RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
- if (!list_is_singular(&sem->wait_list))
- rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ count = list_is_singular(&sem->wait_list) ?
+ RWSEM_ACTIVE_WRITE_BIAS :
+ RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
+
+ if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
+ == RWSEM_WAITING_BIAS) {
rwsem_set_owner(sem);
return true;
}
@@ -279,13 +301,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = READ_ONCE(sem->count);
+ long old, count = atomic_long_read(&sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
- old = cmpxchg_acquire(&sem->count, count,
+ old = atomic_long_cmpxchg_acquire(&sem->count, count,
count + RWSEM_ACTIVE_WRITE_BIAS);
if (old == count) {
rwsem_set_owner(sem);
@@ -306,16 +328,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!owner) {
- long count = READ_ONCE(sem->count);
+ if (!rwsem_owner_is_writer(owner)) {
/*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath with the lock being active, then there is a possibility
- * reader(s) may have the lock. To be safe, bail spinning in these
- * situations.
+ * Don't spin if the rwsem is readers owned.
*/
- if (count & RWSEM_ACTIVE_MASK)
- ret = false;
+ ret = !rwsem_owner_is_reader(owner);
goto done;
}
@@ -325,10 +342,15 @@ done:
return ret;
}
-static noinline
-bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+/*
+ * Return true only if we can still spin on the owner field of the rwsem.
+ */
+static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- long count;
+ struct task_struct *owner = READ_ONCE(sem->owner);
+
+ if (!rwsem_owner_is_writer(owner))
+ goto out;
rcu_read_lock();
while (sem->owner == owner) {
@@ -349,22 +371,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
cpu_relax_lowlatency();
}
rcu_read_unlock();
-
- if (READ_ONCE(sem->owner))
- return true; /* new owner, continue spinning */
-
+out:
/*
- * When the owner is not set, the lock could be free or
- * held by readers. Check the counter to verify the
- * state.
+ * If there is a new owner or the owner is not set, we continue
+ * spinning.
*/
- count = READ_ONCE(sem->count);
- return (count == 0 || count == RWSEM_WAITING_BIAS);
+ return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
- struct task_struct *owner;
bool taken = false;
preempt_disable();
@@ -376,12 +392,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
if (!osq_lock(&sem->osq))
goto done;
- while (true) {
- owner = READ_ONCE(sem->owner);
- if (owner && !rwsem_spin_on_owner(sem, owner))
- break;
-
- /* wait_lock will be acquired if write_lock is obtained */
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock as we can't determine if they are
+ * actively running or not.
+ */
+ while (rwsem_spin_on_owner(sem)) {
+ /*
+ * Try to acquire the lock
+ */
if (rwsem_try_write_lock_unqueued(sem)) {
taken = true;
break;
@@ -393,7 +414,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
- if (!owner && (need_resched() || rt_task(current)))
+ if (!sem->owner && (need_resched() || rt_task(current)))
break;
/*
@@ -440,9 +461,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
+ WAKE_Q(wake_q);
/* undo write bias from down_write operation, stop active locking */
- count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
/* do optimistic spinning and steal lock if possible */
if (rwsem_optimistic_spin(sem))
@@ -465,18 +487,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = READ_ONCE(sem->count);
+ count = atomic_long_read(&sem->count);
/*
* If there were already threads queued before us and there are
* no active writers, the lock must be read owned; so we try to
* wake any read locks that were queued ahead of us.
*/
- if (count > RWSEM_WAITING_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+ if (count > RWSEM_WAITING_BIAS) {
+ WAKE_Q(wake_q);
+
+ __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+ /*
+ * The wakeup is normally called _after_ the wait_lock
+ * is released, but given that we are proactively waking
+ * readers we can deal with the wake_q overhead as it is
+ * similar to releasing and taking the wait_lock again
+ * for attempting rwsem_try_write_lock().
+ */
+ wake_up_q(&wake_q);
+ }
} else
- count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
/* wait until we successfully acquire the lock */
set_current_state(state);
@@ -492,7 +525,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
schedule();
set_current_state(state);
- } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+ } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
@@ -507,10 +540,11 @@ out_nolock:
raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
if (list_empty(&sem->wait_list))
- rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
else
- __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
return ERR_PTR(-EINTR);
}
@@ -537,6 +571,7 @@ __visible
struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
/*
* If a spinner is present, it is not necessary to do the wakeup.
@@ -571,11 +606,11 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
locked:
- /* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
@@ -590,14 +625,15 @@ __visible
struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- /* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c817216c1615..45ba475d4be3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read);
@@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem)
{
int ret = __down_read_trylock(sem);
- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_reader_owned(sem);
+ }
return ret;
}
@@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
- rwsem_clear_owner(sem);
+ rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
@@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read_nested);
@@ -173,6 +177,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_nested);
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_owner(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable_nested);
+
void up_read_non_owner(struct rw_semaphore *sem)
{
__up_read(sem);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 870ed9a5b426..a699f4048ba1 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,14 +1,58 @@
+/*
+ * The owner field of the rw_semaphore structure will be set to
+ * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * the owner field when it unlocks. A reader, on the other hand, will
+ * not touch the owner field when it unlocks.
+ *
+ * In essence, the owner field now has the following 3 states:
+ * 1) 0
+ * - lock is free or the owner hasn't set the field yet
+ * 2) RWSEM_READER_OWNED
+ * - lock is currently or previously owned by readers (lock is free
+ * or not set by owner yet)
+ * 3) Other non-zero value
+ * - a writer owns the lock
+ */
+#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
- sem->owner = current;
+ WRITE_ONCE(sem->owner, current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
- sem->owner = NULL;
+ WRITE_ONCE(sem->owner, NULL);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ /*
+ * We check the owner value first to make sure that we will only
+ * do a write to the rwsem cacheline when it is really necessary
+ * to minimize cacheline contention.
+ */
+ if (sem->owner != RWSEM_READER_OWNED)
+ WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
+}
+
+static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+{
+ return owner && owner != RWSEM_READER_OWNED;
}
+static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+{
+ return owner == RWSEM_READER_OWNED;
+}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
@@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+}
#endif
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a6d382312e6f..b501e390bb34 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
}
#endif
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+{
+ return (__force void *)ioremap_cache(offset, size);
+}
+#endif
+
static void *try_ram_remap(resource_size_t offset, size_t size)
{
unsigned long pfn = PHYS_PFN(offset);
@@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
/* In the simple case just return the existing linear address */
if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
return __va(offset);
- return NULL; /* fallback to ioremap_cache */
+ return NULL; /* fallback to arch_memremap_wb */
}
/**
@@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size);
if (!addr)
- addr = ioremap_cache(offset, size);
+ addr = arch_memremap_wb(offset, size);
}
/*
@@ -162,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
- return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
-
#ifdef CONFIG_ZONE_DEVICE
static DEFINE_MUTEX(pgmap_lock);
static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -246,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
arch_remove_memory(align_start, align_size);
+ untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
"%s: failed to free all reserved pages\n", __func__);
@@ -281,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
resource_size_t key, align_start, align_size, align_end;
+ pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
int error, nid, is_ram;
@@ -301,12 +304,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
- if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
- dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
- __func__);
- return ERR_PTR(-ENXIO);
- }
-
if (!ref)
return ERR_PTR(-EINVAL);
@@ -356,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (nid < 0)
nid = numa_mem_id();
+ error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
+ align_size);
+ if (error)
+ goto err_pfn_remap;
+
error = arch_add_memory(nid, align_start, align_size, true);
if (error)
goto err_add_memory;
@@ -376,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
return __va(res->start);
err_add_memory:
+ untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ err_pfn_remap:
err_radix:
pgmap_radix_release(res);
devres_free(page_map);
@@ -394,7 +398,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
altmap->alloc -= nr_pfns;
}
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
{
/*
@@ -420,5 +423,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 041200ca4a2d..f57dd63186e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,6 +60,7 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -264,7 +265,7 @@ static void module_assert_mutex_or_preempt(void)
if (unlikely(!debug_locks))
return;
- WARN_ON(!rcu_read_lock_sched_held() &&
+ WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
!lockdep_is_held(&module_mutex));
#endif
}
@@ -336,7 +337,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
* A thread that wants to hold a reference to a module only while it
* is running can call this to safely exit. nfsd and lockd use this.
*/
-void __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_exit(struct module *mod, long code)
{
module_put(mod);
do_exit(code);
@@ -1148,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf)
buf[l++] = 'C';
if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
buf[l++] = 'E';
+ if (mod->taints & (1 << TAINT_LIVEPATCH))
+ buf[l++] = 'K';
/*
* TAINT_FORCED_RMMOD: could be added.
* TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -1693,8 +1696,7 @@ static int module_add_modinfo_attrs(struct module *mod)
temp_attr = mod->modinfo_attrs;
for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
- if (!attr->test ||
- (attr->test && attr->test(mod))) {
+ if (!attr->test || attr->test(mod)) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
error = sysfs_create_file(&mod->mkobj.kobj,
@@ -1858,10 +1860,11 @@ static void mod_sysfs_teardown(struct module *mod)
* from modification and any data from execution.
*
* General layout of module is:
- * [text] [read-only-data] [writable data]
- * text_size -----^ ^ ^
- * ro_size ------------------------| |
- * size -------------------------------------------|
+ * [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^ ^ ^ ^
+ * ro_size ------------------------| | |
+ * ro_after_init_size -----------------------------| |
+ * size -----------------------------------------------------------|
*
* These values are always page-aligned (as is base)
*/
@@ -1884,14 +1887,24 @@ static void frob_rodata(const struct module_layout *layout,
(layout->ro_size - layout->text_size) >> PAGE_SHIFT);
}
+static void frob_ro_after_init(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
static void frob_writable_data(const struct module_layout *layout,
int (*set_memory)(unsigned long start, int num_pages))
{
BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->size - layout->ro_size) >> PAGE_SHIFT);
+ set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+ (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
}
/* livepatching wants to disable read-only so it can frob module. */
@@ -1899,21 +1912,26 @@ void module_disable_ro(const struct module *mod)
{
frob_text(&mod->core_layout, set_memory_rw);
frob_rodata(&mod->core_layout, set_memory_rw);
+ frob_ro_after_init(&mod->core_layout, set_memory_rw);
frob_text(&mod->init_layout, set_memory_rw);
frob_rodata(&mod->init_layout, set_memory_rw);
}
-void module_enable_ro(const struct module *mod)
+void module_enable_ro(const struct module *mod, bool after_init)
{
frob_text(&mod->core_layout, set_memory_ro);
frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro);
frob_rodata(&mod->init_layout, set_memory_ro);
+
+ if (after_init)
+ frob_ro_after_init(&mod->core_layout, set_memory_ro);
}
static void module_enable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_nx);
+ frob_ro_after_init(&mod->core_layout, set_memory_nx);
frob_writable_data(&mod->core_layout, set_memory_nx);
frob_rodata(&mod->init_layout, set_memory_nx);
frob_writable_data(&mod->init_layout, set_memory_nx);
@@ -1922,6 +1940,7 @@ static void module_enable_nx(const struct module *mod)
static void module_disable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_x);
+ frob_ro_after_init(&mod->core_layout, set_memory_x);
frob_writable_data(&mod->core_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_x);
frob_writable_data(&mod->init_layout, set_memory_x);
@@ -1964,6 +1983,8 @@ static void disable_ro_nx(const struct module_layout *layout)
frob_text(layout, set_memory_rw);
frob_rodata(layout, set_memory_rw);
frob_rodata(layout, set_memory_x);
+ frob_ro_after_init(layout, set_memory_rw);
+ frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
@@ -1973,6 +1994,83 @@ static void module_enable_nx(const struct module *mod) { }
static void module_disable_nx(const struct module *mod) { }
#endif
+#ifdef CONFIG_LIVEPATCH
+/*
+ * Persist Elf information about a module. Copy the Elf header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ unsigned int size, symndx;
+ int ret;
+
+ size = sizeof(*mod->klp_info);
+ mod->klp_info = kmalloc(size, GFP_KERNEL);
+ if (mod->klp_info == NULL)
+ return -ENOMEM;
+
+ /* Elf header */
+ size = sizeof(mod->klp_info->hdr);
+ memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+ /* Elf section header table */
+ size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+ mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL);
+ if (mod->klp_info->sechdrs == NULL) {
+ ret = -ENOMEM;
+ goto free_info;
+ }
+ memcpy(mod->klp_info->sechdrs, info->sechdrs, size);
+
+ /* Elf section name string table */
+ size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+ mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL);
+ if (mod->klp_info->secstrings == NULL) {
+ ret = -ENOMEM;
+ goto free_sechdrs;
+ }
+ memcpy(mod->klp_info->secstrings, info->secstrings, size);
+
+ /* Elf symbol section index */
+ symndx = info->index.sym;
+ mod->klp_info->symndx = symndx;
+
+ /*
+ * For livepatch modules, core_kallsyms.symtab is a complete
+ * copy of the original symbol table. Adjust sh_addr to point
+ * to core_kallsyms.symtab since the copy of the symtab in module
+ * init memory is freed at the end of do_init_module().
+ */
+ mod->klp_info->sechdrs[symndx].sh_addr = \
+ (unsigned long) mod->core_kallsyms.symtab;
+
+ return 0;
+
+free_sechdrs:
+ kfree(mod->klp_info->sechdrs);
+free_info:
+ kfree(mod->klp_info);
+ return ret;
+}
+
+static void free_module_elf(struct module *mod)
+{
+ kfree(mod->klp_info->sechdrs);
+ kfree(mod->klp_info->secstrings);
+ kfree(mod->klp_info);
+}
+#else /* !CONFIG_LIVEPATCH */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ return 0;
+}
+
+static void free_module_elf(struct module *mod)
+{
+}
+#endif /* CONFIG_LIVEPATCH */
+
void __weak module_memfree(void *module_region)
{
vfree(module_region);
@@ -2011,6 +2109,9 @@ static void free_module(struct module *mod)
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
+ if (is_livepatch_module(mod))
+ free_module_elf(mod);
+
/* Now we can delete it from the lists */
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
@@ -2126,6 +2227,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
(long)sym[i].st_value);
break;
+ case SHN_LIVEPATCH:
+ /* Livepatch symbols are resolved by livepatch */
+ break;
+
case SHN_UNDEF:
ksym = resolve_symbol_wait(mod, info, name);
/* Ok if resolved. */
@@ -2174,6 +2279,10 @@ static int apply_relocations(struct module *mod, const struct load_info *info)
if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
continue;
+ /* Livepatch relocation sections are applied by livepatch */
+ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+ continue;
+
if (info->sechdrs[i].sh_type == SHT_REL)
err = apply_relocate(info->sechdrs, info->strtab,
info->index.sym, i, mod);
@@ -2218,6 +2327,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
* finder in the two loops below */
{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
};
@@ -2249,7 +2359,11 @@ static void layout_sections(struct module *mod, struct load_info *info)
mod->core_layout.size = debug_align(mod->core_layout.size);
mod->core_layout.ro_size = mod->core_layout.size;
break;
- case 3: /* whole core */
+ case 2: /* RO after init */
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.ro_after_init_size = mod->core_layout.size;
+ break;
+ case 4: /* whole core */
mod->core_layout.size = debug_align(mod->core_layout.size);
break;
}
@@ -2279,7 +2393,14 @@ static void layout_sections(struct module *mod, struct load_info *info)
mod->init_layout.size = debug_align(mod->init_layout.size);
mod->init_layout.ro_size = mod->init_layout.size;
break;
- case 3: /* whole init */
+ case 2:
+ /*
+ * RO after init doesn't apply to init_layout (only
+ * core_layout), so it just takes the value of ro_size.
+ */
+ mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+ break;
+ case 4: /* whole init */
mod->init_layout.size = debug_align(mod->init_layout.size);
break;
}
@@ -2469,7 +2590,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
- if (i == 0 ||
+ if (i == 0 || is_livepatch_module(mod) ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
info->index.pcpu)) {
strtab_size += strlen(&info->strtab[src[i].st_name])+1;
@@ -2528,7 +2649,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
src = mod->kallsyms->symtab;
for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
- if (i == 0 ||
+ if (i == 0 || is_livepatch_module(mod) ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
info->index.pcpu)) {
dst[ndst] = src[i];
@@ -2599,13 +2720,18 @@ static inline void kmemleak_load_module(const struct module *mod,
#endif
#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
int err = -ENOKEY;
const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
const void *mod = info->hdr;
- if (info->len > markerlen &&
+ /*
+ * Require flags == 0, as a module with version information
+ * removed is no longer the module that was signed
+ */
+ if (flags == 0 &&
+ info->len > markerlen &&
memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
info->len -= markerlen;
@@ -2624,7 +2750,7 @@ static int module_sig_check(struct load_info *info)
return err;
}
#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
return 0;
}
@@ -2667,6 +2793,29 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
return 0;
}
+#ifdef CONFIG_LIVEPATCH
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+{
+ if (get_modinfo(info, "livepatch")) {
+ mod->klp = true;
+ add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ }
+
+ return 0;
+}
+#else /* !CONFIG_LIVEPATCH */
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+{
+ if (get_modinfo(info, "livepatch")) {
+ pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+ mod->name);
+ return -ENOEXEC;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -2812,8 +2961,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return -ENOEXEC;
}
- if (!get_modinfo(info, "intree"))
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -2821,6 +2974,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
"is unknown, you have been warned.\n", mod->name);
}
+ err = check_modinfo_livepatch(mod, info);
+ if (err)
+ return err;
+
/* Set up license info based on the info section */
set_license(mod, get_modinfo(info, "license"));
@@ -2978,6 +3135,8 @@ static int move_module(struct module *mod, struct load_info *info)
static int check_module_license_and_versions(struct module *mod)
{
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
/*
* ndiswrapper is under GPL by itself, but loads proprietary modules.
* Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2996,6 +3155,9 @@ static int check_module_license_and_versions(struct module *mod)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
LOCKDEP_NOW_UNRELIABLE);
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -3043,16 +3205,41 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(char *module_name)
+{
+ const char *p;
+ size_t len;
+
+ if (!module_blacklist)
+ return false;
+
+ for (p = module_blacklist; *p; p += len) {
+ len = strcspn(p, ",");
+ if (strlen(module_name) == len && !memcmp(module_name, p, len))
+ return true;
+ if (p[len] == ',')
+ len++;
+ }
+ return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
+ unsigned int ndx;
int err;
mod = setup_load_info(info, flags);
if (IS_ERR(mod))
return mod;
+ if (blacklisted(mod->name))
+ return ERR_PTR(-EPERM);
+
err = check_modinfo(mod, info, flags);
if (err)
return ERR_PTR(err);
@@ -3066,6 +3253,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ /*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+ ndx = find_sec(info, ".data..ro_after_init");
+ if (ndx)
+ info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
special cases for the architectures. */
@@ -3232,12 +3428,14 @@ static noinline int do_init_module(struct module *mod)
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
+ module_enable_ro(mod, true);
mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
mod->init_layout.ro_size = 0;
+ mod->init_layout.ro_after_init_size = 0;
mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
@@ -3329,8 +3527,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
- /* Set RO and NX regions */
- module_enable_ro(mod);
+ module_enable_ro(mod, false);
module_enable_nx(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
@@ -3386,7 +3583,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
long err;
char *after_dashes;
- err = module_sig_check(info);
+ err = module_sig_check(info, flags);
if (err)
goto free_copy;
@@ -3494,6 +3691,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err < 0)
goto coming_cleanup;
+ if (is_livepatch_module(mod)) {
+ err = copy_module_elf(mod, info);
+ if (err < 0)
+ goto sysfs_cleanup;
+ }
+
/* Get rid of temporary copy. */
free_copy(info);
@@ -3502,11 +3705,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
return do_init_module(mod);
+ sysfs_cleanup:
+ mod_sysfs_teardown(mod);
coming_cleanup:
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
-
bug_cleanup:
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 64b9dead4a07..937c844bee4a 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -12,7 +12,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
-#include <keys/system_keyring.h>
+#include <linux/verification.h>
#include <crypto/public_key.h>
#include "module-internal.h"
@@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
return -EBADMSG;
}
- return system_verify_data(mod, modlen, mod + modlen, sig_len,
- VERIFYING_MODULE_SIGNATURE);
+ return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+ NULL, VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
}
diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9c466a..7848f0566403 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
+#include <linux/module.h>
#define MAX_OBJ_NUM 1000
@@ -607,33 +608,6 @@ out_replace:
}
/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- * one is used by parallel workers and the second one
- * by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
- cpumask_var_t cbcpumask)
-{
- int err;
-
- mutex_lock(&pinst->lock);
- get_online_cpus();
-
- err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
- put_online_cpus();
- mutex_unlock(&pinst->lock);
-
- return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
-/**
* padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
* equivalent to @cpumask.
*
@@ -674,6 +648,43 @@ out:
}
EXPORT_SYMBOL(padata_set_cpumask);
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+ int err = 0;
+
+ mutex_lock(&pinst->lock);
+
+ if (pinst->flags & PADATA_INVALID)
+ err = -EINVAL;
+
+ __padata_start(pinst);
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+ mutex_lock(&pinst->lock);
+ __padata_stop(pinst);
+ mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd;
@@ -694,42 +705,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
return 0;
}
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_add_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd = NULL;
@@ -789,95 +764,49 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
}
EXPORT_SYMBOL(padata_remove_cpu);
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- */
-int padata_start(struct padata_instance *pinst)
-{
- int err = 0;
-
- mutex_lock(&pinst->lock);
-
- if (pinst->flags & PADATA_INVALID)
- err =-EINVAL;
-
- __padata_start(pinst);
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
- mutex_lock(&pinst->lock);
- __padata_stop(pinst);
- mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
}
-
-static int padata_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
{
- int err;
struct padata_instance *pinst;
- int cpu = (unsigned long)hcpu;
+ int ret;
- pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+ pinst = hlist_entry_safe(node, struct padata_instance, node);
+ if (!pinst_has_cpu(pinst, cpu))
+ return 0;
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- err = __padata_add_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
- if (err)
- return notifier_from_errno(err);
- break;
+ mutex_lock(&pinst->lock);
+ ret = __padata_add_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ return ret;
+}
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- err = __padata_remove_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
- if (err)
- return notifier_from_errno(err);
- break;
- }
+static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
+{
+ struct padata_instance *pinst;
+ int ret;
- return NOTIFY_OK;
+ pinst = hlist_entry_safe(node, struct padata_instance, node);
+ if (!pinst_has_cpu(pinst, cpu))
+ return 0;
+
+ mutex_lock(&pinst->lock);
+ ret = __padata_remove_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ return ret;
}
+
+static enum cpuhp_state hp_online;
#endif
static void __padata_free(struct padata_instance *pinst)
{
#ifdef CONFIG_HOTPLUG_CPU
- unregister_hotcpu_notifier(&pinst->cpu_notifier);
+ cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
#endif
padata_stop(pinst);
@@ -1075,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
mutex_init(&pinst->lock);
#ifdef CONFIG_HOTPLUG_CPU
- pinst->cpu_notifier.notifier_call = padata_cpu_callback;
- pinst->cpu_notifier.priority = 0;
- register_hotcpu_notifier(&pinst->cpu_notifier);
+ cpuhp_state_add_instance_nocalls(hp_online, &pinst->node);
#endif
-
return pinst;
err_free_masks:
@@ -1091,7 +1017,6 @@ err_free_inst:
err:
return NULL;
}
-EXPORT_SYMBOL(padata_alloc);
/**
* padata_free - free a padata instance
@@ -1103,3 +1028,26 @@ void padata_free(struct padata_instance *pinst)
kobject_put(&pinst->kobj);
}
EXPORT_SYMBOL(padata_free);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static __init int padata_driver_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
+ padata_cpu_online,
+ padata_cpu_prep_down);
+ if (ret < 0)
+ return ret;
+ hp_online = ret;
+ return 0;
+}
+module_init(padata_driver_init);
+
+static __exit void padata_driver_exit(void)
+{
+ cpuhp_remove_multi_state(hp_online);
+}
+module_exit(padata_driver_exit);
+#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 535c96510a44..ca8cea1ef673 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -108,6 +108,7 @@ void panic(const char *fmt, ...)
long i, i_next = 0;
int state = 0;
int old_cpu, this_cpu;
+ bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
@@ -160,8 +161,10 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!crash_kexec_post_notifiers)
+ if (!_crash_kexec_post_notifiers) {
+ printk_nmi_flush_on_panic();
__crash_kexec(NULL);
+ }
/*
* Note smp_send_stop is the usual smp shutdown function, which
@@ -176,6 +179,8 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+ /* Call flush even twice. It tries harder with a single online CPU */
+ printk_nmi_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -187,7 +192,7 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (crash_kexec_post_notifiers)
+ if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
bust_spinlocks(0);
@@ -567,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
-
-static int __init setup_crash_kexec_post_notifiers(char *s)
-{
- crash_kexec_post_notifiers = true;
- return 0;
-}
-early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
static int __init oops_setup(char *s)
{
diff --git a/kernel/pid.c b/kernel/pid.c
index 4d73a834c7e6..f66162f2359b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
- if (IS_ERR_VALUE(nr)) {
+ if (nr < 0) {
retval = nr;
goto out_free;
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba137fd15..df9e8e9e0be7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work)
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
+}
+
+static void dec_pid_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
+}
+
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
+ struct ucounts *ucounts;
int i;
int err;
- if (level > MAX_PID_NS_LEVEL) {
- err = -EINVAL;
+ err = -ENOSPC;
+ if (level > MAX_PID_NS_LEVEL)
+ goto out;
+ ucounts = inc_pid_namespaces(user_ns);
+ if (!ucounts)
goto out;
- }
err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
- goto out;
+ goto out_dec;
ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!ns->pidmap[0].page)
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
+ ns->ucounts = ucounts;
ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
@@ -129,6 +143,8 @@ out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+ dec_pid_namespaces(ucounts);
out:
return ERR_PTR(err);
}
@@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
+ dec_pid_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static struct ns_common *pidns_get_parent(struct ns_common *ns)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *pid_ns, *p;
+
+ /* See if the parent is in the current namespace */
+ pid_ns = p = to_pid_ns(ns)->parent;
+ for (;;) {
+ if (!p)
+ return ERR_PTR(-EPERM);
+ if (p == active)
+ break;
+ p = p->parent;
+ }
+
+ return &get_pid_ns(pid_ns)->ns;
+}
+
+static struct user_namespace *pidns_owner(struct ns_common *ns)
+{
+ return to_pid_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
+ .owner = pidns_owner,
+ .get_parent = pidns_get_parent,
};
static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 68d3ebc12601..e8517b63eb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG
config DPM_WATCHDOG
bool "Device suspend/resume watchdog"
- depends on PM_DEBUG && PSTORE
+ depends on PM_DEBUG && PSTORE && EXPERT
---help---
Sets up a watchdog timer to capture drivers that are
locked up attempting to suspend/resume a device.
@@ -197,7 +197,7 @@ config DPM_WATCHDOG
config DPM_WATCHDOG_TIMEOUT
int "Watchdog timeout in seconds"
range 1 120
- default 60
+ default 120
depends on DPM_WATCHDOG
config PM_TRACE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a14cc39..eb4f717705ba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,8 @@
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+KASAN_SANITIZE_snapshot.o := n
+
obj-y += qos.o
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a0e3..0e781798b0b3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -126,17 +126,17 @@ out:
return ret;
}
-int pm_prepare_console(void)
+void pm_prepare_console(void)
{
if (!pm_vt_switch())
- return 0;
+ return;
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
- return 1;
+ return;
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
- return 0;
+ return;
}
void pm_restore_console(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fca9254280ee..b26dbc48c75b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -52,6 +52,7 @@ enum {
#ifdef CONFIG_SUSPEND
HIBERNATION_SUSPEND,
#endif
+ HIBERNATION_TEST_RESUME,
/* keep last */
__HIBERNATION_AFTER_LAST
};
@@ -299,14 +300,16 @@ static int create_image(int platform_mode)
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ /* Restore control flow magically appears here */
+ restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
- /* Restore control flow magically appears here */
- restore_processor_state();
- if (!in_suspend)
+ if (!in_suspend) {
events_check_enabled = false;
+ clear_free_pages();
+ }
platform_leave(platform_mode);
@@ -409,6 +412,11 @@ int hibernation_snapshot(int platform_mode)
goto Close;
}
+int __weak hibernate_resume_nonboot_cpu_disable(void)
+{
+ return disable_nonboot_cpus();
+}
+
/**
* resume_target_kernel - Restore system state from a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +441,7 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
- error = disable_nonboot_cpus();
+ error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
@@ -642,12 +650,39 @@ static void power_down(void)
cpu_relax();
}
+static int load_image_and_restore(void)
+{
+ int error;
+ unsigned int flags;
+
+ pr_debug("PM: Loading hibernation image.\n");
+
+ lock_device_hotplug();
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Unlock;
+
+ error = swsusp_read(&flags);
+ swsusp_close(FMODE_READ);
+ if (!error)
+ hibernation_restore(flags & SF_PLATFORM_MODE);
+
+ printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+ swsusp_free();
+ free_basic_memory_bitmaps();
+ Unlock:
+ unlock_device_hotplug();
+
+ return error;
+}
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
int hibernate(void)
{
- int error;
+ int error, nr_calls = 0;
+ bool snapshot_test = false;
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +697,11 @@ int hibernate(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Exit;
+ }
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
@@ -697,8 +734,12 @@ int hibernate(void)
pr_debug("PM: writing image.\n");
error = swsusp_write(flags);
swsusp_free();
- if (!error)
- power_down();
+ if (!error) {
+ if (hibernation_mode == HIBERNATION_TEST_RESUME)
+ snapshot_test = true;
+ else
+ power_down();
+ }
in_suspend = 0;
pm_restore_gfp_mask();
} else {
@@ -709,12 +750,18 @@ int hibernate(void)
free_basic_memory_bitmaps();
Thaw:
unlock_device_hotplug();
+ if (snapshot_test) {
+ pr_debug("PM: Checking hibernation image\n");
+ error = swsusp_check();
+ if (!error)
+ error = load_image_and_restore();
+ }
thaw_processes();
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
@@ -740,8 +787,7 @@ int hibernate(void)
*/
static int software_resume(void)
{
- int error;
- unsigned int flags;
+ int error, nr_calls = 0;
/*
* If the user said "noresume".. bail out early.
@@ -827,35 +873,20 @@ static int software_resume(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Close_Finish;
+ }
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
-
- pr_debug("PM: Loading hibernation image.\n");
-
- lock_device_hotplug();
- error = create_basic_memory_bitmaps();
- if (error)
- goto Thaw;
-
- error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
- if (!error)
- hibernation_restore(flags & SF_PLATFORM_MODE);
-
- printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
- swsusp_free();
- free_basic_memory_bitmaps();
- Thaw:
- unlock_device_hotplug();
+ error = load_image_and_restore();
thaw_processes();
Finish:
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
@@ -878,6 +909,7 @@ static const char * const hibernation_modes[] = {
#ifdef CONFIG_SUSPEND
[HIBERNATION_SUSPEND] = "suspend",
#endif
+ [HIBERNATION_TEST_RESUME] = "test_resume",
};
/*
@@ -924,6 +956,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
@@ -970,6 +1003,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
@@ -1115,13 +1149,16 @@ static int __init resume_offset_setup(char *str)
static int __init hibernate_setup(char *str)
{
- if (!strncmp(str, "noresume", 8))
+ if (!strncmp(str, "noresume", 8)) {
noresume = 1;
- else if (!strncmp(str, "nocompress", 10))
+ } else if (!strncmp(str, "nocompress", 10)) {
nocompress = 1;
- else if (!strncmp(str, "no", 2)) {
+ } else if (!strncmp(str, "no", 2)) {
noresume = 1;
nohibernate = 1;
+ } else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+ && !strncmp(str, "protect_image", 13)) {
+ enable_restore_image_protection();
}
return 1;
}
@@ -1154,27 +1191,6 @@ static int __init nohibernate_setup(char *str)
return 1;
}
-static int __init kaslr_nohibernate_setup(char *str)
-{
- return nohibernate_setup(str);
-}
-
-static int __init page_poison_nohibernate_setup(char *str)
-{
-#ifdef CONFIG_PAGE_POISONING_ZERO
- /*
- * The zeroing option for page poison skips the checks on alloc.
- * since hibernation doesn't save free pages there's no way to
- * guarantee the pages will still be zeroed.
- */
- if (!strcmp(str, "on")) {
- pr_info("Disabling hibernation due to page poisoning\n");
- return nohibernate_setup(str);
- }
-#endif
- return 1;
-}
-
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
@@ -1182,5 +1198,3 @@ __setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
-__setup("kaslr", kaslr_nohibernate_setup);
-__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 27946975eff0..281a697fd458 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
{
- int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+ int ret;
+
+ ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+ nr_to_call, nr_calls);
return notifier_to_errno(ret);
}
+int pm_notifier_call_chain(unsigned long val)
+{
+ return __pm_notifier_call_chain(val, -1, NULL);
+}
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
@@ -637,6 +644,7 @@ static int __init pm_init(void)
return error;
hibernate_image_size_init();
hibernate_reserved_size_init();
+ pm_states_init();
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index efe1b3b17c88..56d1d0dedf76 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
}
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+extern int hibernate_resume_nonboot_cpu_disable(void);
+
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
+#ifdef CONFIG_DEBUG_RODATA
+/* kernel/power/snapshot.c */
+extern void enable_restore_image_protection(void);
+#else
+static inline void enable_restore_image_protection(void) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
@@ -101,6 +110,8 @@ extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
extern int hibernate_preallocate_memory(void);
+extern void clear_free_pages(void);
+
/**
* Auxiliary structure used for reading the snapshot image data and
* metadata from and writing them to the list of page backup entries
@@ -200,6 +211,8 @@ static inline void suspend_test_finish(const char *label) {}
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+ int *nr_calls);
extern int pm_notifier_call_chain(unsigned long val);
#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..8f27d5a8adf6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
+ if (wq_busy)
+ show_workqueue_state();
+
if (!wakeup) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -146,6 +149,18 @@ int freeze_processes(void)
if (!error && !oom_killer_disable())
error = -EBUSY;
+ /*
+ * There is a hard to fix race between oom_reaper kernel thread
+ * and oom_killer_disable. oom_reaper calls exit_oom_victim
+ * before the victim reaches exit_mm so try to freeze all the tasks
+ * again and catch such a left over task.
+ */
+ if (!error) {
+ pr_info("Double checking all user space processes after OOM killer disable... ");
+ error = try_to_freeze_tasks(true);
+ pr_cont("\n");
+ }
+
if (error)
thaw_processes();
return error;
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..168ff442ebde 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
- cancel_delayed_work_sync(&req->work);
+ /*
+ * This function may be called very early during boot, for example,
+ * from of_clk_init(), where irq needs to stay disabled.
+ * cancel_delayed_work_sync() assumes that irq is enabled on
+ * invocation and re-enables it on return. Avoid calling it until
+ * workqueue is initialized.
+ */
+ if (keventd_up())
+ cancel_delayed_work_sync(&req->work);
+
__pm_qos_update_request(req, new_value);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..4f0f0604f1c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,6 +38,43 @@
#include "power.h"
+#ifdef CONFIG_DEBUG_RODATA
+static bool hibernate_restore_protection;
+static bool hibernate_restore_protection_active;
+
+void enable_restore_image_protection(void)
+{
+ hibernate_restore_protection = true;
+}
+
+static inline void hibernate_restore_protection_begin(void)
+{
+ hibernate_restore_protection_active = hibernate_restore_protection;
+}
+
+static inline void hibernate_restore_protection_end(void)
+{
+ hibernate_restore_protection_active = false;
+}
+
+static inline void hibernate_restore_protect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ set_memory_ro((unsigned long)page_address, 1);
+}
+
+static inline void hibernate_restore_unprotect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ set_memory_rw((unsigned long)page_address, 1);
+}
+#else
+static inline void hibernate_restore_protection_begin(void) {}
+static inline void hibernate_restore_protection_end(void) {}
+static inline void hibernate_restore_protect_page(void *page_address) {}
+static inline void hibernate_restore_unprotect_page(void *page_address) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
@@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void)
image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
}
-/* List of PBEs needed for restoring the pages that were allocated before
+/*
+ * List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
* directly to their "original" page frames.
*/
struct pbe *restore_pblist;
-/* Pointer to an auxiliary buffer (1 page) */
-static void *buffer;
+/* struct linked_page is used to build chains of pages */
-/**
- * @safe_needed - on resume, for storing the PBE list and the image,
- * we can only use memory pages that do not conflict with the pages
- * used before suspend. The unsafe pages have PageNosaveFree set
- * and we count them using unsafe_pages.
- *
- * Each allocated image page is marked as PageNosave and PageNosaveFree
- * so that swsusp_free() can release it.
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __packed;
+
+/*
+ * List of "safe" pages (ie. pages that were not used by the image kernel
+ * before hibernation) that may be used as temporary storage for image kernel
+ * memory contents.
*/
+static struct linked_page *safe_pages_list;
+
+/* Pointer to an auxiliary buffer (1 page) */
+static void *buffer;
#define PG_ANY 0
#define PG_SAFE 1
@@ -94,6 +138,19 @@ static void *buffer;
static unsigned int allocated_unsafe_pages;
+/**
+ * get_image_page - Allocate a page for a hibernation image.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages that were not used before hibernation (restore only)
+ *
+ * During image restoration, for storing the PBE list and the image data, we can
+ * only use memory pages that do not conflict with the pages used before
+ * hibernation. The "unsafe" pages have PageNosaveFree set and we count them
+ * using allocated_unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree so that
+ * swsusp_free() can release it.
+ */
static void *get_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
@@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
return res;
}
+static void *__get_safe_page(gfp_t gfp_mask)
+{
+ if (safe_pages_list) {
+ void *ret = safe_pages_list;
+
+ safe_pages_list = safe_pages_list->next;
+ memset(ret, 0, PAGE_SIZE);
+ return ret;
+ }
+ return get_image_page(gfp_mask, PG_SAFE);
+}
+
unsigned long get_safe_page(gfp_t gfp_mask)
{
- return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+ return (unsigned long)__get_safe_page(gfp_mask);
}
static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
return page;
}
+static void recycle_safe_page(void *page_address)
+{
+ struct linked_page *lp = page_address;
+
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
+}
+
/**
- * free_image_page - free page represented by @addr, allocated with
- * get_image_page (page flags set by it must be cleared)
+ * free_image_page - Free a page allocated for hibernation image.
+ * @addr: Address of the page to free.
+ * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
+ *
+ * The page to free should have been allocated by get_image_page() (page flags
+ * set by it are affected).
*/
-
static inline void free_image_page(void *addr, int clear_nosave_free)
{
struct page *page;
@@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
__free_page(page);
}
-/* struct linked_page is used to build chains of pages */
-
-#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
-
-struct linked_page {
- struct linked_page *next;
- char data[LINKED_PAGE_DATA_SIZE];
-} __packed;
-
-static inline void
-free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+static inline void free_list_of_pages(struct linked_page *list,
+ int clear_page_nosave)
{
while (list) {
struct linked_page *lp = list->next;
@@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave)
}
}
-/**
- * struct chain_allocator is used for allocating small objects out of
- * a linked list of pages called 'the chain'.
- *
- * The chain grows each time when there is no room for a new object in
- * the current page. The allocated objects cannot be freed individually.
- * It is only possible to free them all at once, by freeing the entire
- * chain.
- *
- * NOTE: The chain allocator may be inefficient if the allocated objects
- * are not much smaller than PAGE_SIZE.
- */
-
+/*
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
struct chain_allocator {
struct linked_page *chain; /* the chain */
unsigned int used_space; /* total size of objects allocated out
- * of the current page
- */
+ of the current page */
gfp_t gfp_mask; /* mask for allocating pages */
int safe_needed; /* if set, only "safe" pages are allocated */
};
-static void
-chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
+ int safe_needed)
{
ca->chain = NULL;
ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
struct linked_page *lp;
- lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+ lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
+ get_image_page(ca->gfp_mask, PG_ANY);
if (!lp)
return NULL;
@@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
}
/**
- * Data types related to memory bitmaps.
+ * Data types related to memory bitmaps.
*
- * Memory bitmap is a structure consiting of many linked lists of
- * objects. The main list's elements are of type struct zone_bitmap
- * and each of them corresonds to one zone. For each zone bitmap
- * object there is a list of objects of type struct bm_block that
- * represent each blocks of bitmap in which information is stored.
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
*
- * struct memory_bitmap contains a pointer to the main list of zone
- * bitmap objects, a struct bm_position used for browsing the bitmap,
- * and a pointer to the list of pages used for allocating all of the
- * zone bitmap objects and bitmap block objects.
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
*
- * NOTE: It has to be possible to lay out the bitmap in memory
- * using only allocations of order 0. Additionally, the bitmap is
- * designed to work with arbitrary number of zones (this is over the
- * top for now, but let's avoid making unnecessary assumptions ;-).
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
*
- * struct zone_bitmap contains a pointer to a list of bitmap block
- * objects and a pointer to the bitmap block object that has been
- * most recently used for setting bits. Additionally, it contains the
- * pfns that correspond to the start and end of the represented zone.
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * PFNs that correspond to the start and end of the represented zone.
*
- * struct bm_block contains a pointer to the memory page in which
- * information is stored (in the form of a block of bitmap)
- * It also contains the pfns that correspond to the start and end of
- * the represented memory area.
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
*
- * The memory bitmap is organized as a radix tree to guarantee fast random
- * access to the bits. There is one radix tree for each zone (as returned
- * from create_mem_extents).
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
*
- * One radix tree is represented by one struct mem_zone_bm_rtree. There are
- * two linked lists for the nodes of the tree, one for the inner nodes and
- * one for the leave nodes. The linked leave nodes are used for fast linear
- * access of the memory bitmap.
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
*
- * The struct rtree_node represents one node of the radix tree.
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
@@ -305,9 +375,8 @@ struct bm_position {
struct memory_bitmap {
struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
- * bitmap objects and bitmap block
- * objects
- */
+ bitmap objects and bitmap block
+ objects */
struct bm_position cur; /* most recently used bit position */
};
@@ -321,12 +390,12 @@ struct memory_bitmap {
#endif
#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
-/*
- * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+/**
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
*
- * This function is used to allocate inner nodes as well as the
- * leave nodes of the radix tree. It also adds the node to the
- * corresponding linked list passed in by the *list parameter.
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
*/
static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
struct chain_allocator *ca,
@@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
return node;
}
-/*
- * add_rtree_block - Add a new leave node to the radix tree
+/**
+ * add_rtree_block - Add a new leave node to the radix tree.
*
- * The leave nodes need to be allocated in order to keep the leaves
- * linked list in order. This is guaranteed by the zone->blocks
- * counter.
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
int safe_needed, struct chain_allocator *ca)
@@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free);
-/*
- * create_zone_bm_rtree - create a radix tree for one zone
+/**
+ * create_zone_bm_rtree - Create a radix tree for one zone.
*
- * Allocated the mem_zone_bm_rtree structure and initializes it.
- * This function also allocated and builds the radix tree for the
- * zone.
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
*/
-static struct mem_zone_bm_rtree *
-create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
- struct chain_allocator *ca,
- unsigned long start, unsigned long end)
+static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
+ int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start,
+ unsigned long end)
{
struct mem_zone_bm_rtree *zone;
unsigned int i, nr_blocks;
@@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
return zone;
}
-/*
- * free_zone_bm_rtree - Free the memory of the radix tree
+/**
+ * free_zone_bm_rtree - Free the memory of the radix tree.
*
- * Free all node pages of the radix tree. The mem_zone_bm_rtree
- * structure itself is not freed here nor are the rtree_node
- * structs.
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
*/
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free)
@@ -492,8 +562,8 @@ struct mem_extent {
};
/**
- * free_mem_extents - free a list of memory extents
- * @list - list of extents to empty
+ * free_mem_extents - Free a list of memory extents.
+ * @list: List of extents to free.
*/
static void free_mem_extents(struct list_head *list)
{
@@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list)
}
/**
- * create_mem_extents - create a list of memory extents representing
- * contiguous ranges of PFNs
- * @list - list to put the extents into
- * @gfp_mask - mask to use for memory allocations
+ * create_mem_extents - Create a list of memory extents.
+ * @list: List to put the extents into.
+ * @gfp_mask: Mask to use for memory allocations.
+ *
+ * The extents represent contiguous ranges of PFNs.
*/
static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
{
@@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
}
/**
- * memory_bm_create - allocate memory for a memory bitmap
- */
-static int
-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+ * memory_bm_create - Allocate memory for a memory bitmap.
+ */
+static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
+ int safe_needed)
{
struct chain_allocator ca;
struct list_head mem_extents;
@@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
/**
- * memory_bm_free - free memory occupied by the memory bitmap @bm
- */
+ * memory_bm_free - Free memory occupied by the memory bitmap.
+ * @bm: Memory bitmap.
+ */
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
struct mem_zone_bm_rtree *zone;
@@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
}
/**
- * memory_bm_find_bit - Find the bit for pfn in the memory
- * bitmap
+ * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
*
- * Find the bit in the bitmap @bm that corresponds to given pfn.
- * The cur.zone, cur.block and cur.node_pfn member of @bm are
- * updated.
- * It walks the radix tree to find the page which contains the bit for
- * pfn and returns the bit position in **addr and *bit_nr.
+ * Find the bit in memory bitmap @bm that corresponds to the given PFN.
+ * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
+ *
+ * Walk the radix tree to find the page containing the bit that represents @pfn
+ * and return the position of the bit in @addr and @bit_nr.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
void **addr, unsigned int *bit_nr)
@@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
zone_found:
/*
- * We have a zone. Now walk the radix tree to find the leave
- * node for our pfn.
+ * We have found the zone. Now walk the radix tree to find the leaf node
+ * for our PFN.
*/
-
node = bm->cur.node;
if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
@@ -754,20 +824,20 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
}
/*
- * rtree_next_node - Jumps to the next leave node
+ * rtree_next_node - Jump to the next leaf node.
*
- * Sets the position to the beginning of the next node in the
- * memory bitmap. This is either the next node in the current
- * zone's radix tree or the first node in the radix tree of the
- * next zone.
+ * Set the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
*
- * Returns true if there is a next node, false otherwise.
+ * Return true if there is a next node, false otherwise.
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur.node = list_entry(bm->cur.node->list.next,
- struct rtree_node, list);
- if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
bm->cur.node_bit = 0;
touch_softlockup_watchdog();
@@ -775,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/* No more nodes, goto next zone */
- bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur.zone->list != &bm->zones) {
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
@@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * @bm: Memory bitmap.
*
- * Starting from the last returned position this function searches
- * for the next set bit in the memory bitmap and returns its
- * number. If no more bit is set BM_END_OF_MAP is returned.
+ * Starting from the last returned position this function searches for the next
+ * set bit in @bm and returns the PFN represented by it. If no more bits are
+ * set, BM_END_OF_MAP is returned.
*
- * It is required to run memory_bm_position_reset() before the
- * first call to this function.
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function for the given memory bitmap.
*/
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
@@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
return BM_END_OF_MAP;
}
-/**
- * This structure represents a range of page frames the contents of which
- * should not be saved during the suspend.
+/*
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during hibernation.
*/
-
struct nosave_region {
struct list_head list;
unsigned long start_pfn;
@@ -832,15 +902,42 @@ struct nosave_region {
static LIST_HEAD(nosave_regions);
+static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ recycle_safe_page(node->data);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ recycle_safe_page(node->data);
+}
+
+static void memory_bm_recycle(struct memory_bitmap *bm)
+{
+ struct mem_zone_bm_rtree *zone;
+ struct linked_page *p_list;
+
+ list_for_each_entry(zone, &bm->zones, list)
+ recycle_zone_bm_rtree(zone);
+
+ p_list = bm->p_list;
+ while (p_list) {
+ struct linked_page *lp = p_list;
+
+ p_list = lp->next;
+ recycle_safe_page(lp);
+ }
+}
+
/**
- * register_nosave_region - register a range of page frames the contents
- * of which should not be saved during the suspend (to be used in the early
- * initialization code)
+ * register_nosave_region - Register a region of unsaveable memory.
+ *
+ * Register a range of page frames the contents of which should not be saved
+ * during hibernation (to be used in the early initialization code).
*/
-
-void __init
-__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
- int use_kmalloc)
+void __init __register_nosave_region(unsigned long start_pfn,
+ unsigned long end_pfn, int use_kmalloc)
{
struct nosave_region *region;
@@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
}
}
if (use_kmalloc) {
- /* during init, this shouldn't fail */
+ /* During init, this shouldn't fail */
region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
BUG_ON(!region);
- } else
+ } else {
/* This allocation cannot fail */
region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+ }
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
@@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page)
}
/**
- * mark_nosave_pages - set bits corresponding to the page frames the
- * contents of which should not be saved in a given bitmap.
+ * mark_nosave_pages - Mark pages that should not be saved.
+ * @bm: Memory bitmap.
+ *
+ * Set the bits in @bm that correspond to the page frames the contents of which
+ * should not be saved.
*/
-
static void mark_nosave_pages(struct memory_bitmap *bm)
{
struct nosave_region *region;
@@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
}
/**
- * create_basic_memory_bitmaps - create bitmaps needed for marking page
- * frames that should not be saved and free page frames. The pointers
- * forbidden_pages_map and free_pages_map are only modified if everything
- * goes well, because we don't want the bits to be used before both bitmaps
- * are set up.
+ * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
+ *
+ * Create bitmaps needed for marking page frames that should not be saved and
+ * free page frames. The forbidden_pages_map and free_pages_map pointers are
+ * only modified if everything goes well, because we don't want the bits to be
+ * touched before both bitmaps are set up.
*/
-
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
@@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void)
}
/**
- * free_basic_memory_bitmaps - free memory bitmaps allocated by
- * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
- * so that the bitmaps themselves are not referred to while they are being
- * freed.
+ * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
+ *
+ * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The
+ * auxiliary pointers are necessary so that the bitmaps themselves are not
+ * referred to while they are being freed.
*/
-
void free_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
@@ -1032,12 +1132,36 @@ void free_basic_memory_bitmaps(void)
pr_debug("PM: Basic memory bitmaps freed\n");
}
+void clear_free_pages(void)
+{
+#ifdef CONFIG_PAGE_POISONING_ZERO
+ struct memory_bitmap *bm = free_pages_map;
+ unsigned long pfn;
+
+ if (WARN_ON(!(free_pages_map)))
+ return;
+
+ memory_bm_position_reset(bm);
+ pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (pfn_valid(pfn))
+ clear_highpage(pfn_to_page(pfn));
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ memory_bm_position_reset(bm);
+ pr_info("PM: free pages cleared after restore\n");
+#endif /* PAGE_POISONING_ZERO */
+}
+
/**
- * snapshot_additional_pages - estimate the number of additional pages
- * be needed for setting up the suspend image data structures for given
- * zone (usually the returned value is greater than the exact number)
+ * snapshot_additional_pages - Estimate the number of extra pages needed.
+ * @zone: Memory zone to carry out the computation for.
+ *
+ * Estimate the number of additional pages needed for setting up a hibernation
+ * image data structures for @zone (usually, the returned value is greater than
+ * the exact number).
*/
-
unsigned int snapshot_additional_pages(struct zone *zone)
{
unsigned int rtree, nodes;
@@ -1055,10 +1179,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)
#ifdef CONFIG_HIGHMEM
/**
- * count_free_highmem_pages - compute the total number of free highmem
- * pages, system-wide.
+ * count_free_highmem_pages - Compute the total number of free highmem pages.
+ *
+ * The returned number is system-wide.
*/
-
static unsigned int count_free_highmem_pages(void)
{
struct zone *zone;
@@ -1072,11 +1196,12 @@ static unsigned int count_free_highmem_pages(void)
}
/**
- * saveable_highmem_page - Determine whether a highmem page should be
- * included in the suspend image.
+ * saveable_highmem_page - Check if a highmem page is saveable.
*
- * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
- * and it isn't a part of a free chunk of pages.
+ * Determine whether a highmem page should be included in a hibernation image.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't part of a free chunk of pages.
*/
static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
@@ -1102,10 +1227,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_highmem_pages - compute the total number of saveable highmem
- * pages.
+ * count_highmem_pages - Compute the total number of saveable highmem pages.
*/
-
static unsigned int count_highmem_pages(void)
{
struct zone *zone;
@@ -1133,12 +1256,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
#endif /* CONFIG_HIGHMEM */
/**
- * saveable_page - Determine whether a non-highmem page should be included
- * in the suspend image.
+ * saveable_page - Check if the given page is saveable.
*
- * We should save the page if it isn't Nosave, and is not in the range
- * of pages statically defined as 'unsaveable', and it isn't a part of
- * a free chunk of pages.
+ * Determine whether a non-highmem page should be included in a hibernation
+ * image.
+ *
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't part of
+ * a free chunk of pages.
*/
static struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
@@ -1167,10 +1292,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_data_pages - compute the total number of saveable non-highmem
- * pages.
+ * count_data_pages - Compute the total number of saveable non-highmem pages.
*/
-
static unsigned int count_data_pages(void)
{
struct zone *zone;
@@ -1190,7 +1313,8 @@ static unsigned int count_data_pages(void)
return n;
}
-/* This is needed, because copy_page and memcpy are not usable for copying
+/*
+ * This is needed, because copy_page and memcpy are not usable for copying
* task structs.
*/
static inline void do_copy_page(long *dst, long *src)
@@ -1201,12 +1325,12 @@ static inline void do_copy_page(long *dst, long *src)
*dst++ = *src++;
}
-
/**
- * safe_copy_page - check if the page we are going to copy is marked as
- * present in the kernel page tables (this always is the case if
- * CONFIG_DEBUG_PAGEALLOC is not set and in that case
- * kernel_page_present() always returns 'true').
+ * safe_copy_page - Copy a page in a safe way.
+ *
+ * Check if the page we are going to copy is marked as present in the kernel
+ * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
+ * and in that case kernel_page_present() always returns 'true').
*/
static void safe_copy_page(void *dst, struct page *s_page)
{
@@ -1219,10 +1343,8 @@ static void safe_copy_page(void *dst, struct page *s_page)
}
}
-
#ifdef CONFIG_HIGHMEM
-static inline struct page *
-page_is_saveable(struct zone *zone, unsigned long pfn)
+static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
{
return is_highmem(zone) ?
saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
@@ -1243,7 +1365,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
kunmap_atomic(src);
} else {
if (PageHighMem(d_page)) {
- /* Page pointed to by src may contain some kernel
+ /*
+ * The page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
safe_copy_page(buffer, s_page);
@@ -1265,8 +1388,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
}
#endif /* CONFIG_HIGHMEM */
-static void
-copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+static void copy_data_pages(struct memory_bitmap *copy_bm,
+ struct memory_bitmap *orig_bm)
{
struct zone *zone;
unsigned long pfn;
@@ -1315,12 +1438,11 @@ static struct memory_bitmap orig_bm;
static struct memory_bitmap copy_bm;
/**
- * swsusp_free - free pages allocated for the suspend.
+ * swsusp_free - Free pages allocated for hibernation image.
*
- * Suspend pages are alocated before the atomic copy is made, so we
- * need to release them after the resume.
+ * Image pages are alocated before snapshot creation, so they need to be
+ * released after resume.
*/
-
void swsusp_free(void)
{
unsigned long fb_pfn, fr_pfn;
@@ -1351,6 +1473,7 @@ loop:
memory_bm_clear_current(forbidden_pages_map);
memory_bm_clear_current(free_pages_map);
+ hibernate_restore_unprotect_page(page_address(page));
__free_page(page);
goto loop;
}
@@ -1362,6 +1485,7 @@ out:
buffer = NULL;
alloc_normal = 0;
alloc_highmem = 0;
+ hibernate_restore_protection_end();
}
/* Helper functions used for the shrinking of memory. */
@@ -1369,7 +1493,7 @@ out:
#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
/**
- * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * preallocate_image_pages - Allocate a number of pages for hibernation image.
* @nr_pages: Number of page frames to allocate.
* @mask: GFP flags to use for the allocation.
*
@@ -1419,7 +1543,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
/**
- * __fraction - Compute (an approximation of) x * (multiplier / base)
+ * __fraction - Compute (an approximation of) x * (multiplier / base).
*/
static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
{
@@ -1429,8 +1553,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
}
static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
unsigned long alloc = __fraction(nr_pages, highmem, total);
@@ -1443,15 +1567,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
return 0;
}
#endif /* CONFIG_HIGHMEM */
/**
- * free_unnecessary_pages - Release preallocated pages not needed for the image
+ * free_unnecessary_pages - Release preallocated pages not needed for the image.
*/
static unsigned long free_unnecessary_pages(void)
{
@@ -1505,7 +1629,7 @@ static unsigned long free_unnecessary_pages(void)
}
/**
- * minimum_image_size - Estimate the minimum acceptable size of an image
+ * minimum_image_size - Estimate the minimum acceptable size of an image.
* @saveable: Number of saveable pages in the system.
*
* We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1525,17 +1649,17 @@ static unsigned long minimum_image_size(unsigned long saveable)
unsigned long size;
size = global_page_state(NR_SLAB_RECLAIMABLE)
- + global_page_state(NR_ACTIVE_ANON)
- + global_page_state(NR_INACTIVE_ANON)
- + global_page_state(NR_ACTIVE_FILE)
- + global_page_state(NR_INACTIVE_FILE)
- - global_page_state(NR_FILE_MAPPED);
+ + global_node_page_state(NR_ACTIVE_ANON)
+ + global_node_page_state(NR_INACTIVE_ANON)
+ + global_node_page_state(NR_ACTIVE_FILE)
+ + global_node_page_state(NR_INACTIVE_FILE)
+ - global_node_page_state(NR_FILE_MAPPED);
return saveable <= size ? 0 : saveable - size;
}
/**
- * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image.
*
* To create a hibernation image it is necessary to make a copy of every page
* frame in use. We also need a number of page frames to be free during
@@ -1708,10 +1832,11 @@ int hibernate_preallocate_memory(void)
#ifdef CONFIG_HIGHMEM
/**
- * count_pages_for_highmem - compute the number of non-highmem pages
- * that will be necessary for creating copies of highmem pages.
- */
-
+ * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
+ *
+ * Compute the number of non-highmem pages that will be necessary for creating
+ * copies of highmem pages.
+ */
static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
{
unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1724,15 +1849,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
return nr_highmem;
}
#else
-static unsigned int
-count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * enough_free_mem - Make sure we have enough free memory for the
- * snapshot image.
+ * enough_free_mem - Check if there is enough free memory for the image.
*/
-
static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
{
struct zone *zone;
@@ -1751,10 +1873,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
#ifdef CONFIG_HIGHMEM
/**
- * get_highmem_buffer - if there are some highmem pages in the suspend
- * image, we may need the buffer to copy them and/or load their data.
+ * get_highmem_buffer - Allocate a buffer for highmem pages.
+ *
+ * If there are some highmem pages in the hibernation image, we may need a
+ * buffer to copy them and/or load their data.
*/
-
static inline int get_highmem_buffer(int safe_needed)
{
buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
@@ -1762,13 +1885,13 @@ static inline int get_highmem_buffer(int safe_needed)
}
/**
- * alloc_highmem_image_pages - allocate some highmem pages for the image.
- * Try to allocate as many pages as needed, but if the number of free
- * highmem pages is lesser than that, allocate them all.
+ * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ *
+ * Try to allocate as many pages as needed, but if the number of free highmem
+ * pages is less than that, allocate them all.
*/
-
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int nr_highmem)
{
unsigned int to_alloc = count_free_highmem_pages();
@@ -1787,25 +1910,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
#else
static inline int get_highmem_buffer(int safe_needed) { return 0; }
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int n) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * swsusp_alloc - allocate memory for the suspend image
+ * swsusp_alloc - Allocate memory for hibernation image.
*
- * We first try to allocate as many highmem pages as there are
- * saveable highmem pages in the system. If that fails, we allocate
- * non-highmem pages for the copies of the remaining highmem ones.
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system. If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
*
- * In this approach it is likely that the copies of highmem pages will
- * also be located in the high memory, because of the way in which
- * copy_data_pages() works.
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
*/
-
-static int
-swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
- unsigned int nr_pages, unsigned int nr_highmem)
+static int swsusp_alloc(struct memory_bitmap *orig_bm,
+ struct memory_bitmap *copy_bm,
+ unsigned int nr_pages, unsigned int nr_highmem)
{
if (nr_highmem > 0) {
if (get_highmem_buffer(PG_ANY))
@@ -1855,7 +1977,8 @@ asmlinkage __visible int swsusp_save(void)
return -ENOMEM;
}
- /* During allocating of suspend pagedir, new cold pages may appear.
+ /*
+ * During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages(NULL);
@@ -1918,12 +2041,14 @@ static int init_header(struct swsusp_info *info)
}
/**
- * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
- * are stored in the array @buf[] (1 page at a time)
+ * pack_pfns - Prepare PFNs for saving.
+ * @bm: Memory bitmap.
+ * @buf: Memory buffer to store the PFNs in.
+ *
+ * PFNs corresponding to set bits in @bm are stored in the area of memory
+ * pointed to by @buf (1 page at a time).
*/
-
-static inline void
-pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
@@ -1937,22 +2062,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
}
/**
- * snapshot_read_next - used for reading the system memory snapshot.
+ * snapshot_read_next - Get the address to read the next image page from.
+ * @handle: Snapshot handle to be used for the reading.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to read up to the returned number of bytes from the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the end of data stream condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the end of the data stream condition,
+ * and negative numbers are returned on errors. If that happens, the structure
+ * pointed to by @handle is not updated and should not be used any more.
*/
-
int snapshot_read_next(struct snapshot_handle *handle)
{
if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -1981,7 +2105,8 @@ int snapshot_read_next(struct snapshot_handle *handle)
page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
if (PageHighMem(page)) {
- /* Highmem pages are copied to the buffer,
+ /*
+ * Highmem pages are copied to the buffer,
* because we can't return with a kmapped
* highmem page (we may not be called again).
*/
@@ -1999,53 +2124,41 @@ int snapshot_read_next(struct snapshot_handle *handle)
return PAGE_SIZE;
}
-/**
- * mark_unsafe_pages - mark the pages that cannot be used for storing
- * the image during resume, because they conflict with the pages that
- * had been used before suspend
- */
-
-static int mark_unsafe_pages(struct memory_bitmap *bm)
+static void duplicate_memory_bitmap(struct memory_bitmap *dst,
+ struct memory_bitmap *src)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn;
- /* Clear page flags */
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn))
- swsusp_unset_page_free(pfn_to_page(pfn));
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
-
- /* Mark pages that correspond to the "original" pfns as "unsafe" */
- memory_bm_position_reset(bm);
- do {
- pfn = memory_bm_next_pfn(bm);
- if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)))
- swsusp_set_page_free(pfn_to_page(pfn));
- else
- return -EFAULT;
- }
- } while (pfn != BM_END_OF_MAP);
-
- allocated_unsafe_pages = 0;
-
- return 0;
}
-static void
-duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
+/**
+ * mark_unsafe_pages - Mark pages that were used before hibernation.
+ *
+ * Mark the pages that cannot be used for storing the image during restoration,
+ * because they conflict with the pages that had been used before hibernation.
+ */
+static void mark_unsafe_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
- memory_bm_position_reset(src);
- pfn = memory_bm_next_pfn(src);
+ /* Clear the "free"/"unsafe" bit for all PFNs */
+ memory_bm_position_reset(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
while (pfn != BM_END_OF_MAP) {
- memory_bm_set_bit(dst, pfn);
- pfn = memory_bm_next_pfn(src);
+ memory_bm_clear_current(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
}
+
+ /* Mark pages that correspond to the "original" PFNs as "unsafe" */
+ duplicate_memory_bitmap(free_pages_map, bm);
+
+ allocated_unsafe_pages = 0;
}
static int check_header(struct swsusp_info *info)
@@ -2063,11 +2176,9 @@ static int check_header(struct swsusp_info *info)
}
/**
- * load header - check the image header and copy data from it
+ * load header - Check the image header and copy the data from it.
*/
-
-static int
-load_header(struct swsusp_info *info)
+static int load_header(struct swsusp_info *info)
{
int error;
@@ -2081,8 +2192,12 @@ load_header(struct swsusp_info *info)
}
/**
- * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
- * the corresponding bit in the memory bitmap @bm
+ * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
+ * @bm: Memory bitmap.
+ * @buf: Area of memory containing the PFNs.
+ *
+ * For each element of the array pointed to by @buf (1 page at a time), set the
+ * corresponding bit in @bm.
*/
static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
@@ -2095,7 +2210,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
/* Extract and buffer page key for data page (s390 only). */
page_key_memorize(buf + j);
- if (memory_bm_pfn_present(bm, buf[j]))
+ if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
memory_bm_set_bit(bm, buf[j]);
else
return -EFAULT;
@@ -2104,13 +2219,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
return 0;
}
-/* List of "safe" pages that may be used to store data loaded from the suspend
- * image
- */
-static struct linked_page *safe_pages_list;
-
#ifdef CONFIG_HIGHMEM
-/* struct highmem_pbe is used for creating the list of highmem pages that
+/*
+ * struct highmem_pbe is used for creating the list of highmem pages that
* should be restored atomically during the resume from disk, because the page
* frames they have occupied before the suspend are in use.
*/
@@ -2120,7 +2231,8 @@ struct highmem_pbe {
struct highmem_pbe *next;
};
-/* List of highmem PBEs needed for restoring the highmem pages that were
+/*
+ * List of highmem PBEs needed for restoring the highmem pages that were
* allocated before the suspend and included in the suspend image, but have
* also been allocated by the "resume" kernel, so their contents cannot be
* written directly to their "original" page frames.
@@ -2128,11 +2240,11 @@ struct highmem_pbe {
static struct highmem_pbe *highmem_pblist;
/**
- * count_highmem_image_pages - compute the number of highmem pages in the
- * suspend image. The bits in the memory bitmap @bm that correspond to the
- * image pages are assumed to be set.
+ * count_highmem_image_pages - Compute the number of highmem pages in the image.
+ * @bm: Memory bitmap.
+ *
+ * The bits in @bm that correspond to image pages are assumed to be set.
*/
-
static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
@@ -2149,24 +2261,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
return cnt;
}
-/**
- * prepare_highmem_image - try to allocate as many highmem pages as
- * there are highmem image pages (@nr_highmem_p points to the variable
- * containing the number of highmem image pages). The pages that are
- * "safe" (ie. will not be overwritten when the suspend image is
- * restored) have the corresponding bits set in @bm (it must be
- * unitialized).
- *
- * NOTE: This function should not be called if there are no highmem
- * image pages.
- */
-
static unsigned int safe_highmem_pages;
static struct memory_bitmap *safe_highmem_bm;
-static int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+/**
+ * prepare_highmem_image - Allocate memory for loading highmem data from image.
+ * @bm: Pointer to an uninitialized memory bitmap structure.
+ * @nr_highmem_p: Pointer to the number of highmem image pages.
+ *
+ * Try to allocate as many highmem pages as there are highmem image pages
+ * (@nr_highmem_p points to the variable containing the number of highmem image
+ * pages). The pages that are "safe" (ie. will not be overwritten when the
+ * hibernation image is restored entirely) have the corresponding bits set in
+ * @bm (it must be unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem image pages.
+ */
+static int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p)
{
unsigned int to_alloc;
@@ -2201,39 +2314,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
return 0;
}
+static struct page *last_highmem_page;
+
/**
- * get_highmem_page_buffer - for given highmem image page find the buffer
- * that suspend_write_next() should set for its caller to write to.
+ * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
*
- * If the page is to be saved to its "original" page frame or a copy of
- * the page is to be made in the highmem, @buffer is returned. Otherwise,
- * the copy of the page is to be made in normal memory, so the address of
- * the copy is returned.
+ * For a given highmem image page get a buffer that suspend_write_next() should
+ * return to its caller to write to.
*
- * If @buffer is returned, the caller of suspend_write_next() will write
- * the page's contents to @buffer, so they will have to be copied to the
- * right location on the next call to suspend_write_next() and it is done
- * with the help of copy_last_highmem_page(). For this purpose, if
- * @buffer is returned, @last_highmem page is set to the page to which
- * the data will have to be copied from @buffer.
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned. Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
+ *
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page(). For this purpose, if
+ * @buffer is returned, @last_highmem_page is set to the page to which
+ * the data will have to be copied from @buffer.
*/
-
-static struct page *last_highmem_page;
-
-static void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
struct highmem_pbe *pbe;
void *kaddr;
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
last_highmem_page = page;
return buffer;
}
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2263,11 +2379,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
}
/**
- * copy_last_highmem_page - copy the contents of a highmem image from
- * @buffer, where the caller of snapshot_write_next() has place them,
- * to the right location represented by @last_highmem_page .
+ * copy_last_highmem_page - Copy most the most recent highmem image page.
+ *
+ * Copy the contents of a highmem image from @buffer, where the caller of
+ * snapshot_write_next() has stored them, to the right location represented by
+ * @last_highmem_page .
*/
-
static void copy_last_highmem_page(void)
{
if (last_highmem_page) {
@@ -2294,17 +2411,13 @@ static inline void free_highmem_data(void)
free_image_page(buffer, PG_UNSAFE_CLEAR);
}
#else
-static unsigned int
-count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
-static inline int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
-{
- return 0;
-}
+static inline int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p) { return 0; }
-static inline void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static inline void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
return ERR_PTR(-EINVAL);
}
@@ -2314,27 +2427,27 @@ static inline int last_highmem_page_copied(void) { return 1; }
static inline void free_highmem_data(void) {}
#endif /* CONFIG_HIGHMEM */
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
/**
- * prepare_image - use the memory bitmap @bm to mark the pages that will
- * be overwritten in the process of restoring the system memory state
- * from the suspend image ("unsafe" pages) and allocate memory for the
- * image.
+ * prepare_image - Make room for loading hibernation image.
+ * @new_bm: Unitialized memory bitmap structure.
+ * @bm: Memory bitmap with unsafe pages marked.
*
- * The idea is to allocate a new memory bitmap first and then allocate
- * as many pages as needed for the image data, but not to assign these
- * pages to specific tasks initially. Instead, we just mark them as
- * allocated and create a lists of "safe" pages that will be used
- * later. On systems with high memory a list of "safe" highmem pages is
- * also created.
+ * Use @bm to mark the pages that will be overwritten in the process of
+ * restoring the system memory state from the suspend image ("unsafe" pages)
+ * and allocate memory for the image.
+ *
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for image data, but without specifying what those
+ * pages will be used for just yet. Instead, we mark them all as allocated and
+ * create a lists of "safe" pages to be used later. On systems with high
+ * memory a list of "safe" highmem pages is created too.
*/
-
-#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-
-static int
-prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
{
unsigned int nr_pages, nr_highmem;
- struct linked_page *sp_list, *lp;
+ struct linked_page *lp;
int error;
/* If there is no highmem, the buffer will not be necessary */
@@ -2342,9 +2455,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
buffer = NULL;
nr_highmem = count_highmem_image_pages(bm);
- error = mark_unsafe_pages(bm);
- if (error)
- goto Free;
+ mark_unsafe_pages(bm);
error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
if (error)
@@ -2357,14 +2468,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
if (error)
goto Free;
}
- /* Reserve some safe pages for potential later use.
+ /*
+ * Reserve some safe pages for potential later use.
*
* NOTE: This way we make sure there will be enough safe pages for the
* chain_alloc() in get_buffer(). It is a bit wasteful, but
* nr_copy_pages cannot be greater than 50% of the memory anyway.
+ *
+ * nr_copy_pages cannot be less than allocated_unsafe_pages too.
*/
- sp_list = NULL;
- /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
while (nr_pages > 0) {
@@ -2373,12 +2485,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
error = -ENOMEM;
goto Free;
}
- lp->next = sp_list;
- sp_list = lp;
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
nr_pages--;
}
/* Preallocate memory for the image */
- safe_pages_list = NULL;
nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
while (nr_pages > 0) {
lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
@@ -2396,12 +2507,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
swsusp_set_page_free(virt_to_page(lp));
nr_pages--;
}
- /* Free the reserved safe pages so that chain_alloc() can use them */
- while (sp_list) {
- lp = sp_list->next;
- free_image_page(sp_list, PG_UNSAFE_CLEAR);
- sp_list = lp;
- }
return 0;
Free:
@@ -2410,10 +2515,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
}
/**
- * get_buffer - compute the address that snapshot_write_next() should
- * set for its caller to write to.
+ * get_buffer - Get the address to store the next image data page.
+ *
+ * Get the address that snapshot_write_next() should return to its caller to
+ * write to.
*/
-
static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
struct pbe *pbe;
@@ -2428,12 +2534,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return get_highmem_page_buffer(page, ca);
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
return page_address(page);
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2450,22 +2558,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
}
/**
- * snapshot_write_next - used for writing the system memory snapshot.
+ * snapshot_write_next - Get the address to store the next image page.
+ * @handle: Snapshot handle structure to guide the writing.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to write up to the returned number of bytes to the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the "end of file" condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the "end of file" condition. Negative
+ * numbers are returned on errors, in which cases the structure pointed to by
+ * @handle is not updated and should not be used any more.
*/
-
int snapshot_write_next(struct snapshot_handle *handle)
{
static struct chain_allocator ca;
@@ -2491,6 +2598,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ safe_pages_list = NULL;
+
error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
if (error)
return error;
@@ -2500,6 +2609,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ hibernate_restore_protection_begin();
} else if (handle->cur <= nr_meta_pages + 1) {
error = unpack_orig_pfns(buffer, &copy_bm);
if (error)
@@ -2522,6 +2632,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
copy_last_highmem_page();
/* Restore page key for data page (s390 only). */
page_key_write(handle->buffer);
+ hibernate_restore_protect_page(handle->buffer);
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
@@ -2533,22 +2644,23 @@ int snapshot_write_next(struct snapshot_handle *handle)
}
/**
- * snapshot_write_finalize - must be called after the last call to
- * snapshot_write_next() in case the last page in the image happens
- * to be a highmem page and its contents should be stored in the
- * highmem. Additionally, it releases the memory that will not be
- * used any more.
+ * snapshot_write_finalize - Complete the loading of a hibernation image.
+ *
+ * Must be called after the last call to snapshot_write_next() in case the last
+ * page in the image happens to be a highmem page and its contents should be
+ * stored in highmem. Additionally, it recycles bitmap memory that's not
+ * necessary any more.
*/
-
void snapshot_write_finalize(struct snapshot_handle *handle)
{
copy_last_highmem_page();
/* Restore page key for data page (s390 only). */
page_key_write(handle->buffer);
page_key_free();
- /* Free only if we have loaded the image entirely */
+ hibernate_restore_protect_page(handle->buffer);
+ /* Do that only if we have loaded the image entirely */
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
- memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ memory_bm_recycle(&orig_bm);
free_highmem_data();
}
}
@@ -2561,8 +2673,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)
#ifdef CONFIG_HIGHMEM
/* Assumes that @buf is ready and points to a "safe" page */
-static inline void
-swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+static inline void swap_two_pages_data(struct page *p1, struct page *p2,
+ void *buf)
{
void *kaddr1, *kaddr2;
@@ -2576,15 +2688,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
}
/**
- * restore_highmem - for each highmem page that was allocated before
- * the suspend and included in the suspend image, and also has been
- * allocated by the "resume" kernel swap its current (ie. "before
- * resume") contents with the previous (ie. "before suspend") one.
+ * restore_highmem - Put highmem image pages into their original locations.
+ *
+ * For each highmem page that was in use before hibernation and is included in
+ * the image, and also has been allocated by the "restore" kernel, swap its
+ * current contents with the previous (ie. "before hibernation") ones.
*
- * If the resume eventually fails, we can call this function once
- * again and restore the "before resume" highmem state.
+ * If the restore eventually fails, we can call this function once again and
+ * restore the highmem state as seen by the restore kernel.
*/
-
int restore_highmem(void)
{
struct highmem_pbe *pbe = highmem_pblist;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5b70d64b871e..1e7f5da648d9 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -118,10 +118,18 @@ static bool valid_state(suspend_state_t state)
*/
static bool relative_states;
+void __init pm_states_init(void)
+{
+ /*
+ * freeze state should be supported even without any suspend_ops,
+ * initialize pm_states accordingly here
+ */
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+}
+
static int __init sleep_states_setup(char *str)
{
relative_states = !strncmp(str, "1", 1);
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
return 1;
}
@@ -211,7 +219,7 @@ static int platform_suspend_begin(suspend_state_t state)
{
if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
return freeze_ops->begin();
- else if (suspend_ops->begin)
+ else if (suspend_ops && suspend_ops->begin)
return suspend_ops->begin(state);
else
return 0;
@@ -221,7 +229,7 @@ static void platform_resume_end(suspend_state_t state)
{
if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
freeze_ops->end();
- else if (suspend_ops->end)
+ else if (suspend_ops && suspend_ops->end)
suspend_ops->end();
}
@@ -266,16 +274,18 @@ static int suspend_test(int level)
*/
static int suspend_prepare(suspend_state_t state)
{
- int error;
+ int error, nr_calls = 0;
if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Finish;
+ }
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
@@ -286,7 +296,7 @@ static int suspend_prepare(suspend_state_t state)
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
Finish:
- pm_notifier_call_chain(PM_POST_SUSPEND);
+ __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
pm_restore_console();
return error;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..a3b1e617bcdc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct hib_bio_batch *hb)
{
struct page *page = virt_to_page(addr);
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = hib_resume_bdev;
+ bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
bio->bi_end_io = hib_end_io;
bio->bi_private = hb;
atomic_inc(&hb->count);
- submit_bio(rw, bio);
+ submit_bio(bio);
} else {
- error = submit_bio_wait(rw, bio);
+ error = submit_bio_wait(bio);
bio_put(bio);
}
@@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block, swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Swap header not found!\n");
error = -ENODEV;
@@ -348,6 +350,12 @@ static int swsusp_swap_check(void)
if (res < 0)
blkdev_put(hib_resume_bdev, FMODE_WRITE);
+ /*
+ * Update the resume device to the one actually used,
+ * so the test_resume mode can use it in case it is
+ * invoked from hibernate() to test the snapshot.
+ */
+ swsusp_resume_device = hib_resume_bdev->bd_dev;
return res;
}
@@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
} else {
src = buf;
}
- return hib_submit_io(WRITE_SYNC, offset, src, hb);
+ return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
+ tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(READ_SYNC, offset, buf, hb);
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1525,7 +1534,8 @@ int swsusp_check(void)
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_submit_io(READ_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1533,7 +1543,8 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
@@ -1577,10 +1588,12 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- int error;
+ int error, nr_calls = 0;
if (!hibernation_available())
return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error)
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
} else {
/*
* Resuming. We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = -1;
data->mode = O_WRONLY;
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
- }
+ } else
+ nr_calls--;
+
if (error)
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
}
if (error)
atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
obj-y = printk.o
+obj-$(CONFIG_PRINTK_NMI) += nmi.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
char *_braille_console_setup(char **str, char **brl_options)
{
- if (!memcmp(*str, "brl,", 4)) {
+ if (!strncmp(*str, "brl,", 4)) {
*brl_options = "";
*str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
+ } else if (!strncmp(*str, "brl=", 4)) {
*brl_options = *str + 4;
*str = strchr(*brl_options, ',');
if (!*str)
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..7fd2838fa417
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,57 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+extern raw_spinlock_t logbuf_lock;
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+extern atomic_t nmi_message_lost;
+static inline int get_nmi_message_lost(void)
+{
+ return atomic_xchg(&nmi_message_lost, 0);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+static inline int get_nmi_message_lost(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..16bab471c7e2
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,268 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/debug_locks.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+atomic_t nmi_message_lost;
+
+#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \
+ sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+ atomic_t len; /* length of written data */
+ struct irq_work work; /* IRQ work that flushes the buffer */
+ unsigned char buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ int add = 0;
+ size_t len;
+
+again:
+ len = atomic_read(&s->len);
+
+ if (len >= sizeof(s->buffer)) {
+ atomic_inc(&nmi_message_lost);
+ return 0;
+ }
+
+ /*
+ * Make sure that all old data have been read before the buffer was
+ * reseted. This is not needed when we just append data.
+ */
+ if (!len)
+ smp_rmb();
+
+ add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+ /*
+ * Do it once again if the buffer has been flushed in the meantime.
+ * Note that atomic_cmpxchg() is an implicit memory barrier that
+ * makes sure that the data were written before updating s->len.
+ */
+ if (atomic_cmpxchg(&s->len, len, len + add) != len)
+ goto again;
+
+ /* Get flushed in a more safe context. */
+ if (add && printk_nmi_irq_ready) {
+ /* Make sure that IRQ work is really initialized. */
+ smp_rmb();
+ irq_work_queue(&s->work);
+ }
+
+ return add;
+}
+
+static void printk_nmi_flush_line(const char *text, int len)
+{
+ /*
+ * The buffers are flushed in NMI only on panic. The messages must
+ * go only into the ring buffer at this stage. Consoles will get
+ * explicitly called later when a crashdump is not generated.
+ */
+ if (in_nmi())
+ printk_deferred("%.*s", len, text);
+ else
+ printk("%.*s", len, text);
+
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
+ int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk_nmi_flush_line(buf, (end - start) + 1);
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ static raw_spinlock_t read_lock =
+ __RAW_SPIN_LOCK_INITIALIZER(read_lock);
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ unsigned long flags;
+ size_t len, size;
+ int i, last_i;
+
+ /*
+ * The lock has two functions. First, one reader has to flush all
+ * available message to make the lockless synchronization with
+ * writers easier. Second, we do not want to mix messages from
+ * different CPUs. This is especially important when printing
+ * a backtrace.
+ */
+ raw_spin_lock_irqsave(&read_lock, flags);
+
+ i = 0;
+more:
+ len = atomic_read(&s->len);
+
+ /*
+ * This is just a paranoid check that nobody has manipulated
+ * the buffer an unexpected way. If we printed something then
+ * @len must only increase.
+ */
+ if (i && i >= len) {
+ const char *msg = "printk_nmi_flush: internal error\n";
+
+ printk_nmi_flush_line(msg, strlen(msg));
+ }
+
+ if (!len)
+ goto out; /* Someone else has already flushed the buffer. */
+
+ /* Make sure that data has been written up to the @len */
+ smp_rmb();
+
+ size = min(len, sizeof(s->buffer));
+ last_i = i;
+
+ /* Print line by line. */
+ for (; i < size; i++) {
+ if (s->buffer[i] == '\n') {
+ printk_nmi_flush_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < size) {
+ printk_nmi_flush_seq_line(s, last_i, size - 1);
+ printk_nmi_flush_line("\n", strlen("\n"));
+ }
+
+ /*
+ * Check that nothing has got added in the meantime and truncate
+ * the buffer. Note that atomic_cmpxchg() is an implicit memory
+ * barrier that makes sure that the data were copied before
+ * updating s->len.
+ */
+ if (atomic_cmpxchg(&s->len, len, 0) != len)
+ goto more;
+
+out:
+ raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+/**
+ * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ * goes down.
+ *
+ * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_nmi_flush_on_panic(void)
+{
+ /*
+ * Make sure that we could access the main ring buffer.
+ * Do not risk a double release when more CPUs are up.
+ */
+ if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+ if (num_online_cpus() > 1)
+ return;
+
+ debug_locks_off();
+ raw_spin_lock_init(&logbuf_lock);
+ }
+
+ printk_nmi_flush();
+}
+
+void __init printk_nmi_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ }
+
+ /* Make sure that IRQ works are initialized before enabling. */
+ smp_wmb();
+ printk_nmi_irq_ready = 1;
+
+ /* Flush pending messages that did not have scheduled IRQ works. */
+ printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bfbf284e4218..eea6dbc2d8cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -26,7 +26,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/interrupt.h> /* For in_interrupt() */
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
@@ -48,13 +47,14 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
#include "console_cmdline.h"
#include "braille.h"
+#include "internal.h"
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
@@ -85,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = {
};
#endif
+enum devkmsg_log_bits {
+ __DEVKMSG_LOG_BIT_ON = 0,
+ __DEVKMSG_LOG_BIT_OFF,
+ __DEVKMSG_LOG_BIT_LOCK,
+};
+
+enum devkmsg_log_masks {
+ DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON),
+ DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF),
+ DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK),
+};
+
+/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
+#define DEVKMSG_LOG_MASK_DEFAULT 0
+
+static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+
+static int __control_devkmsg(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strncmp(str, "on", 2)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_ON;
+ return 2;
+ } else if (!strncmp(str, "off", 3)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_OFF;
+ return 3;
+ } else if (!strncmp(str, "ratelimit", 9)) {
+ devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+ return 9;
+ }
+ return -EINVAL;
+}
+
+static int __init control_devkmsg(char *str)
+{
+ if (__control_devkmsg(str) < 0)
+ return 1;
+
+ /*
+ * Set sysctl string accordingly:
+ */
+ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) {
+ memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+ strncpy(devkmsg_log_str, "on", 2);
+ } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) {
+ memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+ strncpy(devkmsg_log_str, "off", 3);
+ }
+ /* else "ratelimit" which is set by default. */
+
+ /*
+ * Sysctl cannot change it anymore. The kernel command line setting of
+ * this parameter is to force the setting to be permanent throughout the
+ * runtime of the system. This is a precation measure against userspace
+ * trying to be a smarta** and attempting to change it up on us.
+ */
+ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
+
+ return 0;
+}
+__setup("printk.devkmsg=", control_devkmsg);
+
+char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
+
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char old_str[DEVKMSG_STR_MAX_SIZE];
+ unsigned int old;
+ int err;
+
+ if (write) {
+ if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
+ return -EINVAL;
+
+ old = devkmsg_log;
+ strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
+ }
+
+ err = proc_dostring(table, write, buffer, lenp, ppos);
+ if (err)
+ return err;
+
+ if (write) {
+ err = __control_devkmsg(devkmsg_log_str);
+
+ /*
+ * Do not accept an unknown string OR a known string with
+ * trailing crap...
+ */
+ if (err < 0 || (err + 1 != *lenp)) {
+
+ /* ... and restore old setting. */
+ devkmsg_log = old;
+ strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
+
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
/*
* Number of registered extended console drivers.
*
@@ -244,7 +349,7 @@ __packed __aligned(4)
* within the scheduler's rq lock. It must be released before calling
* console_unlock() or anything else that might wake up a process.
*/
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+DEFINE_RAW_SPINLOCK(logbuf_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
@@ -613,6 +718,7 @@ struct devkmsg_user {
u64 seq;
u32 idx;
enum log_flags prev;
+ struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
};
@@ -622,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
char *buf, *line;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
+ struct file *file = iocb->ki_filp;
+ struct devkmsg_user *user = file->private_data;
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (len > LOG_LINE_MAX)
+ if (!user || len > LOG_LINE_MAX)
return -EINVAL;
+
+ /* Ignore when user logging is disabled. */
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return len;
+
+ /* Ratelimit when not explicitly enabled. */
+ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
+ if (!___ratelimit(&user->rs, current->comm))
+ return ret;
+ }
+
buf = kmalloc(len+1, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
@@ -799,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file)
struct devkmsg_user *user;
int err;
- /* write-only does not need any file context */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- return 0;
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return -EPERM;
- err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
- SYSLOG_FROM_READER);
- if (err)
- return err;
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+ }
user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
+ ratelimit_default_init(&user->rs);
+ ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);
+
mutex_init(&user->lock);
raw_spin_lock_irq(&logbuf_lock);
@@ -830,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file)
if (!user)
return 0;
+ ratelimit_state_exit(&user->rs);
+
mutex_destroy(&user->lock);
kfree(user);
return 0;
@@ -985,6 +1111,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
"ignore loglevel setting (prints all kernel messages to the console)");
+static bool suppress_message_printing(int level)
+{
+ return (level >= console_loglevel && !ignore_loglevel);
+}
+
#ifdef CONFIG_BOOT_PRINTK_DELAY
static int boot_delay; /* msecs delay after each printk during bootup */
@@ -1014,7 +1145,7 @@ static void boot_delay_msec(int level)
unsigned long timeout;
if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
- || (level >= console_loglevel && !ignore_loglevel)) {
+ || suppress_message_printing(level)) {
return;
}
@@ -1438,8 +1569,6 @@ static void call_console_drivers(int level,
trace_console(text, len);
- if (level >= console_loglevel && !ignore_loglevel)
- return;
if (!console_drivers)
return;
@@ -1616,6 +1745,7 @@ asmlinkage int vprintk_emit(int facility, int level,
unsigned long flags;
int this_cpu;
int printed_len = 0;
+ int nmi_message_lost;
bool in_sched = false;
/* cpu currently holding logbuf_lock in this function */
static unsigned int logbuf_cpu = UINT_MAX;
@@ -1666,6 +1796,15 @@ asmlinkage int vprintk_emit(int facility, int level,
strlen(recursion_msg));
}
+ nmi_message_lost = get_nmi_message_lost();
+ if (unlikely(nmi_message_lost)) {
+ text_len = scnprintf(textbuf, sizeof(textbuf),
+ "BAD LUCK: lost %d message(s) from NMI context!",
+ nmi_message_lost);
+ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ NULL, 0, textbuf, text_len);
+ }
+
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
@@ -1807,14 +1946,6 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
/**
* printk - print a kernel message
* @fmt: format string
@@ -1838,21 +1969,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;
va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);
return r;
@@ -1895,6 +2016,7 @@ static void call_console_drivers(int level,
static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
+static bool suppress_message_printing(int level) { return false; }
/* Still needs to be defined for users */
DEFINE_PER_CPU(printk_func_t, printk_func);
@@ -2174,6 +2296,13 @@ static void console_cont_flush(char *text, size_t size)
if (!cont.len)
goto out;
+ if (suppress_message_printing(cont.level)) {
+ cont.cons = cont.len;
+ if (cont.flushed)
+ cont.len = 0;
+ goto out;
+ }
+
/*
* We still queue earlier records, likely because the console was
* busy. The earlier ones need to be printed before this one, we
@@ -2277,10 +2406,13 @@ skip:
break;
msg = log_from_idx(console_idx);
- if (msg->flags & LOG_NOCONS) {
+ level = msg->level;
+ if ((msg->flags & LOG_NOCONS) ||
+ suppress_message_printing(level)) {
/*
* Skip record we have buffered and already printed
- * directly to the console when we received it.
+ * directly to the console when we received it, and
+ * record that has level above the console loglevel.
*/
console_idx = log_next(console_idx);
console_seq++;
@@ -2294,7 +2426,6 @@ skip:
goto skip;
}
- level = msg->level;
len += msg_print_text(msg, console_prev, false,
text + len, sizeof(text) - len);
if (nr_ext_console_drivers) {
@@ -3184,9 +3315,8 @@ void show_regs_print_info(const char *log_lvl)
{
dump_stack_print_info(log_lvl);
- printk("%stask: %p ti: %p task.ti: %p\n",
- log_lvl, current, current_thread_info(),
- task_thread_info(current));
+ printk("%stask: %p task.stack: %p\n",
+ log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9901c9..2dbccf2d806c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -328,68 +328,57 @@ out:
put_cpu();
}
-static int profile_cpu_callback(struct notifier_block *info,
- unsigned long action, void *__cpu)
+static int profile_dead_cpu(unsigned int cpu)
{
- int node, cpu = (unsigned long)__cpu;
struct page *page;
+ int i;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- node = cpu_to_mem(cpu);
- per_cpu(cpu_profile_flip, cpu) = 0;
- if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- return notifier_from_errno(-ENOMEM);
- per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
- }
- if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- goto out_free;
- per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
- }
- break;
-out_free:
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- return notifier_from_errno(-ENOMEM);
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_set_cpu(cpu, prof_cpu_mask);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_clear_cpu(cpu, prof_cpu_mask);
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+ if (prof_cpu_mask != NULL)
+ cpumask_clear_cpu(cpu, prof_cpu_mask);
+
+ for (i = 0; i < 2; i++) {
+ if (per_cpu(cpu_profile_hits, cpu)[i]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
+ per_cpu(cpu_profile_hits, cpu)[i] = NULL;
__free_page(page);
}
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
+ }
+ return 0;
+}
+
+static int profile_prepare_cpu(unsigned int cpu)
+{
+ int i, node = cpu_to_mem(cpu);
+ struct page *page;
+
+ per_cpu(cpu_profile_flip, cpu) = 0;
+
+ for (i = 0; i < 2; i++) {
+ if (per_cpu(cpu_profile_hits, cpu)[i])
+ continue;
+
+ page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page) {
+ profile_dead_cpu(cpu);
+ return -ENOMEM;
}
- break;
+ per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
+
}
- return NOTIFY_OK;
+ return 0;
+}
+
+static int profile_online_cpu(unsigned int cpu)
+{
+ if (prof_cpu_mask != NULL)
+ cpumask_set_cpu(cpu, prof_cpu_mask);
+
+ return 0;
}
+
#else /* !CONFIG_SMP */
#define profile_flip_buffers() do { } while (0)
#define profile_discard_flip_buffers() do { } while (0)
-#define profile_cpu_callback NULL
static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_SMP
-static void profile_nop(void *unused)
-{
-}
-
-static int create_hash_tables(void)
+int __ref create_proc_profile(void)
{
- int cpu;
-
- for_each_online_cpu(cpu) {
- int node = cpu_to_mem(cpu);
- struct page *page;
-
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[1]
- = (struct profile_hit *)page_address(page);
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[0]
- = (struct profile_hit *)page_address(page);
- }
- return 0;
-out_cleanup:
- prof_on = 0;
- smp_mb();
- on_each_cpu(profile_nop, NULL, 1);
- for_each_online_cpu(cpu) {
- struct page *page;
-
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
- __free_page(page);
- }
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- }
- }
- return -1;
-}
-#else
-#define create_hash_tables() ({ 0; })
+ struct proc_dir_entry *entry;
+#ifdef CONFIG_SMP
+ enum cpuhp_state online_state;
#endif
-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
-{
- struct proc_dir_entry *entry;
int err = 0;
if (!prof_on)
return 0;
-
- cpu_notifier_register_begin();
-
- if (create_hash_tables()) {
- err = -ENOMEM;
- goto out;
- }
-
+#ifdef CONFIG_SMP
+ err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
+ profile_prepare_cpu, profile_dead_cpu);
+ if (err)
+ return err;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
+ profile_online_cpu, NULL);
+ if (err < 0)
+ goto err_state_prep;
+ online_state = err;
+ err = 0;
+#endif
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &proc_profile_operations);
if (!entry)
- goto out;
+ goto err_state_onl;
proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
- __hotcpu_notifier(profile_cpu_callback, 0);
-out:
- cpu_notifier_register_done();
+ return err;
+err_state_onl:
+#ifdef CONFIG_SMP
+ cpuhp_remove_state(online_state);
+err_state_prep:
+ cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
+#endif
return err;
}
subsys_initcall(create_proc_profile);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d49bfa1e53e6..1d3b7665d0be 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
return -EINVAL;
if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
- !config_enabled(CONFIG_SECCOMP))
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cee0d8393ed..123ccbd22449 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -52,13 +52,13 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define PERF_FLAG "-perf:"
#define PERFOUT_STRING(s) \
- pr_alert("%s" PERF_FLAG s "\n", perf_type)
+ pr_alert("%s" PERF_FLAG " %s\n", perf_type, s)
#define VERBOSE_PERFOUT_STRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
#define VERBOSE_PERFOUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
-torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
@@ -96,12 +96,7 @@ static int rcu_perf_writer_state;
#define MAX_MEAS 10000
#define MIN_MEAS 100
-#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
-#define RCUPERF_RUNNABLE_INIT 1
-#else
-#define RCUPERF_RUNNABLE_INIT 0
-#endif
-static int perf_runnable = RCUPERF_RUNNABLE_INIT;
+static int perf_runnable = IS_ENABLED(MODULE);
module_param(perf_runnable, int, 0444);
MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
@@ -363,8 +358,6 @@ rcu_perf_writer(void *arg)
u64 *wdpp = writer_durations[me];
VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
- WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
- WARN_ON(rcu_gp_is_normal() && gp_exp);
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
sp.sched_priority = 1;
@@ -407,9 +400,8 @@ rcu_perf_writer(void *arg)
sp.sched_priority = 0;
sched_setscheduler_nocheck(current,
SCHED_NORMAL, &sp);
- pr_alert("%s" PERF_FLAG
- "rcu_perf_writer %ld has %d measurements\n",
- perf_type, me, MIN_MEAS);
+ pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
+ perf_type, PERF_FLAG, me, MIN_MEAS);
if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
nrealwriters) {
schedule_timeout_interruptible(10);
@@ -631,12 +623,24 @@ rcu_perf_init(void)
firsterr = -ENOMEM;
goto unwind;
}
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ if (rcu_gp_is_normal() && gp_exp) {
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
for (i = 0; i < nrealwriters; i++) {
writer_durations[i] =
kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
GFP_KERNEL);
- if (!writer_durations[i])
+ if (!writer_durations[i]) {
+ firsterr = -ENOMEM;
goto unwind;
+ }
firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
writer_tasks[i]);
if (firsterr)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 084a28a732eb..bf08fee53dc7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
-#define RCUTORTURE_RUNNABLE_INIT 1
-#else
-#define RCUTORTURE_RUNNABLE_INIT 0
-#endif
-static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
+static int torture_runnable = IS_ENABLED(MODULE);
module_param(torture_runnable, int, 0444);
MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
@@ -1243,6 +1238,7 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
static unsigned long rtcv_snap = ULONG_MAX;
+ struct task_struct *wtp;
for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -1263,8 +1259,9 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_barrier_error,
n_rcu_torture_boost_ktrerror,
n_rcu_torture_boost_rterror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
@@ -1317,10 +1314,12 @@ rcu_torture_stats_print(void)
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
- pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+ wtp = READ_ONCE(writer_task);
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
- gpnum, completed, flags);
+ gpnum, completed, flags,
+ wtp == NULL ? ~0UL : wtp->state);
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1367,12 +1366,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
onoff_interval, onoff_holdoff);
}
-static void rcutorture_booster_cleanup(int cpu)
+static int rcutorture_booster_cleanup(unsigned int cpu)
{
struct task_struct *t;
if (boost_tasks[cpu] == NULL)
- return;
+ return 0;
mutex_lock(&boost_mutex);
t = boost_tasks[cpu];
boost_tasks[cpu] = NULL;
@@ -1380,9 +1379,10 @@ static void rcutorture_booster_cleanup(int cpu)
/* This must be outside of the mutex, otherwise deadlock! */
torture_stop_kthread(rcu_torture_boost, t);
+ return 0;
}
-static int rcutorture_booster_init(int cpu)
+static int rcutorture_booster_init(unsigned int cpu)
{
int retval;
@@ -1476,7 +1476,7 @@ static int rcu_torture_barrier_cbs(void *arg)
break;
/*
* The above smp_load_acquire() ensures barrier_phase load
- * is ordered before the folloiwng ->call().
+ * is ordered before the following ->call().
*/
local_irq_disable(); /* Just to test no-irq call_rcu(). */
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1582,28 +1582,7 @@ static void rcu_torture_barrier_cleanup(void)
}
}
-static int rcutorture_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- (void)rcutorture_booster_init(cpu);
- break;
- case CPU_DOWN_PREPARE:
- rcutorture_booster_cleanup(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block rcutorture_cpu_nb = {
- .notifier_call = rcutorture_cpu_notify,
-};
+static enum cpuhp_state rcutor_hp;
static void
rcu_torture_cleanup(void)
@@ -1643,11 +1622,8 @@ rcu_torture_cleanup(void)
for (i = 0; i < ncbflooders; i++)
torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
if ((test_boost == 1 && cur_ops->can_boost) ||
- test_boost == 2) {
- unregister_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i)
- rcutorture_booster_cleanup(i);
- }
+ test_boost == 2)
+ cpuhp_remove_state(rcutor_hp);
/*
* Wait for all RCU callbacks to fire, then do flavor-specific
@@ -1874,14 +1850,13 @@ rcu_torture_init(void)
test_boost == 2) {
boost_starttime = jiffies + test_boost_interval * HZ;
- register_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i) {
- if (cpu_is_offline(i))
- continue; /* Heuristic: CPU can go offline. */
- firsterr = rcutorture_booster_init(i);
- if (firsterr)
- goto unwind;
- }
+
+ firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
+ rcutorture_booster_init,
+ rcutorture_booster_cleanup);
+ if (firsterr < 0)
+ goto unwind;
+ rcutor_hp = firsterr;
}
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9f3d37..50d1861f7759 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
"suspicious rcu_sync_is_idle() usage");
}
+
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
#endif
/**
@@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
}
/**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+ rsp->gp_count++;
+ rsp->gp_state = GP_PASSED;
+}
+
+/**
* rcu_sync_enter() - Force readers onto slowpath
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7f1bc4f817c..7e2e03879c2e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -41,7 +41,6 @@
#include <linux/export.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
-#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -60,7 +59,6 @@
#include "tree.h"
#include "rcu.h"
-MODULE_ALIAS("rcutree");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
@@ -125,12 +123,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
/* Number of rcu_nodes at specified level. */
static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
* can assume that there is but one task, allowing RCU to (for example)
- * optimize synchronize_sched() to a simple barrier(). When this variable
+ * optimize synchronize_rcu() to a simple barrier(). When this variable
* is one, RCU must actually do all the hard work required to detect real
* grace periods. This variable is also used to suppress boot-time false
* positives from lockdep-RCU error checking.
@@ -159,6 +159,7 @@ static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
static void rcu_report_exp_rdp(struct rcu_state *rsp,
struct rcu_data *rdp, bool wake);
+static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
#ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -1070,11 +1071,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);
* offline to continue to use RCU for one jiffy after marking itself
* offline in the cpu_online_mask. This leniency is necessary given the
* non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
- * notifiers.
+ * the fact that a CPU enters the scheduler after completing the teardown
+ * of the CPU.
*
- * This is also why RCU internally marks CPUs online during the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ * This is also why RCU internally marks CPUs online during in the
+ * preparation phase and offline after the CPU has been taken down.
*
* Disable checking if in an NMI handler because we cannot safely report
* errors from NMI handlers anyway.
@@ -1284,9 +1285,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask != 0) {
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu))
- dump_cpu_task(rnp->grplo + cpu);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ dump_cpu_task(cpu);
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -1311,6 +1312,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
}
}
+static inline void panic_on_rcu_stall(void)
+{
+ if (sysctl_panic_on_rcu_stall)
+ panic("RCU Stall\n");
+}
+
static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
@@ -1351,10 +1358,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
ndetected += rcu_print_task_stall(rnp);
if (rnp->qsmask != 0) {
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu)) {
- print_cpu_stall_info(rsp,
- rnp->grplo + cpu);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ print_cpu_stall_info(rsp, cpu);
ndetected++;
}
}
@@ -1390,6 +1396,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
rcu_check_gp_kthread_starvation(rsp);
+ panic_on_rcu_stall();
+
force_quiescent_state(rsp); /* Kick them all. */
}
@@ -1430,6 +1438,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ panic_on_rcu_stall();
+
/*
* Attempt to revive the RCU machinery by forcing a context switch.
*
@@ -1836,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
bool ret;
+ bool need_gp;
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
@@ -1862,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
*/
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
- rdp->cpu_no_qs.b.norm = true;
+ need_gp = !!(rnp->qsmask & rdp->grpmask);
+ rdp->cpu_no_qs.b.norm = need_gp;
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
+ rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
}
@@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
* of the tree within the rsp->node[] array. Note that other CPUs
* will access only the leaves of the hierarchy, thus seeing that no
* grace period is in progress, at least until the corresponding
- * leaf node has been initialized. In addition, we have excluded
- * CPU-hotplug operations.
+ * leaf node has been initialized.
*
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
@@ -2333,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
- swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
unsigned long *maxj),
bool *isidle, unsigned long *maxj)
{
- unsigned long bit;
int cpu;
unsigned long flags;
unsigned long mask;
@@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
continue;
}
}
- cpu = rnp->grplo;
- bit = 1;
- for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
@@ -2961,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
}
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
- swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s)
return ULONG_CMP_GE(READ_ONCE(*sp), s);
}
-/* Wrapper functions for expedited grace periods. */
-static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
-{
- rcu_seq_start(&rsp->expedited_sequence);
-}
-static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
-{
- rcu_seq_end(&rsp->expedited_sequence);
- smp_mb(); /* Ensure that consecutive grace periods serialize. */
-}
-static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
-{
- unsigned long s;
-
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- s = rcu_seq_snap(&rsp->expedited_sequence);
- trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
- return s;
-}
-static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
-{
- return rcu_seq_done(&rsp->expedited_sequence, s);
-}
-
-/*
- * Reset the ->expmaskinit values in the rcu_node tree to reflect any
- * recent CPU-online activity. Note that these masks are not cleared
- * when CPUs go offline, so they reflect the union of all CPUs that have
- * ever been online. This means that this function normally takes its
- * no-work-to-do fastpath.
- */
-static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
-{
- bool done;
- unsigned long flags;
- unsigned long mask;
- unsigned long oldmask;
- int ncpus = READ_ONCE(rsp->ncpus);
- struct rcu_node *rnp;
- struct rcu_node *rnp_up;
-
- /* If no new CPUs onlined since last time, nothing to do. */
- if (likely(ncpus == rsp->ncpus_snap))
- return;
- rsp->ncpus_snap = ncpus;
-
- /*
- * Each pass through the following loop propagates newly onlined
- * CPUs for the current rcu_node structure up the rcu_node tree.
- */
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (rnp->expmaskinit == rnp->expmaskinitnext) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- continue; /* No new CPUs, nothing to do. */
- }
-
- /* Update this node's mask, track old value for propagation. */
- oldmask = rnp->expmaskinit;
- rnp->expmaskinit = rnp->expmaskinitnext;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- /* If was already nonzero, nothing to propagate. */
- if (oldmask)
- continue;
-
- /* Propagate the new CPU up the tree. */
- mask = rnp->grpmask;
- rnp_up = rnp->parent;
- done = false;
- while (rnp_up) {
- raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
- if (rnp_up->expmaskinit)
- done = true;
- rnp_up->expmaskinit |= mask;
- raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
- if (done)
- break;
- mask = rnp_up->grpmask;
- rnp_up = rnp_up->parent;
- }
- }
-}
-
-/*
- * Reset the ->expmask values in the rcu_node tree in preparation for
- * a new expedited grace period.
- */
-static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_node *rnp;
-
- sync_exp_reset_tree_hotplug(rsp);
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- WARN_ON_ONCE(rnp->expmask);
- rnp->expmask = rnp->expmaskinit;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-}
-
-/*
- * Return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period. Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
- return rnp->exp_tasks == NULL &&
- READ_ONCE(rnp->expmask) == 0;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period. This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree. (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
- */
-static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake, unsigned long flags)
- __releases(rnp->lock)
-{
- unsigned long mask;
-
- for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp)) {
- if (!rnp->expmask)
- rcu_initiate_boost(rnp, flags);
- else
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- break;
- }
- if (rnp->parent == NULL) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (wake) {
- smp_mb(); /* EGP done before wake_up(). */
- swake_up(&rsp->expedited_wq);
- }
- break;
- }
- mask = rnp->grpmask;
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
- rnp = rnp->parent;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
- WARN_ON_ONCE(!(rnp->expmask & mask));
- rnp->expmask &= ~mask;
- }
-}
-
-/*
- * Report expedited quiescent state for specified node. This is a
- * lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
- struct rcu_node *rnp, bool wake)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- __rcu_report_exp_rnp(rsp, rnp, wake, flags);
-}
-
-/*
- * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure. Caller must hold the rcu_state's
- * exp_mutex.
- */
-static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
- unsigned long mask, bool wake)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (!(rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- rnp->expmask &= ~mask;
- __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
-}
-
-/*
- * Report expedited quiescent state for specified rcu_data (CPU).
- */
-static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
- bool wake)
-{
- rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
-}
-
-/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
- unsigned long s)
-{
- if (rcu_exp_gp_seq_done(rsp, s)) {
- trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
- /* Ensure test happens before caller kfree(). */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(stat);
- return true;
- }
- return false;
-}
-
-/*
- * Funnel-lock acquisition for expedited grace periods. Returns true
- * if some other task completed an expedited grace period that this task
- * can piggy-back on, and with no mutex held. Otherwise, returns false
- * with the mutex held, indicating that the caller must actually do the
- * expedited grace period.
- */
-static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
-{
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
- struct rcu_node *rnp = rdp->mynode;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- /* Low-contention fastpath. */
- if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
- (rnp == rnp_root ||
- ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
- !mutex_is_locked(&rsp->exp_mutex) &&
- mutex_trylock(&rsp->exp_mutex))
- goto fastpath;
-
- /*
- * Each pass through the following loop works its way up
- * the rcu_node tree, returning if others have done the work or
- * otherwise falls through to acquire rsp->exp_mutex. The mapping
- * from CPU to rcu_node structure can be inexact, as it is just
- * promoting locality and is not strictly needed for correctness.
- */
- for (; rnp != NULL; rnp = rnp->parent) {
- if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
- return true;
-
- /* Work not done, either wait here or go up. */
- spin_lock(&rnp->exp_lock);
- if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
-
- /* Someone else doing GP, so wait for them. */
- spin_unlock(&rnp->exp_lock);
- trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
- rnp->grplo, rnp->grphi,
- TPS("wait"));
- wait_event(rnp->exp_wq[(s >> 1) & 0x3],
- sync_exp_work_done(rsp,
- &rdp->exp_workdone2, s));
- return true;
- }
- rnp->exp_seq_rq = s; /* Followers can wait on us. */
- spin_unlock(&rnp->exp_lock);
- trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
- rnp->grphi, TPS("nxtlvl"));
- }
- mutex_lock(&rsp->exp_mutex);
-fastpath:
- if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
- mutex_unlock(&rsp->exp_mutex);
- return true;
- }
- rcu_exp_gp_seq_start(rsp);
- trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
- return false;
-}
-
-/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *data)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- struct rcu_state *rsp = data;
-
- rdp = this_cpu_ptr(rsp->rda);
- rnp = rdp->mynode;
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
- return;
- if (rcu_is_cpu_rrupt_from_idle()) {
- rcu_report_exp_rdp(&rcu_sched_state,
- this_cpu_ptr(&rcu_sched_data), true);
- return;
- }
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
- resched_cpu(smp_processor_id());
-}
-
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
- struct rcu_data *rdp;
- int ret;
- struct rcu_node *rnp;
- struct rcu_state *rsp = &rcu_sched_state;
-
- rdp = per_cpu_ptr(rsp->rda, cpu);
- rnp = rdp->mynode;
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
- return;
- ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
- WARN_ON_ONCE(ret);
-}
-
-/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
- */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
- smp_call_func_t func)
-{
- int cpu;
- unsigned long flags;
- unsigned long mask;
- unsigned long mask_ofl_test;
- unsigned long mask_ofl_ipi;
- int ret;
- struct rcu_node *rnp;
-
- sync_exp_reset_tree(rsp);
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
-
- /* Each pass checks a CPU for identity, offline, and idle. */
- mask_ofl_test = 0;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (raw_smp_processor_id() == cpu ||
- !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- mask_ofl_test |= rdp->grpmask;
- }
- mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited
- * GP until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- rnp->exp_tasks = rnp->blkd_tasks.next;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- /* IPI the remaining CPUs for expedited quiescent state. */
- mask = 1;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
- if (!(mask_ofl_ipi & mask))
- continue;
-retry_ipi:
- ret = smp_call_function_single(cpu, func, rsp, 0);
- if (!ret) {
- mask_ofl_ipi &= ~mask;
- continue;
- }
- /* Failed, raced with offline. */
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- schedule_timeout_uninterruptible(1);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask))
- goto retry_ipi;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- }
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
- /* Report quiescent states for those that went offline. */
- mask_ofl_test |= mask_ofl_ipi;
- if (mask_ofl_test)
- rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
- }
-}
-
-static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
-{
- int cpu;
- unsigned long jiffies_stall;
- unsigned long jiffies_start;
- unsigned long mask;
- int ndetected;
- struct rcu_node *rnp;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
- int ret;
-
- jiffies_stall = rcu_jiffies_till_stall_check();
- jiffies_start = jiffies;
-
- for (;;) {
- ret = swait_event_timeout(
- rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root),
- jiffies_stall);
- if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
- return;
- if (ret < 0) {
- /* Hit a signal, disable CPU stall warnings. */
- swait_event(rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root));
- return;
- }
- pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
- rsp->name);
- ndetected = 0;
- rcu_for_each_leaf_node(rsp, rnp) {
- ndetected += rcu_print_task_exp_stall(rnp);
- mask = 1;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
- struct rcu_data *rdp;
-
- if (!(rnp->expmask & mask))
- continue;
- ndetected++;
- rdp = per_cpu_ptr(rsp->rda, cpu);
- pr_cont(" %d-%c%c%c", cpu,
- "O."[!!cpu_online(cpu)],
- "o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
- }
- mask <<= 1;
- }
- pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
- jiffies - jiffies_start, rsp->expedited_sequence,
- rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
- if (ndetected) {
- pr_err("blocking rcu_node structures:");
- rcu_for_each_node_breadth_first(rsp, rnp) {
- if (rnp == rnp_root)
- continue; /* printed unconditionally */
- if (sync_rcu_preempt_exp_done(rnp))
- continue;
- pr_cont(" l=%u:%d-%d:%#lx/%c",
- rnp->level, rnp->grplo, rnp->grphi,
- rnp->expmask,
- ".T"[!!rnp->exp_tasks]);
- }
- pr_cont("\n");
- }
- rcu_for_each_leaf_node(rsp, rnp) {
- mask = 1;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
- if (!(rnp->expmask & mask))
- continue;
- dump_cpu_task(cpu);
- }
- }
- jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
- }
-}
-
-/*
- * Wait for the current expedited grace period to complete, and then
- * wake up everyone who piggybacked on the just-completed expedited
- * grace period. Also update all the ->exp_seq_rq counters as needed
- * in order to avoid counter-wrap problems.
- */
-static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
-{
- struct rcu_node *rnp;
-
- synchronize_sched_expedited_wait(rsp);
- rcu_exp_gp_seq_end(rsp);
- trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-
- /*
- * Switch over to wakeup mode, allowing the next GP, but -only- the
- * next GP, to proceed.
- */
- mutex_lock(&rsp->exp_wake_mutex);
- mutex_unlock(&rsp->exp_mutex);
-
- rcu_for_each_node_breadth_first(rsp, rnp) {
- if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
- spin_lock(&rnp->exp_lock);
- /* Recheck, avoid hang in case someone just arrived. */
- if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
- rnp->exp_seq_rq = s;
- spin_unlock(&rnp->exp_lock);
- }
- wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
- }
- trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
- mutex_unlock(&rsp->exp_wake_mutex);
-}
-
-/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code. In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
- *
- * This implementation can be thought of as an application of sequence
- * locking to expedited grace periods, but using the sequence counter to
- * determine when someone else has already done the work instead of for
- * retrying readers.
- */
-void synchronize_sched_expedited(void)
-{
- unsigned long s;
- struct rcu_state *rsp = &rcu_sched_state;
-
- /* If only one CPU, this is automatically a grace period. */
- if (rcu_blocking_is_gp())
- return;
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu_sched);
- return;
- }
-
- /* Take a snapshot of the sequence number. */
- s = rcu_exp_gp_seq_snap(rsp);
- if (exp_funnel_lock(rsp, s))
- return; /* Someone else did our work for us. */
-
- /* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-
- /* Wait and clean up, including waking everyone. */
- rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -4326,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rnp = rdp->mynode;
mask = rdp->grpmask;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinitnext |= mask;
- rnp->expmaskinitnext |= mask;
if (!rdp->beenonline)
WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
@@ -4340,12 +3804,84 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static void rcu_prepare_cpu(int cpu)
+int rcutree_prepare_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
rcu_init_percpu_data(cpu, rsp);
+
+ rcu_prepare_kthreads(cpu);
+ rcu_spawn_all_nocb_kthreads(cpu);
+
+ return 0;
+}
+
+static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+
+ rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+}
+
+int rcutree_online_cpu(unsigned int cpu)
+{
+ sync_sched_exp_online_cleanup(cpu);
+ rcutree_affinity_setting(cpu, -1);
+ return 0;
+}
+
+int rcutree_offline_cpu(unsigned int cpu)
+{
+ rcutree_affinity_setting(cpu, cpu);
+ return 0;
+}
+
+
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_cpu(rsp);
+ return 0;
+}
+
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
+ return 0;
+}
+
+/*
+ * Mark the specified CPU as being online so that subsequent grace periods
+ * (both expedited and normal) will wait on it. Note that this means that
+ * incoming CPUs are not allowed to use RCU read-side critical sections
+ * until this function is called. Failing to observe this restriction
+ * will result in lockdep splats.
+ */
+void rcu_cpu_starting(unsigned int cpu)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->qsmaskinitnext |= mask;
+ rnp->expmaskinitnext |= mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -4364,9 +3900,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return;
-
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
@@ -4388,52 +3921,6 @@ void rcu_report_dead(unsigned int cpu)
}
#endif
-/*
- * Handle CPU online/offline notification events.
- */
-int rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
- struct rcu_state *rsp;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_prepare_cpu(cpu);
- rcu_prepare_kthreads(cpu);
- rcu_spawn_all_nocb_kthreads(cpu);
- break;
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- sync_sched_exp_online_cleanup(cpu);
- rcu_boost_kthread_setaffinity(rnp, -1);
- break;
- case CPU_DOWN_PREPARE:
- rcu_boost_kthread_setaffinity(rnp, cpu);
- break;
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- for_each_rcu_flavor(rsp)
- rcu_cleanup_dying_cpu(rsp);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp) {
- rcu_cleanup_dead_cpu(cpu, rsp);
- do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
- }
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
static int rcu_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -4745,10 +4232,12 @@ void __init rcu_init(void)
* this is called early in boot, before either interrupts
* or the scheduler are operational.
*/
- cpu_notifier(rcu_cpu_notify, 0);
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu)
- rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ for_each_online_cpu(cpu) {
+ rcutree_prepare_cpu(cpu);
+ rcu_cpu_starting(cpu);
+ }
}
+#include "tree_exp.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e3959f5e6ddf..e99a5234d9ed 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -254,6 +254,13 @@ struct rcu_node {
} ____cacheline_internodealigned_in_smp;
/*
+ * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
+ * are indexed relative to this interval rather than the global CPU ID space.
+ * This generates the bit for a CPU in node-local masks.
+ */
+#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
+
+/*
* Do a full breadth-first scan of the rcu_node structures for the
* specified rcu_state structure.
*/
@@ -281,6 +288,14 @@ struct rcu_node {
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+ for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
+ cpu <= rnp->grphi; \
+ cpu = cpumask_next((cpu), cpu_possible_mask))
+
+/*
* Union to allow "aggregate OR" operation on the need for a quiescent
* state by the normal and expedited grace periods.
*/
@@ -385,6 +400,7 @@ struct rcu_data {
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+ atomic_long_t exp_workdone0; /* # done by workqueue. */
atomic_long_t exp_workdone1; /* # done by others #1. */
atomic_long_t exp_workdone2; /* # done by others #2. */
atomic_long_t exp_workdone3; /* # done by others #3. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
new file mode 100644
index 000000000000..24343eb87b58
--- /dev/null
+++ b/kernel/rcu/tree_exp.h
@@ -0,0 +1,685 @@
+/*
+ * RCU expedited grace periods
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2016
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+/* Wrapper functions for expedited grace periods. */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+ rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+ rcu_seq_end(&rsp->expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = rcu_seq_snap(&rsp->expedited_sequence);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+ return s;
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+ return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity. Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online. This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+ bool done;
+ unsigned long flags;
+ unsigned long mask;
+ unsigned long oldmask;
+ int ncpus = READ_ONCE(rsp->ncpus);
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_up;
+
+ /* If no new CPUs onlined since last time, nothing to do. */
+ if (likely(ncpus == rsp->ncpus_snap))
+ return;
+ rsp->ncpus_snap = ncpus;
+
+ /*
+ * Each pass through the following loop propagates newly onlined
+ * CPUs for the current rcu_node structure up the rcu_node tree.
+ */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->expmaskinit == rnp->expmaskinitnext) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue; /* No new CPUs, nothing to do. */
+ }
+
+ /* Update this node's mask, track old value for propagation. */
+ oldmask = rnp->expmaskinit;
+ rnp->expmaskinit = rnp->expmaskinitnext;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ /* If was already nonzero, nothing to propagate. */
+ if (oldmask)
+ continue;
+
+ /* Propagate the new CPU up the tree. */
+ mask = rnp->grpmask;
+ rnp_up = rnp->parent;
+ done = false;
+ while (rnp_up) {
+ raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
+ if (rnp_up->expmaskinit)
+ done = true;
+ rnp_up->expmaskinit |= mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
+ if (done)
+ break;
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ }
+ }
+}
+
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ sync_exp_reset_tree_hotplug(rsp);
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ WARN_ON_ONCE(rnp->expmask);
+ rnp->expmask = rnp->expmaskinit;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
+/*
+ * Return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period. Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+ return rnp->exp_tasks == NULL &&
+ READ_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period. This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree. (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
+ */
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long mask;
+
+ for (;;) {
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ if (!rnp->expmask)
+ rcu_initiate_boost(rnp, flags);
+ else
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ break;
+ }
+ if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (wake) {
+ smp_mb(); /* EGP done before wake_up(). */
+ swake_up(&rsp->expedited_wq);
+ }
+ break;
+ }
+ mask = rnp->grpmask;
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
+ rnp = rnp->parent;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
+ WARN_ON_ONCE(!(rnp->expmask & mask));
+ rnp->expmask &= ~mask;
+ }
+}
+
+/*
+ * Report expedited quiescent state for specified node. This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+ struct rcu_node *rnp, bool wake)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ __rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure. Caller must hold the rcu_state's
+ * exp_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+ unsigned long mask, bool wake)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!(rnp->expmask & mask)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ rnp->expmask &= ~mask;
+ __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ */
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+ bool wake)
+{
+ rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+ unsigned long s)
+{
+ if (rcu_exp_gp_seq_done(rsp, s)) {
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held. Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
+ */
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ struct rcu_node *rnp = rdp->mynode;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ /* Low-contention fastpath. */
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+ (rnp == rnp_root ||
+ ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+ mutex_trylock(&rsp->exp_mutex))
+ goto fastpath;
+
+ /*
+ * Each pass through the following loop works its way up
+ * the rcu_node tree, returning if others have done the work or
+ * otherwise falls through to acquire rsp->exp_mutex. The mapping
+ * from CPU to rcu_node structure can be inexact, as it is just
+ * promoting locality and is not strictly needed for correctness.
+ */
+ for (; rnp != NULL; rnp = rnp->parent) {
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+ return true;
+
+ /* Work not done, either wait here or go up. */
+ spin_lock(&rnp->exp_lock);
+ if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+ /* Someone else doing GP, so wait for them. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+ rnp->grplo, rnp->grphi,
+ TPS("wait"));
+ wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ sync_exp_work_done(rsp,
+ &rdp->exp_workdone2, s));
+ return true;
+ }
+ rnp->exp_seq_rq = s; /* Followers can wait on us. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+ rnp->grphi, TPS("nxtlvl"));
+ }
+ mutex_lock(&rsp->exp_mutex);
+fastpath:
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+ mutex_unlock(&rsp->exp_mutex);
+ return true;
+ }
+ rcu_exp_gp_seq_start(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+ return false;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static void sync_sched_exp_handler(void *data)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = data;
+
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ if (rcu_is_cpu_rrupt_from_idle()) {
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
+ return;
+ }
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+ resched_cpu(smp_processor_id());
+}
+
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+ struct rcu_data *rdp;
+ int ret;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+ return;
+ ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+ WARN_ON_ONCE(ret);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long mask_ofl_test;
+ unsigned long mask_ofl_ipi;
+ int ret;
+ struct rcu_node *rnp;
+
+ sync_exp_reset_tree(rsp);
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+
+ /* Each pass checks a CPU for identity, offline, and idle. */
+ mask_ofl_test = 0;
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1) ||
+ !(rnp->qsmaskinitnext & rdp->grpmask))
+ mask_ofl_test |= rdp->grpmask;
+ }
+ mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited
+ * GP until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_preempt_has_tasks(rnp))
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ /* IPI the remaining CPUs for expedited quiescent state. */
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(mask_ofl_ipi & mask))
+ continue;
+retry_ipi:
+ ret = smp_call_function_single(cpu, func, rsp, 0);
+ if (!ret) {
+ mask_ofl_ipi &= ~mask;
+ continue;
+ }
+ /* Failed, raced with CPU hotplug operation. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if ((rnp->qsmaskinitnext & mask) &&
+ (rnp->expmask & mask)) {
+ /* Online, so delay for a bit and try again. */
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ schedule_timeout_uninterruptible(1);
+ goto retry_ipi;
+ }
+ /* CPU really is offline, so we can ignore it. */
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ /* Report quiescent states for those that went offline. */
+ mask_ofl_test |= mask_ofl_ipi;
+ if (mask_ofl_test)
+ rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+ }
+}
+
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ unsigned long mask;
+ int ndetected;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+ int ret;
+
+ jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+
+ for (;;) {
+ ret = swait_event_timeout(
+ rsp->expedited_wq,
+ sync_rcu_preempt_exp_done(rnp_root),
+ jiffies_stall);
+ if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+ return;
+ WARN_ON(ret < 0); /* workqueues should not be signaled. */
+ if (rcu_cpu_stall_suppress)
+ continue;
+ panic_on_rcu_stall();
+ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
+ rsp->name);
+ ndetected = 0;
+ rcu_for_each_leaf_node(rsp, rnp) {
+ ndetected += rcu_print_task_exp_stall(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ struct rcu_data *rdp;
+
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(rnp->expmask & mask))
+ continue;
+ ndetected++;
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ pr_cont(" %d-%c%c%c", cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ }
+ }
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ jiffies - jiffies_start, rsp->expedited_sequence,
+ rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ if (ndetected) {
+ pr_err("blocking rcu_node structures:");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_preempt_exp_done(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi,
+ rnp->expmask,
+ ".T"[!!rnp->exp_tasks]);
+ }
+ pr_cont("\n");
+ }
+ rcu_for_each_leaf_node(rsp, rnp) {
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(rnp->expmask & mask))
+ continue;
+ dump_cpu_task(cpu);
+ }
+ }
+ jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ }
+}
+
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period. Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_node *rnp;
+
+ synchronize_sched_expedited_wait(rsp);
+ rcu_exp_gp_seq_end(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+ /*
+ * Switch over to wakeup mode, allowing the next GP, but -only- the
+ * next GP, to proceed.
+ */
+ mutex_lock(&rsp->exp_wake_mutex);
+
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+ spin_lock(&rnp->exp_lock);
+ /* Recheck, avoid hang in case someone just arrived. */
+ if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+ rnp->exp_seq_rq = s;
+ spin_unlock(&rnp->exp_lock);
+ }
+ wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+ }
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+ mutex_unlock(&rsp->exp_wake_mutex);
+}
+
+/* Let the workqueue handler know what it is supposed to do. */
+struct rcu_exp_work {
+ smp_call_func_t rew_func;
+ struct rcu_state *rew_rsp;
+ unsigned long rew_s;
+ struct work_struct rew_work;
+};
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
+
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+}
+
+/*
+ * Given an rcu_state pointer and a smp_call_function() handler, kick
+ * off the specified flavor of expedited grace period.
+ */
+static void _synchronize_rcu_expedited(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ struct rcu_data *rdp;
+ struct rcu_exp_work rew;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(rsp->call);
+ return;
+ }
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
+ if (exp_funnel_lock(rsp, s))
+ return; /* Someone else did our work for us. */
+
+ /* Marshall arguments and schedule the expedited grace period. */
+ rew.rew_func = func;
+ rew.rew_rsp = rsp;
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ schedule_work(&rew.rew_work);
+
+ /* Wait for expedited grace period to complete. */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ rnp = rcu_get_root(rsp);
+ wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ sync_exp_work_done(rsp,
+ &rdp->exp_workdone0, s));
+
+ /* Let the next expedited grace period start. */
+ mutex_unlock(&rsp->exp_mutex);
+}
+
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code. In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
+ */
+void synchronize_sched_expedited(void)
+{
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ /* If only one CPU, this is automatically a grace period. */
+ if (rcu_blocking_is_gp())
+ return;
+
+ _synchronize_rcu_expedited(rsp, sync_sched_exp_handler);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Remote handler for smp_call_function_single(). If there is an
+ * RCU read-side critical section in effect, request that the
+ * next rcu_read_unlock() record the quiescent state up the
+ * ->expmask fields in the rcu_node tree. Otherwise, immediately
+ * report the quiescent state.
+ */
+static void sync_rcu_exp_handler(void *info)
+{
+ struct rcu_data *rdp;
+ struct rcu_state *rsp = info;
+ struct task_struct *t = current;
+
+ /*
+ * Within an RCU read-side critical section, request that the next
+ * rcu_read_unlock() report. Unless this RCU read-side critical
+ * section has already blocked, in which case it is already set
+ * up for the expedited grace period to wait on it.
+ */
+ if (t->rcu_read_lock_nesting > 0 &&
+ !t->rcu_read_unlock_special.b.blocked) {
+ t->rcu_read_unlock_special.b.exp_need_qs = true;
+ return;
+ }
+
+ /*
+ * We are either exiting an RCU read-side critical section (negative
+ * values of t->rcu_read_lock_nesting) or are not in one at all
+ * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
+ * read-side critical section that blocked before this expedited
+ * grace period started. Either way, we can immediately report
+ * the quiescent state.
+ */
+ rdp = this_cpu_ptr(rsp->rda);
+ rcu_report_exp_rdp(rsp, rdp, true);
+}
+
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU-preempt grace period, but expedite it. The basic
+ * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state. On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
+ */
+void synchronize_rcu_expedited(void)
+{
+ struct rcu_state *rsp = rcu_state_p;
+
+ _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptible RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+ synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ff1cd4e1188d..85c5a883c6e3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
- if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
- pr_info("\tRCU torture testing starts during boot.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
@@ -681,84 +679,6 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
-/*
- * Remote handler for smp_call_function_single(). If there is an
- * RCU read-side critical section in effect, request that the
- * next rcu_read_unlock() record the quiescent state up the
- * ->expmask fields in the rcu_node tree. Otherwise, immediately
- * report the quiescent state.
- */
-static void sync_rcu_exp_handler(void *info)
-{
- struct rcu_data *rdp;
- struct rcu_state *rsp = info;
- struct task_struct *t = current;
-
- /*
- * Within an RCU read-side critical section, request that the next
- * rcu_read_unlock() report. Unless this RCU read-side critical
- * section has already blocked, in which case it is already set
- * up for the expedited grace period to wait on it.
- */
- if (t->rcu_read_lock_nesting > 0 &&
- !t->rcu_read_unlock_special.b.blocked) {
- t->rcu_read_unlock_special.b.exp_need_qs = true;
- return;
- }
-
- /*
- * We are either exiting an RCU read-side critical section (negative
- * values of t->rcu_read_lock_nesting) or are not in one at all
- * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
- * read-side critical section that blocked before this expedited
- * grace period started. Either way, we can immediately report
- * the quiescent state.
- */
- rdp = this_cpu_ptr(rsp->rda);
- rcu_report_exp_rdp(rsp, rdp, true);
-}
-
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
- * checks whether the CPU is in an RCU-preempt critical section, and
- * if so, it sets a flag that causes the outermost rcu_read_unlock()
- * to report the quiescent state. On the other hand, if the CPU is
- * not in an RCU read-side critical section, the IPI handler reports
- * the quiescent state immediately.
- *
- * Although this is a greate improvement over previous expedited
- * implementations, it is still unfriendly to real-time workloads, so is
- * thus not recommended for any sort of common-case code. In fact, if
- * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
- * instead.
- */
-void synchronize_rcu_expedited(void)
-{
- struct rcu_state *rsp = rcu_state_p;
- unsigned long s;
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
- return;
- }
-
- s = rcu_exp_gp_seq_snap(rsp);
- if (exp_funnel_lock(rsp, s))
- return; /* Someone else did our work for us. */
-
- /* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
-
- /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
- rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
/**
* rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
*
@@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void)
}
/*
- * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptible RCU does not exist, map to rcu-sched.
- */
-void synchronize_rcu_expedited(void)
-{
- synchronize_sched_expedited();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-/*
* Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
*/
@@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
- if ((mask & 0x1) && cpu != outgoingcpu)
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+ cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
if (cpumask_weight(cm) == 0)
cpumask_setall(cm);
@@ -2262,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg)
cl++;
c++;
local_bh_enable();
+ cond_resched_rcu_qs();
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 86782f9a4604..b1f28972872c 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v)
int cpu;
struct rcu_state *rsp = (struct rcu_state *)m->private;
struct rcu_data *rdp;
- unsigned long s1 = 0, s2 = 0, s3 = 0;
+ unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(rsp->rda, cpu);
+ s0 += atomic_long_read(&rdp->exp_workdone0);
s1 += atomic_long_read(&rdp->exp_workdone1);
s2 += atomic_long_read(&rdp->exp_workdone2);
s3 += atomic_long_read(&rdp->exp_workdone3);
}
- seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence, s1, s2, s3,
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence, s0, s1, s2, s3,
atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ccdc8eebc5a..f19271dce0a9 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -46,7 +46,7 @@
#include <linux/export.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/tick.h>
@@ -54,7 +54,6 @@
#include "rcu.h"
-MODULE_ALIAS("rcupdate");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
@@ -380,29 +379,9 @@ void destroy_rcu_head(struct rcu_head *head)
debug_object_free(head, &rcuhead_debug_descr);
}
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
{
- struct rcu_head *head = addr;
-
- switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. We just make sure that it is
- * tracked in the object tracker.
- */
- debug_object_init(head, &rcuhead_debug_descr);
- debug_object_activate(head, &rcuhead_debug_descr);
- return 0;
- default:
- return 1;
- }
+ return true;
}
/**
@@ -440,7 +419,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
- .fixup_activate = rcuhead_fixup_activate,
+ .is_static_object = rcuhead_is_static_object,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -548,6 +527,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
module_param(rcu_task_stall_timeout, int, 0644);
static void rcu_spawn_tasks_kthread(void);
+static struct task_struct *rcu_tasks_kthread_ptr;
/*
* Post an RCU-tasks callback. First call must be from process context
@@ -557,6 +537,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
unsigned long flags;
bool needwake;
+ bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
rhp->next = NULL;
rhp->func = func;
@@ -565,7 +546,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
*rcu_tasks_cbs_tail = rhp;
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
- if (needwake) {
+ /* We can't create the thread unless interrupts are enabled. */
+ if ((needwake && havetask) ||
+ (!havetask && !irqs_disabled_flags(flags))) {
rcu_spawn_tasks_kthread();
wake_up(&rcu_tasks_cbs_wq);
}
@@ -810,7 +793,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)
static void rcu_spawn_tasks_kthread(void)
{
static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
- static struct task_struct *rcu_tasks_kthread_ptr;
struct task_struct *t;
if (READ_ONCE(rcu_tasks_kthread_ptr)) {
diff --git a/kernel/relay.c b/kernel/relay.c
index 074994bcfa9b..fc9b4a4af463 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
__free_page(buf->page_array[i]);
relay_free_page_array(buf->page_array);
}
- chan->buf[buf->cpu] = NULL;
+ *per_cpu_ptr(chan->buf, buf->cpu) = NULL;
kfree(buf->padding);
kfree(buf);
kref_put(&chan->kref, relay_destroy_channel);
@@ -382,20 +382,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
*/
void relay_reset(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
- if (chan->is_global && chan->buf[0]) {
- __relay_reset(chan->buf[0], 0);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+ __relay_reset(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
- if (chan->buf[i])
- __relay_reset(chan->buf[i], 0);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ __relay_reset(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_reset);
@@ -440,7 +441,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
struct dentry *dentry;
if (chan->is_global)
- return chan->buf[0];
+ return *per_cpu_ptr(chan->buf, 0);
buf = relay_create_buf(chan);
if (!buf)
@@ -451,13 +452,20 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
if (!dentry)
goto free_buf;
relay_set_buf_dentry(buf, dentry);
+ } else {
+ /* Only retrieve global info, nothing more, nothing less */
+ dentry = chan->cb->create_buf_file(NULL, NULL,
+ S_IRUSR, buf,
+ &chan->is_global);
+ if (WARN_ON(dentry))
+ goto free_buf;
}
buf->cpu = cpu;
__relay_reset(buf, 1);
if(chan->is_global) {
- chan->buf[0] = buf;
+ *per_cpu_ptr(chan->buf, 0) = buf;
buf->cpu = 0;
}
@@ -505,46 +513,25 @@ static void setup_callbacks(struct rchan *chan,
chan->cb = cb;
}
-/**
- * relay_hotcpu_callback - CPU hotplug callback
- * @nb: notifier block
- * @action: hotplug action to take
- * @hcpu: CPU number
- *
- * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
- */
-static int relay_hotcpu_callback(struct notifier_block *nb,
- unsigned long action,
- void *hcpu)
+int relay_prepare_cpu(unsigned int cpu)
{
- unsigned int hotcpu = (unsigned long)hcpu;
struct rchan *chan;
+ struct rchan_buf *buf;
- switch(action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&relay_channels_mutex);
- list_for_each_entry(chan, &relay_channels, list) {
- if (chan->buf[hotcpu])
- continue;
- chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
- if(!chan->buf[hotcpu]) {
- printk(KERN_ERR
- "relay_hotcpu_callback: cpu %d buffer "
- "creation failed\n", hotcpu);
- mutex_unlock(&relay_channels_mutex);
- return notifier_from_errno(-ENOMEM);
- }
+ mutex_lock(&relay_channels_mutex);
+ list_for_each_entry(chan, &relay_channels, list) {
+ if ((buf = *per_cpu_ptr(chan->buf, cpu)))
+ continue;
+ buf = relay_open_buf(chan, cpu);
+ if (!buf) {
+ pr_err("relay: cpu %d buffer creation failed\n", cpu);
+ mutex_unlock(&relay_channels_mutex);
+ return -ENOMEM;
}
- mutex_unlock(&relay_channels_mutex);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- /* No need to flush the cpu : will be flushed upon
- * final relay_flush() call. */
- break;
+ *per_cpu_ptr(chan->buf, cpu) = buf;
}
- return NOTIFY_OK;
+ mutex_unlock(&relay_channels_mutex);
+ return 0;
}
/**
@@ -562,6 +549,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb,
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be %S_IRUSR.
+ *
+ * If opening a buffer (@parent = NULL) that you later wish to register
+ * in a filesystem, call relay_late_setup_files() once the @parent dentry
+ * is available.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
@@ -572,6 +563,7 @@ struct rchan *relay_open(const char *base_filename,
{
unsigned int i;
struct rchan *chan;
+ struct rchan_buf *buf;
if (!(subbuf_size && n_subbufs))
return NULL;
@@ -582,6 +574,7 @@ struct rchan *relay_open(const char *base_filename,
if (!chan)
return NULL;
+ chan->buf = alloc_percpu(struct rchan_buf *);
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
@@ -597,9 +590,10 @@ struct rchan *relay_open(const char *base_filename,
mutex_lock(&relay_channels_mutex);
for_each_online_cpu(i) {
- chan->buf[i] = relay_open_buf(chan, i);
- if (!chan->buf[i])
+ buf = relay_open_buf(chan, i);
+ if (!buf)
goto free_bufs;
+ *per_cpu_ptr(chan->buf, i) = buf;
}
list_add(&chan->list, &relay_channels);
mutex_unlock(&relay_channels_mutex);
@@ -608,12 +602,13 @@ struct rchan *relay_open(const char *base_filename,
free_bufs:
for_each_possible_cpu(i) {
- if (chan->buf[i])
- relay_close_buf(chan->buf[i]);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_close_buf(buf);
}
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
+ kfree(chan);
return NULL;
}
EXPORT_SYMBOL_GPL(relay_open);
@@ -639,8 +634,12 @@ static void __relay_set_buf_dentry(void *info)
*
* Returns 0 if successful, non-zero otherwise.
*
- * Use to setup files for a previously buffer-only channel.
- * Useful to do early tracing in kernel, before VFS is up, for example.
+ * Use to setup files for a previously buffer-only channel created
+ * by relay_open() with a NULL parent dentry.
+ *
+ * For example, this is useful for perfomring early tracing in kernel,
+ * before VFS is up and then exposing the early results once the dentry
+ * is available.
*/
int relay_late_setup_files(struct rchan *chan,
const char *base_filename,
@@ -650,6 +649,7 @@ int relay_late_setup_files(struct rchan *chan,
unsigned int i, curr_cpu;
unsigned long flags;
struct dentry *dentry;
+ struct rchan_buf *buf;
struct rchan_percpu_buf_dispatcher disp;
if (!chan || !base_filename)
@@ -665,6 +665,21 @@ int relay_late_setup_files(struct rchan *chan,
}
chan->has_base_filename = 1;
chan->parent = parent;
+
+ if (chan->is_global) {
+ err = -EINVAL;
+ buf = *per_cpu_ptr(chan->buf, 0);
+ if (!WARN_ON_ONCE(!buf)) {
+ dentry = relay_create_buf_file(chan, buf, 0);
+ if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
+ relay_set_buf_dentry(buf, dentry);
+ err = 0;
+ }
+ }
+ mutex_unlock(&relay_channels_mutex);
+ return err;
+ }
+
curr_cpu = get_cpu();
/*
* The CPU hotplug notifier ran before us and created buffers with
@@ -672,13 +687,14 @@ int relay_late_setup_files(struct rchan *chan,
* on all currently online CPUs.
*/
for_each_online_cpu(i) {
- if (unlikely(!chan->buf[i])) {
+ buf = *per_cpu_ptr(chan->buf, i);
+ if (unlikely(!buf)) {
WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
- dentry = relay_create_buf_file(chan, chan->buf[i], i);
+ dentry = relay_create_buf_file(chan, buf, i);
if (unlikely(!dentry)) {
err = -EINVAL;
break;
@@ -686,10 +702,10 @@ int relay_late_setup_files(struct rchan *chan,
if (curr_cpu == i) {
local_irq_save(flags);
- relay_set_buf_dentry(chan->buf[i], dentry);
+ relay_set_buf_dentry(buf, dentry);
local_irq_restore(flags);
} else {
- disp.buf = chan->buf[i];
+ disp.buf = buf;
disp.dentry = dentry;
smp_mb();
/* relay_channels_mutex must be held, so wait. */
@@ -705,6 +721,7 @@ int relay_late_setup_files(struct rchan *chan,
return err;
}
+EXPORT_SYMBOL_GPL(relay_late_setup_files);
/**
* relay_switch_subbuf - switch to a new sub-buffer
@@ -791,11 +808,10 @@ void relay_subbufs_consumed(struct rchan *chan,
if (!chan)
return;
- if (cpu >= NR_CPUS || !chan->buf[cpu] ||
- subbufs_consumed > chan->n_subbufs)
+ buf = *per_cpu_ptr(chan->buf, cpu);
+ if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
return;
- buf = chan->buf[cpu];
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
else
@@ -811,18 +827,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
*/
void relay_close(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
mutex_lock(&relay_channels_mutex);
- if (chan->is_global && chan->buf[0])
- relay_close_buf(chan->buf[0]);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0)))
+ relay_close_buf(buf);
else
for_each_possible_cpu(i)
- if (chan->buf[i])
- relay_close_buf(chan->buf[i]);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_close_buf(buf);
if (chan->last_toobig)
printk(KERN_WARNING "relay: one or more items not logged "
@@ -843,20 +860,21 @@ EXPORT_SYMBOL_GPL(relay_close);
*/
void relay_flush(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
- if (chan->is_global && chan->buf[0]) {
- relay_switch_subbuf(chan->buf[0], 0);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+ relay_switch_subbuf(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
- if (chan->buf[i])
- relay_switch_subbuf(chan->buf[i], 0);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_switch_subbuf(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_flush);
@@ -1346,12 +1364,3 @@ const struct file_operations relay_file_operations = {
.splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
-
-static __init int relay_init(void)
-{
-
- hotcpu_notifier(relay_hotcpu_callback, 0);
- return 0;
-}
-
-early_initcall(relay_init);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 404c0784b1fc..94732d1ab00a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <linux/context_tracking.h>
#include <linux/compiler.h>
#include <linux/frame.h>
+#include <linux/prefetch.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -580,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu)
* If needed we can still optimize that later with an
* empty IRQ.
*/
+ if (cpu_is_offline(cpu))
+ return true; /* Don't try to wake offline CPUs. */
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
@@ -590,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu)
return false;
}
+/*
+ * Wake up the specified CPU. If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
void wake_up_nohz_cpu(int cpu)
{
if (!wake_up_full_nohz_cpu(cpu))
@@ -1062,8 +1070,12 @@ static int migration_cpu_stop(void *data)
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
- if (task_rq(p) == rq && task_on_rq_queued(p))
- rq = __migrate_task(rq, p, arg->dest_cpu);
+ if (task_rq(p) == rq) {
+ if (task_on_rq_queued(p))
+ rq = __migrate_task(rq, p, arg->dest_cpu);
+ else
+ p->wake_cpu = arg->dest_cpu;
+ }
raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
@@ -1104,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (running)
+ set_curr_task(rq, p);
}
/*
@@ -1264,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
/*
* Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the
- * previous cpu our targer instead of where it really is.
+ * previous cpu our target instead of where it really is.
*/
p->wake_cpu = cpu;
}
@@ -1536,7 +1548,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
- if (!cpu_active(dest_cpu))
+ if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+ continue;
+ if (!cpu_online(dest_cpu))
continue;
goto out;
}
@@ -1626,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
-#ifdef CONFIG_SCHEDSTATS
- struct rq *rq = this_rq();
+ struct rq *rq;
-#ifdef CONFIG_SMP
- int this_cpu = smp_processor_id();
+ if (!schedstat_enabled())
+ return;
- if (cpu == this_cpu) {
- schedstat_inc(rq, ttwu_local);
- schedstat_inc(p, se.statistics.nr_wakeups_local);
+ rq = this_rq();
+
+#ifdef CONFIG_SMP
+ if (cpu == rq->cpu) {
+ schedstat_inc(rq->ttwu_local);
+ schedstat_inc(p->se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock();
- for_each_domain(this_cpu, sd) {
+ for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
+ schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
@@ -1650,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+ schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(p, se.statistics.nr_wakeups);
+ schedstat_inc(rq->ttwu_count);
+ schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+ schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -1768,13 +1781,15 @@ void sched_ttwu_pending(void)
cookie = lockdep_pin_lock(&rq->lock);
while (llist) {
+ int wake_flags = 0;
+
p = llist_entry(llist, struct task_struct, wake_entry);
llist = llist_next(llist);
- /*
- * See ttwu_queue(); we only call ttwu_queue_remote() when
- * its a x-cpu wakeup.
- */
- ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+ if (p->sched_remote_wakeup)
+ wake_flags = WF_MIGRATED;
+
+ ttwu_do_activate(rq, p, wake_flags, cookie);
}
lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1834,12 @@ void scheduler_ipi(void)
irq_exit();
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
if (!set_nr_if_polling(rq->idle))
smp_send_reschedule(cpu);
@@ -1869,7 +1886,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
- ttwu_queue_remote(p, cpu);
+ ttwu_queue_remote(p, cpu, wake_flags);
return;
}
#endif
@@ -1931,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_acquire(!X->on_cpu)
+ * 2) smp_cond_load_acquire(!X->on_cpu)
*
* Example:
*
@@ -1942,7 +1959,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
- * smp_cond_acquire(!X->on_cpu);
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
@@ -1968,7 +1985,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
*
*/
@@ -2009,6 +2026,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * [S] p->on_rq = 1; [L] P->state
+ * UNLOCK rq->lock -----.
+ * \
+ * +--- RMB
+ * schedule() /
+ * LOCK rq->lock -----'
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ *
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
+ * last wakeup of our task and the schedule that got our task
+ * current.
+ */
+ smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
@@ -2041,7 +2080,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2055,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu, wake_flags);
stat:
- if (schedstat_enabled())
- ttwu_stat(p, cpu, wake_flags);
+ ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2066,6 +2104,7 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2104,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0, cookie);
- if (schedstat_enabled())
- ttwu_stat(p, smp_processor_id(), 0);
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
@@ -2249,9 +2287,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
+#ifdef CONFIG_SCHEDSTATS
+
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
-#ifdef CONFIG_SCHEDSTATS
static void set_schedstats(bool enabled)
{
if (enabled)
@@ -2274,11 +2314,16 @@ static int __init setup_schedstats(char *str)
if (!str)
goto out;
+ /*
+ * This code is called before jump labels have been set up, so we can't
+ * change the static branch directly just yet. Instead set a temporary
+ * variable so init_schedstats() can do it later.
+ */
if (!strcmp(str, "enable")) {
- set_schedstats(true);
+ __sched_schedstats = true;
ret = 1;
} else if (!strcmp(str, "disable")) {
- set_schedstats(false);
+ __sched_schedstats = false;
ret = 1;
}
out:
@@ -2289,6 +2334,11 @@ out:
}
__setup("schedstats=", setup_schedstats);
+static void __init init_schedstats(void)
+{
+ set_schedstats(__sched_schedstats);
+}
+
#ifdef CONFIG_PROC_SYSCTL
int sysctl_schedstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2309,8 +2359,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
set_schedstats(state);
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#else /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
/*
* fork()/clone()-time setup:
@@ -2322,11 +2374,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2363,8 +2415,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2374,7 +2425,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2506,21 +2563,22 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf;
struct rq *rq;
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
- /* Post initialize new task's util average when its cfs_rq is set */
+ rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
- rq = __task_rq_lock(p, &rf);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -2723,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* task and put them back on the free list.
*/
kprobe_flush_task(prev);
+
+ /* Task is done with its stack. */
+ put_task_stack(prev);
+
put_task_struct(prev);
}
@@ -2946,6 +3008,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+ struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -2979,6 +3058,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
@@ -3125,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { }
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
+ /* Save this before calling printk(), since that will clobber it */
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
if (oops_in_progress)
return;
@@ -3135,13 +3218,15 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
+ print_ip_sym(preempt_disable_ip);
pr_cont("\n");
}
-#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -3152,7 +3237,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(task_stack_end_corrupted(prev));
+ if (task_stack_end_corrupted(prev))
+ panic("corrupted stack end detected inside scheduler\n");
#endif
if (unlikely(in_atomic_preempt_off())) {
@@ -3163,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev)
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
- schedstat_inc(this_rq(), sched_count);
+ schedstat_inc(this_rq()->sched_count);
}
/*
@@ -3256,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- /*
- * do_exit() calls schedule() with preemption disabled as an exception;
- * however we must fix that up, otherwise the next task will see an
- * inconsistent (higher) preempt count.
- *
- * It also avoids the below schedule_debug() test from complaining
- * about this.
- */
- if (unlikely(prev->state == TASK_DEAD))
- preempt_enable_no_resched_notrace();
-
schedule_debug(prev);
if (sched_feat(HRTICK))
@@ -3332,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt)
balance_callback(rq);
}
-STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
+
+void __noreturn do_task_dead(void)
+{
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&current->pi_lock);
+
+ /* causes final put_task_struct in finish_task_switch(). */
+ __set_current_state(TASK_DEAD);
+ current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
+ __schedule(false);
+ BUG();
+ /* Avoid "noreturn function does return". */
+ for (;;)
+ cpu_relax(); /* For when BUG is null */
+}
static inline void sched_submit_work(struct task_struct *tsk)
{
@@ -3616,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, queue_flag);
+ if (running)
+ set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3633,7 +3734,8 @@ out_unlock:
void set_user_nice(struct task_struct *p, long nice)
{
- int old_prio, delta, queued;
+ bool queued, running;
+ int old_prio, delta;
struct rq_flags rf;
struct rq *rq;
@@ -3655,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
+ if (running)
+ put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -3673,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice)
if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_curr(rq);
}
+ if (running)
+ set_curr_task(rq, p);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -4172,8 +4279,6 @@ change:
prev_class = p->sched_class;
__setscheduler(rq, p, attr, pi);
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued) {
/*
* We enqueue to tail when the priority of a task is
@@ -4184,6 +4289,8 @@ change:
enqueue_task(rq, p, queue_flags);
}
+ if (running)
+ set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
preempt_disable(); /* avoid rq from going away on us */
@@ -4732,7 +4839,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4774,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield)
{
struct rq *rq = this_rq_lock();
- schedstat_inc(rq, yld_count);
+ schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
/*
@@ -4791,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
+#ifndef CONFIG_PREEMPT
int __sched _cond_resched(void)
{
if (should_resched(0)) {
@@ -4800,6 +4909,7 @@ int __sched _cond_resched(void)
return 0;
}
EXPORT_SYMBOL(_cond_resched);
+#endif
/*
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4925,7 +5035,7 @@ again:
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
- schedstat_inc(rq, yld_count);
+ schedstat_inc(rq->yld_count);
/*
* Make p's CPU reschedule; pick_next_entity takes care of
* fairness.
@@ -5129,14 +5239,16 @@ void show_state_filter(unsigned long state_filter)
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
*/
touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
}
- touch_all_softlockup_watchdogs();
-
#ifdef CONFIG_SCHED_DEBUG
if (!state_filter)
sysrq_sched_debug_show();
@@ -5343,10 +5455,10 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid;
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (running)
+ set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -5372,13 +5484,15 @@ void idle_task_exit(void)
/*
* Since this CPU is going 'away' for a while, fold any nr_active delta
* we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
*
* Also see the comment "Global load-average calculations".
*/
static void calc_load_migrate(struct rq *rq)
{
- long delta = calc_load_fold_active(rq);
+ long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
}
@@ -5641,6 +5755,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
}
#else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
# define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void)
{
@@ -5659,6 +5775,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
@@ -5689,6 +5806,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
@@ -5833,10 +5951,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
} while (sg != first);
}
-static void free_sched_domain(struct rcu_head *rcu)
+static void destroy_sched_domain(struct sched_domain *sd)
{
- struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
/*
* If its an overlapping domain it has private groups, iterate and
* nuke them all.
@@ -5847,18 +5963,26 @@ static void free_sched_domain(struct rcu_head *rcu)
kfree(sd->groups->sgc);
kfree(sd->groups);
}
+ if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+ kfree(sd->shared);
kfree(sd);
}
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
{
- call_rcu(&sd->rcu, free_sched_domain);
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ while (sd) {
+ struct sched_domain *parent = sd->parent;
+ destroy_sched_domain(sd);
+ sd = parent;
+ }
}
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains(struct sched_domain *sd)
{
- for (; sd; sd = sd->parent)
- destroy_sched_domain(sd, cpu);
+ if (sd)
+ call_rcu(&sd->rcu, destroy_sched_domains_rcu);
}
/*
@@ -5873,14 +5997,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
+ struct sched_domain_shared *sds = NULL;
struct sched_domain *sd;
- struct sched_domain *busy_sd = NULL;
int id = cpu;
int size = 1;
@@ -5888,13 +6012,13 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- busy_sd = sd->parent; /* sd_busy */
+ sds = sd->shared;
}
- rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+ rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -5930,7 +6054,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
*/
if (parent->flags & SD_PREFER_SIBLING)
tmp->flags |= SD_PREFER_SIBLING;
- destroy_sched_domain(parent, cpu);
+ destroy_sched_domain(parent);
} else
tmp = tmp->parent;
}
@@ -5938,7 +6062,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd && sd_degenerate(sd)) {
tmp = sd;
sd = sd->parent;
- destroy_sched_domain(tmp, cpu);
+ destroy_sched_domain(tmp);
if (sd)
sd->child = NULL;
}
@@ -5948,7 +6072,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rq_attach_root(rq, rd);
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp, cpu);
+ destroy_sched_domains(tmp);
update_top_cache_domain(cpu);
}
@@ -6191,7 +6315,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
return;
update_group_capacity(sd, cpu);
- atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -6279,6 +6402,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
*per_cpu_ptr(sdd->sd, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;
+
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
@@ -6298,26 +6424,37 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
*
- * Odd one out:
- * SD_ASYM_PACKING - describes SMT quirks
+ * SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN)
static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map,
+ struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int sd_weight, sd_flags = 0;
+ struct sd_data *sdd = &tl->data;
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ int sd_id, sd_weight, sd_flags = 0;
#ifdef CONFIG_NUMA
/*
@@ -6366,15 +6503,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
.smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
+ .child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
};
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sched_domain_span(sd));
+
/*
* Convert topological properties into behaviour.
*/
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
@@ -6406,7 +6554,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
sd->idle_idx = 1;
}
- sd->private = &tl->data;
+ /*
+ * For all levels sharing cache; connect a sched_domain_shared
+ * instance.
+ */
+ if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+ }
+
+ sd->private = sdd;
return sd;
}
@@ -6433,6 +6591,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
void set_sched_topology(struct sched_domain_topology_level *tl)
{
+ if (WARN_ON_ONCE(sched_smp_initialized))
+ return;
+
sched_domain_topology = tl;
}
@@ -6713,6 +6874,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sd)
return -ENOMEM;
+ sdd->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!sdd->sds)
+ return -ENOMEM;
+
sdd->sg = alloc_percpu(struct sched_group *);
if (!sdd->sg)
return -ENOMEM;
@@ -6723,6 +6888,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
+ struct sched_domain_shared *sds;
struct sched_group *sg;
struct sched_group_capacity *sgc;
@@ -6733,6 +6899,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sd, j) = sd;
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sds, j) = sds;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sg)
@@ -6772,6 +6945,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
kfree(*per_cpu_ptr(sdd->sd, j));
}
+ if (sdd->sds)
+ kfree(*per_cpu_ptr(sdd->sds, j));
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
if (sdd->sgc)
@@ -6779,6 +6954,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
free_percpu(sdd->sd);
sdd->sd = NULL;
+ free_percpu(sdd->sds);
+ sdd->sds = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
free_percpu(sdd->sgc);
@@ -6790,16 +6967,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu);
- if (!sd)
- return child;
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
- sd->child = child;
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
@@ -6830,6 +7003,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
+ struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6880,11 +7054,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
+
+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
+ if (rq && sched_debug_enabled) {
+ pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+ }
+
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7209,7 +7394,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
@@ -7244,6 +7428,22 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+
+static void sched_init_smt(void)
+{
+ /*
+ * We've enumerated all CPUs and will assume that if any CPU
+ * has SMT siblings, CPU0 will too.
+ */
+ if (cpumask_weight(cpu_smt_mask(0)) > 1)
+ static_branch_enable(&sched_smt_present);
+}
+#else
+static inline void sched_init_smt(void) { }
+#endif
+
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@@ -7273,6 +7473,9 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+
+ sched_init_smt();
+
sched_smp_initialized = true;
}
@@ -7310,6 +7513,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
@@ -7346,6 +7550,8 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+ per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -7448,10 +7654,6 @@ void __init sched_init(void)
set_load_weight(&init_task);
-#ifdef CONFIG_PREEMPT_NOTIFIERS
- INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
/*
* The boot idle thread does lazy MMU switching as well:
*/
@@ -7459,11 +7661,6 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
- /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7483,6 +7680,8 @@ void __init sched_init(void)
#endif
init_sched_fair_class();
+ init_schedstats();
+
scheduler_running = 1;
}
@@ -7515,6 +7714,7 @@ EXPORT_SYMBOL(__might_sleep);
void ___might_sleep(const char *file, int line, int preempt_offset)
{
static unsigned long prev_jiffy; /* ratelimiting */
+ unsigned long preempt_disable_ip;
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7525,6 +7725,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
return;
prev_jiffy = jiffies;
+ /* Save this before calling printk(), since that will clobber it */
+ preempt_disable_ip = get_preempt_disable_ip(current);
+
printk(KERN_ERR
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
@@ -7539,14 +7742,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (!preempt_count_equals(preempt_offset)) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && !preempt_count_equals(preempt_offset)) {
pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
+ print_ip_sym(preempt_disable_ip);
pr_cont("\n");
}
-#endif
dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(___might_sleep);
#endif
@@ -7567,12 +7770,10 @@ void normalize_rt_tasks(void)
if (p->flags & PF_KTHREAD)
continue;
- p->se.exec_start = 0;
-#ifdef CONFIG_SCHEDSTATS
- p->se.statistics.wait_start = 0;
- p->se.statistics.sleep_start = 0;
- p->se.statistics.block_start = 0;
-#endif
+ p->se.exec_start = 0;
+ schedstat_set(p->se.statistics.wait_start, 0);
+ schedstat_set(p->se.statistics.sleep_start, 0);
+ schedstat_set(p->se.statistics.block_start, 0);
if (!dl_task(p) && !rt_task(p)) {
/*
@@ -7633,7 +7834,7 @@ struct task_struct *curr_task(int cpu)
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
*/
-void set_curr_task(int cpu, struct task_struct *p)
+void ia64_set_curr_task(int cpu, struct task_struct *p)
{
cpu_curr(cpu) = p;
}
@@ -7687,6 +7888,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
/* rcu callback to free various structures associated with a task group */
@@ -7715,27 +7918,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7748,16 +7933,42 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
- tsk->sched_class->set_curr_task(rq);
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
+
if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+ if (unlikely(running))
+ set_curr_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
}
@@ -8180,15 +8391,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8199,8 +8422,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0938..bc0b309c3f19 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS,
};
-enum cpuacct_usage_index {
- CPUACCT_USAGE_USER, /* ... user mode */
- CPUACCT_USAGE_SYSTEM, /* ... kernel mode */
-
- CPUACCT_USAGE_NRUSAGE,
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
};
struct cpuacct_usage {
- u64 usages[CPUACCT_USAGE_NRUSAGE];
+ u64 usages[CPUACCT_STAT_NSTATS];
};
/* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
}
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
/*
- * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+ * We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of suages.
*/
- BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+ BUG_ON(index > CPUACCT_STAT_NSTATS);
#ifndef CONFIG_64BIT
/*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- if (index == CPUACCT_USAGE_NRUSAGE) {
+ if (index == CPUACCT_STAT_NSTATS) {
int i = 0;
data = 0;
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
data += cpuusage->usages[i];
} else {
data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/* return total cpu usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_USER);
+ return __cpuusage_read(css, CPUACCT_STAT_USER);
}
static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+ return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
}
static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+ return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
}
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
}
static int __cpuacct_percpu_seq_show(struct seq_file *m,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
}
static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
}
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
}
-static const char * const cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ int index;
+ int cpu;
+
+ seq_puts(m, "cpu");
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %s", cpuacct_stat_desc[index]);
+ seq_puts(m, "\n");
+
+ for_each_possible_cpu(cpu) {
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+ seq_printf(m, "%d", cpu);
+
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit
+ * platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ seq_printf(m, " %llu", cpuusage->usages[index]);
+
+#ifndef CONFIG_64BIT
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+ }
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
+ s64 val[CPUACCT_STAT_NSTATS];
int cpu;
- s64 val = 0;
+ int stat;
+ memset(val, 0, sizeof(val));
for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val = 0;
- for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
}
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+ for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
+ seq_printf(sf, "%s %lld\n",
+ cpuacct_stat_desc[stat],
+ cputime64_to_clock_t(val[stat]));
+ }
return 0;
}
@@ -302,6 +330,10 @@ static struct cftype files[] = {
.seq_show = cpuacct_percpu_sys_seq_show,
},
{
+ .name = "usage_all",
+ .seq_show = cpuacct_all_seq_show,
+ },
+ {
.name = "stat",
.seq_show = cpuacct_stats_show,
},
@@ -316,11 +348,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int index = CPUACCT_USAGE_SYSTEM;
+ int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = task_pt_regs(tsk);
if (regs && user_mode(regs))
- index = CPUACCT_USAGE_USER;
+ index = CPUACCT_STAT_USER;
rcu_read_lock();
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be58820465c..e73119013c53 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
return (i << 1) + 2;
}
-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
{
- int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+ int l, r, largest;
- swap(cp->elements[a].cpu, cp->elements[b].cpu);
- swap(cp->elements[a].dl , cp->elements[b].dl );
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
- swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
-}
-
-static void cpudl_heapify(struct cpudl *cp, int idx)
-{
- int l, r, largest;
+ if (left_child(idx) >= cp->size)
+ return;
/* adapted from lib/prio_heap.c */
while(1) {
+ u64 largest_dl;
l = left_child(idx);
r = right_child(idx);
largest = idx;
+ largest_dl = orig_dl;
- if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
- cp->elements[l].dl))
+ if ((l < cp->size) && dl_time_before(orig_dl,
+ cp->elements[l].dl)) {
largest = l;
- if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
- cp->elements[r].dl))
+ largest_dl = cp->elements[l].dl;
+ }
+ if ((r < cp->size) && dl_time_before(largest_dl,
+ cp->elements[r].dl))
largest = r;
+
if (largest == idx)
break;
- /* Push idx down the heap one level and bump one up */
- cpudl_exchange(cp, largest, idx);
+ /* pull largest child onto idx */
+ cp->elements[idx].cpu = cp->elements[largest].cpu;
+ cp->elements[idx].dl = cp->elements[largest].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
idx = largest;
}
+ /* actual push down of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
}
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
{
- WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+ int p;
- if (dl_time_before(new_dl, cp->elements[idx].dl)) {
- cp->elements[idx].dl = new_dl;
- cpudl_heapify(cp, idx);
- } else {
- cp->elements[idx].dl = new_dl;
- while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
- cp->elements[idx].dl)) {
- cpudl_exchange(cp, idx, parent(idx));
- idx = parent(idx);
- }
- }
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
+
+ if (idx == 0)
+ return;
+
+ do {
+ p = parent(idx);
+ if (dl_time_before(orig_dl, cp->elements[p].dl))
+ break;
+ /* pull parent onto idx */
+ cp->elements[idx].cpu = cp->elements[p].cpu;
+ cp->elements[idx].dl = cp->elements[p].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+ idx = p;
+ } while (idx != 0);
+ /* actual push up of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+ if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+ cp->elements[idx].dl))
+ cpudl_heapify_up(cp, idx);
+ else
+ cpudl_heapify_down(cp, idx);
}
static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
}
/*
- * cpudl_set - update the cpudl max-heap
+ * cpudl_clear - remove a cpu from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_clear(struct cpudl *cp, int cpu)
{
int old_idx, new_cpu;
unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
+
old_idx = cp->elements[cpu].idx;
- if (!is_valid) {
- /* remove item */
- if (old_idx == IDX_INVALID) {
- /*
- * Nothing to remove if old_idx was invalid.
- * This could happen if a rq_offline_dl is
- * called for a CPU without -dl tasks running.
- */
- goto out;
- }
+ if (old_idx == IDX_INVALID) {
+ /*
+ * Nothing to remove if old_idx was invalid.
+ * This could happen if a rq_offline_dl is
+ * called for a CPU without -dl tasks running.
+ */
+ } else {
new_cpu = cp->elements[cp->size - 1].cpu;
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID;
- while (old_idx > 0 && dl_time_before(
- cp->elements[parent(old_idx)].dl,
- cp->elements[old_idx].dl)) {
- cpudl_exchange(cp, old_idx, parent(old_idx));
- old_idx = parent(old_idx);
- }
- cpumask_set_cpu(cpu, cp->free_cpus);
- cpudl_heapify(cp, old_idx);
+ cpudl_heapify(cp, old_idx);
- goto out;
+ cpumask_set_cpu(cpu, cp->free_cpus);
}
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
+{
+ int old_idx;
+ unsigned long flags;
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+
+ old_idx = cp->elements[cpu].idx;
if (old_idx == IDX_INVALID) {
- cp->size++;
- cp->elements[cp->size - 1].dl = 0;
- cp->elements[cp->size - 1].cpu = cpu;
- cp->elements[cpu].idx = cp->size - 1;
- cpudl_change_key(cp, cp->size - 1, dl);
+ int new_idx = cp->size++;
+ cp->elements[new_idx].dl = dl;
+ cp->elements[new_idx].cpu = cpu;
+ cp->elements[cpu].idx = new_idx;
+ cpudl_heapify_up(cp, new_idx);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
- cpudl_change_key(cp, old_idx, dl);
+ cp->elements[old_idx].dl = dl;
+ cpudl_heapify(cp, old_idx);
}
-out:
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index fcbdf83fed7e..f7da8c55bba0 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954e73b4..dbc51442ecbc 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
*/
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
void (*func)(struct update_util_data *data, u64 time,
- unsigned long util, unsigned long max))
+ unsigned int flags))
{
if (WARN_ON(!data || !func))
return;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 154ae3a51e86..69e06898997d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -9,8 +9,9 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cpufreq.h>
-#include <linux/module.h>
#include <linux/slab.h>
#include <trace/events/power.h>
@@ -45,10 +46,15 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cached_raw_freq;
+ unsigned long iowait_boost;
+ unsigned long iowait_boost_max;
+ u64 last_update;
+
/* The fields below are only needed when sharing a policy. */
unsigned long util;
unsigned long max;
- u64 last_update;
+ unsigned int flags;
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -104,7 +110,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @policy: cpufreq policy object to compute the new frequency for.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -119,43 +125,108 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq = C * curr_freq * util_raw / max
*
* Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct cpufreq_policy *policy,
- unsigned long util, unsigned long max)
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+ unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
- return (freq + (freq >> 2)) * util / max;
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ return sg_policy->next_freq;
+ sg_cpu->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max)
+{
+ struct rq *rq = this_rq();
+ unsigned long cfs_max;
+
+ cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+
+ *util = min(rq->cfs.avg.util_avg, cfs_max);
+ *max = cfs_max;
+}
+
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ if (flags & SCHED_CPUFREQ_IOWAIT) {
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else if (sg_cpu->iowait_boost) {
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Clear iowait_boost if the CPU apprears to have been idle. */
+ if (delta_ns > TICK_NSEC)
+ sg_cpu->iowait_boost = 0;
+ }
+}
+
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+ unsigned long *max)
+{
+ unsigned long boost_util = sg_cpu->iowait_boost;
+ unsigned long boost_max = sg_cpu->iowait_boost_max;
+
+ if (!boost_util)
+ return;
+
+ if (*util * boost_max < *max * boost_util) {
+ *util = boost_util;
+ *max = boost_max;
+ }
+ sg_cpu->iowait_boost >>= 1;
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
- unsigned long util, unsigned long max)
+ unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util, max;
unsigned int next_f;
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
if (!sugov_should_update_freq(sg_policy, time))
return;
- next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
- get_next_freq(policy, util, max);
+ if (flags & SCHED_CPUFREQ_RT_DL) {
+ next_f = policy->cpuinfo.max_freq;
+ } else {
+ sugov_get_util(&util, &max);
+ sugov_iowait_boost(sg_cpu, &util, &max);
+ next_f = get_next_freq(sg_cpu, util, max);
+ }
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
- unsigned long util, unsigned long max)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
+ unsigned long util, unsigned long max,
+ unsigned int flags)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int max_f = policy->cpuinfo.max_freq;
u64 last_freq_update_time = sg_policy->last_freq_update_time;
unsigned int j;
- if (util == ULONG_MAX)
+ if (flags & SCHED_CPUFREQ_RT_DL)
return max_f;
+ sugov_iowait_boost(sg_cpu, &util, &max);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu;
unsigned long j_util, j_max;
@@ -170,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
* frequency update and the time elapsed between the last update
* of the CPU utilization and the last frequency update is long
* enough, don't take the CPU into account as it probably is
- * idle now.
+ * idle now (and clear iowait_boost for it).
*/
delta_ns = last_freq_update_time - j_sg_cpu->last_update;
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
+ j_sg_cpu->iowait_boost = 0;
continue;
-
- j_util = j_sg_cpu->util;
- if (j_util == ULONG_MAX)
+ }
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
return max_f;
+ j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
+
+ sugov_iowait_boost(j_sg_cpu, &util, &max);
}
- return get_next_freq(policy, util, max);
+ return get_next_freq(sg_cpu, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
- unsigned long util, unsigned long max)
+ unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned long util, max;
unsigned int next_f;
+ sugov_get_util(&util, &max);
+
raw_spin_lock(&sg_policy->update_lock);
sg_cpu->util = util;
sg_cpu->max = max;
+ sg_cpu->flags = flags;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_policy, util, max);
+ next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
sugov_update_commit(sg_policy, time, next_f);
}
@@ -388,11 +468,11 @@ static int sugov_init(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
- pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+ pr_err("initialization failed (error %d)\n", ret);
return ret;
}
-static int sugov_exit(struct cpufreq_policy *policy)
+static void sugov_exit(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
struct sugov_tunables *tunables = sg_policy->tunables;
@@ -410,7 +490,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
- return 0;
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -429,9 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->sg_policy = sg_policy;
if (policy_is_shared(policy)) {
- sg_cpu->util = ULONG_MAX;
+ sg_cpu->util = 0;
sg_cpu->max = 0;
+ sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->last_update = 0;
+ sg_cpu->cached_raw_freq = 0;
+ sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
sugov_update_shared);
} else {
@@ -442,7 +525,7 @@ static int sugov_start(struct cpufreq_policy *policy)
return 0;
}
-static int sugov_stop(struct cpufreq_policy *policy)
+static void sugov_stop(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
@@ -454,77 +537,40 @@ static int sugov_stop(struct cpufreq_policy *policy)
irq_work_sync(&sg_policy->irq_work);
cancel_work_sync(&sg_policy->work);
- return 0;
}
-static int sugov_limits(struct cpufreq_policy *policy)
+static void sugov_limits(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
if (!policy->fast_switch_enabled) {
mutex_lock(&sg_policy->work_lock);
-
- if (policy->max < policy->cur)
- __cpufreq_driver_target(policy, policy->max,
- CPUFREQ_RELATION_H);
- else if (policy->min > policy->cur)
- __cpufreq_driver_target(policy, policy->min,
- CPUFREQ_RELATION_L);
-
+ cpufreq_policy_apply_limits(policy);
mutex_unlock(&sg_policy->work_lock);
}
sg_policy->need_freq_update = true;
- return 0;
-}
-
-int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
-{
- if (event == CPUFREQ_GOV_POLICY_INIT) {
- return sugov_init(policy);
- } else if (policy->governor_data) {
- switch (event) {
- case CPUFREQ_GOV_POLICY_EXIT:
- return sugov_exit(policy);
- case CPUFREQ_GOV_START:
- return sugov_start(policy);
- case CPUFREQ_GOV_STOP:
- return sugov_stop(policy);
- case CPUFREQ_GOV_LIMITS:
- return sugov_limits(policy);
- }
- }
- return -EINVAL;
}
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
- .governor = sugov_governor,
.owner = THIS_MODULE,
+ .init = sugov_init,
+ .exit = sugov_exit,
+ .start = sugov_start,
+ .stop = sugov_stop,
+ .limits = sugov_limits,
};
-static int __init sugov_module_init(void)
-{
- return cpufreq_register_governor(&schedutil_gov);
-}
-
-static void __exit sugov_module_exit(void)
-{
- cpufreq_unregister_governor(&schedutil_gov);
-}
-
-MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
-MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
-MODULE_LICENSE("GPL");
-
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &schedutil_gov;
}
-
-fs_initcall(sugov_module_init);
-#else
-module_init(sugov_module_init);
#endif
-module_exit(sugov_module_exit);
+
+static int __init sugov_register(void)
+{
+ return cpufreq_register_governor(&schedutil_gov);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..5ebee3164e64 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -23,10 +23,8 @@
* task when irq is in progress while we read rq->clock. That is a worthy
* compromise in place of having locks on each irq in account_system_time.
*/
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static DEFINE_PER_CPU(u64, irq_start_time);
static int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
@@ -39,30 +37,24 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
-
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
void irqtime_account_irq(struct task_struct *curr)
{
- unsigned long flags;
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
s64 delta;
int cpu;
if (!sched_clock_irqtime)
return;
- local_irq_save(flags);
-
cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
+ delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+ irqtime->irq_start_time += delta;
- irq_time_write_begin();
+ u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -70,49 +62,52 @@ void irqtime_account_irq(struct task_struct *curr)
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
+ irqtime->hardirq_time += delta;
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
+ irqtime->softirq_time += delta;
- irq_time_write_end();
- local_irq_restore(flags);
+ u64_stats_update_end(&irqtime->sync);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t irq_cputime;
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
+ irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
+ irq_cputime = min(irq_cputime, maxtime);
+ cpustat[idx] += irq_cputime;
+
+ return irq_cputime;
}
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
+ CPUTIME_IRQ, maxtime);
+}
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
+{
+ return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
+ CPUTIME_SOFTIRQ, maxtime);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
#define sched_clock_irqtime (0)
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+ return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+ return 0;
+}
+
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,32 +252,73 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
-static __always_inline bool steal_account_process_tick(void)
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
+ cputime_t steal_cputime;
u64 steal;
- unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
- /*
- * steal is in nsecs but our caller is expecting steal
- * time in jiffies. Lets cast the result to jiffies
- * granularity and account the rest on the next rounds.
- */
- steal_jiffies = nsecs_to_jiffies(steal);
- this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
+ steal_cputime = min(nsecs_to_cputime(steal), maxtime);
+ account_steal_time(steal_cputime);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
- account_steal_time(jiffies_to_cputime(steal_jiffies));
- return steal_jiffies;
+ return steal_cputime;
}
#endif
- return false;
+ return 0;
}
/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+ cputime_t accounted;
+
+ /* Shall be converted to a lockdep-enabled lightweight check */
+ WARN_ON_ONCE(!irqs_disabled());
+
+ accounted = steal_account_process_time(max);
+
+ if (accounted < max)
+ accounted += irqtime_account_hi_update(max - accounted);
+
+ if (accounted < max)
+ accounted += irqtime_account_si_update(max - accounted);
+
+ return accounted;
+}
+
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ u64 ns;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &rf);
+ ns = t->se.sum_exec_runtime;
+ task_rq_unlock(rq, t, &rf);
+
+ return ns;
+}
+#endif
+
+/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
*/
@@ -294,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
unsigned int seq, nextseq;
unsigned long flags;
+ /*
+ * Update current task runtime to account pending time since last
+ * scheduler action or thread_group_cputime() call. This thread group
+ * might have other running tasks on different CPUs, but updating
+ * their runtime can affect syscall performance, so we skip account
+ * those pending times and rely only on values updated on tick or
+ * other scheduler action.
+ */
+ if (same_thread_group(current, tsk))
+ (void) task_sched_runtime(current);
+
rcu_read_lock();
/* Attempt a lockless read on the first round. */
nextseq = 0;
@@ -308,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
+ times->sum_exec_runtime += read_sum_exec_runtime(t);
}
/* If lockless access failed, take the lock. */
nextseq = 1;
@@ -342,21 +389,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
- cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 cputime = (__force u64) cputime_one_jiffy;
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+ cputime_t scaled, other;
- if (steal_account_process_tick())
+ /*
+ * When returning from idle, many ticks can get accounted at
+ * once, including some ticks of steal, irq, and softirq time.
+ * Subtract those ticks from the amount of time accounted to
+ * idle, or potentially user or system time. Due to rounding,
+ * other time can exceed ticks occasionally.
+ */
+ other = account_other_time(ULONG_MAX);
+ if (other >= cputime)
return;
+ cputime -= other;
+ scaled = cputime_to_scaled(cputime);
- cputime *= ticks;
- scaled *= ticks;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += cputime;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += cputime;
- } else if (this_cpu_ksoftirqd() == p) {
+ if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
@@ -406,6 +455,10 @@ void vtime_common_task_switch(struct task_struct *prev)
}
#endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
@@ -415,33 +468,16 @@ void vtime_common_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
{
- if (!in_interrupt()) {
- /*
- * If we interrupted user, context_tracking_in_user()
- * is 1 because the context tracking don't hook
- * on irq entry/exit. This way we know if
- * we need to flush user time on kernel entry.
- */
- if (context_tracking_in_user()) {
- vtime_account_user(tsk);
- return;
- }
-
- if (is_idle_task(tsk)) {
- vtime_account_idle(tsk);
- return;
- }
- }
- vtime_account_system(tsk);
+ if (!in_interrupt() && is_idle_task(tsk))
+ vtime_account_idle(tsk);
+ else
+ vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
*ut = p->utime;
@@ -466,7 +502,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t cputime, scaled, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -477,26 +513,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
- if (steal_account_process_tick())
+ cputime = cputime_one_jiffy;
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
return;
+ cputime -= steal;
+ scaled = cputime_to_scaled(cputime);
+
if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
+ account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
+ account_idle_time(cputime);
}
/*
@@ -505,13 +536,21 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+ cputime_t cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- account_idle_time(jiffies_to_cputime(ticks));
+ cputime = jiffies_to_cputime(ticks);
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+ account_idle_time(cputime);
}
/*
@@ -603,19 +642,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -638,7 +683,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
@@ -681,12 +725,21 @@ static cputime_t vtime_delta(struct task_struct *tsk)
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- unsigned long delta = now - tsk->vtime_snap;
+ cputime_t delta, other;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+ delta = jiffies_to_cputime(now - tsk->vtime_snap);
+ other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;
- return jiffies_to_cputime(delta);
+ return delta - other;
}
static void __vtime_account_system(struct task_struct *tsk)
@@ -706,16 +759,6 @@ void vtime_account_system(struct task_struct *tsk)
write_seqcount_end(&tsk->vtime_seqcount);
}
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
-{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- if (context_tracking_in_user())
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
-}
-
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..37e2449186c4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
- bool fallback = false;
later_rq = find_lock_later_rq(p, rq);
-
if (!later_rq) {
int cpu;
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online cpu.
*/
- fallback = true;
cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
if (cpu >= nr_cpu_ids) {
/*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq);
}
- /*
- * By now the task is replenished and enqueued; migrate it.
- */
- deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu);
- activate_task(later_rq, p, 0);
-
- if (!fallback)
- resched_curr(later_rq);
-
double_unlock_balance(later_rq, rq);
return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
* one, and to (try to!) reconcile itself with its own scheduling
* parameters.
*/
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ WARN_ON(dl_se->dl_boosted);
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+ dl_se->runtime = dl_se->dl_runtime;
}
/*
@@ -641,6 +629,24 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock;
}
+#ifdef CONFIG_SMP
+ if (unlikely(!rq->online)) {
+ /*
+ * If the runqueue is no longer available, migrate the
+ * task elsewhere. This necessarily changes rq.
+ */
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq = dl_task_offline_migration(rq, p);
+ rf.cookie = lockdep_pin_lock(&rq->lock);
+
+ /*
+ * Now that the task has been migrated to the new RQ and we
+ * have that locked, proceed as normal and enqueue the task
+ * there.
+ */
+ }
+#endif
+
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -649,19 +655,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
#ifdef CONFIG_SMP
/*
- * Perform balancing operations here; after the replenishments. We
- * cannot drop rq->lock before this, otherwise the assertion in
- * start_dl_timer() about not missing updates is not true.
- *
- * If we find that the rq the task was on is no longer available, we
- * need to select a new rq.
- *
- * XXX figure out if select_task_rq_dl() deals with offline cpus.
- */
- if (unlikely(!rq->online))
- rq = dl_task_offline_migration(rq, p);
-
- /*
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
*/
@@ -732,9 +725,8 @@ static void update_curr_dl(struct rq *rq)
return;
}
- /* kick cpufreq (see the comment in linux/cpufreq.h). */
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_trigger_update(rq_clock(rq));
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -795,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
dl_rq->earliest_dl.curr = deadline;
- cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
}
@@ -810,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else {
struct rb_node *leftmost = dl_rq->rb_leftmost;
struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
- cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
}
@@ -1668,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
- cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
}
/* Assumes rq->lock is held */
@@ -1677,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
@@ -1720,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
+
+ /* If p is not queued we will update its parameters at next wakeup. */
+ if (!task_on_rq_queued(p))
+ return;
+
+ /*
+ * If p is boosted we already updated its params in
+ * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+ * p's deadline being now already after rq_clock(rq).
+ */
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
- setup_new_dl_entity(&p->dl, &p->dl);
+ setup_new_dl_entity(&p->dl);
- if (task_on_rq_queued(p) && rq->curr != p) {
+ if (rq->curr != p) {
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cf905f655ba1..13935886a471 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if (!se)
return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
- PN(se->statistics.wait_start);
- PN(se->statistics.sleep_start);
- PN(se->statistics.block_start);
- PN(se->statistics.sleep_max);
- PN(se->statistics.block_max);
- PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
- PN(se->statistics.wait_max);
- PN(se->statistics.wait_sum);
- P(se->statistics.wait_count);
+ PN_SCHEDSTAT(se->statistics.wait_start);
+ PN_SCHEDSTAT(se->statistics.sleep_start);
+ PN_SCHEDSTAT(se->statistics.block_start);
+ PN_SCHEDSTAT(se->statistics.sleep_max);
+ PN_SCHEDSTAT(se->statistics.block_max);
+ PN_SCHEDSTAT(se->statistics.exec_max);
+ PN_SCHEDSTAT(se->statistics.slice_max);
+ PN_SCHEDSTAT(se->statistics.wait_max);
+ PN_SCHEDSTAT(se->statistics.wait_sum);
+ P_SCHEDSTAT(se->statistics.wait_count);
}
-#endif
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
#endif
+
+#undef PN_SCHEDSTAT
#undef PN
+#undef P_SCHEDSTAT
#undef P
}
#endif
@@ -427,19 +432,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
-#ifdef CONFIG_SCHEDSTATS
- if (schedstat_enabled()) {
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.statistics.wait_sum),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.statistics.sum_sleep_runtime));
- }
-#else
+
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- 0LL, 0L,
+ SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
- 0LL, 0L);
-#endif
+ SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
+
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
@@ -633,9 +631,7 @@ do { \
#undef P64
#endif
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
-
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
if (schedstat_enabled()) {
P(yld_count);
P(sched_count);
@@ -643,9 +639,8 @@ do { \
P(ttwu_count);
P(ttwu_local);
}
-
#undef P
-#endif
+
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
@@ -875,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
PN(se.exec_start);
PN(se.vruntime);
@@ -886,39 +885,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
P(se.nr_migrations);
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
- PN(se.statistics.sum_sleep_runtime);
- PN(se.statistics.wait_start);
- PN(se.statistics.sleep_start);
- PN(se.statistics.block_start);
- PN(se.statistics.sleep_max);
- PN(se.statistics.block_max);
- PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
- PN(se.statistics.wait_max);
- PN(se.statistics.wait_sum);
- P(se.statistics.wait_count);
- PN(se.statistics.iowait_sum);
- P(se.statistics.iowait_count);
- P(se.statistics.nr_migrations_cold);
- P(se.statistics.nr_failed_migrations_affine);
- P(se.statistics.nr_failed_migrations_running);
- P(se.statistics.nr_failed_migrations_hot);
- P(se.statistics.nr_forced_migrations);
- P(se.statistics.nr_wakeups);
- P(se.statistics.nr_wakeups_sync);
- P(se.statistics.nr_wakeups_migrate);
- P(se.statistics.nr_wakeups_local);
- P(se.statistics.nr_wakeups_remote);
- P(se.statistics.nr_wakeups_affine);
- P(se.statistics.nr_wakeups_affine_attempts);
- P(se.statistics.nr_wakeups_passive);
- P(se.statistics.nr_wakeups_idle);
+ PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+ PN_SCHEDSTAT(se.statistics.wait_start);
+ PN_SCHEDSTAT(se.statistics.sleep_start);
+ PN_SCHEDSTAT(se.statistics.block_start);
+ PN_SCHEDSTAT(se.statistics.sleep_max);
+ PN_SCHEDSTAT(se.statistics.block_max);
+ PN_SCHEDSTAT(se.statistics.exec_max);
+ PN_SCHEDSTAT(se.statistics.slice_max);
+ PN_SCHEDSTAT(se.statistics.wait_max);
+ PN_SCHEDSTAT(se.statistics.wait_sum);
+ P_SCHEDSTAT(se.statistics.wait_count);
+ PN_SCHEDSTAT(se.statistics.iowait_sum);
+ P_SCHEDSTAT(se.statistics.iowait_count);
+ P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+ P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+ P_SCHEDSTAT(se.statistics.nr_wakeups);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
@@ -937,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
__PN(avg_atom);
__PN(avg_per_cpu);
}
-#endif
+
__P(nr_switches);
SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
@@ -954,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#endif
P(policy);
P(prio);
+#undef PN_SCHEDSTAT
#undef PN
#undef __PN
+#undef P_SCHEDSTAT
#undef P
#undef __P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..502e95a6e927 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
static inline struct task_struct *task_of(struct sched_entity *se)
{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!entity_is_task(se));
-#endif
+ SCHED_WARN_ON(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *curr = cfs_rq->curr;
+
u64 vruntime = cfs_rq->min_vruntime;
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
+ if (curr) {
+ if (curr->on_rq)
+ vruntime = curr->vruntime;
+ else
+ curr = NULL;
+ }
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
- if (!cfs_rq->curr)
+ if (!curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
@@ -690,6 +700,11 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -720,6 +735,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ u64 now = cfs_rq_clock_task(cfs_rq);
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -733,18 +749,41 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = now;
+ return;
+ }
+ }
+
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
-#else
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
void post_init_entity_util_avg(struct sched_entity *se)
{
}
-#endif
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -768,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
+ schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
@@ -789,26 +828,34 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
-#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start = rq_clock(rq_of(cfs_rq));
+ u64 wait_start, prev_wait_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ wait_start = rq_clock(rq_of(cfs_rq));
+ prev_wait_start = schedstat_val(se->statistics.wait_start);
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > se->statistics.wait_start))
- wait_start -= se->statistics.wait_start;
+ likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
- se->statistics.wait_start = wait_start;
+ schedstat_set(se->statistics.wait_start, wait_start);
}
-static void
+static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta;
- delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ if (!schedstat_enabled())
+ return;
+
+ delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
p = task_of(se);
@@ -818,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- se->statistics.wait_start = delta;
+ schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- se->statistics.wait_max = max(se->statistics.wait_max, delta);
- se->statistics.wait_count++;
- se->statistics.wait_sum += delta;
- se->statistics.wait_start = 0;
+ schedstat_set(se->statistics.wait_max,
+ max(schedstat_val(se->statistics.wait_max), delta));
+ schedstat_inc(se->statistics.wait_count);
+ schedstat_add(se->statistics.wait_sum, delta);
+ schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *tsk = NULL;
+ u64 sleep_start, block_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ sleep_start = schedstat_val(se->statistics.sleep_start);
+ block_start = schedstat_val(se->statistics.block_start);
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+ schedstat_set(se->statistics.sleep_max, delta);
+
+ schedstat_set(se->statistics.sleep_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
+ }
+ if (block_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+ schedstat_set(se->statistics.block_max, delta);
+
+ schedstat_set(se->statistics.block_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ if (tsk->in_iowait) {
+ schedstat_add(se->statistics.iowait_sum, delta);
+ schedstat_inc(se->statistics.iowait_count);
+ trace_sched_stat_iowait(tsk, delta);
+ }
+
+ trace_sched_stat_blocked(tsk, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
+ }
+ }
}
/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ if (!schedstat_enabled())
+ return;
+
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper(cfs_rq, se);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+
+ if (!schedstat_enabled())
+ return;
+
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
@@ -854,39 +980,17 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
+ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ schedstat_set(se->statistics.sleep_start,
+ rq_clock(rq_of(cfs_rq)));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ schedstat_set(se->statistics.block_start,
+ rq_clock(rq_of(cfs_rq)));
}
-
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
}
-#endif
/*
* We are picking a new current task - update its stats:
@@ -1305,6 +1409,8 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1372,31 +1478,11 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
- bool assigned = false;
rcu_read_lock();
-
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task or idle task.
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ cur = task_rcu_dereference(&dst_rq->curr);
+ if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
- }
-
- raw_spin_unlock_irq(&dst_rq->lock);
/*
* Because we have preemption enabled we can get migrated around and
@@ -1479,7 +1565,6 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
- put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1501,20 +1586,21 @@ balance:
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
*/
- if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ if (!cur) {
+ /*
+ * select_idle_siblings() uses an per-cpu cpumask that
+ * can be used from IRQ context.
+ */
+ local_irq_disable();
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
+ local_irq_enable();
+ }
assign:
- assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
- /*
- * The dst_rq->curr isn't assigned. The protection for task_struct is
- * finished.
- */
- if (cur && !assigned)
- put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2287,7 +2373,7 @@ void task_numa_work(struct callback_head *work)
unsigned long nr_pte_updates = 0;
long pages, virtpages;
- WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+ SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
@@ -2499,28 +2585,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
-
- return tg_weight;
-}
+ load = scale_load_down(cfs_rq->load.weight);
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
@@ -2539,6 +2619,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
@@ -2873,16 +2966,9 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
-
- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
- unsigned long max = rq->cpu_capacity_orig;
-
+ if (&this_rq()->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -2899,12 +2985,44 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ cpufreq_update_util(rq_of(cfs_rq), 0);
}
}
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
@@ -2913,15 +3031,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- sa->load_avg = max_t(long, sa->load_avg - r, 0);
- sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(&sa->load_avg, r);
+ sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sa->util_avg = max_t(long, sa->util_avg - r, 0);
- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(&sa->util_avg, r);
+ sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
}
@@ -2959,6 +3077,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
update_tg_load_avg(cfs_rq, 0);
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2967,6 +3093,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* If we got migrated (either between CPUs or between cgroups) we'll
* have aged the average right before clearing @last_update_time.
+ *
+ * Or we're fresh through post_init_entity_util_avg().
*/
if (se->avg.last_update_time) {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -2988,16 +3116,24 @@ skip_aging:
cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
&se->avg, se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
- cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
cfs_rq_util_change(cfs_rq);
}
@@ -3072,11 +3208,14 @@ void remove_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
last_update_time = cfs_rq_last_update_time(cfs_rq);
@@ -3099,12 +3238,15 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- struct rq *rq = rq_of(cfs_rq);
+ return 0;
+}
- cpufreq_trigger_update(rq_clock(rq));
+static inline void update_load_avg(struct sched_entity *se, int not_used)
+{
+ cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
}
static inline void
@@ -3125,68 +3267,6 @@ static inline int idle_balance(struct rq *rq)
#endif /* CONFIG_SMP */
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
- struct task_struct *tsk = NULL;
-
- if (entity_is_task(se))
- tsk = task_of(se);
-
- if (se->statistics.sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.sleep_max))
- se->statistics.sleep_max = delta;
-
- se->statistics.sleep_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (se->statistics.block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.block_max))
- se->statistics.block_max = delta;
-
- se->statistics.block_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- if (tsk->in_iowait) {
- se->statistics.iowait_sum += delta;
- se->statistics.iowait_count++;
- trace_sched_stat_iowait(tsk, delta);
- }
-
- trace_sched_stat_blocked(tsk, delta);
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
-#endif
-}
-
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -3196,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
d = -d;
if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq, nr_spread_over);
+ schedstat_inc(cfs_rq->nr_spread_over);
#endif
}
@@ -3246,7 +3326,7 @@ static inline void check_schedstat_required(void)
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled()) {
- pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
"kernel parameter schedstats=enabled or "
"kernel.sched_schedstats=1\n");
@@ -3313,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
- if (flags & ENQUEUE_WAKEUP) {
+ if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
- if (schedstat_enabled())
- enqueue_sleeper(cfs_rq, se);
- }
check_schedstat_required();
- if (schedstat_enabled()) {
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
- }
+ update_stats_enqueue(cfs_rq, se, flags);
+ check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3390,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
- if (schedstat_enabled())
- update_stats_dequeue(cfs_rq, se, flags);
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -3401,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
account_entity_dequeue(cfs_rq, se);
/*
- * Normalize the entity after updating the min_vruntime because the
- * update can refer to the ->curr item and we need to reflect this
- * movement in our normalized position.
+ * Normalize after update_curr(); which will also have moved
+ * min_vruntime if @se is the one holding it back. But before doing
+ * update_min_vruntime() again, which will discount @se's position and
+ * can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
@@ -3411,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
+
+ /*
+ * Now advance min_vruntime if @se was the entity holding it back,
+ * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+ * put back on, and if we advance min_vruntime, we'll be placed back
+ * further than we started -- ie. we'll be penalized.
+ */
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ update_min_vruntime(cfs_rq);
}
/*
@@ -3465,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- if (schedstat_enabled())
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->statistics.slice_max = max(se->statistics.slice_max,
- se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ schedstat_set(se->statistics.slice_max,
+ max((u64)schedstat_val(se->statistics.slice_max),
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-#endif
+
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
@@ -3562,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- if (schedstat_enabled()) {
- check_spread(cfs_rq, prev);
- if (prev->on_rq)
- update_stats_wait_start(cfs_rq, prev);
- }
+ check_spread(cfs_rq, prev);
if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -3688,7 +3768,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task;
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
@@ -3826,13 +3906,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
-#endif
return 0;
}
@@ -4199,6 +4277,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *pcfs_rq, *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tg->parent)
+ return;
+
+ cfs_rq = tg->cfs_rq[cpu];
+ pcfs_rq = tg->parent->cfs_rq[cpu];
+
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4338,6 +4433,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4382,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- WARN_ON(task_rq(p) != rq);
+ SCHED_WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
@@ -4435,6 +4531,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -4446,7 +4550,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
@@ -4500,15 +4604,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && parent_entity(se))
- set_next_buddy(parent_entity(se));
-
- /* avoid re-evaluating load for this entity */
- se = parent_entity(se);
+ if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
@@ -4532,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
@@ -4910,27 +5018,32 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * (long)tg->shares) / W;
+ wl = (w * (long)scale_load_down(tg->shares)) / W;
else
- wl = tg->shares;
+ wl = scale_load_down(tg->shares);
/*
* Per the above, wl is the new se->load.weight value; since
@@ -5013,18 +5126,18 @@ static int wake_wide(struct task_struct *p)
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
@@ -5068,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
balanced = this_eff_load <= prev_eff_load;
- schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (!balanced)
return 0;
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
return 1;
}
@@ -5150,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
@@ -5187,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
}
/*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
+ */
+
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+ int next;
+
+again:
+ next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+
+ if (*wrapped) {
+ if (next >= start)
+ return nr_cpumask_bits;
+ } else {
+ if (next >= nr_cpumask_bits) {
+ *wrapped = 1;
+ n = -1;
+ goto again;
+ }
+ }
+
+ return next;
+}
+
+#define for_each_cpu_wrap(cpu, mask, start, wrap) \
+ for ((wrap) = 0, (cpu) = (start)-1; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
+ (cpu) < nr_cpumask_bits; )
+
+#ifdef CONFIG_SCHED_SMT
+
+static inline void set_idle_cores(int cpu, int val)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);
+
+ return def;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core, true))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core, 1);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int core, cpu, wrap;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ if (!test_idle_cores(target, false))
+ return -1;
+
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+ for_each_cpu_wrap(core, cpus, target, wrap) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ return core;
+ }
+
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ set_idle_cores(target, 0);
+
+ return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ u64 avg_idle = this_rq()->avg_idle;
+ u64 avg_cost = this_sd->avg_scan_cost;
+ u64 time, cost;
+ s64 delta;
+ int cpu, wrap;
+
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ if ((avg_idle / 512) < avg_cost)
+ return -1;
+
+ time = local_clock();
+
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ break;
+ }
+
+ time = local_clock() - time;
+ cost = this_sd->avg_scan_cost;
+ delta = (s64)(time - cost) / 8;
+ this_sd->avg_scan_cost += delta;
+
+ return cpu;
+}
+
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
+ */
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
- int i = task_cpu(p);
+ int i;
if (idle_cpu(target))
return target;
/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ return prev;
- /*
- * Otherwise, iterate the domains and find an eligible idle cpu.
- *
- * A completely idle sched group at higher domains is more
- * desirable than an idle group at a lower level, because lower
- * domains have smaller groups and usually share hardware
- * resources which causes tasks to contend on them, e.g. x86
- * hyperthread siblings in the lowest domain (SMT) can contend
- * on the shared cpu pipeline.
- *
- * However, while we prefer idle groups at higher domains
- * finding an idle cpu at the lowest domain is still better than
- * returning 'target', which we've already established, isn't
- * idle.
- */
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- /* Ensure the entire group is idle */
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (!sd)
+ return target;
+
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
- /*
- * It doesn't matter which cpu we pick, the
- * whole group is idle.
- */
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
- }
-done:
return target;
}
@@ -5282,6 +5572,32 @@ static int cpu_util(int cpu)
return (util >= capacity) ? capacity : util;
}
+static inline int task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5305,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+ && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
}
rcu_read_lock();
@@ -5331,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else while (sd) {
struct sched_group *group;
@@ -5861,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* The adjacency matrix of the resulting graph is given by:
*
- * log_2 n
+ * log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
@@ -5907,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
- */
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -6055,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int cpu;
- schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
@@ -6086,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_running);
return 0;
}
@@ -6103,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->se.statistics.nr_forced_migrations);
}
return 1;
}
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
return 0;
}
@@ -6149,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
* so we can safely collect stats here rather than
* inside detach_tasks().
*/
- schedstat_inc(env->sd, lb_gained[env->idle]);
+ schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
return NULL;
@@ -6241,7 +6558,7 @@ next:
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], detached);
+ schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
@@ -6569,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
- */
+ */
group = child->groups;
do {
@@ -7069,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= NICE_0_LOAD;
+ load_above_capacity *= scale_load_down(NICE_0_LOAD);
load_above_capacity /= busiest->group_capacity;
} else
load_above_capacity = ~0UL;
@@ -7276,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
@@ -7382,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
cpumask_copy(cpus, cpu_active_mask);
- schedstat_inc(sd, lb_count[idle]);
+ schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
@@ -7392,19 +7706,19 @@ redo:
group = find_busiest_group(&env);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[idle]);
+ schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[idle]);
+ schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
- schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
@@ -7511,7 +7825,7 @@ more_balance:
}
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ schedstat_inc(sd->lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
@@ -7594,7 +7908,7 @@ out_all_pinned:
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
- schedstat_inc(sd, lb_balanced[idle]);
+ schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
@@ -7626,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
}
static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{
unsigned long interval, next;
- interval = get_sd_balance_interval(sd, cpu_busy);
+ /* used by idle balance, so cpu_busy = 0 */
+ interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
@@ -7660,7 +7975,7 @@ static int idle_balance(struct rq *this_rq)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
goto out;
@@ -7678,7 +7993,7 @@ static int idle_balance(struct rq *this_rq)
continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
break;
}
@@ -7696,7 +8011,7 @@ static int idle_balance(struct rq *this_rq)
curr_cost += domain_cost;
}
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
@@ -7786,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data)
.idle = CPU_IDLE,
};
- schedstat_inc(sd, alb_count);
+ schedstat_inc(sd->alb_count);
p = detach_one_task(&env);
if (p) {
- schedstat_inc(sd, alb_pushed);
+ schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
- schedstat_inc(sd, alb_failed);
+ schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
@@ -7886,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void)
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7903,13 +8218,13 @@ void set_cpu_sd_state_idle(void)
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -8136,8 +8451,8 @@ end:
static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
struct sched_domain *sd;
- struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
bool kick = false;
@@ -8165,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
return true;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
- sgc = sd->groups->sgc;
- nr_busy = atomic_read(&sgc->nr_busy_cpus);
-
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * XXX: write a coherent comment on why we do this.
+ * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
kick = true;
goto unlock;
@@ -8283,31 +8600,17 @@ static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8320,8 +8623,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
@@ -8377,6 +8679,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
if (!vruntime_normalized(p)) {
/*
@@ -8388,13 +8691,16 @@ static void detach_task_cfs_rq(struct task_struct *p)
}
/* Catch up with the cfs_rq and remove our load when we leave */
+ update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8405,7 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif
/* Synchronize task with its cfs_rq */
+ update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8465,6 +8773,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -8477,6 +8793,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -8496,8 +8825,9 @@ void free_fair_sched_group(struct task_group *tg)
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8842,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -8525,7 +8857,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
- post_init_entity_util_avg(se);
}
return 1;
@@ -8536,6 +8867,23 @@ err:
return 0;
}
+void online_fair_sched_group(struct task_group *tg)
+{
+ struct sched_entity *se;
+ struct rq *rq;
+ int i;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ se = tg->se[i];
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ sync_throttle(tg, i);
+ raw_spin_unlock_irq(&rq->lock);
+ }
+}
+
void unregister_fair_sched_group(struct task_group *tg)
{
unsigned long flags;
@@ -8640,6 +8988,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
+void online_fair_sched_group(struct task_group *tg) { }
+
void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8699,7 +9049,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
};
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bd12c6c714ec..9fb873cfc75c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
static void cpuidle_idle_call(void)
{
- struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_device *dev = cpuidle_get_device();
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
@@ -201,6 +201,8 @@ exit_idle:
*/
static void cpu_idle_loop(void)
{
+ int cpu = smp_processor_id();
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id())) {
+ if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 2ce5458bbe1d..5405d3feb112 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,8 +27,8 @@ static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
put_prev_task(rq, prev);
-
- schedstat_inc(rq, sched_goidle);
+ update_idle_core(rq);
+ schedstat_inc(rq->sched_goidle);
return rq->idle;
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..a2d6eb71f06b 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
-long calc_load_fold_active(struct rq *this_rq)
+long calc_load_fold_active(struct rq *this_rq, long adjust)
{
long nr_active, delta = 0;
- nr_active = this_rq->nr_running;
+ nr_active = this_rq->nr_running - adjust;
nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
@@ -188,7 +188,7 @@ void calc_load_enter_idle(void)
* We're going into NOHZ mode, if there's any pending delta, fold it
* into the pending idle delta.
*/
- delta = calc_load_fold_active(this_rq);
+ delta = calc_load_fold_active(this_rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq)
if (time_before(jiffies, this_rq->calc_load_update))
return;
- delta = calc_load_fold_active(this_rq);
+ delta = calc_load_fold_active(this_rq, 0);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b722691..2516b8df6dbb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
- /* Kick cpufreq (see the comment in linux/cpufreq.h). */
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_trigger_update(rq_clock(rq));
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..055f935d4421 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
+#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
@@ -15,6 +16,12 @@
#include "cpudeadline.h"
#include "cpuacct.h"
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+#else
+#define SCHED_WARN_ON(x) ((void)(x))
+#endif
+
struct rq;
struct cpuidle_state;
@@ -28,7 +35,7 @@ extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
-extern long calc_load_fold_active(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_SMP
extern void cpu_load_update_active(struct rq *this_rq);
@@ -321,6 +328,7 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
@@ -564,6 +572,8 @@ struct root_domain {
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
+
+ unsigned long max_cpu_capacity;
};
extern struct root_domain def_root_domain;
@@ -596,7 +606,6 @@ struct rq {
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
#endif /* CONFIG_SMP */
- u64 nohz_stamp;
unsigned long nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
@@ -722,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+
+#ifdef CONFIG_SCHED_SMT
+
+extern struct static_key_false sched_smt_present;
+
+extern void __update_idle_core(struct rq *rq);
+
+static inline void update_idle_core(struct rq *rq)
+{
+ if (static_branch_unlikely(&sched_smt_present))
+ __update_idle_core(rq);
+}
+
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
+
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
@@ -856,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_capacity {
@@ -869,10 +895,6 @@ struct sched_group_capacity {
unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
- /*
- * Number of busy cpus in this group.
- */
- atomic_t nr_busy_cpus;
unsigned long cpumask[0]; /* iteration mask */
};
@@ -999,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ p->cpu = cpu;
+#else
task_thread_info(p)->cpu = cpu;
+#endif
p->wake_cpu = cpu;
#endif
}
@@ -1113,7 +1139,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
@@ -1246,8 +1272,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group) (struct task_struct *p, int type);
#endif
};
@@ -1256,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
prev->sched_class->put_prev_task(rq, prev);
}
+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+{
+ curr->sched_class->set_curr_task(rq);
+}
+
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1286,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- WARN_ON(!rcu_read_lock_held());
+ SCHED_WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
#else
@@ -1706,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+ u64 hardirq_time;
+ u64 softirq_time;
+ u64 irq_start_time;
+ struct u64_stats_sync sync;
+};
-DECLARE_PER_CPU(u64, cpu_hardirq_time);
-DECLARE_PER_CPU(u64, cpu_softirq_time);
-
-#ifndef CONFIG_64BIT
-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
-}
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
static inline u64 irq_time_read(int cpu)
{
- u64 irq_time;
- unsigned seq;
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ unsigned int seq;
+ u64 total;
do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
- return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ total = irqtime->softirq_time + irqtime->hardirq_time;
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
-static inline u64 irq_time_read(int cpu)
-{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+ return total;
}
-#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
@@ -1759,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
- * @time: Current time.
- * @util: Current utilization.
- * @max: Utilization ceiling.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
*
- * This function is called by the scheduler on every invocation of
- * update_load_avg() on the CPU whose utilization is being updated.
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
*
* It can only be called from RCU-sched read-side critical sections.
- */
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
-{
- struct update_util_data *data;
-
- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
- if (data)
- data->func(data, time, util, max);
-}
-
-/**
- * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
- * @time: Current time.
*
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
@@ -1793,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
* but that really is a band-aid. Going forward it should be replaced with
* solutions targeted more specifically at RT and DL tasks.
*/
-static inline void cpufreq_trigger_update(u64 time)
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
{
- cpufreq_update_util(time, ULONG_MAX, 0);
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_update_util(rq, flags);
}
#else
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
-static inline void cpufreq_trigger_update(u64 time) {}
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
@@ -1809,16 +1815,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
- rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- rq->prev_steal_time_rq = 0;
-#endif
-}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 70b3b6a20fb0..34659a853505 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,10 +29,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
-# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
-# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
-# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var) (var)
+#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -43,11 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
-# define schedstat_enabled() 0
-# define schedstat_inc(rq, field) do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-# define schedstat_set(var, val) do { } while (0)
-#endif
+#define schedstat_enabled() 0
+#define schedstat_inc(var) do { } while (0)
+#define schedstat_add(var, amt) do { } while (0)
+#define schedstat_set(var, val) do { } while (0)
+#define schedstat_val(var) 0
+#define schedstat_val_or_zero(var) 0
+#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..4f7053579fe3 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+void init_wait_entry(wait_queue_t *wait, int flags)
{
- unsigned long flags;
-
- if (signal_pending_state(state, current))
- return -ERESTARTSYS;
-
+ wait->flags = flags;
wait->private = current;
wait->func = autoremove_wake_function;
+ INIT_LIST_HEAD(&wait->task_list);
+}
+EXPORT_SYMBOL(init_wait_entry);
+
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+ long ret = 0;
spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list)) {
- if (wait->flags & WQ_FLAG_EXCLUSIVE)
- __add_wait_queue_tail(q, wait);
- else
- __add_wait_queue(q, wait);
+ if (unlikely(signal_pending_state(state, current))) {
+ /*
+ * Exclusive waiter must not fail if it was selected by wakeup,
+ * it should "consume" the condition we were waiting for.
+ *
+ * The caller will recheck the condition and return success if
+ * we were already woken up, we can not miss the event because
+ * wakeup locks/unlocks the same q->lock.
+ *
+ * But we need to ensure that set-condition + wakeup after that
+ * can't see us, it should wake up another exclusive waiter if
+ * we fail.
+ */
+ list_del_init(&wait->task_list);
+ ret = -ERESTARTSYS;
+ } else {
+ if (list_empty(&wait->task_list)) {
+ if (wait->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_tail(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ }
+ set_current_state(state);
}
- set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(prepare_to_wait_event);
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(finish_wait);
-/**
- * abort_exclusive_wait - abort exclusive waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
- * @mode: runstate of the waiter to be woken
- * @key: key to identify a wait bit queue or %NULL
- *
- * Sets current thread back to running state and removes
- * the wait descriptor from the given waitqueue if still
- * queued.
- *
- * Wakes up the next waiter if the caller is concurrently
- * woken up through the queue.
- *
- * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and no one wakes up
- * the next waiter.
- */
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
- unsigned int mode, void *key)
-{
- unsigned long flags;
-
- __set_current_state(TASK_RUNNING);
- spin_lock_irqsave(&q->lock, flags);
- if (!list_empty(&wait->task_list))
- list_del_init(&wait->task_list);
- else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(abort_exclusive_wait);
-
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
@@ -425,20 +413,29 @@ int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
wait_bit_action_f *action, unsigned mode)
{
- do {
- int ret;
+ int ret = 0;
+ for (;;) {
prepare_to_wait_exclusive(wq, &q->wait, mode);
- if (!test_bit(q->key.bit_nr, q->key.flags))
- continue;
- ret = action(&q->key, mode);
- if (!ret)
- continue;
- abort_exclusive_wait(wq, &q->wait, mode, &q->key);
- return ret;
- } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
- finish_wait(wq, &q->wait);
- return 0;
+ if (test_bit(q->key.bit_nr, q->key.flags)) {
+ ret = action(&q->key, mode);
+ /*
+ * See the comment in prepare_to_wait_event().
+ * finish_wait() does not necessarily takes wq->lock,
+ * but test_and_set_bit() implies mb() which pairs with
+ * smp_mb__after_atomic() before wake_up_page().
+ */
+ if (ret)
+ finish_wait(wq, &q->wait);
+ }
+ if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
+ if (!ret)
+ finish_wait(wq, &q->wait);
+ return 0;
+ } else if (ret) {
+ return ret;
+ }
+ }
}
EXPORT_SYMBOL(__wait_on_bit_lock);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e1e5a354854e..0db7c8a2afe2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
@@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -513,24 +513,17 @@ static void seccomp_send_sigsys(int syscall, int reason)
* To be fully secure this must be combined with rlimit
* to limit the stack allocations too.
*/
-static int mode1_syscalls[] = {
+static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
0, /* null terminated */
};
-#ifdef CONFIG_COMPAT
-static int mode1_syscalls_32[] = {
- __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
- 0, /* null terminated */
-};
-#endif
-
static void __secure_computing_strict(int this_syscall)
{
- int *syscall_whitelist = mode1_syscalls;
+ const int *syscall_whitelist = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- syscall_whitelist = mode1_syscalls_32;
+ syscall_whitelist = get_compat_mode1_syscalls();
#endif
do {
if (*syscall_whitelist == this_syscall)
@@ -549,7 +542,7 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return;
@@ -561,20 +554,10 @@ void secure_computing_strict(int this_syscall)
BUG();
}
#else
-int __secure_computing(void)
-{
- u32 phase1_result = seccomp_phase1(NULL);
-
- if (likely(phase1_result == SECCOMP_PHASE1_OK))
- return 0;
- else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
- return -1;
- else
- return seccomp_phase2(phase1_result);
-}
#ifdef CONFIG_SECCOMP_FILTER
-static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+ const bool recheck_after_trace)
{
u32 filter_ret, action;
int data;
@@ -606,10 +589,50 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
goto skip;
case SECCOMP_RET_TRACE:
- return filter_ret; /* Save the rest for phase 2. */
+ /* We've been put in this state by the ptracer already. */
+ if (recheck_after_trace)
+ return 0;
+
+ /* ENOSYS these calls if there is no tracer attached. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current,
+ task_pt_regs(current),
+ -ENOSYS, 0);
+ goto skip;
+ }
+
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification,
+ * which could leave us with a potentially unmodified
+ * syscall that the tracer would have liked to have
+ * changed. Since the process is about to die, we just
+ * force the syscall to be skipped and let the signal
+ * kill the process and correctly handle any tracer exit
+ * notifications.
+ */
+ if (fatal_signal_pending(current))
+ goto skip;
+ /* Check if the tracer forced the syscall to be skipped. */
+ this_syscall = syscall_get_nr(current, task_pt_regs(current));
+ if (this_syscall < 0)
+ goto skip;
+
+ /*
+ * Recheck the syscall, since it may have changed. This
+ * intentionally uses a NULL struct seccomp_data to force
+ * a reload of all registers. This does not goto skip since
+ * a skip would have already been reported.
+ */
+ if (__seccomp_filter(this_syscall, NULL, true))
+ return -1;
+
+ return 0;
case SECCOMP_RET_ALLOW:
- return SECCOMP_PHASE1_OK;
+ return 0;
case SECCOMP_RET_KILL:
default:
@@ -621,96 +644,38 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
skip:
audit_seccomp(this_syscall, 0, action);
- return SECCOMP_PHASE1_SKIP;
+ return -1;
+}
+#else
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+ const bool recheck_after_trace)
+{
+ BUG();
}
#endif
-/**
- * seccomp_phase1() - run fast path seccomp checks on the current syscall
- * @arg sd: The seccomp_data or NULL
- *
- * This only reads pt_regs via the syscall_xyz helpers. The only change
- * it will make to pt_regs is via syscall_set_return_value, and it will
- * only do that if it returns SECCOMP_PHASE1_SKIP.
- *
- * If sd is provided, it will not read pt_regs at all.
- *
- * It may also call do_exit or force a signal; these actions must be
- * safe.
- *
- * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
- * be processed normally.
- *
- * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
- * invoked. In this case, seccomp_phase1 will have set the return value
- * using syscall_set_return_value.
- *
- * If it returns anything else, then the return value should be passed
- * to seccomp_phase2 from a context in which ptrace hooks are safe.
- */
-u32 seccomp_phase1(struct seccomp_data *sd)
+int __secure_computing(const struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
- int this_syscall = sd ? sd->nr :
- syscall_get_nr(current, task_pt_regs(current));
+ int this_syscall;
- if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
- return SECCOMP_PHASE1_OK;
+ return 0;
+
+ this_syscall = sd ? sd->nr :
+ syscall_get_nr(current, task_pt_regs(current));
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
- return SECCOMP_PHASE1_OK;
-#ifdef CONFIG_SECCOMP_FILTER
+ return 0;
case SECCOMP_MODE_FILTER:
- return __seccomp_phase1_filter(this_syscall, sd);
-#endif
+ return __seccomp_filter(this_syscall, sd, false);
default:
BUG();
}
}
-
-/**
- * seccomp_phase2() - finish slow path seccomp work for the current syscall
- * @phase1_result: The return value from seccomp_phase1()
- *
- * This must be called from a context in which ptrace hooks can be used.
- *
- * Returns 0 if the syscall should be processed or -1 to skip the syscall.
- */
-int seccomp_phase2(u32 phase1_result)
-{
- struct pt_regs *regs = task_pt_regs(current);
- u32 action = phase1_result & SECCOMP_RET_ACTION;
- int data = phase1_result & SECCOMP_RET_DATA;
-
- BUG_ON(action != SECCOMP_RET_TRACE);
-
- audit_seccomp(syscall_get_nr(current, regs), 0, action);
-
- /* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
- syscall_set_return_value(current, regs,
- -ENOSYS, 0);
- return -1;
- }
-
- /* Allow the BPF to provide the event message */
- ptrace_event(PTRACE_EVENT_SECCOMP, data);
- /*
- * The delivery of a fatal signal during event
- * notification may silently skip tracer notification.
- * Terminating the task now avoids executing a system
- * call that may not be intended.
- */
- if (fatal_signal_pending(current))
- do_exit(SIGSYS);
- if (syscall_get_nr(current, regs) < 0)
- return -1; /* Explicit request to skip. */
-
- return 0;
-}
#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
long prctl_get_seccomp(void)
@@ -915,7 +880,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
fprog = filter->prog->orig_prog;
if (!fprog) {
- /* This must be a new non-cBPF filter, since we save every
+ /* This must be a new non-cBPF filter, since we save
* every cBPF filter's orig_prog above when
* CONFIG_CHECKPOINT_RESTORE is enabled.
*/
diff --git a/kernel/signal.c b/kernel/signal.c
index ab122a2cee41..75761acc77cf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig)
if (!__ratelimit(&ratelimit_state))
return;
- printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
+ pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
current->comm, current->pid, sig);
}
@@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = signal_pt_regs();
- printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
+ pr_info("potentially unexpected fatal signal %d.\n", signr);
#if defined(__i386__) && !defined(__arch_um__)
- printk(KERN_INFO "code at %08lx: ", regs->ip);
+ pr_info("code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
@@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr)
if (get_user(insn, (unsigned char *)(regs->ip + i)))
break;
- printk(KERN_CONT "%02x ", insn);
+ pr_cont("%02x ", insn);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
#endif
preempt_disable();
show_regs(regs);
@@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* @ts: upper bound on process time suspension
*/
int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
- const struct timespec *ts)
+ const struct timespec *ts)
{
+ ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
struct task_struct *tsk = current;
- long timeout = MAX_SCHEDULE_TIMEOUT;
sigset_t mask = *which;
- int sig;
+ int sig, ret = 0;
if (ts) {
if (!timespec_valid(ts))
return -EINVAL;
- timeout = timespec_to_jiffies(ts);
- /*
- * We can be close to the next tick, add another one
- * to ensure we will wait at least the time asked for.
- */
- if (ts->tv_sec || ts->tv_nsec)
- timeout++;
+ timeout = timespec_to_ktime(*ts);
+ to = &timeout;
}
/*
@@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
sig = dequeue_signal(tsk, &mask, info);
- if (!sig && timeout) {
+ if (!sig && timeout.tv64) {
/*
* None ready, temporarily unblock those we're interested
* while we are sleeping in so that we'll be awakened when
@@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
- timeout = freezable_schedule_timeout_interruptible(timeout);
-
+ __set_current_state(TASK_INTERRUPTIBLE);
+ ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+ HRTIMER_MODE_REL);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
if (sig)
return sig;
- return timeout ? -EINTR : -EAGAIN;
+ return ret ? -EINTR : -EAGAIN;
}
/**
@@ -3048,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action)
}
EXPORT_SYMBOL(kernel_sigaction);
+void __weak sigaction_compat_abi(struct k_sigaction *act,
+ struct k_sigaction *oact)
+{
+}
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *p = current, *t;
@@ -3063,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
if (oact)
*oact = *k;
+ sigaction_compat_abi(act, oact);
+
if (act) {
sigdelsetmask(&act->sa.sa_mask,
sigmask(SIGKILL) | sigmask(SIGSTOP));
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..bba3b201668d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -14,6 +14,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/hypervisor.h>
#include "smpboot.h"
@@ -33,69 +34,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline);
-static int
-hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+int smpcfd_prepare_cpu(unsigned int cpu)
{
- long cpu = (long)hcpu;
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
- cpu_to_node(cpu)))
- return notifier_from_errno(-ENOMEM);
- cfd->csd = alloc_percpu(struct call_single_data);
- if (!cfd->csd) {
- free_cpumask_var(cfd->cpumask);
- return notifier_from_errno(-ENOMEM);
- }
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- /* Fall-through to the CPU_DEAD[_FROZEN] case. */
-
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
+ if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return -ENOMEM;
+ cfd->csd = alloc_percpu(struct call_single_data);
+ if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
- free_percpu(cfd->csd);
- break;
+ return -ENOMEM;
+ }
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- /*
- * The IPIs for the smp-call-function callbacks queued by other
- * CPUs might arrive late, either due to hardware latencies or
- * because this CPU disabled interrupts (inside stop-machine)
- * before the IPIs were sent. So flush out any pending callbacks
- * explicitly (without waiting for the IPIs to arrive), to
- * ensure that the outgoing CPU doesn't go offline with work
- * still pending.
- */
- flush_smp_call_function_queue(false);
- break;
-#endif
- };
+ return 0;
+}
- return NOTIFY_OK;
+int smpcfd_dead_cpu(unsigned int cpu)
+{
+ struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
+
+ free_cpumask_var(cfd->cpumask);
+ free_percpu(cfd->csd);
+ return 0;
}
-static struct notifier_block hotplug_cfd_notifier = {
- .notifier_call = hotplug_cfd,
-};
+int smpcfd_dying_cpu(unsigned int cpu)
+{
+ /*
+ * The IPIs for the smp-call-function callbacks queued by other
+ * CPUs might arrive late, either due to hardware latencies or
+ * because this CPU disabled interrupts (inside stop-machine)
+ * before the IPIs were sent. So flush out any pending callbacks
+ * explicitly (without waiting for the IPIs to arrive), to
+ * ensure that the outgoing CPU doesn't go offline with work
+ * still pending.
+ */
+ flush_smp_call_function_queue(false);
+ return 0;
+}
void __init call_function_init(void)
{
- void *cpu = (void *)(long)smp_processor_id();
int i;
for_each_possible_cpu(i)
init_llist_head(&per_cpu(call_single_queue, i));
- hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
- register_cpu_notifier(&hotplug_cfd_notifier);
+ smpcfd_prepare_cpu(smp_processor_id());
}
/*
@@ -107,7 +93,7 @@ void __init call_function_init(void)
*/
static __always_inline void csd_lock_wait(struct call_single_data *csd)
{
- smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
static __always_inline void csd_lock(struct call_single_data *csd)
@@ -739,3 +725,54 @@ void wake_up_all_idle_cpus(void)
preempt_enable();
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
+
+/**
+ * smp_call_on_cpu - Call a function on a specific cpu
+ *
+ * Used to call a function on a specific cpu and wait for it to return.
+ * Optionally make sure the call is done on a specified physical cpu via vcpu
+ * pinning in order to support virtualized environments.
+ */
+struct smp_call_on_cpu_struct {
+ struct work_struct work;
+ struct completion done;
+ int (*func)(void *);
+ void *data;
+ int ret;
+ int cpu;
+};
+
+static void smp_call_on_cpu_callback(struct work_struct *work)
+{
+ struct smp_call_on_cpu_struct *sscs;
+
+ sscs = container_of(work, struct smp_call_on_cpu_struct, work);
+ if (sscs->cpu >= 0)
+ hypervisor_pin_vcpu(sscs->cpu);
+ sscs->ret = sscs->func(sscs->data);
+ if (sscs->cpu >= 0)
+ hypervisor_pin_vcpu(-1);
+
+ complete(&sscs->done);
+}
+
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+ struct smp_call_on_cpu_struct sscs = {
+ .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
+ .func = func,
+ .data = par,
+ .cpu = phys ? cpu : -1,
+ };
+
+ INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
+
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO;
+
+ queue_work_on(cpu, system_wq, &sscs.work);
+ wait_for_completion(&sscs.done);
+
+ return sscs.ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 13bc43d1fb22..fc0d8270f69e 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_park()) {
__set_current_state(TASK_RUNNING);
- preempt_enable();
if (ht->park && td->status == HP_THREAD_ACTIVE) {
BUG_ON(td->cpu != smp_processor_id());
ht->park(td->cpu);
td->status = HP_THREAD_PARKED;
}
+ preempt_enable();
kthread_parkme();
/* We might have been woken for stop */
continue;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 17caf4b63342..66762645f9e8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -78,6 +78,17 @@ static void wakeup_softirqd(void)
}
/*
+ * If ksoftirqd is scheduled, we do not want to process pending softirqs
+ * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ */
+static bool ksoftirqd_running(void)
+{
+ struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+
+ return tsk && (tsk->state == TASK_RUNNING);
+}
+
+/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
* softirq processing.
@@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending)
+ if (pending && !ksoftirqd_running())
do_softirq_own_stack();
local_irq_restore(flags);
@@ -340,6 +351,9 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
+ if (ksoftirqd_running())
+ return;
+
if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
@@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
BUG();
}
-static void takeover_tasklets(unsigned int cpu)
+static int takeover_tasklets(unsigned int cpu)
{
/* CPU is dead, so no lock needed. */
local_irq_disable();
@@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu)
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
+ return 0;
}
+#else
+#define takeover_tasklets NULL
#endif /* CONFIG_HOTPLUG_CPU */
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- switch (action) {
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- takeover_tasklets((unsigned long)hcpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_nfb = {
- .notifier_call = cpu_callback
-};
-
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
@@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = {
static __init int spawn_ksoftirqd(void)
{
- register_cpu_notifier(&cpu_nfb);
-
+ cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
+ takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a467e6c28a3b..ec9ab2f01489 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,7 @@
#include <linux/kallsyms.h>
#include <linux/smpboot.h>
#include <linux/atomic.h>
-#include <linux/lglock.h>
+#include <linux/nmi.h>
/*
* Structure to determine completion condition and record errors. May
@@ -46,13 +46,9 @@ struct cpu_stopper {
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static bool stop_machine_initialized = false;
-/*
- * Avoids a race between stop_two_cpus and global stop_cpus, where
- * the stoppers could get queued up in reverse order, leading to
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static bool stop_cpus_in_progress;
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
@@ -125,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work))
return -ENOENT;
+ /*
+ * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
+ * cycle by doing a preemption:
+ */
+ cond_resched();
wait_for_completion(&done.completion);
return done.ret;
}
@@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data)
break;
}
ack_state(msdata);
+ } else if (curstate > MULTI_STOP_PREPARE) {
+ /*
+ * At this stage all other CPUs we depend on must spin
+ * in the same loop. Any reason for hard-lockup should
+ * be detected and reported on their side.
+ */
+ touch_nmi_watchdog();
}
} while (curstate != MULTI_STOP_EXIT);
@@ -222,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
int err;
-
- lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+retry:
spin_lock_irq(&stopper1->lock);
spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
goto unlock;
+ /*
+ * Ensure that if we race with __stop_cpus() the stoppers won't get
+ * queued up in reverse order leading to system deadlock.
+ *
+ * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has
+ * queued a work on cpu1 but not on cpu2, we hold both locks.
+ *
+ * It can be falsely true but it is safe to spin until it is cleared,
+ * queue_stop_cpus_work() does everything under preempt_disable().
+ */
+ err = -EDEADLK;
+ if (unlikely(stop_cpus_in_progress))
+ goto unlock;
err = 0;
__cpu_stop_queue_work(stopper1, work1);
@@ -237,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
unlock:
spin_unlock(&stopper2->lock);
spin_unlock_irq(&stopper1->lock);
- lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+ if (unlikely(err == -EDEADLK)) {
+ while (stop_cpus_in_progress)
+ cpu_relax();
+ goto retry;
+ }
return err;
}
/**
@@ -308,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
return cpu_stop_queue_work(cpu, work_buf);
}
-/* static data for stop_cpus */
-static DEFINE_MUTEX(stop_cpus_mutex);
-
static bool queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_done *done)
@@ -324,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
- lg_global_lock(&stop_cpus_lock);
+ preempt_disable();
+ stop_cpus_in_progress = true;
for_each_cpu(cpu, cpumask) {
work = &per_cpu(cpu_stopper.stop_work, cpu);
work->fn = fn;
@@ -333,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
- lg_global_unlock(&stop_cpus_lock);
+ stop_cpus_in_progress = false;
+ preempt_enable();
return queued;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index cf8ba545c7d3..89d5be418157 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2246,7 +2246,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_THP_DISABLE:
if (arg3 || arg4 || arg5)
return -EINVAL;
- down_write(&me->mm->mmap_sem);
+ if (down_write_killable(&me->mm->mmap_sem))
+ return -EINTR;
if (arg2)
me->mm->def_flags |= VM_NOHUGEPAGE;
else
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..a43775c6646c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,6 +65,7 @@
#include <linux/sched/sysctl.h>
#include <linux/kexec.h>
#include <linux/bpf.h>
+#include <linux/mount.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -814,6 +815,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &ten_thousand,
},
{
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
.procname = "dmesg_restrict",
.data = &dmesg_restrict,
.maxlen = sizeof(int),
@@ -1149,13 +1157,22 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "perf_event_max_stack",
- .data = NULL, /* filled in by handler */
+ .data = &sysctl_perf_event_max_stack,
.maxlen = sizeof(sysctl_perf_event_max_stack),
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
.extra1 = &zero,
.extra2 = &six_hundred_forty_kb,
},
+ {
+ .procname = "perf_event_max_contexts_per_stack",
+ .data = &sysctl_perf_event_max_contexts_per_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ .extra2 = &one_thousand,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
@@ -1196,6 +1213,17 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+ {
+ .procname = "panic_on_rcu_stall",
+ .data = &sysctl_panic_on_rcu_stall,
+ .maxlen = sizeof(sysctl_panic_on_rcu_stall),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
@@ -1488,8 +1516,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
@@ -1521,6 +1549,13 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "stat_refresh",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0600,
+ .proc_handler = vmstat_refresh,
+ },
#endif
#ifdef CONFIG_MMU
{
@@ -1804,6 +1839,14 @@ static struct ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+ {
+ .procname = "mount-max",
+ .data = &sysctl_mount_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
{ }
};
@@ -2106,6 +2149,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2225,8 +2283,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- NULL,NULL);
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2824,6 +2901,12 @@ int proc_dointvec(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2869,6 +2952,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
* exception granted :-)
*/
EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 10a1d7dc9313..6eb99c17dbd8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
#include <linux/ctype.h>
#include <linux/netdevice.h>
#include <linux/kernel.h>
+#include <linux/uuid.h>
#include <linux/slab.h>
#include <linux/compat.h>
@@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file,
/* Only supports reads */
if (oldval && oldlen) {
- char buf[40], *str = buf;
- unsigned char uuid[16];
- int i;
+ char buf[UUID_STRING_LEN + 1];
+ uuid_be uuid;
result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
@@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file,
buf[result] = '\0';
- /* Convert the uuid to from a string to binary */
- for (i = 0; i < 16; i++) {
- result = -EIO;
- if (!isxdigit(str[0]) || !isxdigit(str[1]))
- goto out;
-
- uuid[i] = (hex_to_bin(str[0]) << 4) |
- hex_to_bin(str[1]);
- str += 2;
- if (*str == '-')
- str++;
- }
+ result = -EIO;
+ if (uuid_be_to_bin(buf, &uuid))
+ goto out;
if (oldlen > 16)
oldlen = 16;
result = -EFAULT;
- if (copy_to_user(oldval, uuid, oldlen))
+ if (copy_to_user(oldval, &uuid, oldlen))
goto out;
copied = oldlen;
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 53fa971d000d..d513051fcca2 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
struct callback_head *head;
do {
- head = ACCESS_ONCE(task->task_works);
+ head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH;
work->next = head;
@@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
unsigned long flags;
+
+ if (likely(!task->task_works))
+ return NULL;
/*
* If cmpxchg() fails we continue without updating pprev.
* Either we raced with task_work_add() which added the
@@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = ACCESS_ONCE(*pprev))) {
- smp_read_barrier_depends();
+ while ((work = lockless_dereference(*pprev))) {
if (work->func != func)
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
@@ -95,7 +97,7 @@ void task_work_run(void)
* work_exited unless the list is empty.
*/
do {
- work = ACCESS_ONCE(task->task_works);
+ work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
@@ -108,7 +110,6 @@ void task_work_run(void)
* fail, but it can play with *work and other entries.
*/
raw_spin_unlock_wait(&task->pi_lock);
- smp_mb();
do {
next = work->next;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e840ed867a5d..c3aad685bbc0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -30,7 +30,6 @@
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
- * @timer: hrtimer used to schedule events while running
* @gettime: Function to read the time correlating to the base
* @base_clockid: clockid for the base
*/
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a9b76a40319e..2c5bc77c0bb0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)
#endif
#ifdef CONFIG_SYSFS
-struct bus_type clockevents_subsys = {
+static struct bus_type clockevents_subsys = {
.name = "clockevents",
.dev_name = "clockevent",
};
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 56ece145a814..7e4fad75acaa 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur)
*/
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
/* Override clocksource cannot be used. */
- pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
- cs->name);
- override_name[0] = 0;
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+ cs->name);
+ override_name[0] = 0;
+ } else {
+ /*
+ * The override cannot be currently verified.
+ * Deferring to let the watchdog check.
+ */
+ pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
+ cs->name);
+ }
} else
/* Override clocksource can be used. */
best = cs;
@@ -669,10 +678,12 @@ static void clocksource_enqueue(struct clocksource *cs)
struct list_head *entry = &clocksource_list;
struct clocksource *tmp;
- list_for_each_entry(tmp, &clocksource_list, list)
+ list_for_each_entry(tmp, &clocksource_list, list) {
/* Keep track of the place, where to insert */
- if (tmp->rating >= cs->rating)
- entry = &tmp->list;
+ if (tmp->rating < cs->rating)
+ break;
+ entry = &tmp->list;
+ }
list_add(&cs->list, entry);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa0b983290cf..bb5ec425dfe0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
#endif
}
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
@@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns);
*/
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
- ktime_t res = ktime_add(lhs, rhs);
+ ktime_t res = ktime_add_unsafe(lhs, rhs);
/*
* We use KTIME_SEC_MAX here, the maximum timeout which we can
@@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr)
* fixup_init is called when:
* - an active object is initialized
*/
-static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
@@ -342,30 +342,25 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_init(timer, &hrtimer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- WARN_ON_ONCE(1);
- return 0;
-
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
default:
- return 0;
+ return false;
}
}
@@ -373,7 +368,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
@@ -381,9 +376,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_free(timer, &hrtimer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -430,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
}
+EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
@@ -707,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work)
static DECLARE_WORK(hrtimer_work, clock_was_set_work);
/*
- * Called from timekeeping and resume code to reprogramm the hrtimer
+ * Called from timekeeping and resume code to reprogram the hrtimer
* interrupt device on all cpus.
*/
void clock_was_set_delayed(void)
@@ -1245,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
/*
* Note: We clear the running state after enqueue_hrtimer and
- * we do not reprogramm the event hardware. Happens either in
+ * we do not reprogram the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
*
* Note: Because we dropped the cpu_base->lock above,
@@ -1594,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
/*
* Functions related to boot-time initialization:
*/
-static void init_hrtimers_cpu(int cpu)
+int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
@@ -1606,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1640,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-static void migrate_hrtimers(int scpu)
+int hrtimers_dead_cpu(unsigned int scpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
int i;
@@ -1669,45 +1666,14 @@ static void migrate_hrtimers(int scpu)
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
+ return 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int hrtimer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- int scpu = (long)hcpu;
-
- switch (action) {
-
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- init_hrtimers_cpu(scpu);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_hrtimers(scpu);
- break;
-#endif
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block hrtimers_nb = {
- .notifier_call = hrtimer_cpu_notify,
-};
-
void __init hrtimers_init(void)
{
- hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- register_cpu_notifier(&hrtimers_nb);
+ hrtimers_prepare_cpu(smp_processor_id());
}
/**
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1cafba860b08..39008d78927a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
timer->it.cpu.expires = 0;
sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
&itp->it_value);
+ return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
unlock_task_sighand(p, &flags);
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index e622ba365a13..b0928ab3270f 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
int allowed_error_ns = usecs * 5;
for (i = 0; i < iters; ++i) {
- struct timespec ts1, ts2;
+ s64 kt1, kt2;
int time_passed;
- ktime_get_ts(&ts1);
+ kt1 = ktime_get_ns();
udelay(usecs);
- ktime_get_ts(&ts2);
- time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+ kt2 = ktime_get_ns();
+ time_passed = kt2 - kt1;
if (i == 0 || time_passed < min)
min = time_passed;
@@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v)
if (usecs > 0 && iters > 0) {
return udelay_test_single(s, usecs, iters);
} else if (usecs == 0) {
- struct timespec ts;
+ struct timespec64 ts;
- ktime_get_ts(&ts);
- seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
- loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+ ktime_get_ts64(&ts);
+ seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n",
+ loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec);
seq_puts(s, "usage:\n");
seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 53d7184da0be..690b797f522e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
}
static struct clock_event_device ce_broadcast_hrtimer = {
+ .name = "bc_hrtimer",
.set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 966a5a6fdd0a..f738251000fe 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+void timer_clear_idle(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f6dd..3bcb61b52f6c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
#include <trace/events/timer.h>
/*
- * Per cpu nohz control structure
+ * Per-CPU nohz control structure
*/
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
@@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now)
if (delta.tv64 < tick_period.tv64)
return;
- /* Reevalute with jiffies_lock held */
+ /* Reevaluate with jiffies_lock held */
write_seqlock(&jiffies_lock);
delta = ktime_sub(now, last_jiffies_update);
@@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
#ifdef CONFIG_NO_HZ_COMMON
/*
* Check if the do_timer duty was dropped. We don't care about
- * concurrency: This happens only when the cpu in charge went
- * into a long sleep. If two cpus happen to assign themself to
+ * concurrency: This happens only when the CPU in charge went
+ * into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
*/
@@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep)
return false;
}
-static bool can_stop_full_tick(struct tick_sched *ts)
+static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
WARN_ON_ONCE(!irqs_disabled());
+ if (unlikely(!cpu_online(cpu)))
+ return false;
+
if (check_tick_dependency(&tick_dep_mask))
return false;
@@ -349,7 +352,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
/*
* Re-evaluate the need for the tick as we switch the current task.
* It might need the tick due to per task/process properties:
- * perf events, posix cpu timers, ...
+ * perf events, posix CPU timers, ...
*/
void __tick_nohz_task_switch(void)
{
@@ -509,8 +512,8 @@ int tick_nohz_tick_stopped(void)
*
* In case the sched_tick was stopped on this CPU, we have to check if jiffies
* must be updated. Otherwise an interrupt handler could use a stale jiffy
- * value. We do this unconditionally on any cpu, as we don't know whether the
- * cpu, which has the update task assigned is in a long sleep.
+ * value. We do this unconditionally on any CPU, as we don't know whether the
+ * CPU, which has the update task assigned is in a long sleep.
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
@@ -526,7 +529,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
}
/*
- * Updates the per cpu time idle statistics counters
+ * Updates the per-CPU time idle statistics counters
*/
static void
update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
@@ -566,12 +569,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
}
/**
- * get_cpu_idle_time_us - get the total idle time of a cpu
+ * get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
- * Return the cummulative idle time (since boot) for a given
+ * Return the cumulative idle time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
@@ -607,12 +610,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
/**
- * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
- * Return the cummulative iowait time (since boot) for a given
+ * Return the cumulative iowait time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
@@ -700,6 +703,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
tick.tv64 = 0;
+
+ /*
+ * Tell the timer code that the base is not idle, i.e. undo
+ * the effect of get_next_timer_interrupt():
+ */
+ timer_clear_idle();
/*
* We've not stopped the tick yet, and there's a timer in the
* next period, so no point in stopping it either, bail.
@@ -726,14 +735,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
}
/*
- * If this cpu is the one which updates jiffies, then give up
- * the assignment and let it be taken by the cpu which runs
- * the tick timer next, which might be this cpu as well. If we
+ * If this CPU is the one which updates jiffies, then give up
+ * the assignment and let it be taken by the CPU which runs
+ * the tick timer next, which might be this CPU as well. If we
* don't drop this here the jiffies might be stale and
* do_timer() never invoked. Keep track of the fact that it
- * was the one which had the do_timer() duty last. If this cpu
+ * was the one which had the do_timer() duty last. If this CPU
* is the one which had the do_timer() duty last, we limit the
- * sleep time to the timekeeping max_deferement value.
+ * sleep time to the timekeeping max_deferment value.
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
@@ -809,6 +818,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
tick_do_update_jiffies64(now);
cpu_load_update_nohz_stop();
+ /*
+ * Clear the timer idle flag, so we avoid IPIs on remote queueing and
+ * the clock forward checks in the enqueue path:
+ */
+ timer_clear_idle();
+
calc_load_exit_idle();
touch_softlockup_watchdog_sched();
/*
@@ -831,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (can_stop_full_tick(ts))
+ if (can_stop_full_tick(cpu, ts))
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
else if (ts->tick_stopped)
tick_nohz_restart_sched_tick(ts, ktime_get());
@@ -841,9 +856,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
- * If this cpu is offline and it is the one which updates
+ * If this CPU is offline and it is the one which updates
* jiffies, then give up the assignment and let it be taken by
- * the cpu which runs the tick timer next. If we don't drop
+ * the CPU which runs the tick timer next. If we don't drop
* this here the jiffies might be stale and do_timer() never
* invoked.
*/
@@ -933,11 +948,11 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(irqs_disabled());
/*
- * Update the idle state in the scheduler domain hierarchy
- * when tick_nohz_stop_sched_tick() is called from the idle loop.
- * State will be updated to busy during the first busy tick after
- * exiting idle.
- */
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
set_cpu_sd_state_idle();
local_irq_disable();
@@ -1092,35 +1107,6 @@ static void tick_nohz_switch_to_nohz(void)
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
-/*
- * When NOHZ is enabled and the tick is stopped, we need to kick the
- * tick timer from irq_enter() so that the jiffies update is kept
- * alive during long running softirqs. That's ugly as hell, but
- * correctness is key even if we need to fix the offending softirq in
- * the first place.
- *
- * Note, this is different to tick_nohz_restart. We just kick the
- * timer and do not touch the other magic bits which need to be done
- * when idle is left.
- */
-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
-{
-#if 0
- /* Switch back to 2.6.27 behaviour */
- ktime_t delta;
-
- /*
- * Do not touch the tick device, when the next expiry is either
- * already reached or less/equal than the tick period.
- */
- delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
- if (delta.tv64 <= tick_period.tv64)
- return;
-
- tick_nohz_restart(ts, now);
-#endif
-}
-
static inline void tick_nohz_irq_enter(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1117,8 @@ static inline void tick_nohz_irq_enter(void)
now = ktime_get();
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
- if (ts->tick_stopped) {
+ if (ts->tick_stopped)
tick_nohz_update_jiffies(now);
- tick_nohz_kick_tick(ts, now);
- }
}
#else
@@ -1211,7 +1195,7 @@ void tick_setup_sched_timer(void)
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
- /* Get the next period (per cpu) */
+ /* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
/* Offset the tick to avert jiffies_lock contention. */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a4064b612066..bd62fb8e8e77 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,24 @@ struct timespec timespec_add_safe(const struct timespec lhs,
return res;
}
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+ const struct timespec64 rhs)
+{
+ struct timespec64 res;
+
+ set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+ res.tv_sec = TIME64_MAX;
+ res.tv_nsec = 0;
+ }
+
+ return res;
+}
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 86628e755f38..7142580ad94f 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = {
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
/**
- * time_to_tm - converts the calendar time to local broken-down time
+ * time64_to_tm - converts the calendar time to local broken-down time
*
* @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
* Coordinated Universal Time (UTC).
* @offset offset seconds adding to totalsecs.
* @result pointer to struct tm variable to receive broken-down time
*/
-void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
{
long days, rem, y;
+ int remainder;
const unsigned short *ip;
- days = totalsecs / SECS_PER_DAY;
- rem = totalsecs % SECS_PER_DAY;
+ days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
+ rem = remainder;
rem += offset;
while (rem < 0) {
rem += SECS_PER_DAY;
@@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result)
result->tm_mon = y;
result->tm_mday = days + 1;
}
-EXPORT_SYMBOL(time_to_tm);
+EXPORT_SYMBOL(time64_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 479d25cd3d4f..e07fb093f819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ now = ktime_to_ns(tkr->base);
+
+ now += clocksource_delta(tkr->read(tkr->clock),
+ tkr->cycle_last, tkr->mask);
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
@@ -480,10 +483,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
* users are removed, this can be killed.
*/
remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
- tk->tkr_mono.xtime_nsec -= remainder;
- tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+ if (remainder != 0) {
+ tk->tkr_mono.xtime_nsec -= remainder;
+ tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+ }
}
#else
#define old_vsyscall_fixup(tk)
@@ -2186,6 +2191,7 @@ struct timespec64 get_monotonic_coarse64(void)
return now;
}
+EXPORT_SYMBOL(get_monotonic_coarse64);
/*
* Must hold jiffies_lock
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..ca9fb800336b 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
#include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
{
@@ -69,6 +71,11 @@ late_initcall(tk_debug_sleep_time_init);
void tk_debug_account_sleep_time(struct timespec64 *t)
{
- sleep_time_bin[fls(t->tv_sec)]++;
+ /* Cap bin index so we don't overflow the array */
+ int bin = min(fls(t->tv_sec), NUM_BINS-1);
+
+ sleep_time_bin[bin]++;
+ pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec,
+ t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 73164c3aa56b..32bf6f75a8fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
/*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is: LVL_CLK_DIV ^ lvl
+ * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation removes the need for recascading
+ * the timers into the lower array levels. The previous 'classic' timer wheel
+ * implementation of the kernel already violated the 'exact' expiry by adding
+ * slack to the expiry time to provide batched expiration. The granularity
+ * levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The only exception to this are networking timers with a small expiry
+ * time. They rely on the granularity. Those fit into the first wheel level,
+ * which has HZ granularity.
+ *
+ * We don't have cascading anymore. timers with a expiry time above the
+ * capacity of the last wheel level are force expired at the maximum timeout
+ * value of the last wheel level. From data sampling we know that the maximum
+ * value observed is 5 days (network connection tracking), so this should not
+ * be an issue.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity.
+ *
+ * This results in the following granularity and range levels:
+ *
+ * HZ 1000 steps
+ * Level Offset Granularity Range
+ * 0 0 1 ms 0 ms - 63 ms
+ * 1 64 8 ms 64 ms - 511 ms
+ * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
+ * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
+ * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
+ * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
+ * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
+ * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
+ * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
+ *
+ * HZ 300
+ * Level Offset Granularity Range
+ * 0 0 3 ms 0 ms - 210 ms
+ * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
+ * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
+ * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
+ * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
+ * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
+ * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
+ * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
+ * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
+ *
+ * HZ 250
+ * Level Offset Granularity Range
+ * 0 0 4 ms 0 ms - 255 ms
+ * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
+ * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
+ * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
+ * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
+ * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
+ * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
+ * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
+ * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
+ *
+ * HZ 100
+ * Level Offset Granularity Range
+ * 0 0 10 ms 0 ms - 630 ms
+ * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
+ * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
+ * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
+ * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
+ * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
+ * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
+ * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
*/
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
-
-struct tvec {
- struct hlist_head vec[TVN_SIZE];
-};
-struct tvec_root {
- struct hlist_head vec[TVR_SIZE];
-};
+/* Clock divisor for the next level */
+#define LVL_CLK_SHIFT 3
+#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK (LVL_CLK_DIV - 1)
+#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
+#define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
-struct tvec_base {
- spinlock_t lock;
- struct timer_list *running_timer;
- unsigned long timer_jiffies;
- unsigned long next_timer;
- unsigned long active_timers;
- unsigned long all_timers;
- int cpu;
- bool migration_enabled;
- bool nohz_active;
- struct tvec_root tv1;
- struct tvec tv2;
- struct tvec tv3;
- struct tvec tv4;
- struct tvec tv5;
-} ____cacheline_aligned;
+/*
+ * The time start value for each level to select the bucket at enqueue
+ * time.
+ */
+#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
+
+/* Size of each clock level */
+#define LVL_BITS 6
+#define LVL_SIZE (1UL << LVL_BITS)
+#define LVL_MASK (LVL_SIZE - 1)
+#define LVL_OFFS(n) ((n) * LVL_SIZE)
+
+/* Level depth */
+#if HZ > 100
+# define LVL_DEPTH 9
+# else
+# define LVL_DEPTH 8
+#endif
+
+/* The cutoff (max. capacity of the wheel) */
+#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
+#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
+ */
+#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
+
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES 2
+# define BASE_STD 0
+# define BASE_DEF 1
+#else
+# define NR_BASES 1
+# define BASE_STD 0
+# define BASE_DEF 0
+#endif
+
+struct timer_base {
+ spinlock_t lock;
+ struct timer_list *running_timer;
+ unsigned long clk;
+ unsigned long next_expiry;
+ unsigned int cpu;
+ bool migration_enabled;
+ bool nohz_active;
+ bool is_idle;
+ DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+ struct hlist_head vectors[WHEEL_SIZE];
+} ____cacheline_aligned;
-static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
unsigned int sysctl_timer_migration = 1;
@@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz)
unsigned int cpu;
/* Avoid the loop, if nothing to update */
- if (this_cpu_read(tvec_bases.migration_enabled) == on)
+ if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
return;
for_each_possible_cpu(cpu) {
- per_cpu(tvec_bases.migration_enabled, cpu) = on;
+ per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+ per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
if (!update_nohz)
continue;
- per_cpu(tvec_bases.nohz_active, cpu) = true;
+ per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+ per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
per_cpu(hrtimer_bases.nohz_active, cpu) = true;
}
}
@@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write,
mutex_unlock(&mutex);
return ret;
}
-
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
- int pinned)
-{
- if (pinned || !base->migration_enabled)
- return this_cpu_ptr(&tvec_bases);
- return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
-}
-#else
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
- int pinned)
-{
- return this_cpu_ptr(&tvec_bases);
-}
#endif
static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j)
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
-/**
- * set_timer_slack - set the allowed slack for a timer
- * @timer: the timer to be modified
- * @slack_hz: the amount of time (in jiffies) allowed for rounding
- *
- * Set the amount of time, in jiffies, that a certain timer has
- * in terms of slack. By setting this value, the timer subsystem
- * will schedule the actual timer somewhere between
- * the time mod_timer() asks for, and that time plus the slack.
- *
- * By setting the slack to -1, a percentage of the delay is used
- * instead.
- */
-void set_timer_slack(struct timer_list *timer, int slack_hz)
+
+static inline unsigned int timer_get_idx(struct timer_list *timer)
{
- timer->slack = slack_hz;
+ return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}
-EXPORT_SYMBOL_GPL(set_timer_slack);
-static void
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
- unsigned long expires = timer->expires;
- unsigned long idx = expires - base->timer_jiffies;
- struct hlist_head *vec;
+ timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
+ idx << TIMER_ARRAYSHIFT;
+}
- if (idx < TVR_SIZE) {
- int i = expires & TVR_MASK;
- vec = base->tv1.vec + i;
- } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
- int i = (expires >> TVR_BITS) & TVN_MASK;
- vec = base->tv2.vec + i;
- } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
- int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
- vec = base->tv3.vec + i;
- } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
- int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
- vec = base->tv4.vec + i;
- } else if ((signed long) idx < 0) {
- /*
- * Can happen if you add a timer with expires == jiffies,
- * or you set a timer to go off in the past
- */
- vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+/*
+ * Helper function to calculate the array index for a given expiry
+ * time.
+ */
+static inline unsigned calc_index(unsigned expires, unsigned lvl)
+{
+ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+ return LVL_OFFS(lvl) + (expires & LVL_MASK);
+}
+
+static int calc_wheel_index(unsigned long expires, unsigned long clk)
+{
+ unsigned long delta = expires - clk;
+ unsigned int idx;
+
+ if (delta < LVL_START(1)) {
+ idx = calc_index(expires, 0);
+ } else if (delta < LVL_START(2)) {
+ idx = calc_index(expires, 1);
+ } else if (delta < LVL_START(3)) {
+ idx = calc_index(expires, 2);
+ } else if (delta < LVL_START(4)) {
+ idx = calc_index(expires, 3);
+ } else if (delta < LVL_START(5)) {
+ idx = calc_index(expires, 4);
+ } else if (delta < LVL_START(6)) {
+ idx = calc_index(expires, 5);
+ } else if (delta < LVL_START(7)) {
+ idx = calc_index(expires, 6);
+ } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
+ idx = calc_index(expires, 7);
+ } else if ((long) delta < 0) {
+ idx = clk & LVL_MASK;
} else {
- int i;
- /* If the timeout is larger than MAX_TVAL (on 64-bit
- * architectures or with CONFIG_BASE_SMALL=1) then we
- * use the maximum timeout.
+ /*
+ * Force expire obscene large timeouts to expire at the
+ * capacity limit of the wheel.
*/
- if (idx > MAX_TVAL) {
- idx = MAX_TVAL;
- expires = idx + base->timer_jiffies;
- }
- i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
- vec = base->tv5.vec + i;
+ if (expires >= WHEEL_TIMEOUT_CUTOFF)
+ expires = WHEEL_TIMEOUT_MAX;
+
+ idx = calc_index(expires, LVL_DEPTH - 1);
}
+ return idx;
+}
- hlist_add_head(&timer->entry, vec);
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx)
+{
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
}
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
- /* Advance base->jiffies, if the base is empty */
- if (!base->all_timers++)
- base->timer_jiffies = jiffies;
+ unsigned int idx;
+
+ idx = calc_wheel_index(timer->expires, base->clk);
+ enqueue_timer(base, timer, idx);
+}
+
+static void
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
+{
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ return;
- __internal_add_timer(base, timer);
/*
- * Update base->active_timers and base->next_timer
+ * TODO: This wants some optimizing similar to the code below, but we
+ * will do that when we switch from push to pull for deferrable timers.
*/
- if (!(timer->flags & TIMER_DEFERRABLE)) {
- if (!base->active_timers++ ||
- time_before(timer->expires, base->next_timer))
- base->next_timer = timer->expires;
+ if (timer->flags & TIMER_DEFERRABLE) {
+ if (tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
+ return;
}
/*
- * Check whether the other CPU is in dynticks mode and needs
- * to be triggered to reevaluate the timer wheel.
- * We are protected against the other CPU fiddling
- * with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to stop its tick can not
- * evaluate the timer wheel.
- *
- * Spare the IPI for deferrable timers on idle targets though.
- * The next busy ticks will take care of it. Except full dynticks
- * require special care against races with idle_cpu(), lets deal
- * with that later.
+ * We might have to IPI the remote CPU if the base is idle and the
+ * timer is not deferrable. If the other CPU is on the way to idle
+ * then it can't set base->is_idle as we hold the base lock:
*/
- if (base->nohz_active) {
- if (!(timer->flags & TIMER_DEFERRABLE) ||
- tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
- }
+ if (!base->is_idle)
+ return;
+
+ /* Check whether this is the new first expiring timer: */
+ if (time_after_eq(timer->expires, base->next_expiry))
+ return;
+
+ /*
+ * Set the next expiry time and kick the CPU so it can reevaluate the
+ * wheel:
+ */
+ base->next_expiry = timer->expires;
+ wake_up_nohz_cpu(base->cpu);
+}
+
+static void
+internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+ __internal_add_timer(base, timer);
+ trigger_dyntick_cpu(base, timer);
}
#ifdef CONFIG_TIMER_STATS
@@ -489,11 +612,19 @@ static void *timer_debug_hint(void *addr)
return ((struct timer_list *) addr)->function;
}
+static bool timer_is_static_object(void *addr)
+{
+ struct timer_list *timer = addr;
+
+ return (timer->entry.pprev == NULL &&
+ timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
*/
-static int timer_fixup_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
@@ -501,9 +632,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_init(timer, &timer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -516,36 +647,22 @@ static void stub_timer(unsigned long data)
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
-
case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. The timer was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- if (timer->entry.pprev == NULL &&
- timer->entry.next == TIMER_ENTRY_STATIC) {
- debug_object_init(timer, &timer_debug_descr);
- debug_object_activate(timer, &timer_debug_descr);
- return 0;
- } else {
- setup_timer(timer, stub_timer, 0);
- return 1;
- }
- return 0;
+ setup_timer(timer, stub_timer, 0);
+ return true;
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
default:
- return 0;
+ return false;
}
}
@@ -553,7 +670,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int timer_fixup_free(void *addr, enum debug_obj_state state)
+static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
@@ -561,9 +678,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_free(timer, &timer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -571,32 +688,23 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
* fixup_assert_init is called when:
* - an untracked/uninit-ed object is found
*/
-static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (timer->entry.next == TIMER_ENTRY_STATIC) {
- /*
- * This is not really a fixup. The timer was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- debug_object_init(timer, &timer_debug_descr);
- return 0;
- } else {
- setup_timer(timer, stub_timer, 0);
- return 1;
- }
+ setup_timer(timer, stub_timer, 0);
+ return true;
default:
- return 0;
+ return false;
}
}
static struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
.debug_hint = timer_debug_hint,
+ .is_static_object = timer_is_static_object,
.fixup_init = timer_fixup_init,
.fixup_activate = timer_fixup_activate,
.fixup_free = timer_fixup_free,
@@ -681,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
{
timer->entry.pprev = NULL;
timer->flags = flags | raw_smp_processor_id();
- timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
timer->start_pid = -1;
@@ -721,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
entry->next = LIST_POISON2;
}
-static inline void
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
-{
- detach_timer(timer, true);
- if (!(timer->flags & TIMER_DEFERRABLE))
- base->active_timers--;
- base->all_timers--;
-}
-
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
bool clear_pending)
{
+ unsigned idx = timer_get_idx(timer);
+
if (!timer_pending(timer))
return 0;
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ __clear_bit(idx, base->pending_map);
+
detach_timer(timer, clear_pending);
- if (!(timer->flags & TIMER_DEFERRABLE)) {
- base->active_timers--;
- if (timer->expires == base->next_timer)
- base->next_timer = base->timer_jiffies;
- }
- /* If this was the last timer, advance base->jiffies */
- if (!--base->all_timers)
- base->timer_jiffies = jiffies;
return 1;
}
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+ struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+
+ /*
+ * If the timer is deferrable and nohz is active then we need to use
+ * the deferrable base.
+ */
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+ (tflags & TIMER_DEFERRABLE))
+ base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+ return base;
+}
+
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+ /*
+ * If the timer is deferrable and nohz is active then we need to use
+ * the deferrable base.
+ */
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+ (tflags & TIMER_DEFERRABLE))
+ base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+ return base;
+}
+
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+ return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+#ifdef CONFIG_SMP
+ if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+ return get_timer_this_cpu_base(tflags);
+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+ return get_timer_this_cpu_base(tflags);
+#endif
+}
+
+static inline void forward_timer_base(struct timer_base *base)
+{
+ /*
+ * We only forward the base when it's idle and we have a delta between
+ * base clock and jiffies.
+ */
+ if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+ return;
+
+ /*
+ * If the next expiry value is > jiffies, then we fast forward to
+ * jiffies otherwise we forward to the next expiry value.
+ */
+ if (time_after(base->next_expiry, jiffies))
+ base->clk = jiffies;
+ else
+ base->clk = base->next_expiry;
+}
+#else
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+ return get_timer_this_cpu_base(tflags);
+}
+
+static inline void forward_timer_base(struct timer_base *base) { }
+#endif
+
+static inline struct timer_base *
+get_target_base(struct timer_base *base, unsigned tflags)
+{
+ struct timer_base *target = __get_target_base(base, tflags);
+
+ forward_timer_base(target);
+ return target;
+}
+
/*
- * We are using hashed locking: holding per_cpu(tvec_bases).lock
- * means that all timers which are tied to this base via timer->base are
- * locked, and the base itself is locked too.
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
+ * that all timers which are tied to this base are locked, and the base itself
+ * is locked too.
*
* So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
*
- * When the timer's base is locked and removed from the list, the
- * TIMER_MIGRATING flag is set, FIXME
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
+ * to wait until the migration is done.
*/
-static struct tvec_base *lock_timer_base(struct timer_list *timer,
- unsigned long *flags)
+static struct timer_base *lock_timer_base(struct timer_list *timer,
+ unsigned long *flags)
__acquires(timer->base->lock)
{
for (;;) {
+ struct timer_base *base;
u32 tf = timer->flags;
- struct tvec_base *base;
if (!(tf & TIMER_MIGRATING)) {
- base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+ base = get_timer_base(tf);
spin_lock_irqsave(&base->lock, *flags);
if (timer->flags == tf)
return base;
@@ -779,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
}
static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires,
- bool pending_only, int pinned)
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
{
- struct tvec_base *base, *new_base;
- unsigned long flags;
+ struct timer_base *base, *new_base;
+ unsigned int idx = UINT_MAX;
+ unsigned long clk = 0, flags;
int ret = 0;
+ /*
+ * This is a common optimization triggered by the networking code - if
+ * the timer is re-modified to have the same timeout or ends up in the
+ * same array bucket then just return:
+ */
+ if (timer_pending(timer)) {
+ if (timer->expires == expires)
+ return 1;
+ /*
+ * Take the current timer_jiffies of base, but without holding
+ * the lock!
+ */
+ base = get_timer_base(timer->flags);
+ clk = base->clk;
+
+ idx = calc_wheel_index(expires, clk);
+
+ /*
+ * Retrieve and compare the array index of the pending
+ * timer. If it matches set the expiry to the new value so a
+ * subsequent call will exit in the expires check above.
+ */
+ if (idx == timer_get_idx(timer)) {
+ timer->expires = expires;
+ return 1;
+ }
+ }
+
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -797,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
- new_base = get_target_base(base, pinned);
+ new_base = get_target_base(base, timer->flags);
if (base != new_base) {
/*
- * We are trying to schedule the timer on the local CPU.
+ * We are trying to schedule the timer on the new base.
* However we can't change timer's base while it is running,
* otherwise del_timer_sync() can't detect that the timer's
- * handler yet has not finished. This also guarantees that
- * the timer is serialized wrt itself.
+ * handler yet has not finished. This also guarantees that the
+ * timer is serialized wrt itself.
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
@@ -820,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
timer->expires = expires;
- internal_add_timer(base, timer);
+ /*
+ * If 'idx' was calculated above and the base time did not advance
+ * between calculating 'idx' and taking the lock, only enqueue_timer()
+ * and trigger_dyntick_cpu() is required. Otherwise we need to
+ * (re)calculate the wheel index via internal_add_timer().
+ */
+ if (idx != UINT_MAX && clk == base->clk) {
+ enqueue_timer(base, timer, idx);
+ trigger_dyntick_cpu(base, timer);
+ } else {
+ internal_add_timer(base, timer);
+ }
out_unlock:
spin_unlock_irqrestore(&base->lock, flags);
@@ -840,49 +1057,10 @@ out_unlock:
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
- return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+ return __mod_timer(timer, expires, true);
}
EXPORT_SYMBOL(mod_timer_pending);
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- * 1) calculate the maximum (absolute) time
- * 2) calculate the highest bit where the expires and new max are different
- * 3) use this bit to make a mask
- * 4) use the bitmask to round down the maximum time, so that all last
- * bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
- unsigned long expires_limit, mask;
- int bit;
-
- if (timer->slack >= 0) {
- expires_limit = expires + timer->slack;
- } else {
- long delta = expires - jiffies;
-
- if (delta < 256)
- return expires;
-
- expires_limit = expires + delta / 256;
- }
- mask = expires ^ expires_limit;
- if (mask == 0)
- return expires;
-
- bit = __fls(mask);
-
- mask = (1UL << bit) - 1;
-
- expires_limit = expires_limit & ~(mask);
-
- return expires_limit;
-}
-
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
@@ -905,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
- expires = apply_slack(timer, expires);
-
- /*
- * This is a common optimization triggered by the
- * networking code - if the timer is re-modified
- * to be the same thing then just return:
- */
- if (timer_pending(timer) && timer->expires == expires)
- return 1;
-
- return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+ return __mod_timer(timer, expires, false);
}
EXPORT_SYMBOL(mod_timer);
/**
- * mod_timer_pinned - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer_pinned() is a way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline. If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
- *
- * mod_timer_pinned(timer, expires) is equivalent to:
- *
- * del_timer(timer); timer->expires = expires; add_timer(timer);
- */
-int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
-{
- if (timer->expires == expires && timer_pending(timer))
- return 1;
-
- return __mod_timer(timer, expires, false, TIMER_PINNED);
-}
-EXPORT_SYMBOL(mod_timer_pinned);
-
-/**
* add_timer - start a timer
* @timer: the timer to be added
*
@@ -977,13 +1117,14 @@ EXPORT_SYMBOL(add_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
- struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
- struct tvec_base *base;
+ struct timer_base *new_base, *base;
unsigned long flags;
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
+ new_base = get_timer_cpu_base(timer->flags, cpu);
+
/*
* If @timer was on a different CPU, it should be migrated with the
* old base locked to prevent other operations proceeding with the
@@ -1019,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
*/
int del_timer(struct timer_list *timer)
{
- struct tvec_base *base;
+ struct timer_base *base;
unsigned long flags;
int ret = 0;
@@ -1045,7 +1186,7 @@ EXPORT_SYMBOL(del_timer);
*/
int try_to_del_timer_sync(struct timer_list *timer)
{
- struct tvec_base *base;
+ struct timer_base *base;
unsigned long flags;
int ret = -1;
@@ -1129,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
-static int cascade(struct tvec_base *base, struct tvec *tv, int index)
-{
- /* cascade all the timers from tv up one level */
- struct timer_list *timer;
- struct hlist_node *tmp;
- struct hlist_head tv_list;
-
- hlist_move_list(tv->vec + index, &tv_list);
-
- /*
- * We are removing _all_ timers from the list, so we
- * don't have to detach them individually.
- */
- hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- /* No accounting, while moving them */
- __internal_add_timer(base, timer);
- }
-
- return index;
-}
-
static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
unsigned long data)
{
@@ -1193,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
}
}
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
-
-/**
- * __run_timers - run all expired timers (if any) on this CPU.
- * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
- */
-static inline void __run_timers(struct tvec_base *base)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
- struct timer_list *timer;
+ while (!hlist_empty(head)) {
+ struct timer_list *timer;
+ void (*fn)(unsigned long);
+ unsigned long data;
- spin_lock_irq(&base->lock);
+ timer = hlist_entry(head->first, struct timer_list, entry);
+ timer_stats_account_timer(timer);
- while (time_after_eq(jiffies, base->timer_jiffies)) {
- struct hlist_head work_list;
- struct hlist_head *head = &work_list;
- int index;
+ base->running_timer = timer;
+ detach_timer(timer, true);
- if (!base->all_timers) {
- base->timer_jiffies = jiffies;
- break;
+ fn = timer->function;
+ data = timer->data;
+
+ if (timer->flags & TIMER_IRQSAFE) {
+ spin_unlock(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock(&base->lock);
+ } else {
+ spin_unlock_irq(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock_irq(&base->lock);
}
+ }
+}
- index = base->timer_jiffies & TVR_MASK;
+static int __collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
+{
+ unsigned long clk = base->clk;
+ struct hlist_head *vec;
+ int i, levels = 0;
+ unsigned int idx;
- /*
- * Cascade timers:
- */
- if (!index &&
- (!cascade(base, &base->tv2, INDEX(0))) &&
- (!cascade(base, &base->tv3, INDEX(1))) &&
- !cascade(base, &base->tv4, INDEX(2)))
- cascade(base, &base->tv5, INDEX(3));
- ++base->timer_jiffies;
- hlist_move_list(base->tv1.vec + index, head);
- while (!hlist_empty(head)) {
- void (*fn)(unsigned long);
- unsigned long data;
- bool irqsafe;
-
- timer = hlist_entry(head->first, struct timer_list, entry);
- fn = timer->function;
- data = timer->data;
- irqsafe = timer->flags & TIMER_IRQSAFE;
-
- timer_stats_account_timer(timer);
-
- base->running_timer = timer;
- detach_expired_timer(timer, base);
-
- if (irqsafe) {
- spin_unlock(&base->lock);
- call_timer_fn(timer, fn, data);
- spin_lock(&base->lock);
- } else {
- spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn, data);
- spin_lock_irq(&base->lock);
- }
+ for (i = 0; i < LVL_DEPTH; i++) {
+ idx = (clk & LVL_MASK) + i * LVL_SIZE;
+
+ if (__test_and_clear_bit(idx, base->pending_map)) {
+ vec = base->vectors + idx;
+ hlist_move_list(vec, heads++);
+ levels++;
}
+ /* Is it time to look at the next level? */
+ if (clk & LVL_CLK_MASK)
+ break;
+ /* Shift clock for the next level granularity */
+ clk >>= LVL_CLK_SHIFT;
}
- base->running_timer = NULL;
- spin_unlock_irq(&base->lock);
+ return levels;
}
#ifdef CONFIG_NO_HZ_COMMON
/*
- * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a CPU is idle.
- * This function needs to be called with interrupts disabled.
+ * Find the next pending bucket of a level. Search from level start (@offset)
+ * + @clk upwards and if nothing there, search from start of the level
+ * (@offset) up to @offset + clk.
*/
-static unsigned long __next_timer_interrupt(struct tvec_base *base)
-{
- unsigned long timer_jiffies = base->timer_jiffies;
- unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
- int index, slot, array, found = 0;
- struct timer_list *nte;
- struct tvec *varray[4];
-
- /* Look for timer events in tv1. */
- index = slot = timer_jiffies & TVR_MASK;
- do {
- hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
- if (nte->flags & TIMER_DEFERRABLE)
- continue;
-
- found = 1;
- expires = nte->expires;
- /* Look at the cascade bucket(s)? */
- if (!index || slot < index)
- goto cascade;
- return expires;
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+ unsigned clk)
+{
+ unsigned pos, start = offset + clk;
+ unsigned end = offset + LVL_SIZE;
+
+ pos = find_next_bit(base->pending_map, end, start);
+ if (pos < end)
+ return pos - start;
+
+ pos = find_next_bit(base->pending_map, start, offset);
+ return pos < start ? pos + LVL_SIZE - start : -1;
+}
+
+/*
+ * Search the first expiring timer in the various clock levels. Caller must
+ * hold base->lock.
+ */
+static unsigned long __next_timer_interrupt(struct timer_base *base)
+{
+ unsigned long clk, next, adj;
+ unsigned lvl, offset = 0;
+
+ next = base->clk + NEXT_TIMER_MAX_DELTA;
+ clk = base->clk;
+ for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
+ int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+
+ if (pos >= 0) {
+ unsigned long tmp = clk + (unsigned long) pos;
+
+ tmp <<= LVL_SHIFT(lvl);
+ if (time_before(tmp, next))
+ next = tmp;
}
- slot = (slot + 1) & TVR_MASK;
- } while (slot != index);
-
-cascade:
- /* Calculate the next cascade event */
- if (index)
- timer_jiffies += TVR_SIZE - index;
- timer_jiffies >>= TVR_BITS;
-
- /* Check tv2-tv5. */
- varray[0] = &base->tv2;
- varray[1] = &base->tv3;
- varray[2] = &base->tv4;
- varray[3] = &base->tv5;
-
- for (array = 0; array < 4; array++) {
- struct tvec *varp = varray[array];
-
- index = slot = timer_jiffies & TVN_MASK;
- do {
- hlist_for_each_entry(nte, varp->vec + slot, entry) {
- if (nte->flags & TIMER_DEFERRABLE)
- continue;
-
- found = 1;
- if (time_before(nte->expires, expires))
- expires = nte->expires;
- }
- /*
- * Do we still search for the first timer or are
- * we looking up the cascade buckets ?
- */
- if (found) {
- /* Look at the cascade bucket(s)? */
- if (!index || slot < index)
- break;
- return expires;
- }
- slot = (slot + 1) & TVN_MASK;
- } while (slot != index);
-
- if (index)
- timer_jiffies += TVN_SIZE - index;
- timer_jiffies >>= TVN_BITS;
+ /*
+ * Clock for the next level. If the current level clock lower
+ * bits are zero, we look at the next level as is. If not we
+ * need to advance it by one because that's going to be the
+ * next expiring bucket in that level. base->clk is the next
+ * expiring jiffie. So in case of:
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+ * 0 0 0 0 0 0
+ *
+ * we have to look at all levels @index 0. With
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+ * 0 0 0 0 0 2
+ *
+ * LVL0 has the next expiring bucket @index 2. The upper
+ * levels have the next expiring bucket @index 1.
+ *
+ * In case that the propagation wraps the next level the same
+ * rules apply:
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+ * 0 0 0 0 F 2
+ *
+ * So after looking at LVL0 we get:
+ *
+ * LVL5 LVL4 LVL3 LVL2 LVL1
+ * 0 0 0 1 0
+ *
+ * So no propagation from LVL1 to LVL2 because that happened
+ * with the add already, but then we need to propagate further
+ * from LVL2 to LVL3.
+ *
+ * So the simple check whether the lower bits of the current
+ * level are 0 or not is sufficient for all cases.
+ */
+ adj = clk & LVL_CLK_MASK ? 1 : 0;
+ clk >>= LVL_CLK_SHIFT;
+ clk += adj;
}
- return expires;
+ return next;
}
/*
@@ -1379,9 +1493,10 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
*/
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
- struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
+ bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1391,19 +1506,82 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
spin_lock(&base->lock);
- if (base->active_timers) {
- if (time_before_eq(base->next_timer, base->timer_jiffies))
- base->next_timer = __next_timer_interrupt(base);
- nextevt = base->next_timer;
- if (time_before_eq(nextevt, basej))
- expires = basem;
- else
+ nextevt = __next_timer_interrupt(base);
+ is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
+ base->next_expiry = nextevt;
+ /*
+ * We have a fresh next event. Check whether we can forward the base:
+ */
+ if (time_after(nextevt, jiffies))
+ base->clk = jiffies;
+ else if (time_after(nextevt, base->clk))
+ base->clk = nextevt;
+
+ if (time_before_eq(nextevt, basej)) {
+ expires = basem;
+ base->is_idle = false;
+ } else {
+ if (!is_max_delta)
expires = basem + (nextevt - basej) * TICK_NSEC;
+ /*
+ * If we expect to sleep more than a tick, mark the base idle:
+ */
+ if ((expires - basem) > TICK_NSEC)
+ base->is_idle = true;
}
spin_unlock(&base->lock);
return cmp_next_hrtimer_event(basem, expires);
}
+
+/**
+ * timer_clear_idle - Clear the idle state of the timer base
+ *
+ * Called with interrupts disabled
+ */
+void timer_clear_idle(void)
+{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+ /*
+ * We do this unlocked. The worst outcome is a remote enqueue sending
+ * a pointless IPI, but taking the lock would just make the window for
+ * sending the IPI a few instructions smaller for the cost of taking
+ * the lock in the exit from idle path.
+ */
+ base->is_idle = false;
+}
+
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
+{
+ /*
+ * NOHZ optimization. After a long idle sleep we need to forward the
+ * base to current jiffies. Avoid a loop by searching the bitfield for
+ * the next expiring timer.
+ */
+ if ((long)(jiffies - base->clk) > 2) {
+ unsigned long next = __next_timer_interrupt(base);
+
+ /*
+ * If the next timer is ahead of time forward to current
+ * jiffies, otherwise forward to the next expiry time:
+ */
+ if (time_after(next, jiffies)) {
+ /* The call site will increment clock! */
+ base->clk = jiffies - 1;
+ return 0;
+ }
+ base->clk = next;
+ }
+ return __collect_expired_timers(base, heads);
+}
+#else
+static inline int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
+{
+ return __collect_expired_timers(base, heads);
+}
#endif
/*
@@ -1426,15 +1604,42 @@ void update_process_times(int user_tick)
run_posix_cpu_timers(p);
}
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ */
+static inline void __run_timers(struct timer_base *base)
+{
+ struct hlist_head heads[LVL_DEPTH];
+ int levels;
+
+ if (!time_after_eq(jiffies, base->clk))
+ return;
+
+ spin_lock_irq(&base->lock);
+
+ while (time_after_eq(jiffies, base->clk)) {
+
+ levels = collect_expired_timers(base, heads);
+ base->clk++;
+
+ while (levels--)
+ expire_timers(base, heads + levels);
+ }
+ base->running_timer = NULL;
+ spin_unlock_irq(&base->lock);
+}
+
/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
- if (time_after_eq(jiffies, base->timer_jiffies))
- __run_timers(base);
+ __run_timers(base);
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
/*
@@ -1442,7 +1647,18 @@ static void run_timer_softirq(struct softirq_action *h)
*/
void run_local_timers(void)
{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
hrtimer_run_queues();
+ /* Raise the softirq only if required. */
+ if (time_before(jiffies, base->clk)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ return;
+ /* CPU is awake, so check the deferrable base. */
+ base++;
+ if (time_before(jiffies, base->clk))
+ return;
+ }
raise_softirq(TIMER_SOFTIRQ);
}
@@ -1527,7 +1743,7 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
- __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+ __mod_timer(&timer, expire, false);
schedule();
del_singleshot_timer_sync(&timer);
@@ -1578,87 +1794,62 @@ signed long __sched schedule_timeout_idle(signed long timeout)
EXPORT_SYMBOL(schedule_timeout_idle);
#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
struct timer_list *timer;
int cpu = new_base->cpu;
while (!hlist_empty(head)) {
timer = hlist_entry(head->first, struct timer_list, entry);
- /* We ignore the accounting on the dying cpu */
detach_timer(timer, false);
timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
internal_add_timer(new_base, timer);
}
}
-static void migrate_timers(int cpu)
+int timers_dead_cpu(unsigned int cpu)
{
- struct tvec_base *old_base;
- struct tvec_base *new_base;
- int i;
+ struct timer_base *old_base;
+ struct timer_base *new_base;
+ int b, i;
BUG_ON(cpu_online(cpu));
- old_base = per_cpu_ptr(&tvec_bases, cpu);
- new_base = get_cpu_ptr(&tvec_bases);
- /*
- * The caller is globally serialized and nobody else
- * takes two locks at once, deadlock is not possible.
- */
- spin_lock_irq(&new_base->lock);
- spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
-
- BUG_ON(old_base->running_timer);
-
- for (i = 0; i < TVR_SIZE; i++)
- migrate_timer_list(new_base, old_base->tv1.vec + i);
- for (i = 0; i < TVN_SIZE; i++) {
- migrate_timer_list(new_base, old_base->tv2.vec + i);
- migrate_timer_list(new_base, old_base->tv3.vec + i);
- migrate_timer_list(new_base, old_base->tv4.vec + i);
- migrate_timer_list(new_base, old_base->tv5.vec + i);
- }
- old_base->active_timers = 0;
- old_base->all_timers = 0;
+ for (b = 0; b < NR_BASES; b++) {
+ old_base = per_cpu_ptr(&timer_bases[b], cpu);
+ new_base = get_cpu_ptr(&timer_bases[b]);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ spin_lock_irq(&new_base->lock);
+ spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
- spin_unlock(&old_base->lock);
- spin_unlock_irq(&new_base->lock);
- put_cpu_ptr(&tvec_bases);
-}
+ BUG_ON(old_base->running_timer);
-static int timer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- switch (action) {
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_timers((long)hcpu);
- break;
- default:
- break;
- }
+ for (i = 0; i < WHEEL_SIZE; i++)
+ migrate_timer_list(new_base, old_base->vectors + i);
- return NOTIFY_OK;
+ spin_unlock(&old_base->lock);
+ spin_unlock_irq(&new_base->lock);
+ put_cpu_ptr(&timer_bases);
+ }
+ return 0;
}
-static inline void timer_register_cpu_notifier(void)
-{
- cpu_notifier(timer_cpu_notify, 0);
-}
-#else
-static inline void timer_register_cpu_notifier(void) { }
#endif /* CONFIG_HOTPLUG_CPU */
static void __init init_timer_cpu(int cpu)
{
- struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
-
- base->cpu = cpu;
- spin_lock_init(&base->lock);
+ struct timer_base *base;
+ int i;
- base->timer_jiffies = jiffies;
- base->next_timer = base->timer_jiffies;
+ for (i = 0; i < NR_BASES; i++) {
+ base = per_cpu_ptr(&timer_bases[i], cpu);
+ base->cpu = cpu;
+ spin_lock_init(&base->lock);
+ base->clk = jiffies;
+ }
}
static void __init init_timer_cpus(void)
@@ -1673,7 +1864,6 @@ void __init init_timers(void)
{
init_timer_cpus();
init_timer_stats();
- timer_register_cpu_notifier();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
@@ -1717,9 +1907,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
}
/**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * usleep_range - Sleep for an approximate time
* @min: Minimum time in usecs to sleep
* @max: Maximum time in usecs to sleep
+ *
+ * In non-atomic context where the exact wakeup time is flexible, use
+ * usleep_range() instead of udelay(). The sleep improves responsiveness
+ * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
+ * power usage by allowing hrtimers to take advantage of an already-
+ * scheduled interrupt instead of scheduling a new one just for this sleep.
*/
void __sched usleep_range(unsigned long min, unsigned long max)
{
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1adecb4b87c8..087204c733eb 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr)
static int tstats_show(struct seq_file *m, void *v)
{
- struct timespec period;
+ struct timespec64 period;
struct entry *entry;
unsigned long ms;
long events = 0;
@@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v)
time = ktime_sub(time_stop, time_start);
- period = ktime_to_timespec(time);
+ period = ktime_to_timespec64(time);
ms = period.tv_nsec / 1000000;
seq_puts(m, "Timer Stats Version: v0.3\n");
- seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+ seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
if (atomic_read(&overflow_count))
seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
diff --git a/kernel/torture.c b/kernel/torture.c
index fa0bdeee17ac..0d887eb62856 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -43,6 +43,7 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/trace_clock.h>
+#include <linux/ktime.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
@@ -82,6 +83,104 @@ static int min_online = -1;
static int max_online;
/*
+ * Attempt to take a CPU offline. Return false if the CPU is already
+ * offline or if it is not subject to CPU-hotplug operations. The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
+ unsigned long *sum_offl, int *min_offl, int *max_offl)
+{
+ unsigned long delta;
+ int ret;
+ unsigned long starttime;
+
+ if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+ return false;
+
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ (*n_offl_attempts)++;
+ ret = cpu_down(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offline %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlined %d\n",
+ torture_type, cpu);
+ (*n_offl_successes)++;
+ delta = jiffies - starttime;
+ sum_offl += delta;
+ if (*min_offl < 0) {
+ *min_offl = delta;
+ *max_offl = delta;
+ }
+ if (*min_offl > delta)
+ *min_offl = delta;
+ if (*max_offl < delta)
+ *max_offl = delta;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(torture_offline);
+
+/*
+ * Attempt to bring a CPU online. Return false if the CPU is already
+ * online or if it is not subject to CPU-hotplug operations. The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
+ unsigned long *sum_onl, int *min_onl, int *max_onl)
+{
+ unsigned long delta;
+ int ret;
+ unsigned long starttime;
+
+ if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+ return false;
+
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ (*n_onl_attempts)++;
+ ret = cpu_up(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: online %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlined %d\n",
+ torture_type, cpu);
+ (*n_onl_successes)++;
+ delta = jiffies - starttime;
+ *sum_onl += delta;
+ if (*min_onl < 0) {
+ *min_onl = delta;
+ *max_onl = delta;
+ }
+ if (*min_onl > delta)
+ *min_onl = delta;
+ if (*max_onl < delta)
+ *max_onl = delta;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(torture_online);
+
+/*
* Execute random CPU-hotplug operations at the interval specified
* by the onoff_interval.
*/
@@ -89,16 +188,19 @@ static int
torture_onoff(void *arg)
{
int cpu;
- unsigned long delta;
int maxcpu = -1;
DEFINE_TORTURE_RANDOM(rand);
- int ret;
- unsigned long starttime;
VERBOSE_TOROUT_STRING("torture_onoff task started");
for_each_online_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
+
+ if (maxcpu == 0) {
+ VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
+ goto stop;
+ }
+
if (onoff_holdoff > 0) {
VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
schedule_timeout_interruptible(onoff_holdoff);
@@ -106,69 +208,16 @@ torture_onoff(void *arg)
}
while (!torture_must_stop()) {
cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
- if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: offlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_offline_attempts++;
- ret = cpu_down(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: offline %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: offlined %d\n",
- torture_type, cpu);
- n_offline_successes++;
- delta = jiffies - starttime;
- sum_offline += delta;
- if (min_offline < 0) {
- min_offline = delta;
- max_offline = delta;
- }
- if (min_offline > delta)
- min_offline = delta;
- if (max_offline < delta)
- max_offline = delta;
- }
- } else if (cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: onlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_online_attempts++;
- ret = cpu_up(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: online %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: onlined %d\n",
- torture_type, cpu);
- n_online_successes++;
- delta = jiffies - starttime;
- sum_online += delta;
- if (min_online < 0) {
- min_online = delta;
- max_online = delta;
- }
- if (min_online > delta)
- min_online = delta;
- if (max_online < delta)
- max_online = delta;
- }
- }
+ if (!torture_offline(cpu,
+ &n_offline_attempts, &n_offline_successes,
+ &sum_offline, &min_offline, &max_offline))
+ torture_online(cpu,
+ &n_online_attempts, &n_online_successes,
+ &sum_online, &min_online, &max_online);
schedule_timeout_interruptible(onoff_interval);
}
+
+stop:
torture_kthread_stopping("torture_onoff");
return 0;
}
@@ -398,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
* Variables for auto-shutdown. This allows "lights out" torture runs
* to be fully scripted.
*/
-static int shutdown_secs; /* desired test duration in seconds. */
static struct task_struct *shutdown_task;
-static unsigned long shutdown_time; /* jiffies to system shutdown. */
+static ktime_t shutdown_time; /* time to system shutdown. */
static void (*torture_shutdown_hook)(void);
/*
@@ -423,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
*/
static int torture_shutdown(void *arg)
{
- long delta;
- unsigned long jiffies_snap;
+ ktime_t ktime_snap;
VERBOSE_TOROUT_STRING("torture_shutdown task started");
- jiffies_snap = jiffies;
- while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+ ktime_snap = ktime_get();
+ while (ktime_before(ktime_snap, shutdown_time) &&
!torture_must_stop()) {
- delta = shutdown_time - jiffies_snap;
if (verbose)
pr_alert("%s" TORTURE_FLAG
- "torture_shutdown task: %lu jiffies remaining\n",
- torture_type, delta);
- schedule_timeout_interruptible(delta);
- jiffies_snap = jiffies;
+ "torture_shutdown task: %llu ms remaining\n",
+ torture_type,
+ ktime_ms_delta(shutdown_time, ktime_snap));
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS);
+ ktime_snap = ktime_get();
}
if (torture_must_stop()) {
torture_kthread_stopping("torture_shutdown");
@@ -463,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
{
int ret = 0;
- shutdown_secs = ssecs;
torture_shutdown_hook = cleanup;
- if (shutdown_secs > 0) {
- shutdown_time = jiffies + shutdown_secs * HZ;
+ if (ssecs > 0) {
+ shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
ret = torture_create_kthread(torture_shutdown, NULL,
shutdown_task);
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e45db6b0d878..2a96b063d659 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_GRAPH_FP_TEST
- bool
- help
- See Documentation/trace/ftrace-design.txt
-
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -221,6 +216,41 @@ config SCHED_TRACER
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
+config HWLAT_TRACER
+ bool "Tracer to detect hardware latencies (like SMIs)"
+ select GENERIC_TRACER
+ help
+ This tracer, when enabled will create one or more kernel threads,
+ depening on what the cpumask file is set to, which each thread
+ spinning in a loop looking for interruptions caused by
+ something other than the kernel. For example, if a
+ System Management Interrupt (SMI) takes a noticeable amount of
+ time, this tracer will detect it. This is useful for testing
+ if a system is reliable for Real Time tasks.
+
+ Some files are created in the tracing directory when this
+ is enabled:
+
+ hwlat_detector/width - time in usecs for how long to spin for
+ hwlat_detector/window - time in usecs between the start of each
+ iteration
+
+ A kernel thread is created that will spin with interrupts disabled
+ for "width" microseconds in every "widow" cycle. It will not spin
+ for "window - width" microseconds, where the system can
+ continue to operate.
+
+ The output will appear in the trace and trace_pipe files.
+
+ When the tracer is not running, it has no affect on the system,
+ but when it is running, it can cause the system to be
+ periodically non responsive. Do not run this tracer on a
+ production system.
+
+ To enable this tracer, echo in "hwlat" into the current_tracer
+ file. Every time a latency is greater than tracing_thresh, it will
+ be recorded into the ring buffer.
+
config ENABLE_DEFAULT_TRACERS
bool "Trace process context switches and events"
depends on !GENERIC_TRACER
@@ -528,6 +558,33 @@ config MMIOTRACE
See Documentation/trace/mmiotrace.txt.
If you are not helping to develop drivers, say N.
+config TRACING_MAP
+ bool
+ depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+ help
+ tracing_map is a special-purpose lock-free map for tracing,
+ separated out as a stand-alone facility in order to allow it
+ to be shared between multiple tracers. It isn't meant to be
+ generally used outside of that context, and is normally
+ selected by tracers that use it.
+
+config HIST_TRIGGERS
+ bool "Histogram triggers"
+ depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select TRACING_MAP
+ select TRACING
+ default n
+ help
+ Hist triggers allow one or more arbitrary trace event fields
+ to be aggregated into hash tables and dumped to stdout by
+ reading a debugfs/tracefs file. They're useful for
+ gathering quick and dirty (though precise) summaries of
+ event activity as an initial guide for further investigation
+ using more advanced tools.
+
+ See Documentation/trace/events.txt.
+ If in doubt, say N.
+
config MMIOTRACE_TEST
tristate "Test module for mmiotrace"
depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 9b1044e936a6..992ab9d99f35 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,4 +1,8 @@
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
# Do not instrument the tracer itself:
ifdef CONFIG_FUNCTION_TRACER
@@ -31,11 +35,13 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
@@ -53,6 +59,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef8654e90d..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
static void trace_note_time(struct blk_trace *bt)
{
- struct timespec now;
+ struct timespec64 now;
unsigned long flags;
u32 words[2];
- getnstimeofday(&now);
- words[0] = now.tv_sec;
+ /* need to check user space to see if this breaks in y2038 or y2106 */
+ ktime_get_real_ts64(&now);
+ words[0] = (u32)now.tv_sec;
words[1] = now.tv_nsec;
local_irq_save(flags);
@@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
#define BLK_TC_RAHEAD BLK_TC_AHEAD
+#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int rw, u32 what, int error, int pdu_len, void *pdu_data)
+ int op, int op_flags, u32 what, int error, int pdu_len,
+ void *pdu_data)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
- what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, SYNC);
- what |= MASK_TC_BIT(rw, RAHEAD);
- what |= MASK_TC_BIT(rw, META);
- what |= MASK_TC_BIT(rw, DISCARD);
- what |= MASK_TC_BIT(rw, FLUSH);
- what |= MASK_TC_BIT(rw, FUA);
+ what |= ddir_act[op_is_write(op) ? WRITE : READ];
+ what |= MASK_TC_BIT(op_flags, SYNC);
+ what |= MASK_TC_BIT(op_flags, RAHEAD);
+ what |= MASK_TC_BIT(op_flags, META);
+ what |= MASK_TC_BIT(op_flags, PREFLUSH);
+ what |= MASK_TC_BIT(op_flags, FUA);
+ if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
+ what |= BLK_TC_ACT(BLK_TC_DISCARD);
+ if (op == REQ_OP_FLUSH)
+ what |= BLK_TC_ACT(BLK_TC_FLUSH);
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+ __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+ __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
+ NULL);
}
}
@@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL);
}
}
@@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
}
@@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore,
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- bio->bi_error, sizeof(rpdu), &rpdu);
+ bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
+ BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+ &rpdu);
}
}
@@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
sizeof(r), &r);
}
@@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
sizeof(r), &r);
}
@@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q,
return;
if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
else
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq)
}
}
-void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
{
int i = 0;
- if (rw & REQ_FLUSH)
+ if (rw & REQ_PREFLUSH)
rwbs[i++] = 'F';
- if (rw & WRITE)
+ switch (op) {
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
- else if (rw & REQ_DISCARD)
+ break;
+ case REQ_OP_DISCARD:
+ rwbs[i++] = 'D';
+ break;
+ case REQ_OP_SECURE_ERASE:
rwbs[i++] = 'D';
- else if (bytes)
+ rwbs[i++] = 'E';
+ break;
+ case REQ_OP_FLUSH:
+ rwbs[i++] = 'F';
+ break;
+ case REQ_OP_READ:
rwbs[i++] = 'R';
- else
+ break;
+ default:
rwbs[i++] = 'N';
+ }
if (rw & REQ_FUA)
rwbs[i++] = 'F';
@@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i++] = 'S';
if (rw & REQ_META)
rwbs[i++] = 'M';
- if (rw & REQ_SECURE)
- rwbs[i++] = 'E';
rwbs[i] = '\0';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 780bcbe1d4de..5dcb99281259 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -8,6 +9,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_perf_event.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
@@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
}
EXPORT_SYMBOL_GPL(trace_call_bpf);
-static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
- void *dst = (void *) (long) r1;
- int ret, size = (int) r2;
- void *unsafe_ptr = (void *) (long) r3;
+ int ret;
ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
@@ -81,13 +81,53 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+ u32, size)
+{
+ /*
+ * Ensure we're in user context which is safe for the helper to
+ * run. This helper has no business in a kthread.
+ *
+ * access_ok() should prevent writing to non-user memory, but in
+ * some situations (nommu, temporary switch, etc) access_ok() does
+ * not provide enough validation, hence the check on KERNEL_DS.
+ */
+
+ if (unlikely(in_interrupt() ||
+ current->flags & (PF_KTHREAD | PF_EXITING)))
+ return -EPERM;
+ if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+ return -EPERM;
+ if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+ return -EPERM;
+
+ return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+ .func = bpf_probe_write_user,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+ pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+ current->comm, task_pid_nr(current));
+
+ return &bpf_probe_write_user_proto;
+}
+
/*
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
*/
-static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+ u64, arg2, u64, arg3)
{
- char *fmt = (char *) (long) r1;
bool str_seen = false;
int mod[3] = {};
int fmt_cnt = 0;
@@ -133,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
switch (fmt_cnt) {
case 1:
- unsafe_addr = r3;
- r3 = (long) buf;
+ unsafe_addr = arg1;
+ arg1 = (long) buf;
break;
case 2:
- unsafe_addr = r4;
- r4 = (long) buf;
+ unsafe_addr = arg2;
+ arg2 = (long) buf;
break;
case 3:
- unsafe_addr = r5;
- r5 = (long) buf;
+ unsafe_addr = arg3;
+ arg3 = (long) buf;
break;
}
buf[0] = 0;
@@ -164,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
}
return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
- mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
- mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+ mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
+ mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
+ mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -188,25 +228,32 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
+ u64 index = flags & BPF_F_INDEX_MASK;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (index == BPF_F_CURRENT_CPU)
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = (struct file *)array->ptrs[index];
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
+ event = ee->event;
+ if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+ event->attr.type != PERF_TYPE_RAW))
+ return -EINVAL;
/* make sure event is local and doesn't have pmu::count */
- if (event->oncpu != smp_processor_id() ||
- event->pmu->count)
+ if (unlikely(event->oncpu != cpu || event->pmu->count))
return -EINVAL;
/*
@@ -225,47 +272,56 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+ u64 flags, struct perf_raw_record *raw)
{
- struct pt_regs *regs = (struct pt_regs *) (long) r1;
- struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
- struct perf_raw_record raw = {
- .size = size,
- .data = data,
- };
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
if (index == BPF_F_CURRENT_CPU)
- index = raw_smp_processor_id();
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = (struct file *)array->ptrs[index];
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
- if (unlikely(event->oncpu != smp_processor_id()))
+ if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = &raw;
+ sample_data.raw = raw;
perf_event_output(event, &sample_data, regs);
return 0;
}
+BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
+{
+ struct perf_raw_record raw = {
+ .frag = {
+ .size = size,
+ .data = data,
+ },
+ };
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
static const struct bpf_func_proto bpf_perf_event_output_proto = {
.func = bpf_perf_event_output,
.gpl_only = true,
@@ -279,31 +335,66 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+ struct perf_raw_frag frag = {
+ .copy = ctx_copy,
+ .size = ctx_size,
+ .data = ctx,
+ };
+ struct perf_raw_record raw = {
+ .frag = {
+ {
+ .next = ctx_size ? &frag : NULL,
+ },
+ .size = meta_size,
+ .data = meta,
+ },
+ };
perf_fetch_caller_regs(regs);
- return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
+BPF_CALL_0(bpf_get_current_task)
+{
+ return (long) current;
}
-static const struct bpf_func_proto bpf_event_output_proto = {
- .func = bpf_event_output,
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+ .func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
};
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
{
- return &bpf_event_output_proto;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+
+ if (unlikely(in_interrupt()))
+ return -EINVAL;
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return task_under_cgroup_hierarchy(current, cgrp);
}
+static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+ .func = bpf_current_task_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -321,6 +412,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -331,6 +424,12 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ case BPF_FUNC_probe_write_user:
+ return bpf_get_probe_write_proto();
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
default:
return NULL;
}
@@ -349,20 +448,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
}
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
-static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
- /* check bounds */
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
-
- /* only read is allowed */
if (type != BPF_READ)
return false;
-
- /* disallow misaligned access */
if (off % size != 0)
return false;
-
return true;
}
@@ -376,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = {
.type = BPF_PROG_TYPE_KPROBE,
};
-static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
{
+ struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
/*
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
* from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
- * from there and call the same bpf_perf_event_output() helper
+ * from there and call the same bpf_perf_event_output() helper inline.
*/
- u64 ctx = *(long *)(uintptr_t)r1;
-
- return bpf_perf_event_output(ctx, r2, index, r4, size);
+ return ____bpf_perf_event_output(regs, map, flags, data, size);
}
static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
@@ -399,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg5_type = ARG_CONST_STACK_SIZE,
};
-static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
+ u64, flags)
{
- u64 ctx = *(long *)(uintptr_t)r1;
+ struct pt_regs *regs = *(struct pt_regs **)tp_buff;
- return bpf_get_stackid(ctx, r2, r3, r4, r5);
+ /*
+ * Same comment as in bpf_perf_event_output_tp(), only that this time
+ * the other helper's function body cannot be inlined due to being
+ * external, thus we need to call raw helper function.
+ */
+ return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
}
static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
@@ -427,7 +529,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
}
}
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
@@ -448,10 +551,69 @@ static struct bpf_prog_type_list tracepoint_tl = {
.type = BPF_PROG_TYPE_TRACEPOINT,
};
+static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+ if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
+ if (size != sizeof(u64))
+ return false;
+ } else {
+ if (size != sizeof(long))
+ return false;
+ }
+ return true;
+}
+
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct bpf_perf_event_data, sample_period):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+ data), dst_reg, src_reg,
+ offsetof(struct bpf_perf_event_data_kern, data));
+ *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+ offsetof(struct perf_sample_data, period));
+ break;
+ default:
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+ regs), dst_reg, src_reg,
+ offsetof(struct bpf_perf_event_data_kern, regs));
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static const struct bpf_verifier_ops perf_event_prog_ops = {
+ .get_func_proto = tp_prog_func_proto,
+ .is_valid_access = pe_prog_is_valid_access,
+ .convert_ctx_access = pe_prog_convert_ctx_access,
+};
+
+static struct bpf_prog_type_list perf_event_tl = {
+ .ops = &perf_event_prog_ops,
+ .type = BPF_PROG_TYPE_PERF_EVENT,
+};
+
static int __init register_kprobe_prog_ops(void)
{
bpf_register_prog_type(&kprobe_tl);
bpf_register_prog_type(&tracepoint_tl);
+ bpf_register_prog_type(&perf_event_tl);
return 0;
}
late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1870fbd2b67..2050a7652a86 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-/* List for set_ftrace_pid's pids. */
-LIST_HEAD(ftrace_pids);
-struct ftrace_pid {
- struct list_head list;
- struct pid *pid;
-};
-
-static bool ftrace_pids_enabled(void)
+static bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
- return !list_empty(&ftrace_pids);
+ struct trace_array *tr;
+
+ if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
+ return false;
+
+ tr = ops->private;
+
+ return tr->function_pids != NULL;
}
static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
- if (!test_tsk_trace_trace(current))
+ struct trace_array *tr = op->private;
+
+ if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
return;
op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
- if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+ if (ftrace_pids_enabled(ops))
ops->func = ftrace_pid_func;
ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
static void ftrace_update_pid_func(void)
{
- bool enabled = ftrace_pids_enabled();
struct ftrace_ops *op;
/* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (op->flags & FTRACE_OPS_FL_PID) {
- op->func = enabled ? ftrace_pid_func :
- op->saved_func;
+ op->func = ftrace_pids_enabled(op) ?
+ ftrace_pid_func : op->saved_func;
ftrace_update_trampoline(op);
}
} while_for_each_ftrace_op(op);
@@ -871,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int profile_graph_entry(struct ftrace_graph_ent *trace)
{
+ int index = trace->depth;
+
function_profile_call(trace->func, 0, NULL, NULL);
+
+ if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
+ current->ret_stack[index].subtime = 0;
+
return 1;
}
@@ -1530,7 +1537,19 @@ static int ftrace_cmp_recs(const void *a, const void *b)
return 0;
}
-static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
+/**
+ * ftrace_location_range - return the first address of a traced location
+ * if it touches the given ip range
+ * @start: start of range to search.
+ * @end: end of range to search (inclusive). @end points to the last byte
+ * to check.
+ *
+ * Returns rec->ip if the related ftrace location is a least partly within
+ * the given address range. That is, the first address of the instruction
+ * that is either a NOP or call to the function tracer. It checks the ftrace
+ * internal tables to determine if the address belongs or not.
+ */
+unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3444,11 +3463,23 @@ struct ftrace_glob {
int type;
};
+/*
+ * If symbols in an architecture don't correspond exactly to the user-visible
+ * name of what they represent, it is possible to define this function to
+ * perform the necessary adjustments.
+*/
+char * __weak arch_ftrace_match_adjust(char *str, const char *search)
+{
+ return str;
+}
+
static int ftrace_match(char *str, struct ftrace_glob *g)
{
int matched = 0;
int slen;
+ str = arch_ftrace_match_adjust(str, g->search);
+
switch (g->type) {
case MATCH_FULL:
if (strcmp(str, g->search) == 0)
@@ -5300,179 +5331,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
return ops->func;
}
-static void clear_ftrace_swapper(void)
+static void
+ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
{
- struct task_struct *p;
- int cpu;
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- clear_tsk_trace_trace(p);
- }
- put_online_cpus();
-}
+ pid_list = rcu_dereference_sched(tr->function_pids);
-static void set_ftrace_swapper(void)
-{
- struct task_struct *p;
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- set_tsk_trace_trace(p);
- }
- put_online_cpus();
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, next));
}
-static void clear_ftrace_pid(struct pid *pid)
+static void clear_ftrace_pids(struct trace_array *tr)
{
- struct task_struct *p;
+ struct trace_pid_list *pid_list;
+ int cpu;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- clear_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+ if (!pid_list)
+ return;
- put_pid(pid);
-}
+ unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
-static void set_ftrace_pid(struct pid *pid)
-{
- struct task_struct *p;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- set_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
-}
+ rcu_assign_pointer(tr->function_pids, NULL);
-static void clear_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- clear_ftrace_swapper();
- else
- clear_ftrace_pid(pid);
-}
+ /* Wait till all users are no longer using pid filtering */
+ synchronize_sched();
-static void set_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- set_ftrace_swapper();
- else
- set_ftrace_pid(pid);
+ trace_free_pid_list(pid_list);
}
-static int ftrace_pid_add(int p)
+static void ftrace_pid_reset(struct trace_array *tr)
{
- struct pid *pid;
- struct ftrace_pid *fpid;
- int ret = -EINVAL;
-
mutex_lock(&ftrace_lock);
-
- if (!p)
- pid = ftrace_swapper_pid;
- else
- pid = find_get_pid(p);
-
- if (!pid)
- goto out;
-
- ret = 0;
-
- list_for_each_entry(fpid, &ftrace_pids, list)
- if (fpid->pid == pid)
- goto out_put;
-
- ret = -ENOMEM;
-
- fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
- if (!fpid)
- goto out_put;
-
- list_add(&fpid->list, &ftrace_pids);
- fpid->pid = pid;
-
- set_ftrace_pid_task(pid);
+ clear_ftrace_pids(tr);
ftrace_update_pid_func();
-
ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
- return 0;
-
-out_put:
- if (pid != ftrace_swapper_pid)
- put_pid(pid);
-
-out:
- mutex_unlock(&ftrace_lock);
- return ret;
}
-static void ftrace_pid_reset(void)
-{
- struct ftrace_pid *fpid, *safe;
-
- mutex_lock(&ftrace_lock);
- list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
- struct pid *pid = fpid->pid;
-
- clear_ftrace_pid_task(pid);
-
- list_del(&fpid->list);
- kfree(fpid);
- }
-
- ftrace_update_pid_func();
- ftrace_startup_all(0);
-
- mutex_unlock(&ftrace_lock);
-}
+/* Greater than any max PID */
+#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1)
static void *fpid_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = m->private;
+
mutex_lock(&ftrace_lock);
+ rcu_read_lock_sched();
- if (!ftrace_pids_enabled() && (!*pos))
- return (void *) 1;
+ pid_list = rcu_dereference_sched(tr->function_pids);
- return seq_list_start(&ftrace_pids, *pos);
+ if (!pid_list)
+ return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+ return trace_pid_start(pid_list, pos);
}
static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
{
- if (v == (void *)1)
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
+
+ if (v == FTRACE_NO_PIDS)
return NULL;
- return seq_list_next(v, &ftrace_pids, pos);
+ return trace_pid_next(pid_list, v, pos);
}
static void fpid_stop(struct seq_file *m, void *p)
+ __releases(RCU)
{
+ rcu_read_unlock_sched();
mutex_unlock(&ftrace_lock);
}
static int fpid_show(struct seq_file *m, void *v)
{
- const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
-
- if (v == (void *)1) {
+ if (v == FTRACE_NO_PIDS) {
seq_puts(m, "no pid\n");
return 0;
}
- if (fpid->pid == ftrace_swapper_pid)
- seq_puts(m, "swapper tasks\n");
- else
- seq_printf(m, "%u\n", pid_vnr(fpid->pid));
-
- return 0;
+ return trace_pid_show(m, v);
}
static const struct seq_operations ftrace_pid_sops = {
@@ -5485,58 +5436,103 @@ static const struct seq_operations ftrace_pid_sops = {
static int
ftrace_pid_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
int ret = 0;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_pid_reset();
+ ftrace_pid_reset(tr);
- if (file->f_mode & FMODE_READ)
- ret = seq_open(file, &ftrace_pid_sops);
+ ret = seq_open(file, &ftrace_pid_sops);
+ if (ret < 0) {
+ trace_array_put(tr);
+ } else {
+ m = file->private_data;
+ /* copy tr over to seq ops */
+ m->private = tr;
+ }
return ret;
}
+static void ignore_task_cpu(void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /*
+ * This function is called by on_each_cpu() while the
+ * event_mutex is held.
+ */
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ mutex_is_locked(&ftrace_lock));
+
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, current));
+}
+
static ssize_t
ftrace_pid_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[64], *tmp;
- long val;
- int ret;
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *pid_list;
+ ssize_t ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
+ if (!cnt)
+ return 0;
+
+ mutex_lock(&ftrace_lock);
+
+ filtered_pids = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
+ goto out;
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
+ rcu_assign_pointer(tr->function_pids, pid_list);
- buf[cnt] = 0;
+ if (filtered_pids) {
+ synchronize_sched();
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
+ /* Register a probe to set whether to ignore the tracing of a task */
+ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+ }
/*
- * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
- * to clean the filter quietly.
+ * Ignoring of pids is done at task switch. But we have to
+ * check for those tasks that are currently running.
+ * Always do this in case a pid was appended or removed.
*/
- tmp = strstrip(buf);
- if (strlen(tmp) == 0)
- return 1;
+ on_each_cpu(ignore_task_cpu, tr, 1);
- ret = kstrtol(tmp, 10, &val);
- if (ret < 0)
- return ret;
+ ftrace_update_pid_func();
+ ftrace_startup_all(0);
+ out:
+ mutex_unlock(&ftrace_lock);
- ret = ftrace_pid_add(val);
+ if (ret > 0)
+ *ppos += ret;
- return ret ? ret : cnt;
+ return ret;
}
static int
ftrace_pid_release(struct inode *inode, struct file *file)
{
- if (file->f_mode & FMODE_READ)
- seq_release(inode, file);
+ struct trace_array *tr = inode->i_private;
- return 0;
+ trace_array_put(tr);
+
+ return seq_release(inode, file);
}
static const struct file_operations ftrace_pid_fops = {
@@ -5547,24 +5543,21 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_tracefs(void)
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- struct dentry *d_tracer;
+ trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ tr, &ftrace_pid_fops);
+}
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
- return 0;
+void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ /* Only the top level directory has the dyn_tracefs and profile */
+ WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
ftrace_init_dyn_tracefs(d_tracer);
-
- trace_create_file("set_ftrace_pid", 0644, d_tracer,
- NULL, &ftrace_pid_fops);
-
ftrace_profile_tracefs(d_tracer);
-
- return 0;
}
-fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
@@ -5713,7 +5706,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
{
int i;
int ret = 0;
- unsigned long flags;
int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
struct task_struct *g, *t;
@@ -5729,7 +5721,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
}
}
- read_lock_irqsave(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
do_each_thread(g, t) {
if (start == end) {
ret = -EAGAIN;
@@ -5747,7 +5739,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
} while_each_thread(g, t);
unlock:
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_unlock(&tasklist_lock);
free:
for (i = start; i < end; i++)
kfree(ret_stack_list[i]);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a2f0b9f33e9b..8696ce6bf2f6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
-#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
@@ -253,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec)
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
+/* trace_flags that are default zero for instances */
+#define ZEROED_TRACE_FLAGS \
+ TRACE_ITER_EVENT_FORK
/*
* The global_trace is the descriptor that holds the tracing
@@ -303,33 +306,270 @@ void trace_array_put(struct trace_array *this_tr)
mutex_unlock(&trace_types_lock);
}
-int filter_check_discard(struct trace_event_file *file, void *rec,
- struct ring_buffer *buffer,
- struct ring_buffer_event *event)
+int call_filter_check_discard(struct trace_event_call *call, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
{
- if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
- !filter_match_preds(file->filter, rec)) {
- ring_buffer_discard_commit(buffer, event);
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
+ !filter_match_preds(call->filter, rec)) {
+ __trace_event_discard_commit(buffer, event);
return 1;
}
return 0;
}
-EXPORT_SYMBOL_GPL(filter_check_discard);
-int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct ring_buffer *buffer,
- struct ring_buffer_event *event)
+void trace_free_pid_list(struct trace_pid_list *pid_list)
{
- if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
- !filter_match_preds(call->filter, rec)) {
- ring_buffer_discard_commit(buffer, event);
- return 1;
+ vfree(pid_list->pids);
+ kfree(pid_list);
+}
+
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+ /*
+ * If pid_max changed after filtered_pids was created, we
+ * by default ignore all pids greater than the previous pid_max.
+ */
+ if (search_pid >= filtered_pids->pid_max)
+ return false;
+
+ return test_bit(search_pid, filtered_pids->pids);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+ /*
+ * Return false, because if filtered_pids does not exist,
+ * all pids are good to trace.
+ */
+ if (!filtered_pids)
+ return false;
+
+ return !trace_find_filtered_pid(filtered_pids, task->pid);
+}
+
+/**
+ * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task)
+{
+ if (!pid_list)
+ return;
+
+ /* For forks, we only add if the forking task is listed */
+ if (self) {
+ if (!trace_find_filtered_pid(pid_list, self->pid))
+ return;
}
+ /* Sorry, but we don't support pid_max changing after setting */
+ if (task->pid >= pid_list->pid_max)
+ return;
+
+ /* "self" is set for forks, and NULL for exits */
+ if (self)
+ set_bit(task->pid, pid_list->pids);
+ else
+ clear_bit(task->pid, pid_list->pids);
+}
+
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+ unsigned long pid = (unsigned long)v;
+
+ (*pos)++;
+
+ /* pid already is +1 of the actual prevous bit */
+ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+ /* Return pid + 1 to allow zero to be represented */
+ if (pid < pid_list->pid_max)
+ return (void *)(pid + 1);
+
+ return NULL;
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+ unsigned long pid;
+ loff_t l = 0;
+
+ pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+ if (pid >= pid_list->pid_max)
+ return NULL;
+
+ /* Return pid + 1 so that zero can be the exit value */
+ for (pid++; pid && l < *pos;
+ pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+ ;
+ return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+ unsigned long pid = (unsigned long)v - 1;
+
+ seq_printf(m, "%lu\n", pid);
return 0;
}
-EXPORT_SYMBOL_GPL(call_filter_check_discard);
+
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE 127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_parser parser;
+ unsigned long val;
+ int nr_pids = 0;
+ ssize_t read = 0;
+ ssize_t ret = 0;
+ loff_t pos;
+ pid_t pid;
+
+ if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ /*
+ * Always recreate a new array. The write is an all or nothing
+ * operation. Always create a new array when adding new pids by
+ * the user. If the operation fails, then the current list is
+ * not modified.
+ */
+ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ return -ENOMEM;
+
+ pid_list->pid_max = READ_ONCE(pid_max);
+
+ /* Only truncating will shrink pid_max */
+ if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+ pid_list->pid_max = filtered_pids->pid_max;
+
+ pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+ if (!pid_list->pids) {
+ kfree(pid_list);
+ return -ENOMEM;
+ }
+
+ if (filtered_pids) {
+ /* copy the current bits to the new max */
+ for_each_set_bit(pid, filtered_pids->pids,
+ filtered_pids->pid_max) {
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+ }
+ }
+
+ while (cnt > 0) {
+
+ pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &pos);
+ if (ret < 0 || !trace_parser_loaded(&parser))
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ parser.buffer[parser.idx] = 0;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+ if (val >= pid_list->pid_max)
+ break;
+
+ pid = (pid_t)val;
+
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ trace_free_pid_list(pid_list);
+ return ret;
+ }
+
+ if (!nr_pids) {
+ /* Cleared the list of pids */
+ trace_free_pid_list(pid_list);
+ read = ret;
+ pid_list = NULL;
+ }
+
+ *new_pid_list = pid_list;
+
+ return read;
+}
static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
@@ -807,7 +1047,7 @@ void disable_trace_on_warning(void)
*
* Shows real state of the ring buffer if it is enabled or not.
*/
-static int tracer_tracing_is_on(struct trace_array *tr)
+int tracer_tracing_is_on(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
return ring_buffer_record_is_on(tr->trace_buffer.buffer);
@@ -1672,6 +1912,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+ int type, unsigned long flags, int pc)
+{
+ struct trace_entry *ent = ring_buffer_event_data(event);
+
+ tracing_generic_entry_update(ent, flags, pc);
+ ent->type = type;
+}
+
struct ring_buffer_event *
trace_buffer_lock_reserve(struct ring_buffer *buffer,
int type,
@@ -1681,34 +1931,137 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(buffer, len);
- if (event != NULL) {
- struct trace_entry *ent = ring_buffer_event_data(event);
+ if (event != NULL)
+ trace_event_setup(event, type, flags, pc);
+
+ return event;
+}
+
+DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
+DEFINE_PER_CPU(int, trace_buffered_event_cnt);
+static int trace_buffered_event_ref;
- tracing_generic_entry_update(ent, flags, pc);
- ent->type = type;
+/**
+ * trace_buffered_event_enable - enable buffering events
+ *
+ * When events are being filtered, it is quicker to use a temporary
+ * buffer to write the event data into if there's a likely chance
+ * that it will not be committed. The discard of the ring buffer
+ * is not as fast as committing, and is much slower than copying
+ * a commit.
+ *
+ * When an event is to be filtered, allocate per cpu buffers to
+ * write the event data into, and if the event is filtered and discarded
+ * it is simply dropped, otherwise, the entire data is to be committed
+ * in one shot.
+ */
+void trace_buffered_event_enable(void)
+{
+ struct ring_buffer_event *event;
+ struct page *page;
+ int cpu;
+
+ WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+ if (trace_buffered_event_ref++)
+ return;
+
+ for_each_tracing_cpu(cpu) {
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY, 0);
+ if (!page)
+ goto failed;
+
+ event = page_address(page);
+ memset(event, 0, sizeof(*event));
+
+ per_cpu(trace_buffered_event, cpu) = event;
+
+ preempt_disable();
+ if (cpu == smp_processor_id() &&
+ this_cpu_read(trace_buffered_event) !=
+ per_cpu(trace_buffered_event, cpu))
+ WARN_ON_ONCE(1);
+ preempt_enable();
}
- return event;
+ return;
+ failed:
+ trace_buffered_event_disable();
}
-void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+static void enable_trace_buffered_event(void *data)
{
- __this_cpu_write(trace_cmdline_save, true);
- ring_buffer_unlock_commit(buffer, event);
+ /* Probably not needed, but do it anyway */
+ smp_rmb();
+ this_cpu_dec(trace_buffered_event_cnt);
}
-void trace_buffer_unlock_commit(struct trace_array *tr,
- struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
+static void disable_trace_buffered_event(void *data)
{
- __buffer_unlock_commit(buffer, event);
+ this_cpu_inc(trace_buffered_event_cnt);
+}
- ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
- ftrace_trace_userstack(buffer, flags, pc);
+/**
+ * trace_buffered_event_disable - disable buffering events
+ *
+ * When a filter is removed, it is faster to not use the buffered
+ * events, and to commit directly into the ring buffer. Free up
+ * the temp buffers when there are no more users. This requires
+ * special synchronization with current events.
+ */
+void trace_buffered_event_disable(void)
+{
+ int cpu;
+
+ WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+ if (WARN_ON_ONCE(!trace_buffered_event_ref))
+ return;
+
+ if (--trace_buffered_event_ref)
+ return;
+
+ preempt_disable();
+ /* For each CPU, set the buffer as used. */
+ smp_call_function_many(tracing_buffer_mask,
+ disable_trace_buffered_event, NULL, 1);
+ preempt_enable();
+
+ /* Wait for all current users to finish */
+ synchronize_sched();
+
+ for_each_tracing_cpu(cpu) {
+ free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
+ per_cpu(trace_buffered_event, cpu) = NULL;
+ }
+ /*
+ * Make sure trace_buffered_event is NULL before clearing
+ * trace_buffered_event_cnt.
+ */
+ smp_wmb();
+
+ preempt_disable();
+ /* Do the work on each cpu */
+ smp_call_function_many(tracing_buffer_mask,
+ enable_trace_buffered_event, NULL, 1);
+ preempt_enable();
+}
+
+void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+ __this_cpu_write(trace_cmdline_save, true);
+
+ /* If this is the temp buffer, we need to commit fully */
+ if (this_cpu_read(trace_buffered_event) == event) {
+ /* Length is in event->array[0] */
+ ring_buffer_write(buffer, event->array[0], &event->array[1]);
+ /* Release the temp buffer */
+ this_cpu_dec(trace_buffered_event_cnt);
+ } else
+ ring_buffer_unlock_commit(buffer, event);
}
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
static struct ring_buffer *temp_buffer;
@@ -1719,8 +2072,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
unsigned long flags, int pc)
{
struct ring_buffer_event *entry;
+ int val;
*current_rb = trace_file->tr->trace_buffer.buffer;
+
+ if ((trace_file->flags &
+ (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+ (entry = this_cpu_read(trace_buffered_event))) {
+ /* Try to use the per cpu buffer first */
+ val = this_cpu_inc_return(trace_buffered_event_cnt);
+ if (val == 1) {
+ trace_event_setup(entry, type, flags, pc);
+ entry->array[0] = len;
+ return entry;
+ }
+ this_cpu_dec(trace_buffered_event_cnt);
+ }
+
entry = trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
/*
@@ -1738,17 +2106,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
- int type, unsigned long len,
- unsigned long flags, int pc)
-{
- *current_rb = global_trace.trace_buffer.buffer;
- return trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
-
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct ring_buffer *buffer,
struct ring_buffer_event *event,
@@ -1757,17 +2114,19 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
+ /*
+ * If regs is not set, then skip the following callers:
+ * trace_buffer_unlock_commit_regs
+ * event_trigger_unlock_commit
+ * trace_event_buffer_commit
+ * trace_event_raw_event_sched_switch
+ * Note, we can still get here via blktrace, wakeup tracer
+ * and mmiotrace, but that's ok if they lose a function or
+ * two. They are that meaningful.
+ */
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
-
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event)
-{
- ring_buffer_discard_commit(buffer, event);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
void
trace_function(struct trace_array *tr,
@@ -1816,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
+ * Add two, for this function and the call to save_stack_trace()
+ * If regs is set, then these functions will not be in the way.
+ */
+ if (!regs)
+ trace.skip += 2;
+
+ /*
* Since events can happen in NMIs there's no safe way to
* use the per cpu ftrace_stacks. We reserve it and if an interrupt
* or NMI comes in, it will just have to use the default
@@ -1986,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
/* created for use with alloc_percpu */
struct trace_buffer_struct {
- char buffer[TRACE_BUF_SIZE];
+ int nesting;
+ char buffer[4][TRACE_BUF_SIZE];
};
static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
/*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording. If we're nested too deeply, then
+ * this returns NULL.
*/
static char *get_trace_buf(void)
{
- struct trace_buffer_struct *percpu_buffer;
-
- /*
- * If we have allocated per cpu buffers, then we do not
- * need to do any locking.
- */
- if (in_nmi())
- percpu_buffer = trace_percpu_nmi_buffer;
- else if (in_irq())
- percpu_buffer = trace_percpu_irq_buffer;
- else if (in_softirq())
- percpu_buffer = trace_percpu_sirq_buffer;
- else
- percpu_buffer = trace_percpu_buffer;
+ struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!percpu_buffer)
+ if (!buffer || buffer->nesting >= 4)
return NULL;
- return this_cpu_ptr(&percpu_buffer->buffer[0]);
+ return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+ this_cpu_dec(trace_percpu_buffer->nesting);
}
static int alloc_percpu_trace_buffer(void)
{
struct trace_buffer_struct *buffers;
- struct trace_buffer_struct *sirq_buffers;
- struct trace_buffer_struct *irq_buffers;
- struct trace_buffer_struct *nmi_buffers;
buffers = alloc_percpu(struct trace_buffer_struct);
- if (!buffers)
- goto err_warn;
-
- sirq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!sirq_buffers)
- goto err_sirq;
-
- irq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!irq_buffers)
- goto err_irq;
-
- nmi_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!nmi_buffers)
- goto err_nmi;
+ if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+ return -ENOMEM;
trace_percpu_buffer = buffers;
- trace_percpu_sirq_buffer = sirq_buffers;
- trace_percpu_irq_buffer = irq_buffers;
- trace_percpu_nmi_buffer = nmi_buffers;
-
return 0;
-
- err_nmi:
- free_percpu(irq_buffers);
- err_irq:
- free_percpu(sirq_buffers);
- err_sirq:
- free_percpu(buffers);
- err_warn:
- WARN(1, "Could not allocate percpu trace_printk buffer");
- return -ENOMEM;
}
static int buffers_allocated;
@@ -2153,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2179,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
}
out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -2210,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2229,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
- out:
+
+out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -3571,6 +3902,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
+ if (mask == TRACE_ITER_EVENT_FORK)
+ trace_event_follow_fork(tr, enabled);
+
if (mask == TRACE_ITER_OVERWRITE) {
ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -3658,7 +3992,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
if (cnt >= sizeof(buf))
return -EINVAL;
- if (copy_from_user(&buf, ubuf, cnt))
+ if (copy_from_user(buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
@@ -3789,6 +4123,30 @@ static const char readme_msg[] =
"\t\t\t traces\n"
#endif
#endif /* CONFIG_STACK_TRACER */
+#ifdef CONFIG_KPROBE_EVENT
+ " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
+ "\t\t\t Write into this file to define/undefine new trace events.\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+ " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
+ "\t\t\t Write into this file to define/undefine new trace events.\n"
+#endif
+#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT)
+ "\t accepts: event-definitions (one definition per line)\n"
+ "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+ "\t -:[<group>/]<event>\n"
+#ifdef CONFIG_KPROBE_EVENT
+ "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+ "\t place: <path>:<offset>\n"
+#endif
+ "\t args: <name>=fetcharg[:type]\n"
+ "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+ "\t $stack<index>, $stack, $retval, $comm\n"
+ "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n"
+ "\t b<bit-width>@<bit-offset>/<container-size>\n"
+#endif
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
" events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -3804,12 +4162,19 @@ static const char readme_msg[] =
"\t trigger: traceon, traceoff\n"
"\t enable_event:<system>:<event>\n"
"\t disable_event:<system>:<event>\n"
+#ifdef CONFIG_HIST_TRIGGERS
+ "\t enable_hist:<system>:<event>\n"
+ "\t disable_hist:<system>:<event>\n"
+#endif
#ifdef CONFIG_STACKTRACE
"\t\t stacktrace\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
"\t\t snapshot\n"
#endif
+#ifdef CONFIG_HIST_TRIGGERS
+ "\t\t hist (see below)\n"
+#endif
"\t example: echo traceoff > events/block/block_unplug/trigger\n"
"\t echo traceoff:3 > events/block/block_unplug/trigger\n"
"\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
@@ -3825,6 +4190,56 @@ static const char readme_msg[] =
"\t To remove a trigger with a count:\n"
"\t echo '!<trigger>:0 > <system>/<event>/trigger\n"
"\t Filters can be ignored when removing a trigger.\n"
+#ifdef CONFIG_HIST_TRIGGERS
+ " hist trigger\t- If set, event hits are aggregated into a hash table\n"
+ "\t Format: hist:keys=<field1[,field2,...]>\n"
+ "\t [:values=<field1[,field2,...]>]\n"
+ "\t [:sort=<field1[,field2,...]>]\n"
+ "\t [:size=#entries]\n"
+ "\t [:pause][:continue][:clear]\n"
+ "\t [:name=histname1]\n"
+ "\t [if <filter>]\n\n"
+ "\t When a matching event is hit, an entry is added to a hash\n"
+ "\t table using the key(s) and value(s) named, and the value of a\n"
+ "\t sum called 'hitcount' is incremented. Keys and values\n"
+ "\t correspond to fields in the event's format description. Keys\n"
+ "\t can be any field, or the special string 'stacktrace'.\n"
+ "\t Compound keys consisting of up to two fields can be specified\n"
+ "\t by the 'keys' keyword. Values must correspond to numeric\n"
+ "\t fields. Sort keys consisting of up to two fields can be\n"
+ "\t specified using the 'sort' keyword. The sort direction can\n"
+ "\t be modified by appending '.descending' or '.ascending' to a\n"
+ "\t sort field. The 'size' parameter can be used to specify more\n"
+ "\t or fewer than the default 2048 entries for the hashtable size.\n"
+ "\t If a hist trigger is given a name using the 'name' parameter,\n"
+ "\t its histogram data will be shared with other triggers of the\n"
+ "\t same name, and trigger hits will update this common data.\n\n"
+ "\t Reading the 'hist' file for the event will dump the hash\n"
+ "\t table in its entirety to stdout. If there are multiple hist\n"
+ "\t triggers attached to an event, there will be a table for each\n"
+ "\t trigger in the output. The table displayed for a named\n"
+ "\t trigger will be the same as any other instance having the\n"
+ "\t same name. The default format used to display a given field\n"
+ "\t can be modified by appending any of the following modifiers\n"
+ "\t to the field name, as applicable:\n\n"
+ "\t .hex display a number as a hex value\n"
+ "\t .sym display an address as a symbol\n"
+ "\t .sym-offset display an address as a symbol and offset\n"
+ "\t .execname display a common_pid as a program name\n"
+ "\t .syscall display a syscall id as a syscall name\n\n"
+ "\t .log2 display log2 value rather than raw number\n\n"
+ "\t The 'pause' parameter can be used to pause an existing hist\n"
+ "\t trigger or to start a hist trigger but not log any events\n"
+ "\t until told to do so. 'continue' can be used to start or\n"
+ "\t restart a paused hist trigger.\n\n"
+ "\t The 'clear' parameter will clear the contents of a running\n"
+ "\t hist trigger and leave its current paused/active state\n"
+ "\t unchanged.\n\n"
+ "\t The enable_hist and disable_hist triggers can be used to\n"
+ "\t have one event conditionally start and stop another event's\n"
+ "\t already-attached hist trigger. The syntax is analagous to\n"
+ "\t the enable_event and disable_event triggers.\n"
+#endif
;
static ssize_t
@@ -4474,7 +4889,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
if (cnt > MAX_TRACER_SIZE)
cnt = MAX_TRACER_SIZE;
- if (copy_from_user(&buf, ubuf, cnt))
+ if (copy_from_user(buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
@@ -4554,7 +4969,7 @@ out:
return ret;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
@@ -4733,19 +5148,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
struct trace_iterator *iter = filp->private_data;
ssize_t sret;
- /* return any leftover data */
- sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (sret != -EBUSY)
- return sret;
-
- trace_seq_init(&iter->seq);
-
/*
* Avoid more than one consumer on a single file descriptor
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
mutex_lock(&iter->mutex);
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ goto out;
+
+ trace_seq_init(&iter->seq);
+
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
@@ -5264,7 +5680,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
if (cnt >= sizeof(buf))
return -EINVAL;
- if (copy_from_user(&buf, ubuf, cnt))
+ if (copy_from_user(buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
@@ -5476,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -5772,9 +6188,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
if (*ppos & (PAGE_SIZE - 1))
return -EINVAL;
@@ -5784,6 +6197,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
again:
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5841,19 +6257,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
if (ret)
- return ret;
+ goto out;
+ ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
- return -EAGAIN;
+ goto out;
ret = wait_on_pipe(iter, true);
if (ret)
- return ret;
+ goto out;
goto again;
}
ret = splice_to_pipe(pipe, &spd);
+out:
splice_shrink_spd(&spd);
return ret;
@@ -6650,7 +7068,7 @@ static int instance_mkdir(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
- tr->trace_flags = global_trace.trace_flags;
+ tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -6724,6 +7142,12 @@ static int instance_rmdir(const char *name)
list_del(&tr->list);
+ /* Disable all the flags that were enabled coming in */
+ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
+ if ((1 << i) & ZEROED_TRACE_FLAGS)
+ set_tracer_flag(tr, 1 << i, 0);
+ }
+
tracing_set_nop(tr);
event_trace_del_tracer(tr);
ftrace_destroy_function_files(tr);
@@ -6798,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tr->max_latency, &tracing_max_lat_fops);
#endif
@@ -6814,6 +7238,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
for_each_tracing_cpu(cpu)
tracing_init_tracefs_percpu(tr, cpu);
+ ftrace_init_tracefs(tr, d_tracer);
}
static struct vfsmount *trace_automount(void *ingore)
@@ -6967,6 +7392,7 @@ static __init int tracer_init_tracefs(void)
return 0;
init_tracer_tracefs(&global_trace, d_tracer);
+ ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
&global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3fff4adfd431..fd24b1f9ac43 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,6 +38,7 @@ enum trace_type {
TRACE_USER_STACK,
TRACE_BLK,
TRACE_BPUTS,
+ TRACE_HWLAT,
__TRACE_LAST_TYPE,
};
@@ -80,6 +81,12 @@ enum trace_type {
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
+ filter) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter) __packed
+
#include "trace_entries.h"
/*
@@ -156,6 +163,9 @@ struct trace_array_cpu {
char comm[TASK_COMM_LEN];
bool ignore_pid;
+#ifdef CONFIG_FUNCTION_TRACER
+ bool ftrace_ignore_pid;
+#endif
};
struct tracer;
@@ -177,9 +187,8 @@ struct trace_options {
};
struct trace_pid_list {
- unsigned int nr_pids;
- int order;
- pid_t *pids;
+ int pid_max;
+ unsigned long *pids;
};
/*
@@ -205,6 +214,8 @@ struct trace_array {
*/
struct trace_buffer max_buffer;
bool allocated_snapshot;
+#endif
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
unsigned long max_latency;
#endif
struct trace_pid_list __rcu *filtered_pids;
@@ -248,6 +259,7 @@ struct trace_array {
int ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
+ struct trace_pid_list __rcu *function_pids;
/* function tracing enabled */
int function_enabled;
#endif
@@ -317,6 +329,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
+ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -562,6 +575,7 @@ void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
+int tracer_tracing_is_on(struct trace_array *tr);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
@@ -629,6 +643,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
+/* PID filtering */
+
+extern int pid_max;
+
+bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
+ pid_t search_pid);
+bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct task_struct *task);
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task);
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
+int trace_pid_show(struct seq_file *m, void *v);
+void trace_free_pid_list(struct trace_pid_list *pid_list);
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt);
+
#ifdef CONFIG_TRACER_MAX_TRACE
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
@@ -656,6 +689,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
extern cycle_t ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
+extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
@@ -821,12 +855,9 @@ extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
- if (list_empty(&ftrace_pids))
- return 1;
-
- return test_tsk_trace_trace(task);
+ return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
@@ -836,8 +867,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
int using_ftrace_ops_list_func(void);
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+void ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer);
#else
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
return 1;
}
@@ -852,6 +886,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
@@ -967,6 +1003,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(STOP_ON_FREE, "disable_on_free"), \
C(IRQ_INFO, "irq-info"), \
C(MARKERS, "markers"), \
+ C(EVENT_FORK, "event-fork"), \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
@@ -1064,6 +1101,137 @@ struct trace_subsystem_dir {
int nr_events;
};
+extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event);
+
+void trace_buffer_unlock_commit_regs(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc,
+ struct pt_regs *regs);
+
+static inline void trace_buffer_unlock_commit(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
+{
+ trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+}
+
+DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
+DECLARE_PER_CPU(int, trace_buffered_event_cnt);
+void trace_buffered_event_disable(void);
+void trace_buffered_event_enable(void);
+
+static inline void
+__trace_event_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (this_cpu_read(trace_buffered_event) == event) {
+ /* Simply release the temp buffer */
+ this_cpu_dec(trace_buffered_event_cnt);
+ return;
+ }
+ ring_buffer_discard_commit(buffer, event);
+}
+
+/*
+ * Helper function for event_trigger_unlock_commit{_regs}().
+ * If there are event triggers attached to this event that requires
+ * filtering against its fields, then they wil be called as the
+ * entry already holds the field information of the current event.
+ *
+ * It also checks if the event should be discarded or not.
+ * It is to be discarded if the event is soft disabled and the
+ * event was only recorded to process triggers, or if the event
+ * filter is active and this event did not match the filters.
+ *
+ * Returns true if the event is discarded, false otherwise.
+ */
+static inline bool
+__event_trigger_test_discard(struct trace_event_file *file,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ void *entry,
+ enum event_trigger_type *tt)
+{
+ unsigned long eflags = file->flags;
+
+ if (eflags & EVENT_FILE_FL_TRIGGER_COND)
+ *tt = event_triggers_call(file, entry);
+
+ if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
+ (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
+ !filter_match_preds(file->filter, entry))) {
+ __trace_event_discard_commit(buffer, event);
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * event_trigger_unlock_commit - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ */
+static inline void
+event_trigger_unlock_commit(struct trace_event_file *file,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ void *entry, unsigned long irq_flags, int pc)
+{
+ enum event_trigger_type tt = ETT_NONE;
+
+ if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+ trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+
+ if (tt)
+ event_triggers_post_call(file, tt, entry);
+}
+
+/**
+ * event_trigger_unlock_commit_regs - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ *
+ * Same as event_trigger_unlock_commit() but calls
+ * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
+ */
+static inline void
+event_trigger_unlock_commit_regs(struct trace_event_file *file,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ void *entry, unsigned long irq_flags, int pc,
+ struct pt_regs *regs)
+{
+ enum event_trigger_type tt = ETT_NONE;
+
+ if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+ trace_buffer_unlock_commit_regs(file->tr, buffer, event,
+ irq_flags, pc, regs);
+
+ if (tt)
+ event_triggers_post_call(file, tt, entry);
+}
+
#define FILTER_PRED_INVALID ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT (1 << 15)
#define FILTER_PRED_FOLD (1 << 15)
@@ -1161,6 +1329,15 @@ extern struct mutex event_mutex;
extern struct list_head ftrace_events;
extern const struct file_operations event_trigger_fops;
+extern const struct file_operations event_hist_fops;
+
+#ifdef CONFIG_HIST_TRIGGERS
+extern int register_trigger_hist_cmd(void);
+extern int register_trigger_hist_enable_disable_cmds(void);
+#else
+static inline int register_trigger_hist_cmd(void) { return 0; }
+static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
+#endif
extern int register_trigger_cmds(void);
extern void clear_event_triggers(struct trace_array *tr);
@@ -1174,9 +1351,41 @@ struct event_trigger_data {
char *filter_str;
void *private_data;
bool paused;
+ bool paused_tmp;
struct list_head list;
+ char *name;
+ struct list_head named_list;
+ struct event_trigger_data *named_data;
+};
+
+/* Avoid typos */
+#define ENABLE_EVENT_STR "enable_event"
+#define DISABLE_EVENT_STR "disable_event"
+#define ENABLE_HIST_STR "enable_hist"
+#define DISABLE_HIST_STR "disable_hist"
+
+struct enable_trigger_data {
+ struct trace_event_file *file;
+ bool enable;
+ bool hist;
};
+extern int event_enable_trigger_print(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+extern int event_enable_trigger_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param);
+extern int event_enable_register_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file);
+extern void event_enable_unregister_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct trace_event_file *file);
extern void trigger_data_free(struct event_trigger_data *data);
extern int event_trigger_init(struct event_trigger_ops *ops,
struct event_trigger_data *data);
@@ -1189,7 +1398,18 @@ extern void unregister_trigger(char *glob, struct event_trigger_ops *ops,
extern int set_trigger_filter(char *filter_str,
struct event_trigger_data *trigger_data,
struct trace_event_file *file);
+extern struct event_trigger_data *find_named_trigger(const char *name);
+extern bool is_named_trigger(struct event_trigger_data *test);
+extern int save_named_trigger(const char *name,
+ struct event_trigger_data *data);
+extern void del_named_trigger(struct event_trigger_data *data);
+extern void pause_named_trigger(struct event_trigger_data *data);
+extern void unpause_named_trigger(struct event_trigger_data *data);
+extern void set_named_trigger_data(struct event_trigger_data *data,
+ struct event_trigger_data *named_data);
extern int register_event_command(struct event_command *cmd);
+extern int unregister_event_command(struct event_command *cmd);
+extern int register_trigger_hist_enable_disable_cmds(void);
/**
* struct event_trigger_ops - callbacks for trace event triggers
@@ -1416,6 +1636,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
#include "trace_entries.h"
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..d1cc37e78f99 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
);
/* Function call entry */
-FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
);
/* Function return entry */
-FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch,
FILTER_OTHER
);
+
+FTRACE_ENTRY(hwlat, hwlat_entry,
+
+ TRACE_HWLAT,
+
+ F_STRUCT(
+ __field( u64, duration )
+ __field( u64, outer_duration )
+ __field( u64, nmi_total_ts )
+ __field_struct( struct timespec, timestamp )
+ __field_desc( long, timestamp, tv_sec )
+ __field_desc( long, timestamp, tv_nsec )
+ __field( unsigned int, nmi_count )
+ __field( unsigned int, seqnum )
+ ),
+
+ F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+ __entry->seqnum,
+ __entry->tv_sec,
+ __entry->tv_nsec,
+ __entry->duration,
+ __entry->outer_duration,
+ __entry->nmi_total_ts,
+ __entry->nmi_count),
+
+ FILTER_OTHER
+);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b7b0760ba6ee..03c0a48c3ac4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
#include <linux/kthread.h>
#include <linux/tracefs.h>
#include <linux/uaccess.h>
-#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/sort.h>
@@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
+ /*
+ * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+ * preemption (adding one to the preempt_count). Since we are
+ * interested in the preempt_count at the time the tracepoint was
+ * hit, we need to subtract one to offset the increment.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ fbuffer->pc--;
fbuffer->trace_file = trace_file;
fbuffer->event =
@@ -381,6 +388,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
+ unsigned long file_flags = file->flags;
int ret = 0;
int disable;
@@ -463,6 +471,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
}
+ /* Enable or disable use of trace_buffered_event */
+ if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
+ (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
+ if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+ trace_buffered_event_enable();
+ else
+ trace_buffered_event_disable();
+ }
+
return ret;
}
@@ -489,40 +506,41 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(&event_mutex);
}
-static int cmp_pid(const void *key, const void *elt)
+static void
+event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
{
- const pid_t *search_pid = key;
- const pid_t *pid = elt;
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = data;
- if (*search_pid == *pid)
- return 0;
- if (*search_pid < *pid)
- return -1;
- return 1;
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+ trace_filter_add_remove_task(pid_list, NULL, task);
}
-static bool
-check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+static void
+event_filter_pid_sched_process_fork(void *data,
+ struct task_struct *self,
+ struct task_struct *task)
{
- pid_t search_pid;
- pid_t *pid;
-
- /*
- * Return false, because if filtered_pids does not exist,
- * all pids are good to trace.
- */
- if (!filtered_pids)
- return false;
-
- search_pid = task->pid;
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = data;
- pid = bsearch(&search_pid, filtered_pids->pids,
- filtered_pids->nr_pids, sizeof(pid_t),
- cmp_pid);
- if (!pid)
- return true;
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+ trace_filter_add_remove_task(pid_list, self, task);
+}
- return false;
+void trace_event_follow_fork(struct trace_array *tr, bool enable)
+{
+ if (enable) {
+ register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
+ tr, INT_MIN);
+ register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+ tr, INT_MAX);
+ } else {
+ unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
+ tr);
+ unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+ tr);
+ }
}
static void
@@ -535,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- check_ignore_pid(pid_list, prev) &&
- check_ignore_pid(pid_list, next));
+ trace_ignore_this_task(pid_list, prev) &&
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -549,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- check_ignore_pid(pid_list, next));
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -565,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- check_ignore_pid(pid_list, task));
+ trace_ignore_this_task(pid_list, task));
}
static void
@@ -582,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
/* Set tracing if current is enabled */
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- check_ignore_pid(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -620,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
/* Wait till all users are no longer using pid filtering */
synchronize_sched();
- free_pages((unsigned long)pid_list->pids, pid_list->order);
- kfree(pid_list);
+ trace_free_pid_list(pid_list);
}
static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -964,6 +981,15 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ return trace_pid_next(pid_list, v, pos);
+}
+
static void *p_start(struct seq_file *m, loff_t *pos)
__acquires(RCU)
{
@@ -981,10 +1007,10 @@ static void *p_start(struct seq_file *m, loff_t *pos)
pid_list = rcu_dereference_sched(tr->filtered_pids);
- if (!pid_list || *pos >= pid_list->nr_pids)
+ if (!pid_list)
return NULL;
- return (void *)&pid_list->pids[*pos];
+ return trace_pid_start(pid_list, pos);
}
static void p_stop(struct seq_file *m, void *p)
@@ -994,28 +1020,6 @@ static void p_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
-static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct trace_array *tr = m->private;
- struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-
- (*pos)++;
-
- if (*pos >= pid_list->nr_pids)
- return NULL;
-
- return (void *)&pid_list->pids[*pos];
-}
-
-static int p_show(struct seq_file *m, void *v)
-{
- pid_t *pid = v;
-
- seq_printf(m, "%d\n", *pid);
- return 0;
-}
-
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -1561,11 +1565,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return r;
}
-static int max_pids(struct trace_pid_list *pid_list)
-{
- return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
-}
-
static void ignore_task_cpu(void *data)
{
struct trace_array *tr = data;
@@ -1579,7 +1578,7 @@ static void ignore_task_cpu(void *data)
mutex_is_locked(&event_mutex));
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- check_ignore_pid(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static ssize_t
@@ -1589,15 +1588,9 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
struct seq_file *m = filp->private_data;
struct trace_array *tr = m->private;
struct trace_pid_list *filtered_pids = NULL;
- struct trace_pid_list *pid_list = NULL;
+ struct trace_pid_list *pid_list;
struct trace_event_file *file;
- struct trace_parser parser;
- unsigned long val;
- loff_t this_pos;
- ssize_t read = 0;
- ssize_t ret = 0;
- pid_t pid;
- int i;
+ ssize_t ret;
if (!cnt)
return 0;
@@ -1606,116 +1599,14 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
- return -ENOMEM;
-
mutex_lock(&event_mutex);
- /*
- * Load as many pids into the array before doing a
- * swap from the tr->filtered_pids to the new list.
- */
- while (cnt > 0) {
-
- this_pos = 0;
-
- ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
- if (ret < 0 || !trace_parser_loaded(&parser))
- break;
-
- read += ret;
- ubuf += ret;
- cnt -= ret;
-
- parser.buffer[parser.idx] = 0;
-
- ret = -EINVAL;
- if (kstrtoul(parser.buffer, 0, &val))
- break;
- if (val > INT_MAX)
- break;
-
- pid = (pid_t)val;
-
- ret = -ENOMEM;
- if (!pid_list) {
- pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
- if (!pid_list)
- break;
-
- filtered_pids = rcu_dereference_protected(tr->filtered_pids,
- lockdep_is_held(&event_mutex));
- if (filtered_pids)
- pid_list->order = filtered_pids->order;
- else
- pid_list->order = 0;
-
- pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
- pid_list->order);
- if (!pid_list->pids)
- break;
-
- if (filtered_pids) {
- pid_list->nr_pids = filtered_pids->nr_pids;
- memcpy(pid_list->pids, filtered_pids->pids,
- pid_list->nr_pids * sizeof(pid_t));
- } else
- pid_list->nr_pids = 0;
- }
-
- if (pid_list->nr_pids >= max_pids(pid_list)) {
- pid_t *pid_page;
-
- pid_page = (void *)__get_free_pages(GFP_KERNEL,
- pid_list->order + 1);
- if (!pid_page)
- break;
- memcpy(pid_page, pid_list->pids,
- pid_list->nr_pids * sizeof(pid_t));
- free_pages((unsigned long)pid_list->pids, pid_list->order);
-
- pid_list->order++;
- pid_list->pids = pid_page;
- }
- pid_list->pids[pid_list->nr_pids++] = pid;
- trace_parser_clear(&parser);
- ret = 0;
- }
- trace_parser_put(&parser);
-
- if (ret < 0) {
- if (pid_list)
- free_pages((unsigned long)pid_list->pids, pid_list->order);
- kfree(pid_list);
- mutex_unlock(&event_mutex);
- return ret;
- }
-
- if (!pid_list) {
- mutex_unlock(&event_mutex);
- return ret;
- }
-
- sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
-
- /* Remove duplicates */
- for (i = 1; i < pid_list->nr_pids; i++) {
- int start = i;
-
- while (i < pid_list->nr_pids &&
- pid_list->pids[i - 1] == pid_list->pids[i])
- i++;
+ filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
- if (start != i) {
- if (i < pid_list->nr_pids) {
- memmove(&pid_list->pids[start], &pid_list->pids[i],
- (pid_list->nr_pids - i) * sizeof(pid_t));
- pid_list->nr_pids -= i - start;
- i = start;
- } else
- pid_list->nr_pids = start;
- }
- }
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
+ goto out;
rcu_assign_pointer(tr->filtered_pids, pid_list);
@@ -1725,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
synchronize_sched();
-
- free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
- kfree(filtered_pids);
- } else {
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
/*
* Register a probe that is called before all other probes
* to set ignore_pid if next or prev do not match.
@@ -1763,10 +1652,11 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
*/
on_each_cpu(ignore_task_cpu, tr, 1);
+ out:
mutex_unlock(&event_mutex);
- ret = read;
- *ppos += read;
+ if (ret > 0)
+ *ppos += ret;
return ret;
}
@@ -1793,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = {
static const struct seq_operations show_set_pid_seq_ops = {
.start = p_start,
.next = p_next,
- .show = p_show,
+ .show = trace_pid_show,
.stop = p_stop,
};
@@ -2121,6 +2011,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("trigger", 0644, file->dir, file,
&event_trigger_fops);
+#ifdef CONFIG_HIST_TRIGGERS
+ trace_create_file("hist", 0444, file->dir, file,
+ &event_hist_fops);
+#endif
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
@@ -3368,7 +3262,7 @@ static __init void event_trace_self_tests(void)
static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
-static struct trace_array *event_tr;
+static struct trace_event_file event_trace_file __initdata;
static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
@@ -3392,17 +3286,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
local_save_flags(flags);
- event = trace_current_buffer_lock_reserve(&buffer,
- TRACE_FN, sizeof(*entry),
- flags, pc);
+ event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
+ TRACE_FN, sizeof(*entry),
+ flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->parent_ip = parent_ip;
- trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc);
-
+ event_trigger_unlock_commit(&event_trace_file, buffer, event,
+ entry, flags, pc);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
preempt_enable_notrace();
@@ -3417,9 +3311,11 @@ static struct ftrace_ops trace_ops __initdata =
static __init void event_trace_self_test_with_function(void)
{
int ret;
- event_tr = top_trace_array();
- if (WARN_ON(!event_tr))
+
+ event_trace_file.tr = top_trace_array();
+ if (WARN_ON(!event_trace_file.tr))
return;
+
ret = register_ftrace_function(&trace_ops);
if (WARN_ON(ret < 0)) {
pr_info("Failed to enable function tracer for event tests\n");
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b3f5051cd4e9..9daa9b3bc6d9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps,
static inline struct event_filter *event_filter(struct trace_event_file *file)
{
- if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- return file->event_call->filter;
- else
- return file->filter;
+ return file->filter;
}
/* caller must hold event_mutex */
@@ -826,12 +823,12 @@ static void __free_preds(struct event_filter *filter)
static void filter_disable(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
+ unsigned long old_flags = file->flags;
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
- else
- file->flags &= ~EVENT_FILE_FL_FILTERED;
+ file->flags &= ~EVENT_FILE_FL_FILTERED;
+
+ if (old_flags != file->flags)
+ trace_buffered_event_disable();
}
static void __free_filter(struct event_filter *filter)
@@ -883,13 +880,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
static inline void __remove_filter(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
-
filter_disable(file);
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- remove_filter_string(call->filter);
- else
- remove_filter_string(file->filter);
+ remove_filter_string(file->filter);
}
static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
@@ -906,15 +898,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
static inline void __free_subsystem_filter(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
- __free_filter(call->filter);
- call->filter = NULL;
- } else {
- __free_filter(file->filter);
- file->filter = NULL;
- }
+ __free_filter(file->filter);
+ file->filter = NULL;
}
static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
@@ -1718,69 +1703,43 @@ fail:
static inline void event_set_filtered_flag(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
+ unsigned long old_flags = file->flags;
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call->flags |= TRACE_EVENT_FL_FILTERED;
- else
- file->flags |= EVENT_FILE_FL_FILTERED;
+ file->flags |= EVENT_FILE_FL_FILTERED;
+
+ if (old_flags != file->flags)
+ trace_buffered_event_enable();
}
static inline void event_set_filter(struct trace_event_file *file,
struct event_filter *filter)
{
- struct trace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- rcu_assign_pointer(call->filter, filter);
- else
- rcu_assign_pointer(file->filter, filter);
+ rcu_assign_pointer(file->filter, filter);
}
static inline void event_clear_filter(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- RCU_INIT_POINTER(call->filter, NULL);
- else
- RCU_INIT_POINTER(file->filter, NULL);
+ RCU_INIT_POINTER(file->filter, NULL);
}
static inline void
event_set_no_set_filter_flag(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
- else
- file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
+ file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
}
static inline void
event_clear_no_set_filter_flag(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
- else
- file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
+ file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
}
static inline bool
event_no_set_filter_flag(struct trace_event_file *file)
{
- struct trace_event_call *call = file->event_call;
-
if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
return true;
- if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
- (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
- return true;
-
return false;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
new file mode 100644
index 000000000000..f3a960ed75a1
--- /dev/null
+++ b/kernel/trace/trace_events_hist.c
@@ -0,0 +1,1755 @@
+/*
+ * trace_events_hist - trace event hist triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+struct hist_field;
+
+typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+
+struct hist_field {
+ struct ftrace_event_field *field;
+ unsigned long flags;
+ hist_field_fn_t fn;
+ unsigned int size;
+ unsigned int offset;
+};
+
+static u64 hist_field_none(struct hist_field *field, void *event)
+{
+ return 0;
+}
+
+static u64 hist_field_counter(struct hist_field *field, void *event)
+{
+ return 1;
+}
+
+static u64 hist_field_string(struct hist_field *hist_field, void *event)
+{
+ char *addr = (char *)(event + hist_field->field->offset);
+
+ return (u64)(unsigned long)addr;
+}
+
+static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
+{
+ u32 str_item = *(u32 *)(event + hist_field->field->offset);
+ int str_loc = str_item & 0xffff;
+ char *addr = (char *)(event + str_loc);
+
+ return (u64)(unsigned long)addr;
+}
+
+static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
+{
+ char **addr = (char **)(event + hist_field->field->offset);
+
+ return (u64)(unsigned long)*addr;
+}
+
+static u64 hist_field_log2(struct hist_field *hist_field, void *event)
+{
+ u64 val = *(u64 *)(event + hist_field->field->offset);
+
+ return (u64) ilog2(roundup_pow_of_two(val));
+}
+
+#define DEFINE_HIST_FIELD_FN(type) \
+static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+{ \
+ type *addr = (type *)(event + hist_field->field->offset); \
+ \
+ return (u64)(unsigned long)*addr; \
+}
+
+DEFINE_HIST_FIELD_FN(s64);
+DEFINE_HIST_FIELD_FN(u64);
+DEFINE_HIST_FIELD_FN(s32);
+DEFINE_HIST_FIELD_FN(u32);
+DEFINE_HIST_FIELD_FN(s16);
+DEFINE_HIST_FIELD_FN(u16);
+DEFINE_HIST_FIELD_FN(s8);
+DEFINE_HIST_FIELD_FN(u8);
+
+#define for_each_hist_field(i, hist_data) \
+ for ((i) = 0; (i) < (hist_data)->n_fields; (i)++)
+
+#define for_each_hist_val_field(i, hist_data) \
+ for ((i) = 0; (i) < (hist_data)->n_vals; (i)++)
+
+#define for_each_hist_key_field(i, hist_data) \
+ for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
+
+#define HIST_STACKTRACE_DEPTH 16
+#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
+#define HIST_STACKTRACE_SKIP 5
+
+#define HITCOUNT_IDX 0
+#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
+
+enum hist_field_flags {
+ HIST_FIELD_FL_HITCOUNT = 1,
+ HIST_FIELD_FL_KEY = 2,
+ HIST_FIELD_FL_STRING = 4,
+ HIST_FIELD_FL_HEX = 8,
+ HIST_FIELD_FL_SYM = 16,
+ HIST_FIELD_FL_SYM_OFFSET = 32,
+ HIST_FIELD_FL_EXECNAME = 64,
+ HIST_FIELD_FL_SYSCALL = 128,
+ HIST_FIELD_FL_STACKTRACE = 256,
+ HIST_FIELD_FL_LOG2 = 512,
+};
+
+struct hist_trigger_attrs {
+ char *keys_str;
+ char *vals_str;
+ char *sort_key_str;
+ char *name;
+ bool pause;
+ bool cont;
+ bool clear;
+ unsigned int map_bits;
+};
+
+struct hist_trigger_data {
+ struct hist_field *fields[TRACING_MAP_FIELDS_MAX];
+ unsigned int n_vals;
+ unsigned int n_keys;
+ unsigned int n_fields;
+ unsigned int key_size;
+ struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
+ unsigned int n_sort_keys;
+ struct trace_event_file *event_file;
+ struct hist_trigger_attrs *attrs;
+ struct tracing_map *map;
+};
+
+static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+{
+ hist_field_fn_t fn = NULL;
+
+ switch (field_size) {
+ case 8:
+ if (field_is_signed)
+ fn = hist_field_s64;
+ else
+ fn = hist_field_u64;
+ break;
+ case 4:
+ if (field_is_signed)
+ fn = hist_field_s32;
+ else
+ fn = hist_field_u32;
+ break;
+ case 2:
+ if (field_is_signed)
+ fn = hist_field_s16;
+ else
+ fn = hist_field_u16;
+ break;
+ case 1:
+ if (field_is_signed)
+ fn = hist_field_s8;
+ else
+ fn = hist_field_u8;
+ break;
+ }
+
+ return fn;
+}
+
+static int parse_map_size(char *str)
+{
+ unsigned long size, map_bits;
+ int ret;
+
+ strsep(&str, "=");
+ if (!str) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = kstrtoul(str, 0, &size);
+ if (ret)
+ goto out;
+
+ map_bits = ilog2(roundup_pow_of_two(size));
+ if (map_bits < TRACING_MAP_BITS_MIN ||
+ map_bits > TRACING_MAP_BITS_MAX)
+ ret = -EINVAL;
+ else
+ ret = map_bits;
+ out:
+ return ret;
+}
+
+static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
+{
+ if (!attrs)
+ return;
+
+ kfree(attrs->name);
+ kfree(attrs->sort_key_str);
+ kfree(attrs->keys_str);
+ kfree(attrs->vals_str);
+ kfree(attrs);
+}
+
+static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+{
+ struct hist_trigger_attrs *attrs;
+ int ret = 0;
+
+ attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return ERR_PTR(-ENOMEM);
+
+ while (trigger_str) {
+ char *str = strsep(&trigger_str, ":");
+
+ if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+ (strncmp(str, "keys=", strlen("keys=")) == 0))
+ attrs->keys_str = kstrdup(str, GFP_KERNEL);
+ else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+ (strncmp(str, "vals=", strlen("vals=")) == 0) ||
+ (strncmp(str, "values=", strlen("values=")) == 0))
+ attrs->vals_str = kstrdup(str, GFP_KERNEL);
+ else if (strncmp(str, "sort=", strlen("sort=")) == 0)
+ attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+ else if (strncmp(str, "name=", strlen("name=")) == 0)
+ attrs->name = kstrdup(str, GFP_KERNEL);
+ else if (strcmp(str, "pause") == 0)
+ attrs->pause = true;
+ else if ((strcmp(str, "cont") == 0) ||
+ (strcmp(str, "continue") == 0))
+ attrs->cont = true;
+ else if (strcmp(str, "clear") == 0)
+ attrs->clear = true;
+ else if (strncmp(str, "size=", strlen("size=")) == 0) {
+ int map_bits = parse_map_size(str);
+
+ if (map_bits < 0) {
+ ret = map_bits;
+ goto free;
+ }
+ attrs->map_bits = map_bits;
+ } else {
+ ret = -EINVAL;
+ goto free;
+ }
+ }
+
+ if (!attrs->keys_str) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ return attrs;
+ free:
+ destroy_hist_trigger_attrs(attrs);
+
+ return ERR_PTR(ret);
+}
+
+static inline void save_comm(char *comm, struct task_struct *task)
+{
+ if (!task->pid) {
+ strcpy(comm, "<idle>");
+ return;
+ }
+
+ if (WARN_ON_ONCE(task->pid < 0)) {
+ strcpy(comm, "<XXX>");
+ return;
+ }
+
+ memcpy(comm, task->comm, TASK_COMM_LEN);
+}
+
+static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
+{
+ kfree((char *)elt->private_data);
+}
+
+static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
+{
+ struct hist_trigger_data *hist_data = elt->map->private_data;
+ struct hist_field *key_field;
+ unsigned int i;
+
+ for_each_hist_key_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+
+ if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+ unsigned int size = TASK_COMM_LEN + 1;
+
+ elt->private_data = kzalloc(size, GFP_KERNEL);
+ if (!elt->private_data)
+ return -ENOMEM;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
+ struct tracing_map_elt *from)
+{
+ char *comm_from = from->private_data;
+ char *comm_to = to->private_data;
+
+ if (comm_from)
+ memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
+}
+
+static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
+{
+ char *comm = elt->private_data;
+
+ if (comm)
+ save_comm(comm, current);
+}
+
+static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
+ .elt_alloc = hist_trigger_elt_comm_alloc,
+ .elt_copy = hist_trigger_elt_comm_copy,
+ .elt_free = hist_trigger_elt_comm_free,
+ .elt_init = hist_trigger_elt_comm_init,
+};
+
+static void destroy_hist_field(struct hist_field *hist_field)
+{
+ kfree(hist_field);
+}
+
+static struct hist_field *create_hist_field(struct ftrace_event_field *field,
+ unsigned long flags)
+{
+ struct hist_field *hist_field;
+
+ if (field && is_function_field(field))
+ return NULL;
+
+ hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+ if (!hist_field)
+ return NULL;
+
+ if (flags & HIST_FIELD_FL_HITCOUNT) {
+ hist_field->fn = hist_field_counter;
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_STACKTRACE) {
+ hist_field->fn = hist_field_none;
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_LOG2) {
+ hist_field->fn = hist_field_log2;
+ goto out;
+ }
+
+ if (WARN_ON_ONCE(!field))
+ goto out;
+
+ if (is_string_field(field)) {
+ flags |= HIST_FIELD_FL_STRING;
+
+ if (field->filter_type == FILTER_STATIC_STRING)
+ hist_field->fn = hist_field_string;
+ else if (field->filter_type == FILTER_DYN_STRING)
+ hist_field->fn = hist_field_dynstring;
+ else
+ hist_field->fn = hist_field_pstring;
+ } else {
+ hist_field->fn = select_value_fn(field->size,
+ field->is_signed);
+ if (!hist_field->fn) {
+ destroy_hist_field(hist_field);
+ return NULL;
+ }
+ }
+ out:
+ hist_field->field = field;
+ hist_field->flags = flags;
+
+ return hist_field;
+}
+
+static void destroy_hist_fields(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+ if (hist_data->fields[i]) {
+ destroy_hist_field(hist_data->fields[i]);
+ hist_data->fields[i] = NULL;
+ }
+ }
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+ hist_data->fields[HITCOUNT_IDX] =
+ create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
+ if (!hist_data->fields[HITCOUNT_IDX])
+ return -ENOMEM;
+
+ hist_data->n_vals++;
+
+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int create_val_field(struct hist_trigger_data *hist_data,
+ unsigned int val_idx,
+ struct trace_event_file *file,
+ char *field_str)
+{
+ struct ftrace_event_field *field = NULL;
+ unsigned long flags = 0;
+ char *field_name;
+ int ret = 0;
+
+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+ return -EINVAL;
+
+ field_name = strsep(&field_str, ".");
+ if (field_str) {
+ if (strcmp(field_str, "hex") == 0)
+ flags |= HIST_FIELD_FL_HEX;
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ field = trace_find_event_field(file->event_call, field_name);
+ if (!field) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ hist_data->fields[val_idx] = create_hist_field(field, flags);
+ if (!hist_data->fields[val_idx]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ++hist_data->n_vals;
+
+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ ret = -EINVAL;
+ out:
+ return ret;
+}
+
+static int create_val_fields(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ char *fields_str, *field_str;
+ unsigned int i, j;
+ int ret;
+
+ ret = create_hitcount_val(hist_data);
+ if (ret)
+ goto out;
+
+ fields_str = hist_data->attrs->vals_str;
+ if (!fields_str)
+ goto out;
+
+ strsep(&fields_str, "=");
+ if (!fields_str)
+ goto out;
+
+ for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX &&
+ j < TRACING_MAP_VALS_MAX; i++) {
+ field_str = strsep(&fields_str, ",");
+ if (!field_str)
+ break;
+ if (strcmp(field_str, "hitcount") == 0)
+ continue;
+ ret = create_val_field(hist_data, j++, file, field_str);
+ if (ret)
+ goto out;
+ }
+ if (fields_str && (strcmp(fields_str, "hitcount") != 0))
+ ret = -EINVAL;
+ out:
+ return ret;
+}
+
+static int create_key_field(struct hist_trigger_data *hist_data,
+ unsigned int key_idx,
+ unsigned int key_offset,
+ struct trace_event_file *file,
+ char *field_str)
+{
+ struct ftrace_event_field *field = NULL;
+ unsigned long flags = 0;
+ unsigned int key_size;
+ int ret = 0;
+
+ if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+ return -EINVAL;
+
+ flags |= HIST_FIELD_FL_KEY;
+
+ if (strcmp(field_str, "stacktrace") == 0) {
+ flags |= HIST_FIELD_FL_STACKTRACE;
+ key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
+ } else {
+ char *field_name = strsep(&field_str, ".");
+
+ if (field_str) {
+ if (strcmp(field_str, "hex") == 0)
+ flags |= HIST_FIELD_FL_HEX;
+ else if (strcmp(field_str, "sym") == 0)
+ flags |= HIST_FIELD_FL_SYM;
+ else if (strcmp(field_str, "sym-offset") == 0)
+ flags |= HIST_FIELD_FL_SYM_OFFSET;
+ else if ((strcmp(field_str, "execname") == 0) &&
+ (strcmp(field_name, "common_pid") == 0))
+ flags |= HIST_FIELD_FL_EXECNAME;
+ else if (strcmp(field_str, "syscall") == 0)
+ flags |= HIST_FIELD_FL_SYSCALL;
+ else if (strcmp(field_str, "log2") == 0)
+ flags |= HIST_FIELD_FL_LOG2;
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ field = trace_find_event_field(file->event_call, field_name);
+ if (!field) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (is_string_field(field))
+ key_size = MAX_FILTER_STR_VAL;
+ else
+ key_size = field->size;
+ }
+
+ hist_data->fields[key_idx] = create_hist_field(field, flags);
+ if (!hist_data->fields[key_idx]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key_size = ALIGN(key_size, sizeof(u64));
+ hist_data->fields[key_idx]->size = key_size;
+ hist_data->fields[key_idx]->offset = key_offset;
+ hist_data->key_size += key_size;
+ if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ hist_data->n_keys++;
+
+ if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
+ return -EINVAL;
+
+ ret = key_size;
+ out:
+ return ret;
+}
+
+static int create_key_fields(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ unsigned int i, key_offset = 0, n_vals = hist_data->n_vals;
+ char *fields_str, *field_str;
+ int ret = -EINVAL;
+
+ fields_str = hist_data->attrs->keys_str;
+ if (!fields_str)
+ goto out;
+
+ strsep(&fields_str, "=");
+ if (!fields_str)
+ goto out;
+
+ for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) {
+ field_str = strsep(&fields_str, ",");
+ if (!field_str)
+ break;
+ ret = create_key_field(hist_data, i, key_offset,
+ file, field_str);
+ if (ret < 0)
+ goto out;
+ key_offset += ret;
+ }
+ if (fields_str) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = 0;
+ out:
+ return ret;
+}
+
+static int create_hist_fields(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ int ret;
+
+ ret = create_val_fields(hist_data, file);
+ if (ret)
+ goto out;
+
+ ret = create_key_fields(hist_data, file);
+ if (ret)
+ goto out;
+
+ hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+ out:
+ return ret;
+}
+
+static int is_descending(const char *str)
+{
+ if (!str)
+ return 0;
+
+ if (strcmp(str, "descending") == 0)
+ return 1;
+
+ if (strcmp(str, "ascending") == 0)
+ return 0;
+
+ return -EINVAL;
+}
+
+static int create_sort_keys(struct hist_trigger_data *hist_data)
+{
+ char *fields_str = hist_data->attrs->sort_key_str;
+ struct ftrace_event_field *field = NULL;
+ struct tracing_map_sort_key *sort_key;
+ int descending, ret = 0;
+ unsigned int i, j;
+
+ hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
+
+ if (!fields_str)
+ goto out;
+
+ strsep(&fields_str, "=");
+ if (!fields_str) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+ char *field_str, *field_name;
+
+ sort_key = &hist_data->sort_keys[i];
+
+ field_str = strsep(&fields_str, ",");
+ if (!field_str) {
+ if (i == 0)
+ ret = -EINVAL;
+ break;
+ }
+
+ if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) {
+ ret = -EINVAL;
+ break;
+ }
+
+ field_name = strsep(&field_str, ".");
+ if (!field_name) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (strcmp(field_name, "hitcount") == 0) {
+ descending = is_descending(field_str);
+ if (descending < 0) {
+ ret = descending;
+ break;
+ }
+ sort_key->descending = descending;
+ continue;
+ }
+
+ for (j = 1; j < hist_data->n_fields; j++) {
+ field = hist_data->fields[j]->field;
+ if (field && (strcmp(field_name, field->name) == 0)) {
+ sort_key->field_idx = j;
+ descending = is_descending(field_str);
+ if (descending < 0) {
+ ret = descending;
+ goto out;
+ }
+ sort_key->descending = descending;
+ break;
+ }
+ }
+ if (j == hist_data->n_fields) {
+ ret = -EINVAL;
+ break;
+ }
+ }
+ hist_data->n_sort_keys = i;
+ out:
+ return ret;
+}
+
+static void destroy_hist_data(struct hist_trigger_data *hist_data)
+{
+ destroy_hist_trigger_attrs(hist_data->attrs);
+ destroy_hist_fields(hist_data);
+ tracing_map_destroy(hist_data->map);
+ kfree(hist_data);
+}
+
+static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
+{
+ struct tracing_map *map = hist_data->map;
+ struct ftrace_event_field *field;
+ struct hist_field *hist_field;
+ int i, idx;
+
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ if (hist_field->flags & HIST_FIELD_FL_KEY) {
+ tracing_map_cmp_fn_t cmp_fn;
+
+ field = hist_field->field;
+
+ if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
+ cmp_fn = tracing_map_cmp_none;
+ else if (is_string_field(field))
+ cmp_fn = tracing_map_cmp_string;
+ else
+ cmp_fn = tracing_map_cmp_num(field->size,
+ field->is_signed);
+ idx = tracing_map_add_key_field(map,
+ hist_field->offset,
+ cmp_fn);
+
+ } else
+ idx = tracing_map_add_sum_field(map);
+
+ if (idx < 0)
+ return idx;
+ }
+
+ return 0;
+}
+
+static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
+{
+ struct hist_field *key_field;
+ unsigned int i;
+
+ for_each_hist_key_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+
+ if (key_field->flags & HIST_FIELD_FL_EXECNAME)
+ return true;
+ }
+
+ return false;
+}
+
+static struct hist_trigger_data *
+create_hist_data(unsigned int map_bits,
+ struct hist_trigger_attrs *attrs,
+ struct trace_event_file *file)
+{
+ const struct tracing_map_ops *map_ops = NULL;
+ struct hist_trigger_data *hist_data;
+ int ret = 0;
+
+ hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL);
+ if (!hist_data)
+ return ERR_PTR(-ENOMEM);
+
+ hist_data->attrs = attrs;
+
+ ret = create_hist_fields(hist_data, file);
+ if (ret)
+ goto free;
+
+ ret = create_sort_keys(hist_data);
+ if (ret)
+ goto free;
+
+ if (need_tracing_map_ops(hist_data))
+ map_ops = &hist_trigger_elt_comm_ops;
+
+ hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
+ map_ops, hist_data);
+ if (IS_ERR(hist_data->map)) {
+ ret = PTR_ERR(hist_data->map);
+ hist_data->map = NULL;
+ goto free;
+ }
+
+ ret = create_tracing_map_fields(hist_data);
+ if (ret)
+ goto free;
+
+ ret = tracing_map_init(hist_data->map);
+ if (ret)
+ goto free;
+
+ hist_data->event_file = file;
+ out:
+ return hist_data;
+ free:
+ hist_data->attrs = NULL;
+
+ destroy_hist_data(hist_data);
+
+ hist_data = ERR_PTR(ret);
+
+ goto out;
+}
+
+static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ void *rec)
+{
+ struct hist_field *hist_field;
+ unsigned int i;
+ u64 hist_val;
+
+ for_each_hist_val_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ hist_val = hist_field->fn(hist_field, rec);
+ tracing_map_update_sum(elt, i, hist_val);
+ }
+}
+
+static inline void add_to_key(char *compound_key, void *key,
+ struct hist_field *key_field, void *rec)
+{
+ size_t size = key_field->size;
+
+ if (key_field->flags & HIST_FIELD_FL_STRING) {
+ struct ftrace_event_field *field;
+
+ field = key_field->field;
+ if (field->filter_type == FILTER_DYN_STRING)
+ size = *(u32 *)(rec + field->offset) >> 16;
+ else if (field->filter_type == FILTER_PTR_STRING)
+ size = strlen(key);
+ else if (field->filter_type == FILTER_STATIC_STRING)
+ size = field->size;
+
+ /* ensure NULL-termination */
+ if (size > key_field->size - 1)
+ size = key_field->size - 1;
+ }
+
+ memcpy(compound_key + key_field->offset, key, size);
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ bool use_compound_key = (hist_data->n_keys > 1);
+ unsigned long entries[HIST_STACKTRACE_DEPTH];
+ char compound_key[HIST_KEY_SIZE_MAX];
+ struct stack_trace stacktrace;
+ struct hist_field *key_field;
+ struct tracing_map_elt *elt;
+ u64 field_contents;
+ void *key = NULL;
+ unsigned int i;
+
+ memset(compound_key, 0, hist_data->key_size);
+
+ for_each_hist_key_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+
+ if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+ stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
+ stacktrace.entries = entries;
+ stacktrace.nr_entries = 0;
+ stacktrace.skip = HIST_STACKTRACE_SKIP;
+
+ memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
+ save_stack_trace(&stacktrace);
+
+ key = entries;
+ } else {
+ field_contents = key_field->fn(key_field, rec);
+ if (key_field->flags & HIST_FIELD_FL_STRING) {
+ key = (void *)(unsigned long)field_contents;
+ use_compound_key = true;
+ } else
+ key = (void *)&field_contents;
+ }
+
+ if (use_compound_key)
+ add_to_key(compound_key, key, key_field, rec);
+ }
+
+ if (use_compound_key)
+ key = compound_key;
+
+ elt = tracing_map_insert(hist_data->map, key);
+ if (elt)
+ hist_trigger_elt_update(hist_data, elt, rec);
+}
+
+static void hist_trigger_stacktrace_print(struct seq_file *m,
+ unsigned long *stacktrace_entries,
+ unsigned int max_entries)
+{
+ char str[KSYM_SYMBOL_LEN];
+ unsigned int spaces = 8;
+ unsigned int i;
+
+ for (i = 0; i < max_entries; i++) {
+ if (stacktrace_entries[i] == ULONG_MAX)
+ return;
+
+ seq_printf(m, "%*c", 1 + spaces, ' ');
+ sprint_symbol(str, stacktrace_entries[i]);
+ seq_printf(m, "%s\n", str);
+ }
+}
+
+static void
+hist_trigger_entry_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data, void *key,
+ struct tracing_map_elt *elt)
+{
+ struct hist_field *key_field;
+ char str[KSYM_SYMBOL_LEN];
+ bool multiline = false;
+ unsigned int i;
+ u64 uval;
+
+ seq_puts(m, "{ ");
+
+ for_each_hist_key_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+
+ if (i > hist_data->n_vals)
+ seq_puts(m, ", ");
+
+ if (key_field->flags & HIST_FIELD_FL_HEX) {
+ uval = *(u64 *)(key + key_field->offset);
+ seq_printf(m, "%s: %llx",
+ key_field->field->name, uval);
+ } else if (key_field->flags & HIST_FIELD_FL_SYM) {
+ uval = *(u64 *)(key + key_field->offset);
+ sprint_symbol_no_offset(str, uval);
+ seq_printf(m, "%s: [%llx] %-45s",
+ key_field->field->name, uval, str);
+ } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
+ uval = *(u64 *)(key + key_field->offset);
+ sprint_symbol(str, uval);
+ seq_printf(m, "%s: [%llx] %-55s",
+ key_field->field->name, uval, str);
+ } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+ char *comm = elt->private_data;
+
+ uval = *(u64 *)(key + key_field->offset);
+ seq_printf(m, "%s: %-16s[%10llu]",
+ key_field->field->name, comm, uval);
+ } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
+ const char *syscall_name;
+
+ uval = *(u64 *)(key + key_field->offset);
+ syscall_name = get_syscall_name(uval);
+ if (!syscall_name)
+ syscall_name = "unknown_syscall";
+
+ seq_printf(m, "%s: %-30s[%3llu]",
+ key_field->field->name, syscall_name, uval);
+ } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+ seq_puts(m, "stacktrace:\n");
+ hist_trigger_stacktrace_print(m,
+ key + key_field->offset,
+ HIST_STACKTRACE_DEPTH);
+ multiline = true;
+ } else if (key_field->flags & HIST_FIELD_FL_LOG2) {
+ seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+ *(u64 *)(key + key_field->offset));
+ } else if (key_field->flags & HIST_FIELD_FL_STRING) {
+ seq_printf(m, "%s: %-50s", key_field->field->name,
+ (char *)(key + key_field->offset));
+ } else {
+ uval = *(u64 *)(key + key_field->offset);
+ seq_printf(m, "%s: %10llu", key_field->field->name,
+ uval);
+ }
+ }
+
+ if (!multiline)
+ seq_puts(m, " ");
+
+ seq_puts(m, "}");
+
+ seq_printf(m, " hitcount: %10llu",
+ tracing_map_read_sum(elt, HITCOUNT_IDX));
+
+ for (i = 1; i < hist_data->n_vals; i++) {
+ if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
+ seq_printf(m, " %s: %10llx",
+ hist_data->fields[i]->field->name,
+ tracing_map_read_sum(elt, i));
+ } else {
+ seq_printf(m, " %s: %10llu",
+ hist_data->fields[i]->field->name,
+ tracing_map_read_sum(elt, i));
+ }
+ }
+
+ seq_puts(m, "\n");
+}
+
+static int print_entries(struct seq_file *m,
+ struct hist_trigger_data *hist_data)
+{
+ struct tracing_map_sort_entry **sort_entries = NULL;
+ struct tracing_map *map = hist_data->map;
+ int i, n_entries;
+
+ n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
+ hist_data->n_sort_keys,
+ &sort_entries);
+ if (n_entries < 0)
+ return n_entries;
+
+ for (i = 0; i < n_entries; i++)
+ hist_trigger_entry_print(m, hist_data,
+ sort_entries[i]->key,
+ sort_entries[i]->elt);
+
+ tracing_map_destroy_sort_entries(sort_entries, n_entries);
+
+ return n_entries;
+}
+
+static void hist_trigger_show(struct seq_file *m,
+ struct event_trigger_data *data, int n)
+{
+ struct hist_trigger_data *hist_data;
+ int n_entries, ret = 0;
+
+ if (n > 0)
+ seq_puts(m, "\n\n");
+
+ seq_puts(m, "# event histogram\n#\n# trigger info: ");
+ data->ops->print(m, data->ops, data);
+ seq_puts(m, "#\n\n");
+
+ hist_data = data->private_data;
+ n_entries = print_entries(m, hist_data);
+ if (n_entries < 0) {
+ ret = n_entries;
+ n_entries = 0;
+ }
+
+ seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n",
+ (u64)atomic64_read(&hist_data->map->hits),
+ n_entries, (u64)atomic64_read(&hist_data->map->drops));
+}
+
+static int hist_show(struct seq_file *m, void *v)
+{
+ struct event_trigger_data *data;
+ struct trace_event_file *event_file;
+ int n = 0, ret = 0;
+
+ mutex_lock(&event_mutex);
+
+ event_file = event_file_data(m->private);
+ if (unlikely(!event_file)) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ list_for_each_entry_rcu(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
+ hist_trigger_show(m, data, n++);
+ }
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int event_hist_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hist_show, file);
+}
+
+const struct file_operations event_hist_fops = {
+ .open = event_hist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const char *get_hist_field_flags(struct hist_field *hist_field)
+{
+ const char *flags_str = NULL;
+
+ if (hist_field->flags & HIST_FIELD_FL_HEX)
+ flags_str = "hex";
+ else if (hist_field->flags & HIST_FIELD_FL_SYM)
+ flags_str = "sym";
+ else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
+ flags_str = "sym-offset";
+ else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
+ flags_str = "execname";
+ else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
+ flags_str = "syscall";
+ else if (hist_field->flags & HIST_FIELD_FL_LOG2)
+ flags_str = "log2";
+
+ return flags_str;
+}
+
+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
+{
+ seq_printf(m, "%s", hist_field->field->name);
+ if (hist_field->flags) {
+ const char *flags_str = get_hist_field_flags(hist_field);
+
+ if (flags_str)
+ seq_printf(m, ".%s", flags_str);
+ }
+}
+
+static int event_hist_trigger_print(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct hist_field *key_field;
+ unsigned int i;
+
+ seq_puts(m, "hist:");
+
+ if (data->name)
+ seq_printf(m, "%s:", data->name);
+
+ seq_puts(m, "keys=");
+
+ for_each_hist_key_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+
+ if (i > hist_data->n_vals)
+ seq_puts(m, ",");
+
+ if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
+ seq_puts(m, "stacktrace");
+ else
+ hist_field_print(m, key_field);
+ }
+
+ seq_puts(m, ":vals=");
+
+ for_each_hist_val_field(i, hist_data) {
+ if (i == HITCOUNT_IDX)
+ seq_puts(m, "hitcount");
+ else {
+ seq_puts(m, ",");
+ hist_field_print(m, hist_data->fields[i]);
+ }
+ }
+
+ seq_puts(m, ":sort=");
+
+ for (i = 0; i < hist_data->n_sort_keys; i++) {
+ struct tracing_map_sort_key *sort_key;
+
+ sort_key = &hist_data->sort_keys[i];
+
+ if (i > 0)
+ seq_puts(m, ",");
+
+ if (sort_key->field_idx == HITCOUNT_IDX)
+ seq_puts(m, "hitcount");
+ else {
+ unsigned int idx = sort_key->field_idx;
+
+ if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
+ return -EINVAL;
+
+ hist_field_print(m, hist_data->fields[idx]);
+ }
+
+ if (sort_key->descending)
+ seq_puts(m, ".descending");
+ }
+
+ seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+
+ if (data->filter_str)
+ seq_printf(m, " if %s", data->filter_str);
+
+ if (data->paused)
+ seq_puts(m, " [paused]");
+ else
+ seq_puts(m, " [active]");
+
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int event_hist_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+
+ if (!data->ref && hist_data->attrs->name)
+ save_named_trigger(hist_data->attrs->name, data);
+
+ data->ref++;
+
+ return 0;
+}
+
+static void event_hist_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref) {
+ if (data->name)
+ del_named_trigger(data);
+ trigger_data_free(data);
+ destroy_hist_data(hist_data);
+ }
+}
+
+static struct event_trigger_ops event_hist_trigger_ops = {
+ .func = event_hist_trigger,
+ .print = event_hist_trigger_print,
+ .init = event_hist_trigger_init,
+ .free = event_hist_trigger_free,
+};
+
+static int event_hist_trigger_named_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ data->ref++;
+
+ save_named_trigger(data->named_data->name, data);
+
+ event_hist_trigger_init(ops, data->named_data);
+
+ return 0;
+}
+
+static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ event_hist_trigger_free(ops, data->named_data);
+
+ data->ref--;
+ if (!data->ref) {
+ del_named_trigger(data);
+ trigger_data_free(data);
+ }
+}
+
+static struct event_trigger_ops event_hist_trigger_named_ops = {
+ .func = event_hist_trigger,
+ .print = event_hist_trigger_print,
+ .init = event_hist_trigger_named_init,
+ .free = event_hist_trigger_named_free,
+};
+
+static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
+ char *param)
+{
+ return &event_hist_trigger_ops;
+}
+
+static void hist_clear(struct event_trigger_data *data)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+
+ if (data->name)
+ pause_named_trigger(data);
+
+ synchronize_sched();
+
+ tracing_map_clear(hist_data->map);
+
+ if (data->name)
+ unpause_named_trigger(data);
+}
+
+static bool compatible_field(struct ftrace_event_field *field,
+ struct ftrace_event_field *test_field)
+{
+ if (field == test_field)
+ return true;
+ if (field == NULL || test_field == NULL)
+ return false;
+ if (strcmp(field->name, test_field->name) != 0)
+ return false;
+ if (strcmp(field->type, test_field->type) != 0)
+ return false;
+ if (field->size != test_field->size)
+ return false;
+ if (field->is_signed != test_field->is_signed)
+ return false;
+
+ return true;
+}
+
+static bool hist_trigger_match(struct event_trigger_data *data,
+ struct event_trigger_data *data_test,
+ struct event_trigger_data *named_data,
+ bool ignore_filter)
+{
+ struct tracing_map_sort_key *sort_key, *sort_key_test;
+ struct hist_trigger_data *hist_data, *hist_data_test;
+ struct hist_field *key_field, *key_field_test;
+ unsigned int i;
+
+ if (named_data && (named_data != data_test) &&
+ (named_data != data_test->named_data))
+ return false;
+
+ if (!named_data && is_named_trigger(data_test))
+ return false;
+
+ hist_data = data->private_data;
+ hist_data_test = data_test->private_data;
+
+ if (hist_data->n_vals != hist_data_test->n_vals ||
+ hist_data->n_fields != hist_data_test->n_fields ||
+ hist_data->n_sort_keys != hist_data_test->n_sort_keys)
+ return false;
+
+ if (!ignore_filter) {
+ if ((data->filter_str && !data_test->filter_str) ||
+ (!data->filter_str && data_test->filter_str))
+ return false;
+ }
+
+ for_each_hist_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+ key_field_test = hist_data_test->fields[i];
+
+ if (key_field->flags != key_field_test->flags)
+ return false;
+ if (!compatible_field(key_field->field, key_field_test->field))
+ return false;
+ if (key_field->offset != key_field_test->offset)
+ return false;
+ }
+
+ for (i = 0; i < hist_data->n_sort_keys; i++) {
+ sort_key = &hist_data->sort_keys[i];
+ sort_key_test = &hist_data_test->sort_keys[i];
+
+ if (sort_key->field_idx != sort_key_test->field_idx ||
+ sort_key->descending != sort_key_test->descending)
+ return false;
+ }
+
+ if (!ignore_filter && data->filter_str &&
+ (strcmp(data->filter_str, data_test->filter_str) != 0))
+ return false;
+
+ return true;
+}
+
+static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+ int ret = 0;
+
+ if (hist_data->attrs->name) {
+ named_data = find_named_trigger(hist_data->attrs->name);
+ if (named_data) {
+ if (!hist_trigger_match(data, named_data, named_data,
+ true)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ }
+
+ if (hist_data->attrs->name && !named_data)
+ goto new;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, test, named_data, false))
+ continue;
+ if (hist_data->attrs->pause)
+ test->paused = true;
+ else if (hist_data->attrs->cont)
+ test->paused = false;
+ else if (hist_data->attrs->clear)
+ hist_clear(test);
+ else
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ new:
+ if (hist_data->attrs->cont || hist_data->attrs->clear) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (hist_data->attrs->pause)
+ data->paused = true;
+
+ if (named_data) {
+ destroy_hist_data(data->private_data);
+ data->private_data = named_data->private_data;
+ set_named_trigger_data(data, named_data);
+ data->ops = &event_hist_trigger_named_ops;
+ }
+
+ if (data->ops->init) {
+ ret = data->ops->init(data->ops, data);
+ if (ret < 0)
+ goto out;
+ }
+
+ list_add_rcu(&data->list, &file->triggers);
+ ret++;
+
+ update_cond_flag(file);
+
+ if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ list_del_rcu(&data->list);
+ update_cond_flag(file);
+ ret--;
+ }
+ out:
+ return ret;
+}
+
+static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+ bool unregistered = false;
+
+ if (hist_data->attrs->name)
+ named_data = find_named_trigger(hist_data->attrs->name);
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, test, named_data, false))
+ continue;
+ unregistered = true;
+ list_del_rcu(&test->list);
+ trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
+ break;
+ }
+ }
+
+ if (unregistered && test->ops->free)
+ test->ops->free(test->ops, test);
+}
+
+static void hist_unreg_all(struct trace_event_file *file)
+{
+ struct event_trigger_data *test, *n;
+
+ list_for_each_entry_safe(test, n, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ list_del_rcu(&test->list);
+ trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
+ if (test->ops->free)
+ test->ops->free(test->ops, test);
+ }
+ }
+}
+
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
+ struct event_trigger_data *trigger_data;
+ struct hist_trigger_attrs *attrs;
+ struct event_trigger_ops *trigger_ops;
+ struct hist_trigger_data *hist_data;
+ char *trigger;
+ int ret = 0;
+
+ if (!param)
+ return -EINVAL;
+
+ /* separate the trigger from the filter (k:v [if filter]) */
+ trigger = strsep(&param, " \t");
+ if (!trigger)
+ return -EINVAL;
+
+ attrs = parse_hist_trigger_attrs(trigger);
+ if (IS_ERR(attrs))
+ return PTR_ERR(attrs);
+
+ if (attrs->map_bits)
+ hist_trigger_bits = attrs->map_bits;
+
+ hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+ if (IS_ERR(hist_data)) {
+ destroy_hist_trigger_attrs(attrs);
+ return PTR_ERR(hist_data);
+ }
+
+ trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+ ret = -ENOMEM;
+ trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ if (!trigger_data)
+ goto out_free;
+
+ trigger_data->count = -1;
+ trigger_data->ops = trigger_ops;
+ trigger_data->cmd_ops = cmd_ops;
+
+ INIT_LIST_HEAD(&trigger_data->list);
+ RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+ trigger_data->private_data = hist_data;
+
+ /* if param is non-empty, it's supposed to be a filter */
+ if (param && cmd_ops->set_filter) {
+ ret = cmd_ops->set_filter(param, trigger_data, file);
+ if (ret < 0)
+ goto out_free;
+ }
+
+ if (glob[0] == '!') {
+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ ret = 0;
+ goto out_free;
+ }
+
+ ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ /*
+ * The above returns on success the # of triggers registered,
+ * but if it didn't register any it returns zero. Consider no
+ * triggers registered a failure too.
+ */
+ if (!ret) {
+ if (!(attrs->pause || attrs->cont || attrs->clear))
+ ret = -ENOENT;
+ goto out_free;
+ } else if (ret < 0)
+ goto out_free;
+ /* Just return zero, not the number of registered triggers */
+ ret = 0;
+ out:
+ return ret;
+ out_free:
+ if (cmd_ops->set_filter)
+ cmd_ops->set_filter(NULL, trigger_data, NULL);
+
+ kfree(trigger_data);
+
+ destroy_hist_data(hist_data);
+ goto out;
+}
+
+static struct event_command trigger_hist_cmd = {
+ .name = "hist",
+ .trigger_type = ETT_EVENT_HIST,
+ .flags = EVENT_CMD_FL_NEEDS_REC,
+ .func = event_hist_trigger_func,
+ .reg = hist_register_trigger,
+ .unreg = hist_unregister_trigger,
+ .unreg_all = hist_unreg_all,
+ .get_trigger_ops = event_hist_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+__init int register_trigger_hist_cmd(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_hist_cmd);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+
+static void
+hist_enable_trigger(struct event_trigger_data *data, void *rec)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+ struct event_trigger_data *test;
+
+ list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (enable_data->enable)
+ test->paused = false;
+ else
+ test->paused = true;
+ }
+ }
+}
+
+static void
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+{
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ hist_enable_trigger(data, rec);
+}
+
+static struct event_trigger_ops hist_enable_trigger_ops = {
+ .func = hist_enable_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_enable_count_trigger_ops = {
+ .func = hist_enable_count_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_trigger_ops = {
+ .func = hist_enable_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_count_trigger_ops = {
+ .func = hist_enable_count_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops *
+hist_enable_get_trigger_ops(char *cmd, char *param)
+{
+ struct event_trigger_ops *ops;
+ bool enable;
+
+ enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
+
+ if (enable)
+ ops = param ? &hist_enable_count_trigger_ops :
+ &hist_enable_trigger_ops;
+ else
+ ops = param ? &hist_disable_count_trigger_ops :
+ &hist_disable_trigger_ops;
+
+ return ops;
+}
+
+static void hist_enable_unreg_all(struct trace_event_file *file)
+{
+ struct event_trigger_data *test, *n;
+
+ list_for_each_entry_safe(test, n, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) {
+ list_del_rcu(&test->list);
+ update_cond_flag(file);
+ trace_event_trigger_enable_disable(file, 0);
+ if (test->ops->free)
+ test->ops->free(test->ops, test);
+ }
+ }
+}
+
+static struct event_command trigger_hist_enable_cmd = {
+ .name = ENABLE_HIST_STR,
+ .trigger_type = ETT_HIST_ENABLE,
+ .func = event_enable_trigger_func,
+ .reg = event_enable_register_trigger,
+ .unreg = event_enable_unregister_trigger,
+ .unreg_all = hist_enable_unreg_all,
+ .get_trigger_ops = hist_enable_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static struct event_command trigger_hist_disable_cmd = {
+ .name = DISABLE_HIST_STR,
+ .trigger_type = ETT_HIST_ENABLE,
+ .func = event_enable_trigger_func,
+ .reg = event_enable_register_trigger,
+ .unreg = event_enable_unregister_trigger,
+ .unreg_all = hist_enable_unreg_all,
+ .get_trigger_ops = hist_enable_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init void unregister_trigger_hist_enable_disable_cmds(void)
+{
+ unregister_event_command(&trigger_hist_enable_cmd);
+ unregister_event_command(&trigger_hist_disable_cmd);
+}
+
+__init int register_trigger_hist_enable_disable_cmds(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_hist_enable_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_event_command(&trigger_hist_disable_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_trigger_hist_enable_disable_cmds();
+
+ return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d67992f3bb0e..6721a1e89f39 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd)
* Currently we only unregister event commands from __init, so mark
* this __init too.
*/
-static __init int unregister_event_command(struct event_command *cmd)
+__init int unregister_event_command(struct event_command *cmd)
{
struct event_command *p, *n;
int ret = -ENODEV;
@@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops,
trigger_data->ops = trigger_ops;
trigger_data->cmd_ops = cmd_ops;
INIT_LIST_HEAD(&trigger_data->list);
+ INIT_LIST_HEAD(&trigger_data->named_list);
if (glob[0] == '!') {
cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
@@ -764,6 +765,148 @@ int set_trigger_filter(char *filter_str,
return ret;
}
+static LIST_HEAD(named_triggers);
+
+/**
+ * find_named_trigger - Find the common named trigger associated with @name
+ * @name: The name of the set of named triggers to find the common data for
+ *
+ * Named triggers are sets of triggers that share a common set of
+ * trigger data. The first named trigger registered with a given name
+ * owns the common trigger data that the others subsequently
+ * registered with the same name will reference. This function
+ * returns the common trigger data associated with that first
+ * registered instance.
+ *
+ * Return: the common trigger data for the given named trigger on
+ * success, NULL otherwise.
+ */
+struct event_trigger_data *find_named_trigger(const char *name)
+{
+ struct event_trigger_data *data;
+
+ if (!name)
+ return NULL;
+
+ list_for_each_entry(data, &named_triggers, named_list) {
+ if (data->named_data)
+ continue;
+ if (strcmp(data->name, name) == 0)
+ return data;
+ }
+
+ return NULL;
+}
+
+/**
+ * is_named_trigger - determine if a given trigger is a named trigger
+ * @test: The trigger data to test
+ *
+ * Return: true if 'test' is a named trigger, false otherwise.
+ */
+bool is_named_trigger(struct event_trigger_data *test)
+{
+ struct event_trigger_data *data;
+
+ list_for_each_entry(data, &named_triggers, named_list) {
+ if (test == data)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * save_named_trigger - save the trigger in the named trigger list
+ * @name: The name of the named trigger set
+ * @data: The trigger data to save
+ *
+ * Return: 0 if successful, negative error otherwise.
+ */
+int save_named_trigger(const char *name, struct event_trigger_data *data)
+{
+ data->name = kstrdup(name, GFP_KERNEL);
+ if (!data->name)
+ return -ENOMEM;
+
+ list_add(&data->named_list, &named_triggers);
+
+ return 0;
+}
+
+/**
+ * del_named_trigger - delete a trigger from the named trigger list
+ * @data: The trigger data to delete
+ */
+void del_named_trigger(struct event_trigger_data *data)
+{
+ kfree(data->name);
+ data->name = NULL;
+
+ list_del(&data->named_list);
+}
+
+static void __pause_named_trigger(struct event_trigger_data *data, bool pause)
+{
+ struct event_trigger_data *test;
+
+ list_for_each_entry(test, &named_triggers, named_list) {
+ if (strcmp(test->name, data->name) == 0) {
+ if (pause) {
+ test->paused_tmp = test->paused;
+ test->paused = true;
+ } else {
+ test->paused = test->paused_tmp;
+ }
+ }
+ }
+}
+
+/**
+ * pause_named_trigger - Pause all named triggers with the same name
+ * @data: The trigger data of a named trigger to pause
+ *
+ * Pauses a named trigger along with all other triggers having the
+ * same name. Because named triggers share a common set of data,
+ * pausing only one is meaningless, so pausing one named trigger needs
+ * to pause all triggers with the same name.
+ */
+void pause_named_trigger(struct event_trigger_data *data)
+{
+ __pause_named_trigger(data, true);
+}
+
+/**
+ * unpause_named_trigger - Un-pause all named triggers with the same name
+ * @data: The trigger data of a named trigger to unpause
+ *
+ * Un-pauses a named trigger along with all other triggers having the
+ * same name. Because named triggers share a common set of data,
+ * unpausing only one is meaningless, so unpausing one named trigger
+ * needs to unpause all triggers with the same name.
+ */
+void unpause_named_trigger(struct event_trigger_data *data)
+{
+ __pause_named_trigger(data, false);
+}
+
+/**
+ * set_named_trigger_data - Associate common named trigger data
+ * @data: The trigger data of a named trigger to unpause
+ *
+ * Named triggers are sets of triggers that share a common set of
+ * trigger data. The first named trigger registered with a given name
+ * owns the common trigger data that the others subsequently
+ * registered with the same name will reference. This function
+ * associates the common trigger data from the first trigger with the
+ * given trigger.
+ */
+void set_named_trigger_data(struct event_trigger_data *data,
+ struct event_trigger_data *named_data)
+{
+ data->named_data = named_data;
+}
+
static void
traceon_trigger(struct event_trigger_data *data, void *rec)
{
@@ -885,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = {
static struct event_command trigger_traceoff_cmd = {
.name = "traceoff",
.trigger_type = ETT_TRACE_ONOFF,
+ .flags = EVENT_CMD_FL_POST_TRIGGER,
.func = event_trigger_callback,
.reg = register_trigger,
.unreg = unregister_trigger,
@@ -1062,15 +1206,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
unregister_event_command(&trigger_traceoff_cmd);
}
-/* Avoid typos */
-#define ENABLE_EVENT_STR "enable_event"
-#define DISABLE_EVENT_STR "disable_event"
-
-struct enable_trigger_data {
- struct trace_event_file *file;
- bool enable;
-};
-
static void
event_enable_trigger(struct event_trigger_data *data, void *rec)
{
@@ -1100,14 +1235,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
event_enable_trigger(data, rec);
}
-static int
-event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+int event_enable_trigger_print(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
seq_printf(m, "%s:%s:%s",
- enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+ enable_data->hist ?
+ (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) :
+ (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR),
enable_data->file->event_call->class->system,
trace_event_name(enable_data->file->event_call));
@@ -1124,9 +1261,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
return 0;
}
-static void
-event_enable_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1171,10 +1307,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
.free = event_enable_trigger_free,
};
-static int
-event_enable_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+int event_enable_trigger_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
{
struct trace_event_file *event_enable_file;
struct enable_trigger_data *enable_data;
@@ -1183,6 +1318,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
struct trace_array *tr = file->tr;
const char *system;
const char *event;
+ bool hist = false;
char *trigger;
char *number;
bool enable;
@@ -1207,8 +1343,15 @@ event_enable_trigger_func(struct event_command *cmd_ops,
if (!event_enable_file)
goto out;
- enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#ifdef CONFIG_HIST_TRIGGERS
+ hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
+ (strcmp(cmd, DISABLE_HIST_STR) == 0));
+ enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+ (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#endif
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
ret = -ENOMEM;
@@ -1228,6 +1371,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
INIT_LIST_HEAD(&trigger_data->list);
RCU_INIT_POINTER(trigger_data->filter, NULL);
+ enable_data->hist = hist;
enable_data->enable = enable;
enable_data->file = event_enable_file;
trigger_data->private_data = enable_data;
@@ -1305,10 +1449,10 @@ event_enable_trigger_func(struct event_command *cmd_ops,
goto out;
}
-static int event_enable_register_trigger(char *glob,
- struct event_trigger_ops *ops,
- struct event_trigger_data *data,
- struct trace_event_file *file)
+int event_enable_register_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
{
struct enable_trigger_data *enable_data = data->private_data;
struct enable_trigger_data *test_enable_data;
@@ -1318,6 +1462,8 @@ static int event_enable_register_trigger(char *glob,
list_for_each_entry_rcu(test, &file->triggers, list) {
test_enable_data = test->private_data;
if (test_enable_data &&
+ (test->cmd_ops->trigger_type ==
+ data->cmd_ops->trigger_type) &&
(test_enable_data->file == enable_data->file)) {
ret = -EEXIST;
goto out;
@@ -1343,10 +1489,10 @@ out:
return ret;
}
-static void event_enable_unregister_trigger(char *glob,
- struct event_trigger_ops *ops,
- struct event_trigger_data *test,
- struct trace_event_file *file)
+void event_enable_unregister_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
{
struct enable_trigger_data *test_enable_data = test->private_data;
struct enable_trigger_data *enable_data;
@@ -1356,6 +1502,8 @@ static void event_enable_unregister_trigger(char *glob,
list_for_each_entry_rcu(data, &file->triggers, list) {
enable_data = data->private_data;
if (enable_data &&
+ (data->cmd_ops->trigger_type ==
+ test->cmd_ops->trigger_type) &&
(enable_data->file == test_enable_data->file)) {
unregistered = true;
list_del_rcu(&data->list);
@@ -1375,8 +1523,12 @@ event_enable_get_trigger_ops(char *cmd, char *param)
struct event_trigger_ops *ops;
bool enable;
+#ifdef CONFIG_HIST_TRIGGERS
+ enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+ (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
-
+#endif
if (enable)
ops = param ? &event_enable_count_trigger_ops :
&event_enable_trigger_ops;
@@ -1447,6 +1599,8 @@ __init int register_trigger_cmds(void)
register_trigger_snapshot_cmd();
register_trigger_stacktrace_cmd();
register_trigger_enable_disable_cmds();
+ register_trigger_hist_enable_disable_cmds();
+ register_trigger_hist_cmd();
return 0;
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4b69..0efa00d80623 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
/* Currently only the non stack verision is supported */
ops->func = function_trace_call;
- ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+ ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
tr->ops = ops;
ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7ea8..4e480e870474 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
/* Add a function return address to the trace stack on thread info.*/
int
ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
- unsigned long frame_pointer)
+ unsigned long frame_pointer, unsigned long *retp)
{
unsigned long long calltime;
int index;
@@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
current->ret_stack[index].ret = ret;
current->ret_stack[index].func = func;
current->ret_stack[index].calltime = calltime;
- current->ret_stack[index].subtime = 0;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
current->ret_stack[index].fp = frame_pointer;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+ current->ret_stack[index].retp = retp;
+#endif
*depth = current->curr_ret_stack;
return 0;
@@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
return;
}
-#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
/*
* The arch may choose to record the frame pointer used
* and check it here to make sure that it is what we expect it
@@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
+/**
+ * ftrace_graph_ret_addr - convert a potentially modified stack return address
+ * to its original value
+ *
+ * This function can be called by stack unwinding code to convert a found stack
+ * return address ('ret') to its original value, in case the function graph
+ * tracer has modified it to be 'return_to_handler'. If the address hasn't
+ * been modified, the unchanged value of 'ret' is returned.
+ *
+ * 'idx' is a state variable which should be initialized by the caller to zero
+ * before the first call.
+ *
+ * 'retp' is a pointer to the return address on the stack. It's ignored if
+ * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ */
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+ unsigned long ret, unsigned long *retp)
+{
+ int index = task->curr_ret_stack;
+ int i;
+
+ if (ret != (unsigned long)return_to_handler)
+ return ret;
+
+ if (index < -1)
+ index += FTRACE_NOTRACE_DEPTH;
+
+ if (index < 0)
+ return ret;
+
+ for (i = 0; i <= index; i++)
+ if (task->ret_stack[i].retp == retp)
+ return task->ret_stack[i].ret;
+
+ return ret;
+}
+#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+ unsigned long ret, unsigned long *retp)
+{
+ int task_idx;
+
+ if (ret != (unsigned long)return_to_handler)
+ return ret;
+
+ task_idx = task->curr_ret_stack;
+
+ if (!task->ret_stack || task_idx < *idx)
+ return ret;
+
+ task_idx -= *idx;
+ (*idx)++;
+
+ return task->ret_stack[task_idx].ret;
+}
+#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned long flags,
@@ -319,7 +381,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
int cpu;
int pc;
- if (!ftrace_trace_task(current))
+ if (!ftrace_trace_task(tr))
return 0;
/* trace it when it is-nested-in or is a function enabled. */
@@ -338,6 +400,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (ftrace_graph_notrace_addr(trace->func))
return 1;
+ /*
+ * Stop here if tracing_threshold is set. We only write function return
+ * events to the ring buffer.
+ */
+ if (tracing_thresh)
+ return 1;
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +424,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
-{
- if (tracing_thresh)
- return 1;
- else
- return trace_graph_entry(trace);
-}
-
static void
__trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +518,7 @@ static int graph_trace_init(struct trace_array *tr)
set_graph_array(tr);
if (tracing_thresh)
ret = register_ftrace_graph(&trace_graph_thresh_return,
- &trace_graph_thresh_entry);
+ &trace_graph_entry);
else
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry);
@@ -1121,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
trace_seq_puts(s, "/* ");
switch (iter->ent->type) {
+ case TRACE_BPUTS:
+ ret = trace_print_bputs_msg_only(iter);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ break;
case TRACE_BPRINT:
ret = trace_print_bprintk_msg_only(iter);
if (ret != TRACE_TYPE_HANDLED)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
new file mode 100644
index 000000000000..b97286c48735
--- /dev/null
+++ b/kernel/trace/trace_hwlat.c
@@ -0,0 +1,633 @@
+/*
+ * trace_hwlatdetect.c - A simple Hardware Latency detector.
+ *
+ * Use this tracer to detect large system latencies induced by the behavior of
+ * certain underlying system hardware or firmware, independent of Linux itself.
+ * The code was developed originally to detect the presence of SMIs on Intel
+ * and AMD systems, although there is no dependency upon x86 herein.
+ *
+ * The classical example usage of this tracer is in detecting the presence of
+ * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
+ * somewhat special form of hardware interrupt spawned from earlier CPU debug
+ * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
+ * LPC (or other device) to generate a special interrupt under certain
+ * circumstances, for example, upon expiration of a special SMI timer device,
+ * due to certain external thermal readings, on certain I/O address accesses,
+ * and other situations. An SMI hits a special CPU pin, triggers a special
+ * SMI mode (complete with special memory map), and the OS is unaware.
+ *
+ * Although certain hardware-inducing latencies are necessary (for example,
+ * a modern system often requires an SMI handler for correct thermal control
+ * and remote management) they can wreak havoc upon any OS-level performance
+ * guarantees toward low-latency, especially when the OS is not even made
+ * aware of the presence of these interrupts. For this reason, we need a
+ * somewhat brute force mechanism to detect these interrupts. In this case,
+ * we do it by hogging all of the CPU(s) for configurable timer intervals,
+ * sampling the built-in CPU timer, looking for discontiguous readings.
+ *
+ * WARNING: This implementation necessarily introduces latencies. Therefore,
+ * you should NEVER use this tracer while running in a production
+ * environment requiring any kind of low-latency performance
+ * guarantee(s).
+ *
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
+ *
+ * Includes useful feedback from Clark Williams <clark@redhat.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include "trace.h"
+
+static struct trace_array *hwlat_trace;
+
+#define U64STR_SIZE 22 /* 20 digits max */
+
+#define BANNER "hwlat_detector: "
+#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */
+#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
+#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
+
+/* sampling thread*/
+static struct task_struct *hwlat_kthread;
+
+static struct dentry *hwlat_sample_width; /* sample width us */
+static struct dentry *hwlat_sample_window; /* sample window us */
+
+/* Save the previous tracing_thresh value */
+static unsigned long save_tracing_thresh;
+
+/* NMI timestamp counters */
+static u64 nmi_ts_start;
+static u64 nmi_total_ts;
+static int nmi_count;
+static int nmi_cpu;
+
+/* Tells NMIs to call back to the hwlat tracer to record timestamps */
+bool trace_hwlat_callback_enabled;
+
+/* If the user changed threshold, remember it */
+static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
+
+/* Individual latency samples are stored here when detected. */
+struct hwlat_sample {
+ u64 seqnum; /* unique sequence */
+ u64 duration; /* delta */
+ u64 outer_duration; /* delta (outer loop) */
+ u64 nmi_total_ts; /* Total time spent in NMIs */
+ struct timespec timestamp; /* wall time */
+ int nmi_count; /* # NMIs during this sample */
+};
+
+/* keep the global state somewhere. */
+static struct hwlat_data {
+
+ struct mutex lock; /* protect changes */
+
+ u64 count; /* total since reset */
+
+ u64 sample_window; /* total sampling window (on+off) */
+ u64 sample_width; /* active sampling portion of window */
+
+} hwlat_data = {
+ .sample_window = DEFAULT_SAMPLE_WINDOW,
+ .sample_width = DEFAULT_SAMPLE_WIDTH,
+};
+
+static void trace_hwlat_sample(struct hwlat_sample *sample)
+{
+ struct trace_array *tr = hwlat_trace;
+ struct trace_event_call *call = &event_hwlat;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct hwlat_entry *entry;
+ unsigned long flags;
+ int pc;
+
+ pc = preempt_count();
+ local_save_flags(flags);
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
+ flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->seqnum = sample->seqnum;
+ entry->duration = sample->duration;
+ entry->outer_duration = sample->outer_duration;
+ entry->timestamp = sample->timestamp;
+ entry->nmi_total_ts = sample->nmi_total_ts;
+ entry->nmi_count = sample->nmi_count;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+}
+
+/* Macros to encapsulate the time capturing infrastructure */
+#define time_type u64
+#define time_get() trace_clock_local()
+#define time_to_us(x) div_u64(x, 1000)
+#define time_sub(a, b) ((a) - (b))
+#define init_time(a, b) (a = b)
+#define time_u64(a) a
+
+void trace_hwlat_callback(bool enter)
+{
+ if (smp_processor_id() != nmi_cpu)
+ return;
+
+ /*
+ * Currently trace_clock_local() calls sched_clock() and the
+ * generic version is not NMI safe.
+ */
+ if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
+ if (enter)
+ nmi_ts_start = time_get();
+ else
+ nmi_total_ts = time_get() - nmi_ts_start;
+ }
+
+ if (enter)
+ nmi_count++;
+}
+
+/**
+ * get_sample - sample the CPU TSC and look for likely hardware latencies
+ *
+ * Used to repeatedly capture the CPU TSC (or similar), looking for potential
+ * hardware-induced latency. Called with interrupts disabled and with
+ * hwlat_data.lock held.
+ */
+static int get_sample(void)
+{
+ struct trace_array *tr = hwlat_trace;
+ time_type start, t1, t2, last_t2;
+ s64 diff, total, last_total = 0;
+ u64 sample = 0;
+ u64 thresh = tracing_thresh;
+ u64 outer_sample = 0;
+ int ret = -1;
+
+ do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
+
+ nmi_cpu = smp_processor_id();
+ nmi_total_ts = 0;
+ nmi_count = 0;
+ /* Make sure NMIs see this first */
+ barrier();
+
+ trace_hwlat_callback_enabled = true;
+
+ init_time(last_t2, 0);
+ start = time_get(); /* start timestamp */
+
+ do {
+
+ t1 = time_get(); /* we'll look for a discontinuity */
+ t2 = time_get();
+
+ if (time_u64(last_t2)) {
+ /* Check the delta from outer loop (t2 to next t1) */
+ diff = time_to_us(time_sub(t1, last_t2));
+ /* This shouldn't happen */
+ if (diff < 0) {
+ pr_err(BANNER "time running backwards\n");
+ goto out;
+ }
+ if (diff > outer_sample)
+ outer_sample = diff;
+ }
+ last_t2 = t2;
+
+ total = time_to_us(time_sub(t2, start)); /* sample width */
+
+ /* Check for possible overflows */
+ if (total < last_total) {
+ pr_err("Time total overflowed\n");
+ break;
+ }
+ last_total = total;
+
+ /* This checks the inner loop (t1 to t2) */
+ diff = time_to_us(time_sub(t2, t1)); /* current diff */
+
+ /* This shouldn't happen */
+ if (diff < 0) {
+ pr_err(BANNER "time running backwards\n");
+ goto out;
+ }
+
+ if (diff > sample)
+ sample = diff; /* only want highest value */
+
+ } while (total <= hwlat_data.sample_width);
+
+ barrier(); /* finish the above in the view for NMIs */
+ trace_hwlat_callback_enabled = false;
+ barrier(); /* Make sure nmi_total_ts is no longer updated */
+
+ ret = 0;
+
+ /* If we exceed the threshold value, we have found a hardware latency */
+ if (sample > thresh || outer_sample > thresh) {
+ struct hwlat_sample s;
+
+ ret = 1;
+
+ /* We read in microseconds */
+ if (nmi_total_ts)
+ do_div(nmi_total_ts, NSEC_PER_USEC);
+
+ hwlat_data.count++;
+ s.seqnum = hwlat_data.count;
+ s.duration = sample;
+ s.outer_duration = outer_sample;
+ s.timestamp = CURRENT_TIME;
+ s.nmi_total_ts = nmi_total_ts;
+ s.nmi_count = nmi_count;
+ trace_hwlat_sample(&s);
+
+ /* Keep a running maximum ever recorded hardware latency */
+ if (sample > tr->max_latency)
+ tr->max_latency = sample;
+ }
+
+out:
+ return ret;
+}
+
+static struct cpumask save_cpumask;
+static bool disable_migrate;
+
+static void move_to_next_cpu(void)
+{
+ static struct cpumask *current_mask;
+ int next_cpu;
+
+ if (disable_migrate)
+ return;
+
+ /* Just pick the first CPU on first iteration */
+ if (!current_mask) {
+ current_mask = &save_cpumask;
+ get_online_cpus();
+ cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ put_online_cpus();
+ next_cpu = cpumask_first(current_mask);
+ goto set_affinity;
+ }
+
+ /*
+ * If for some reason the user modifies the CPU affinity
+ * of this thread, than stop migrating for the duration
+ * of the current test.
+ */
+ if (!cpumask_equal(current_mask, &current->cpus_allowed))
+ goto disable;
+
+ get_online_cpus();
+ cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ next_cpu = cpumask_next(smp_processor_id(), current_mask);
+ put_online_cpus();
+
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(current_mask);
+
+ set_affinity:
+ if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
+ goto disable;
+
+ cpumask_clear(current_mask);
+ cpumask_set_cpu(next_cpu, current_mask);
+
+ sched_setaffinity(0, current_mask);
+ return;
+
+ disable:
+ disable_migrate = true;
+}
+
+/*
+ * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
+ *
+ * Used to periodically sample the CPU TSC via a call to get_sample. We
+ * disable interrupts, which does (intentionally) introduce latency since we
+ * need to ensure nothing else might be running (and thus preempting).
+ * Obviously this should never be used in production environments.
+ *
+ * Currently this runs on which ever CPU it was scheduled on, but most
+ * real-world hardware latency situations occur across several CPUs,
+ * but we might later generalize this if we find there are any actualy
+ * systems with alternate SMI delivery or other hardware latencies.
+ */
+static int kthread_fn(void *data)
+{
+ u64 interval;
+
+ while (!kthread_should_stop()) {
+
+ move_to_next_cpu();
+
+ local_irq_disable();
+ get_sample();
+ local_irq_enable();
+
+ mutex_lock(&hwlat_data.lock);
+ interval = hwlat_data.sample_window - hwlat_data.sample_width;
+ mutex_unlock(&hwlat_data.lock);
+
+ do_div(interval, USEC_PER_MSEC); /* modifies interval value */
+
+ /* Always sleep for at least 1ms */
+ if (interval < 1)
+ interval = 1;
+
+ if (msleep_interruptible(interval))
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * start_kthread - Kick off the hardware latency sampling/detector kthread
+ *
+ * This starts the kernel thread that will sit and sample the CPU timestamp
+ * counter (TSC or similar) and look for potential hardware latencies.
+ */
+static int start_kthread(struct trace_array *tr)
+{
+ struct task_struct *kthread;
+
+ kthread = kthread_create(kthread_fn, NULL, "hwlatd");
+ if (IS_ERR(kthread)) {
+ pr_err(BANNER "could not start sampling thread\n");
+ return -ENOMEM;
+ }
+ hwlat_kthread = kthread;
+ wake_up_process(kthread);
+
+ return 0;
+}
+
+/**
+ * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static void stop_kthread(void)
+{
+ if (!hwlat_kthread)
+ return;
+ kthread_stop(hwlat_kthread);
+ hwlat_kthread = NULL;
+}
+
+/*
+ * hwlat_read - Wrapper read function for reading both window and width
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a generic read implementation for the global state
+ * "hwlat_data" structure filesystem entries.
+ */
+static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[U64STR_SIZE];
+ u64 *entry = filp->private_data;
+ u64 val;
+ int len;
+
+ if (!entry)
+ return -EFAULT;
+
+ if (cnt > sizeof(buf))
+ cnt = sizeof(buf);
+
+ val = *entry;
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", val);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+
+/**
+ * hwlat_width_write - Write function for "width" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "width" interface
+ * to the hardware latency detector. It can be used to configure
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latency periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch. It
+ * is enforced that width is less that the total window size.
+ */
+static ssize_t
+hwlat_width_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ u64 val;
+ int err;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+ if (err)
+ return err;
+
+ mutex_lock(&hwlat_data.lock);
+ if (val < hwlat_data.sample_window)
+ hwlat_data.sample_width = val;
+ else
+ err = -EINVAL;
+ mutex_unlock(&hwlat_data.lock);
+
+ if (err)
+ return err;
+
+ return cnt;
+}
+
+/**
+ * hwlat_window_write - Write function for "window" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "window" interface
+ * to the hardware latency detetector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to write a new total window size. It
+ * is enfoced that any value written must be greater than the sample width
+ * size, or an error results.
+ */
+static ssize_t
+hwlat_window_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ u64 val;
+ int err;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+ if (err)
+ return err;
+
+ mutex_lock(&hwlat_data.lock);
+ if (hwlat_data.sample_width < val)
+ hwlat_data.sample_window = val;
+ else
+ err = -EINVAL;
+ mutex_unlock(&hwlat_data.lock);
+
+ if (err)
+ return err;
+
+ return cnt;
+}
+
+static const struct file_operations width_fops = {
+ .open = tracing_open_generic,
+ .read = hwlat_read,
+ .write = hwlat_width_write,
+};
+
+static const struct file_operations window_fops = {
+ .open = tracing_open_generic,
+ .read = hwlat_read,
+ .write = hwlat_window_write,
+};
+
+/**
+ * init_tracefs - A function to initialize the tracefs interface files
+ *
+ * This function creates entries in tracefs for "hwlat_detector".
+ * It creates the hwlat_detector directory in the tracing directory,
+ * and within that directory is the count, width and window files to
+ * change and view those values.
+ */
+static int init_tracefs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *top_dir;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return -ENOMEM;
+
+ top_dir = tracefs_create_dir("hwlat_detector", d_tracer);
+ if (!top_dir)
+ return -ENOMEM;
+
+ hwlat_sample_window = tracefs_create_file("window", 0640,
+ top_dir,
+ &hwlat_data.sample_window,
+ &window_fops);
+ if (!hwlat_sample_window)
+ goto err;
+
+ hwlat_sample_width = tracefs_create_file("width", 0644,
+ top_dir,
+ &hwlat_data.sample_width,
+ &width_fops);
+ if (!hwlat_sample_width)
+ goto err;
+
+ return 0;
+
+ err:
+ tracefs_remove_recursive(top_dir);
+ return -ENOMEM;
+}
+
+static void hwlat_tracer_start(struct trace_array *tr)
+{
+ int err;
+
+ err = start_kthread(tr);
+ if (err)
+ pr_err(BANNER "Cannot start hwlat kthread\n");
+}
+
+static void hwlat_tracer_stop(struct trace_array *tr)
+{
+ stop_kthread();
+}
+
+static bool hwlat_busy;
+
+static int hwlat_tracer_init(struct trace_array *tr)
+{
+ /* Only allow one instance to enable this */
+ if (hwlat_busy)
+ return -EBUSY;
+
+ hwlat_trace = tr;
+
+ disable_migrate = false;
+ hwlat_data.count = 0;
+ tr->max_latency = 0;
+ save_tracing_thresh = tracing_thresh;
+
+ /* tracing_thresh is in nsecs, we speak in usecs */
+ if (!tracing_thresh)
+ tracing_thresh = last_tracing_thresh;
+
+ if (tracer_tracing_is_on(tr))
+ hwlat_tracer_start(tr);
+
+ hwlat_busy = true;
+
+ return 0;
+}
+
+static void hwlat_tracer_reset(struct trace_array *tr)
+{
+ stop_kthread();
+
+ /* the tracing threshold is static between runs */
+ last_tracing_thresh = tracing_thresh;
+
+ tracing_thresh = save_tracing_thresh;
+ hwlat_busy = false;
+}
+
+static struct tracer hwlat_tracer __read_mostly =
+{
+ .name = "hwlat",
+ .init = hwlat_tracer_init,
+ .reset = hwlat_tracer_reset,
+ .start = hwlat_tracer_start,
+ .stop = hwlat_tracer_stop,
+ .allow_instances = true,
+};
+
+__init static int init_hwlat_tracer(void)
+{
+ int ret;
+
+ mutex_init(&hwlat_data.lock);
+
+ ret = register_tracer(&hwlat_tracer);
+ if (ret)
+ return ret;
+
+ init_tracefs();
+
+ return 0;
+}
+late_initcall(init_hwlat_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec0505f..eb6c9f1d3a93 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = {
ASSIGN_FETCH_TYPE(s16, u16, 1),
ASSIGN_FETCH_TYPE(s32, u32, 1),
ASSIGN_FETCH_TYPE(s64, u64, 1),
+ ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
ASSIGN_FETCH_TYPE_END
};
@@ -587,6 +591,7 @@ static int create_trace_kprobe(int argc, char **argv)
* $retval : fetch return value
* $stack : fetch stack address
* $stackN : fetch Nth of stack (N:0-)
+ * $comm : fetch current task comm
* @ADDR : fetch memory at ADDR (ADDR should be in kernel)
* @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
* %REG : fetch register REG
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6d3f..cd7480d0a201 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
dev->bus->number, dev->devfn,
dev->vendor, dev->device, dev->irq);
- /*
- * XXX: is pci_resource_to_user() appropriate, since we are
- * supposed to interpret the __ioremap() phys_addr argument based on
- * these printed values?
- */
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
+ end = dev->resource[i].end;
trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0bb9cf2d53e6..3fc20422c166 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = {
.funcs = &trace_user_stack_funcs,
};
+/* TRACE_HWLAT */
+static enum print_line_t
+trace_hwlat_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct hwlat_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld",
+ field->seqnum,
+ field->duration,
+ field->outer_duration,
+ field->timestamp.tv_sec,
+ field->timestamp.tv_nsec);
+
+ if (field->nmi_count) {
+ /*
+ * The generic sched_clock() is not NMI safe, thus
+ * we only record the count and not the time.
+ */
+ if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK))
+ trace_seq_printf(s, " nmi-total:%llu",
+ field->nmi_total_ts);
+ trace_seq_printf(s, " nmi-count:%u",
+ field->nmi_count);
+ }
+
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+
+static enum print_line_t
+trace_hwlat_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct hwlat_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
+ field->duration,
+ field->outer_duration,
+ field->timestamp.tv_sec,
+ field->timestamp.tv_nsec,
+ field->seqnum);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_hwlat_funcs = {
+ .trace = trace_hwlat_print,
+ .raw = trace_hwlat_raw,
+};
+
+static struct trace_event trace_hwlat_event = {
+ .type = TRACE_HWLAT,
+ .funcs = &trace_hwlat_funcs,
+};
+
/* TRACE_BPUTS */
static enum print_line_t
trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = {
&trace_bputs_event,
&trace_bprint_event,
&trace_print_event,
+ &trace_hwlat_event,
NULL
};
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f96f0383f6c6..ad1d6164e946 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
{
struct trace_bprintk_fmt *pos;
+
+ if (!fmt)
+ return ERR_PTR(-EINVAL);
+
list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
if (!strcmp(pos->fmt, fmt))
return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
for (iter = start; iter < end; iter++) {
struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
if (tb_fmt) {
- *iter = tb_fmt->fmt;
+ if (!IS_ERR(tb_fmt))
+ *iter = tb_fmt->fmt;
continue;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6fefb..8c0553d9afd3 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -36,24 +36,28 @@ const char *reserved_field_names[] = {
};
/* Printing in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
-int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \
+int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \
void *data, void *ent) \
{ \
trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
return !trace_seq_has_overflowed(s); \
} \
-const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
-NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
-
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
+const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname));
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
/* Print type function for string type */
int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
@@ -218,6 +222,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
kfree(data);
}
+void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ long ret;
+
+ if (!maxlen)
+ return;
+
+ ret = strlcpy(dst, current->comm, maxlen);
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
+
+void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ *(u32 *)dest = strlen(current->comm) + 1;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
+
static const struct fetch_type *find_fetch_type(const char *type,
const struct fetch_type *ftbl)
{
@@ -348,6 +374,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
} else
ret = -EINVAL;
+ } else if (strcmp(arg, "comm") == 0) {
+ if (strcmp(t->name, "string") != 0 &&
+ strcmp(t->name, "string_size") != 0)
+ return -EINVAL;
+ f->fn = t->fetch[FETCH_MTD_comm];
} else
ret = -EINVAL;
@@ -522,6 +553,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
arg[t - parg->comm] = '\0';
t++;
}
+ /*
+ * The default type of $comm should be "string", and it can't be
+ * dereferenced.
+ */
+ if (!t && strcmp(arg, "$comm") == 0)
+ t = "string";
parg->type = find_fetch_type(t, ftbl);
if (!parg->type) {
pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09114..0c0ae54d44c6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
FETCH_MTD_reg = 0,
FETCH_MTD_stack,
FETCH_MTD_retval,
+ FETCH_MTD_comm,
FETCH_MTD_memory,
FETCH_MTD_symbol,
FETCH_MTD_deref,
@@ -148,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
+
DECLARE_BASIC_PRINT_TYPE_FUNC(string);
#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
@@ -183,6 +189,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
#define fetch_bitfield_string NULL
#define fetch_bitfield_string_size NULL
+/* comm only makes sense as a string */
+#define fetch_comm_u8 NULL
+#define fetch_comm_u16 NULL
+#define fetch_comm_u32 NULL
+#define fetch_comm_u64 NULL
+DECLARE_FETCH_FUNC(comm, string);
+DECLARE_FETCH_FUNC(comm, string_size);
+
/*
* Define macro for basic types - we don't need to define s* types, because
* we have to care only about bitwidth at recording time.
@@ -194,7 +208,7 @@ DEFINE_FETCH_##method(u32) \
DEFINE_FETCH_##method(u64)
/* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
+#define __DEFAULT_FETCH_TYPE(t) x##t
#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
@@ -213,6 +227,7 @@ DEFINE_FETCH_##method(u64)
ASSIGN_FETCH_FUNC(reg, ftype), \
ASSIGN_FETCH_FUNC(stack, ftype), \
ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(comm, ftype), \
ASSIGN_FETCH_FUNC(memory, ftype), \
ASSIGN_FETCH_FUNC(symbol, ftype), \
ASSIGN_FETCH_FUNC(deref, ftype), \
@@ -224,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+/* If ptype is an alias of atype, use this macro (show atype in format) */
+#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \
+ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype)
+
#define ASSIGN_FETCH_TYPE_END {}
#define FETCH_TYPE_STRING 0
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c53485441c88..0913693caf6e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = {
ASSIGN_FETCH_TYPE(s16, u16, 1),
ASSIGN_FETCH_TYPE(s32, u32, 1),
ASSIGN_FETCH_TYPE(s64, u64, 1),
+ ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
ASSIGN_FETCH_TYPE_END
};
@@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- pr_info("probe point must be have a filename.\n");
- return -EINVAL;
- }
arg = strchr(argv[1], ':');
if (!arg) {
ret = -EINVAL;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
new file mode 100644
index 000000000000..0a689bbb78ef
--- /dev/null
+++ b/kernel/trace/tracing_map.c
@@ -0,0 +1,1062 @@
+/*
+ * tracing_map - lock-free map for tracing
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ *
+ * tracing_map implementation inspired by lock-free map algorithms
+ * originated by Dr. Cliff Click:
+ *
+ * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable
+ * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+/*
+ * NOTE: For a detailed description of the data structures used by
+ * these functions (such as tracing_map_elt) please see the overview
+ * of tracing_map data structures at the beginning of tracing_map.h.
+ */
+
+/**
+ * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ * @n: The value to add to the sum
+ *
+ * Add n to sum i associated with the specified tracing_map_elt
+ * instance. The index i is the index returned by the call to
+ * tracing_map_add_sum_field() when the tracing map was set up.
+ */
+void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n)
+{
+ atomic64_add(n, &elt->fields[i].sum);
+}
+
+/**
+ * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ *
+ * Retrieve the value of the sum i associated with the specified
+ * tracing_map_elt instance. The index i is the index returned by the
+ * call to tracing_map_add_sum_field() when the tracing map was set
+ * up.
+ *
+ * Return: The sum associated with field i for elt.
+ */
+u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
+{
+ return (u64)atomic64_read(&elt->fields[i].sum);
+}
+
+int tracing_map_cmp_string(void *val_a, void *val_b)
+{
+ char *a = val_a;
+ char *b = val_b;
+
+ return strcmp(a, b);
+}
+
+int tracing_map_cmp_none(void *val_a, void *val_b)
+{
+ return 0;
+}
+
+static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
+{
+ u64 a = atomic64_read((atomic64_t *)val_a);
+ u64 b = atomic64_read((atomic64_t *)val_b);
+
+ return (a > b) ? 1 : ((a < b) ? -1 : 0);
+}
+
+#define DEFINE_TRACING_MAP_CMP_FN(type) \
+static int tracing_map_cmp_##type(void *val_a, void *val_b) \
+{ \
+ type a = *(type *)val_a; \
+ type b = *(type *)val_b; \
+ \
+ return (a > b) ? 1 : ((a < b) ? -1 : 0); \
+}
+
+DEFINE_TRACING_MAP_CMP_FN(s64);
+DEFINE_TRACING_MAP_CMP_FN(u64);
+DEFINE_TRACING_MAP_CMP_FN(s32);
+DEFINE_TRACING_MAP_CMP_FN(u32);
+DEFINE_TRACING_MAP_CMP_FN(s16);
+DEFINE_TRACING_MAP_CMP_FN(u16);
+DEFINE_TRACING_MAP_CMP_FN(s8);
+DEFINE_TRACING_MAP_CMP_FN(u8);
+
+tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+ int field_is_signed)
+{
+ tracing_map_cmp_fn_t fn = tracing_map_cmp_none;
+
+ switch (field_size) {
+ case 8:
+ if (field_is_signed)
+ fn = tracing_map_cmp_s64;
+ else
+ fn = tracing_map_cmp_u64;
+ break;
+ case 4:
+ if (field_is_signed)
+ fn = tracing_map_cmp_s32;
+ else
+ fn = tracing_map_cmp_u32;
+ break;
+ case 2:
+ if (field_is_signed)
+ fn = tracing_map_cmp_s16;
+ else
+ fn = tracing_map_cmp_u16;
+ break;
+ case 1:
+ if (field_is_signed)
+ fn = tracing_map_cmp_s8;
+ else
+ fn = tracing_map_cmp_u8;
+ break;
+ }
+
+ return fn;
+}
+
+static int tracing_map_add_field(struct tracing_map *map,
+ tracing_map_cmp_fn_t cmp_fn)
+{
+ int ret = -EINVAL;
+
+ if (map->n_fields < TRACING_MAP_FIELDS_MAX) {
+ ret = map->n_fields;
+ map->fields[map->n_fields++].cmp_fn = cmp_fn;
+ }
+
+ return ret;
+}
+
+/**
+ * tracing_map_add_sum_field - Add a field describing a tracing_map sum
+ * @map: The tracing_map
+ *
+ * Add a sum field to the key and return the index identifying it in
+ * the map and associated tracing_map_elts. This is the index used
+ * for instance to update a sum for a particular tracing_map_elt using
+ * tracing_map_update_sum() or reading it via tracing_map_read_sum().
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts, or -EINVAL on error.
+ */
+int tracing_map_add_sum_field(struct tracing_map *map)
+{
+ return tracing_map_add_field(map, tracing_map_cmp_atomic64);
+}
+
+/**
+ * tracing_map_add_key_field - Add a field describing a tracing_map key
+ * @map: The tracing_map
+ * @offset: The offset within the key
+ * @cmp_fn: The comparison function that will be used to sort on the key
+ *
+ * Let the map know there is a key and that if it's used as a sort key
+ * to use cmp_fn.
+ *
+ * A key can be a subset of a compound key; for that purpose, the
+ * offset param is used to describe where within the the compound key
+ * the key referenced by this key field resides.
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts, or -EINVAL on error.
+ */
+int tracing_map_add_key_field(struct tracing_map *map,
+ unsigned int offset,
+ tracing_map_cmp_fn_t cmp_fn)
+
+{
+ int idx = tracing_map_add_field(map, cmp_fn);
+
+ if (idx < 0)
+ return idx;
+
+ map->fields[idx].offset = offset;
+
+ map->key_idx[map->n_keys++] = idx;
+
+ return idx;
+}
+
+void tracing_map_array_clear(struct tracing_map_array *a)
+{
+ unsigned int i;
+
+ if (!a->pages)
+ return;
+
+ for (i = 0; i < a->n_pages; i++)
+ memset(a->pages[i], 0, PAGE_SIZE);
+}
+
+void tracing_map_array_free(struct tracing_map_array *a)
+{
+ unsigned int i;
+
+ if (!a)
+ return;
+
+ if (!a->pages) {
+ kfree(a);
+ return;
+ }
+
+ for (i = 0; i < a->n_pages; i++) {
+ if (!a->pages[i])
+ break;
+ free_page((unsigned long)a->pages[i]);
+ }
+}
+
+struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
+ unsigned int entry_size)
+{
+ struct tracing_map_array *a;
+ unsigned int i;
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a)
+ return NULL;
+
+ a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
+ a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift);
+ a->n_pages = n_elts / a->entries_per_page;
+ if (!a->n_pages)
+ a->n_pages = 1;
+ a->entry_shift = fls(a->entries_per_page) - 1;
+ a->entry_mask = (1 << a->entry_shift) - 1;
+
+ a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL);
+ if (!a->pages)
+ goto free;
+
+ for (i = 0; i < a->n_pages; i++) {
+ a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!a->pages[i])
+ goto free;
+ }
+ out:
+ return a;
+ free:
+ tracing_map_array_free(a);
+ a = NULL;
+
+ goto out;
+}
+
+static void tracing_map_elt_clear(struct tracing_map_elt *elt)
+{
+ unsigned i;
+
+ for (i = 0; i < elt->map->n_fields; i++)
+ if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
+ atomic64_set(&elt->fields[i].sum, 0);
+
+ if (elt->map->ops && elt->map->ops->elt_clear)
+ elt->map->ops->elt_clear(elt);
+}
+
+static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
+{
+ unsigned int i;
+
+ tracing_map_elt_clear(elt);
+
+ for (i = 0; i < elt->map->n_fields; i++) {
+ elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn;
+
+ if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64)
+ elt->fields[i].offset = elt->map->fields[i].offset;
+ }
+}
+
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+ if (!elt)
+ return;
+
+ if (elt->map->ops && elt->map->ops->elt_free)
+ elt->map->ops->elt_free(elt);
+ kfree(elt->fields);
+ kfree(elt->key);
+ kfree(elt);
+}
+
+static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
+{
+ struct tracing_map_elt *elt;
+ int err = 0;
+
+ elt = kzalloc(sizeof(*elt), GFP_KERNEL);
+ if (!elt)
+ return ERR_PTR(-ENOMEM);
+
+ elt->map = map;
+
+ elt->key = kzalloc(map->key_size, GFP_KERNEL);
+ if (!elt->key) {
+ err = -ENOMEM;
+ goto free;
+ }
+
+ elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL);
+ if (!elt->fields) {
+ err = -ENOMEM;
+ goto free;
+ }
+
+ tracing_map_elt_init_fields(elt);
+
+ if (map->ops && map->ops->elt_alloc) {
+ err = map->ops->elt_alloc(elt);
+ if (err)
+ goto free;
+ }
+ return elt;
+ free:
+ tracing_map_elt_free(elt);
+
+ return ERR_PTR(err);
+}
+
+static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
+{
+ struct tracing_map_elt *elt = NULL;
+ int idx;
+
+ idx = atomic_inc_return(&map->next_elt);
+ if (idx < map->max_elts) {
+ elt = *(TRACING_MAP_ELT(map->elts, idx));
+ if (map->ops && map->ops->elt_init)
+ map->ops->elt_init(elt);
+ }
+
+ return elt;
+}
+
+static void tracing_map_free_elts(struct tracing_map *map)
+{
+ unsigned int i;
+
+ if (!map->elts)
+ return;
+
+ for (i = 0; i < map->max_elts; i++) {
+ tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i)));
+ *(TRACING_MAP_ELT(map->elts, i)) = NULL;
+ }
+
+ tracing_map_array_free(map->elts);
+ map->elts = NULL;
+}
+
+static int tracing_map_alloc_elts(struct tracing_map *map)
+{
+ unsigned int i;
+
+ map->elts = tracing_map_array_alloc(map->max_elts,
+ sizeof(struct tracing_map_elt *));
+ if (!map->elts)
+ return -ENOMEM;
+
+ for (i = 0; i < map->max_elts; i++) {
+ *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map);
+ if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) {
+ *(TRACING_MAP_ELT(map->elts, i)) = NULL;
+ tracing_map_free_elts(map);
+
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static inline bool keys_match(void *key, void *test_key, unsigned key_size)
+{
+ bool match = true;
+
+ if (memcmp(key, test_key, key_size))
+ match = false;
+
+ return match;
+}
+
+static inline struct tracing_map_elt *
+__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
+{
+ u32 idx, key_hash, test_key;
+ struct tracing_map_entry *entry;
+
+ key_hash = jhash(key, map->key_size, 0);
+ if (key_hash == 0)
+ key_hash = 1;
+ idx = key_hash >> (32 - (map->map_bits + 1));
+
+ while (1) {
+ idx &= (map->map_size - 1);
+ entry = TRACING_MAP_ENTRY(map->map, idx);
+ test_key = entry->key;
+
+ if (test_key && test_key == key_hash && entry->val &&
+ keys_match(key, entry->val->key, map->key_size)) {
+ atomic64_inc(&map->hits);
+ return entry->val;
+ }
+
+ if (!test_key) {
+ if (lookup_only)
+ break;
+
+ if (!cmpxchg(&entry->key, 0, key_hash)) {
+ struct tracing_map_elt *elt;
+
+ elt = get_free_elt(map);
+ if (!elt) {
+ atomic64_inc(&map->drops);
+ entry->key = 0;
+ break;
+ }
+
+ memcpy(elt->key, key, map->key_size);
+ entry->val = elt;
+ atomic64_inc(&map->hits);
+
+ return entry->val;
+ }
+ }
+
+ idx++;
+ }
+
+ return NULL;
+}
+
+/**
+ * tracing_map_insert - Insert key and/or retrieve val from a tracing_map
+ * @map: The tracing_map to insert into
+ * @key: The key to insert
+ *
+ * Inserts a key into a tracing_map and creates and returns a new
+ * tracing_map_elt for it, or if the key has already been inserted by
+ * a previous call, returns the tracing_map_elt already associated
+ * with it. When the map was created, the number of elements to be
+ * allocated for the map was specified (internally maintained as
+ * 'max_elts' in struct tracing_map), and that number of
+ * tracing_map_elts was created by tracing_map_init(). This is the
+ * pre-allocated pool of tracing_map_elts that tracing_map_insert()
+ * will allocate from when adding new keys. Once that pool is
+ * exhausted, tracing_map_insert() is useless and will return NULL to
+ * signal that state. There are two user-visible tracing_map
+ * variables, 'hits' and 'drops', which are updated by this function.
+ * Every time an element is either successfully inserted or retrieved,
+ * the 'hits' value is incrememented. Every time an element insertion
+ * fails, the 'drops' value is incremented.
+ *
+ * This is a lock-free tracing map insertion function implementing a
+ * modified form of Cliff Click's basic insertion algorithm. It
+ * requires the table size be a power of two. To prevent any
+ * possibility of an infinite loop we always make the internal table
+ * size double the size of the requested table size (max_elts * 2).
+ * Likewise, we never reuse a slot or resize or delete elements - when
+ * we've reached max_elts entries, we simply return NULL once we've
+ * run out of entries. Readers can at any point in time traverse the
+ * tracing map and safely access the key/val pairs.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If this was a newly inserted key, the val will be a newly allocated
+ * and associated tracing_map_elt pointer val. If the key wasn't
+ * found and the pool of tracing_map_elts has been exhausted, NULL is
+ * returned and no further insertions will succeed.
+ */
+struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key)
+{
+ return __tracing_map_insert(map, key, false);
+}
+
+/**
+ * tracing_map_lookup - Retrieve val from a tracing_map
+ * @map: The tracing_map to perform the lookup on
+ * @key: The key to look up
+ *
+ * Looks up key in tracing_map and if found returns the matching
+ * tracing_map_elt. This is a lock-free lookup; see
+ * tracing_map_insert() for details on tracing_map and how it works.
+ * Every time an element is retrieved, the 'hits' value is
+ * incrememented. There is one user-visible tracing_map variable,
+ * 'hits', which is updated by this function. Every time an element
+ * is successfully retrieved, the 'hits' value is incrememented. The
+ * 'drops' value is never updated by this function.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If the key wasn't found, NULL is returned.
+ */
+struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key)
+{
+ return __tracing_map_insert(map, key, true);
+}
+
+/**
+ * tracing_map_destroy - Destroy a tracing_map
+ * @map: The tracing_map to destroy
+ *
+ * Frees a tracing_map along with its associated array of
+ * tracing_map_elts.
+ *
+ * Callers should make sure there are no readers or writers actively
+ * reading or inserting into the map before calling this.
+ */
+void tracing_map_destroy(struct tracing_map *map)
+{
+ if (!map)
+ return;
+
+ tracing_map_free_elts(map);
+
+ tracing_map_array_free(map->map);
+ kfree(map);
+}
+
+/**
+ * tracing_map_clear - Clear a tracing_map
+ * @map: The tracing_map to clear
+ *
+ * Resets the tracing map to a cleared or initial state. The
+ * tracing_map_elts are all cleared, and the array of struct
+ * tracing_map_entry is reset to an initialized state.
+ *
+ * Callers should make sure there are no writers actively inserting
+ * into the map before calling this.
+ */
+void tracing_map_clear(struct tracing_map *map)
+{
+ unsigned int i;
+
+ atomic_set(&map->next_elt, -1);
+ atomic64_set(&map->hits, 0);
+ atomic64_set(&map->drops, 0);
+
+ tracing_map_array_clear(map->map);
+
+ for (i = 0; i < map->max_elts; i++)
+ tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i)));
+}
+
+static void set_sort_key(struct tracing_map *map,
+ struct tracing_map_sort_key *sort_key)
+{
+ map->sort_key = *sort_key;
+}
+
+/**
+ * tracing_map_create - Create a lock-free map and element pool
+ * @map_bits: The size of the map (2 ** map_bits)
+ * @key_size: The size of the key for the map in bytes
+ * @ops: Optional client-defined tracing_map_ops instance
+ * @private_data: Client data associated with the map
+ *
+ * Creates and sets up a map to contain 2 ** map_bits number of
+ * elements (internally maintained as 'max_elts' in struct
+ * tracing_map). Before using, map fields should be added to the map
+ * with tracing_map_add_sum_field() and tracing_map_add_key_field().
+ * tracing_map_init() should then be called to allocate the array of
+ * tracing_map_elts, in order to avoid allocating anything in the map
+ * insertion path. The user-specified map size reflects the maximum
+ * number of elements that can be contained in the table requested by
+ * the user - internally we double that in order to keep the table
+ * sparse and keep collisions manageable.
+ *
+ * A tracing_map is a special-purpose map designed to aggregate or
+ * 'sum' one or more values associated with a specific object of type
+ * tracing_map_elt, which is attached by the map to a given key.
+ *
+ * tracing_map_create() sets up the map itself, and provides
+ * operations for inserting tracing_map_elts, but doesn't allocate the
+ * tracing_map_elts themselves, or provide a means for describing the
+ * keys or sums associated with the tracing_map_elts. All
+ * tracing_map_elts for a given map have the same set of sums and
+ * keys, which are defined by the client using the functions
+ * tracing_map_add_key_field() and tracing_map_add_sum_field(). Once
+ * the fields are defined, the pool of elements allocated for the map
+ * can be created, which occurs when the client code calls
+ * tracing_map_init().
+ *
+ * When tracing_map_init() returns, tracing_map_elt elements can be
+ * inserted into the map using tracing_map_insert(). When called,
+ * tracing_map_insert() grabs a free tracing_map_elt from the pool, or
+ * finds an existing match in the map and in either case returns it.
+ * The client can then use tracing_map_update_sum() and
+ * tracing_map_read_sum() to update or read a given sum field for the
+ * tracing_map_elt.
+ *
+ * The client can at any point retrieve and traverse the current set
+ * of inserted tracing_map_elts in a tracing_map, via
+ * tracing_map_sort_entries(). Sorting can be done on any field,
+ * including keys.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+struct tracing_map *tracing_map_create(unsigned int map_bits,
+ unsigned int key_size,
+ const struct tracing_map_ops *ops,
+ void *private_data)
+{
+ struct tracing_map *map;
+ unsigned int i;
+
+ if (map_bits < TRACING_MAP_BITS_MIN ||
+ map_bits > TRACING_MAP_BITS_MAX)
+ return ERR_PTR(-EINVAL);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->map_bits = map_bits;
+ map->max_elts = (1 << map_bits);
+ atomic_set(&map->next_elt, -1);
+
+ map->map_size = (1 << (map_bits + 1));
+ map->ops = ops;
+
+ map->private_data = private_data;
+
+ map->map = tracing_map_array_alloc(map->map_size,
+ sizeof(struct tracing_map_entry));
+ if (!map->map)
+ goto free;
+
+ map->key_size = key_size;
+ for (i = 0; i < TRACING_MAP_KEYS_MAX; i++)
+ map->key_idx[i] = -1;
+ out:
+ return map;
+ free:
+ tracing_map_destroy(map);
+ map = ERR_PTR(-ENOMEM);
+
+ goto out;
+}
+
+/**
+ * tracing_map_init - Allocate and clear a map's tracing_map_elts
+ * @map: The tracing_map to initialize
+ *
+ * Allocates a clears a pool of tracing_map_elts equal to the
+ * user-specified size of 2 ** map_bits (internally maintained as
+ * 'max_elts' in struct tracing_map). Before using, the map fields
+ * should be added to the map with tracing_map_add_sum_field() and
+ * tracing_map_add_key_field(). tracing_map_init() should then be
+ * called to allocate the array of tracing_map_elts, in order to avoid
+ * allocating anything in the map insertion path. The user-specified
+ * map size reflects the max number of elements requested by the user
+ * - internally we double that in order to keep the table sparse and
+ * keep collisions manageable.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+int tracing_map_init(struct tracing_map *map)
+{
+ int err;
+
+ if (map->n_fields < 2)
+ return -EINVAL; /* need at least 1 key and 1 val */
+
+ err = tracing_map_alloc_elts(map);
+ if (err)
+ return err;
+
+ tracing_map_clear(map);
+
+ return err;
+}
+
+static int cmp_entries_dup(const struct tracing_map_sort_entry **a,
+ const struct tracing_map_sort_entry **b)
+{
+ int ret = 0;
+
+ if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size))
+ ret = 1;
+
+ return ret;
+}
+
+static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
+ const struct tracing_map_sort_entry **b)
+{
+ const struct tracing_map_elt *elt_a, *elt_b;
+ struct tracing_map_sort_key *sort_key;
+ struct tracing_map_field *field;
+ tracing_map_cmp_fn_t cmp_fn;
+ void *val_a, *val_b;
+ int ret = 0;
+
+ elt_a = (*a)->elt;
+ elt_b = (*b)->elt;
+
+ sort_key = &elt_a->map->sort_key;
+
+ field = &elt_a->fields[sort_key->field_idx];
+ cmp_fn = field->cmp_fn;
+
+ val_a = &elt_a->fields[sort_key->field_idx].sum;
+ val_b = &elt_b->fields[sort_key->field_idx].sum;
+
+ ret = cmp_fn(val_a, val_b);
+ if (sort_key->descending)
+ ret = -ret;
+
+ return ret;
+}
+
+static int cmp_entries_key(const struct tracing_map_sort_entry **a,
+ const struct tracing_map_sort_entry **b)
+{
+ const struct tracing_map_elt *elt_a, *elt_b;
+ struct tracing_map_sort_key *sort_key;
+ struct tracing_map_field *field;
+ tracing_map_cmp_fn_t cmp_fn;
+ void *val_a, *val_b;
+ int ret = 0;
+
+ elt_a = (*a)->elt;
+ elt_b = (*b)->elt;
+
+ sort_key = &elt_a->map->sort_key;
+
+ field = &elt_a->fields[sort_key->field_idx];
+
+ cmp_fn = field->cmp_fn;
+
+ val_a = elt_a->key + field->offset;
+ val_b = elt_b->key + field->offset;
+
+ ret = cmp_fn(val_a, val_b);
+ if (sort_key->descending)
+ ret = -ret;
+
+ return ret;
+}
+
+static void destroy_sort_entry(struct tracing_map_sort_entry *entry)
+{
+ if (!entry)
+ return;
+
+ if (entry->elt_copied)
+ tracing_map_elt_free(entry->elt);
+
+ kfree(entry);
+}
+
+/**
+ * tracing_map_destroy_sort_entries - Destroy an array of sort entries
+ * @entries: The entries to destroy
+ * @n_entries: The number of entries in the array
+ *
+ * Destroy the elements returned by a tracing_map_sort_entries() call.
+ */
+void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+ unsigned int n_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < n_entries; i++)
+ destroy_sort_entry(entries[i]);
+
+ vfree(entries);
+}
+
+static struct tracing_map_sort_entry *
+create_sort_entry(void *key, struct tracing_map_elt *elt)
+{
+ struct tracing_map_sort_entry *sort_entry;
+
+ sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL);
+ if (!sort_entry)
+ return NULL;
+
+ sort_entry->key = key;
+ sort_entry->elt = elt;
+
+ return sort_entry;
+}
+
+static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
+{
+ struct tracing_map_elt *dup_elt;
+ unsigned int i;
+
+ dup_elt = tracing_map_elt_alloc(elt->map);
+ if (IS_ERR(dup_elt))
+ return NULL;
+
+ if (elt->map->ops && elt->map->ops->elt_copy)
+ elt->map->ops->elt_copy(dup_elt, elt);
+
+ dup_elt->private_data = elt->private_data;
+ memcpy(dup_elt->key, elt->key, elt->map->key_size);
+
+ for (i = 0; i < elt->map->n_fields; i++) {
+ atomic64_set(&dup_elt->fields[i].sum,
+ atomic64_read(&elt->fields[i].sum));
+ dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
+ }
+
+ return dup_elt;
+}
+
+static int merge_dup(struct tracing_map_sort_entry **sort_entries,
+ unsigned int target, unsigned int dup)
+{
+ struct tracing_map_elt *target_elt, *elt;
+ bool first_dup = (target - dup) == 1;
+ int i;
+
+ if (first_dup) {
+ elt = sort_entries[target]->elt;
+ target_elt = copy_elt(elt);
+ if (!target_elt)
+ return -ENOMEM;
+ sort_entries[target]->elt = target_elt;
+ sort_entries[target]->elt_copied = true;
+ } else
+ target_elt = sort_entries[target]->elt;
+
+ elt = sort_entries[dup]->elt;
+
+ for (i = 0; i < elt->map->n_fields; i++)
+ atomic64_add(atomic64_read(&elt->fields[i].sum),
+ &target_elt->fields[i].sum);
+
+ sort_entries[dup]->dup = true;
+
+ return 0;
+}
+
+static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+ int n_entries, unsigned int key_size)
+{
+ unsigned int dups = 0, total_dups = 0;
+ int err, i, j;
+ void *key;
+
+ if (n_entries < 2)
+ return total_dups;
+
+ sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
+ (int (*)(const void *, const void *))cmp_entries_dup, NULL);
+
+ key = sort_entries[0]->key;
+ for (i = 1; i < n_entries; i++) {
+ if (!memcmp(sort_entries[i]->key, key, key_size)) {
+ dups++; total_dups++;
+ err = merge_dup(sort_entries, i - dups, i);
+ if (err)
+ return err;
+ continue;
+ }
+ key = sort_entries[i]->key;
+ dups = 0;
+ }
+
+ if (!total_dups)
+ return total_dups;
+
+ for (i = 0, j = 0; i < n_entries; i++) {
+ if (!sort_entries[i]->dup) {
+ sort_entries[j] = sort_entries[i];
+ if (j++ != i)
+ sort_entries[i] = NULL;
+ } else {
+ destroy_sort_entry(sort_entries[i]);
+ sort_entries[i] = NULL;
+ }
+ }
+
+ return total_dups;
+}
+
+static bool is_key(struct tracing_map *map, unsigned int field_idx)
+{
+ unsigned int i;
+
+ for (i = 0; i < map->n_keys; i++)
+ if (map->key_idx[i] == field_idx)
+ return true;
+ return false;
+}
+
+static void sort_secondary(struct tracing_map *map,
+ const struct tracing_map_sort_entry **entries,
+ unsigned int n_entries,
+ struct tracing_map_sort_key *primary_key,
+ struct tracing_map_sort_key *secondary_key)
+{
+ int (*primary_fn)(const struct tracing_map_sort_entry **,
+ const struct tracing_map_sort_entry **);
+ int (*secondary_fn)(const struct tracing_map_sort_entry **,
+ const struct tracing_map_sort_entry **);
+ unsigned i, start = 0, n_sub = 1;
+
+ if (is_key(map, primary_key->field_idx))
+ primary_fn = cmp_entries_key;
+ else
+ primary_fn = cmp_entries_sum;
+
+ if (is_key(map, secondary_key->field_idx))
+ secondary_fn = cmp_entries_key;
+ else
+ secondary_fn = cmp_entries_sum;
+
+ for (i = 0; i < n_entries - 1; i++) {
+ const struct tracing_map_sort_entry **a = &entries[i];
+ const struct tracing_map_sort_entry **b = &entries[i + 1];
+
+ if (primary_fn(a, b) == 0) {
+ n_sub++;
+ if (i < n_entries - 2)
+ continue;
+ }
+
+ if (n_sub < 2) {
+ start = i + 1;
+ n_sub = 1;
+ continue;
+ }
+
+ set_sort_key(map, secondary_key);
+ sort(&entries[start], n_sub,
+ sizeof(struct tracing_map_sort_entry *),
+ (int (*)(const void *, const void *))secondary_fn, NULL);
+ set_sort_key(map, primary_key);
+
+ start = i + 1;
+ n_sub = 1;
+ }
+}
+
+/**
+ * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
+ * @map: The tracing_map
+ * @sort_key: The sort key to use for sorting
+ * @sort_entries: outval: pointer to allocated and sorted array of entries
+ *
+ * tracing_map_sort_entries() sorts the current set of entries in the
+ * map and returns the list of tracing_map_sort_entries containing
+ * them to the client in the sort_entries param. The client can
+ * access the struct tracing_map_elt element of interest directly as
+ * the 'elt' field of a returned struct tracing_map_sort_entry object.
+ *
+ * The sort_key has only two fields: idx and descending. 'idx' refers
+ * to the index of the field added via tracing_map_add_sum_field() or
+ * tracing_map_add_key_field() when the tracing_map was initialized.
+ * 'descending' is a flag that if set reverses the sort order, which
+ * by default is ascending.
+ *
+ * The client should not hold on to the returned array but should use
+ * it and call tracing_map_destroy_sort_entries() when done.
+ *
+ * Return: the number of sort_entries in the struct tracing_map_sort_entry
+ * array, negative on error
+ */
+int tracing_map_sort_entries(struct tracing_map *map,
+ struct tracing_map_sort_key *sort_keys,
+ unsigned int n_sort_keys,
+ struct tracing_map_sort_entry ***sort_entries)
+{
+ int (*cmp_entries_fn)(const struct tracing_map_sort_entry **,
+ const struct tracing_map_sort_entry **);
+ struct tracing_map_sort_entry *sort_entry, **entries;
+ int i, n_entries, ret;
+
+ entries = vmalloc(map->max_elts * sizeof(sort_entry));
+ if (!entries)
+ return -ENOMEM;
+
+ for (i = 0, n_entries = 0; i < map->map_size; i++) {
+ struct tracing_map_entry *entry;
+
+ entry = TRACING_MAP_ENTRY(map->map, i);
+
+ if (!entry->key || !entry->val)
+ continue;
+
+ entries[n_entries] = create_sort_entry(entry->val->key,
+ entry->val);
+ if (!entries[n_entries++]) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ }
+
+ if (n_entries == 0) {
+ ret = 0;
+ goto free;
+ }
+
+ if (n_entries == 1) {
+ *sort_entries = entries;
+ return 1;
+ }
+
+ ret = merge_dups(entries, n_entries, map->key_size);
+ if (ret < 0)
+ goto free;
+ n_entries -= ret;
+
+ if (is_key(map, sort_keys[0].field_idx))
+ cmp_entries_fn = cmp_entries_key;
+ else
+ cmp_entries_fn = cmp_entries_sum;
+
+ set_sort_key(map, &sort_keys[0]);
+
+ sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *),
+ (int (*)(const void *, const void *))cmp_entries_fn, NULL);
+
+ if (n_sort_keys > 1)
+ sort_secondary(map,
+ (const struct tracing_map_sort_entry **)entries,
+ n_entries,
+ &sort_keys[0],
+ &sort_keys[1]);
+
+ *sort_entries = entries;
+
+ return n_entries;
+ free:
+ tracing_map_destroy_sort_entries(entries, n_entries);
+
+ return ret;
+}
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
new file mode 100644
index 000000000000..618838f5f30a
--- /dev/null
+++ b/kernel/trace/tracing_map.h
@@ -0,0 +1,283 @@
+#ifndef __TRACING_MAP_H
+#define __TRACING_MAP_H
+
+#define TRACING_MAP_BITS_DEFAULT 11
+#define TRACING_MAP_BITS_MAX 17
+#define TRACING_MAP_BITS_MIN 7
+
+#define TRACING_MAP_KEYS_MAX 2
+#define TRACING_MAP_VALS_MAX 3
+#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
+ TRACING_MAP_VALS_MAX)
+#define TRACING_MAP_SORT_KEYS_MAX 2
+
+typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
+
+/*
+ * This is an overview of the tracing_map data structures and how they
+ * relate to the tracing_map API. The details of the algorithms
+ * aren't discussed here - this is just a general overview of the data
+ * structures and how they interact with the API.
+ *
+ * The central data structure of the tracing_map is an initially
+ * zeroed array of struct tracing_map_entry (stored in the map field
+ * of struct tracing_map). tracing_map_entry is a very simple data
+ * structure containing only two fields: a 32-bit unsigned 'key'
+ * variable and a pointer named 'val'. This array of struct
+ * tracing_map_entry is essentially a hash table which will be
+ * modified by a single function, tracing_map_insert(), but which can
+ * be traversed and read by a user at any time (though the user does
+ * this indirectly via an array of tracing_map_sort_entry - see the
+ * explanation of that data structure in the discussion of the
+ * sorting-related data structures below).
+ *
+ * The central function of the tracing_map API is
+ * tracing_map_insert(). tracing_map_insert() hashes the
+ * arbitrarily-sized key passed into it into a 32-bit unsigned key.
+ * It then uses this key, truncated to the array size, as an index
+ * into the array of tracing_map_entries. If the value of the 'key'
+ * field of the tracing_map_entry found at that location is 0, then
+ * that entry is considered to be free and can be claimed, by
+ * replacing the 0 in the 'key' field of the tracing_map_entry with
+ * the new 32-bit hashed key. Once claimed, that tracing_map_entry's
+ * 'val' field is then used to store a unique element which will be
+ * forever associated with that 32-bit hashed key in the
+ * tracing_map_entry.
+ *
+ * That unique element now in the tracing_map_entry's 'val' field is
+ * an instance of tracing_map_elt, where 'elt' in the latter part of
+ * that variable name is short for 'element'. The purpose of a
+ * tracing_map_elt is to hold values specific to the particular
+ * 32-bit hashed key it's assocated with. Things such as the unique
+ * set of aggregated sums associated with the 32-bit hashed key, along
+ * with a copy of the full key associated with the entry, and which
+ * was used to produce the 32-bit hashed key.
+ *
+ * When tracing_map_create() is called to create the tracing map, the
+ * user specifies (indirectly via the map_bits param, the details are
+ * unimportant for this discussion) the maximum number of elements
+ * that the map can hold (stored in the max_elts field of struct
+ * tracing_map). This is the maximum possible number of
+ * tracing_map_entries in the tracing_map_entry array which can be
+ * 'claimed' as described in the above discussion, and therefore is
+ * also the maximum number of tracing_map_elts that can be associated
+ * with the tracing_map_entry array in the tracing_map. Because of
+ * the way the insertion algorithm works, the size of the allocated
+ * tracing_map_entry array is always twice the maximum number of
+ * elements (2 * max_elts). This value is stored in the map_size
+ * field of struct tracing_map.
+ *
+ * Because tracing_map_insert() needs to work from any context,
+ * including from within the memory allocation functions themselves,
+ * both the tracing_map_entry array and a pool of max_elts
+ * tracing_map_elts are pre-allocated before any call is made to
+ * tracing_map_insert().
+ *
+ * The tracing_map_entry array is allocated as a single block by
+ * tracing_map_create().
+ *
+ * Because the tracing_map_elts are much larger objects and can't
+ * generally be allocated together as a single large array without
+ * failure, they're allocated individually, by tracing_map_init().
+ *
+ * The pool of tracing_map_elts are allocated by tracing_map_init()
+ * rather than by tracing_map_create() because at the time
+ * tracing_map_create() is called, there isn't enough information to
+ * create the tracing_map_elts. Specifically,the user first needs to
+ * tell the tracing_map implementation how many fields the
+ * tracing_map_elts contain, and which types of fields they are (key
+ * or sum). The user does this via the tracing_map_add_sum_field()
+ * and tracing_map_add_key_field() functions, following which the user
+ * calls tracing_map_init() to finish up the tracing map setup. The
+ * array holding the pointers which make up the pre-allocated pool of
+ * tracing_map_elts is allocated as a single block and is stored in
+ * the elts field of struct tracing_map.
+ *
+ * There is also a set of structures used for sorting that might
+ * benefit from some minimal explanation.
+ *
+ * struct tracing_map_sort_key is used to drive the sort at any given
+ * time. By 'any given time' we mean that a different
+ * tracing_map_sort_key will be used at different times depending on
+ * whether the sort currently being performed is a primary or a
+ * secondary sort.
+ *
+ * The sort key is very simple, consisting of the field index of the
+ * tracing_map_elt field to sort on (which the user saved when adding
+ * the field), and whether the sort should be done in an ascending or
+ * descending order.
+ *
+ * For the convenience of the sorting code, a tracing_map_sort_entry
+ * is created for each tracing_map_elt, again individually allocated
+ * to avoid failures that might be expected if allocated as a single
+ * large array of struct tracing_map_sort_entry.
+ * tracing_map_sort_entry instances are the objects expected by the
+ * various internal sorting functions, and are also what the user
+ * ultimately receives after calling tracing_map_sort_entries().
+ * Because it doesn't make sense for users to access an unordered and
+ * sparsely populated tracing_map directly, the
+ * tracing_map_sort_entries() function is provided so that users can
+ * retrieve a sorted list of all existing elements. In addition to
+ * the associated tracing_map_elt 'elt' field contained within the
+ * tracing_map_sort_entry, which is the object of interest to the
+ * user, tracing_map_sort_entry objects contain a number of additional
+ * fields which are used for caching and internal purposes and can
+ * safely be ignored.
+*/
+
+struct tracing_map_field {
+ tracing_map_cmp_fn_t cmp_fn;
+ union {
+ atomic64_t sum;
+ unsigned int offset;
+ };
+};
+
+struct tracing_map_elt {
+ struct tracing_map *map;
+ struct tracing_map_field *fields;
+ void *key;
+ void *private_data;
+};
+
+struct tracing_map_entry {
+ u32 key;
+ struct tracing_map_elt *val;
+};
+
+struct tracing_map_sort_key {
+ unsigned int field_idx;
+ bool descending;
+};
+
+struct tracing_map_sort_entry {
+ void *key;
+ struct tracing_map_elt *elt;
+ bool elt_copied;
+ bool dup;
+};
+
+struct tracing_map_array {
+ unsigned int entries_per_page;
+ unsigned int entry_size_shift;
+ unsigned int entry_shift;
+ unsigned int entry_mask;
+ unsigned int n_pages;
+ void **pages;
+};
+
+#define TRACING_MAP_ARRAY_ELT(array, idx) \
+ (array->pages[idx >> array->entry_shift] + \
+ ((idx & array->entry_mask) << array->entry_size_shift))
+
+#define TRACING_MAP_ENTRY(array, idx) \
+ ((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx))
+
+#define TRACING_MAP_ELT(array, idx) \
+ ((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx))
+
+struct tracing_map {
+ unsigned int key_size;
+ unsigned int map_bits;
+ unsigned int map_size;
+ unsigned int max_elts;
+ atomic_t next_elt;
+ struct tracing_map_array *elts;
+ struct tracing_map_array *map;
+ const struct tracing_map_ops *ops;
+ void *private_data;
+ struct tracing_map_field fields[TRACING_MAP_FIELDS_MAX];
+ unsigned int n_fields;
+ int key_idx[TRACING_MAP_KEYS_MAX];
+ unsigned int n_keys;
+ struct tracing_map_sort_key sort_key;
+ atomic64_t hits;
+ atomic64_t drops;
+};
+
+/**
+ * struct tracing_map_ops - callbacks for tracing_map
+ *
+ * The methods in this structure define callback functions for various
+ * operations on a tracing_map or objects related to a tracing_map.
+ *
+ * For a detailed description of tracing_map_elt objects please see
+ * the overview of tracing_map data structures at the beginning of
+ * this file.
+ *
+ * All the methods below are optional.
+ *
+ * @elt_alloc: When a tracing_map_elt is allocated, this function, if
+ * defined, will be called and gives clients the opportunity to
+ * allocate additional data and attach it to the element
+ * (tracing_map_elt->private_data is meant for that purpose).
+ * Element allocation occurs before tracing begins, when the
+ * tracing_map_init() call is made by client code.
+ *
+ * @elt_copy: At certain points in the lifetime of an element, it may
+ * need to be copied. The copy should include a copy of the
+ * client-allocated data, which can be copied into the 'to'
+ * element from the 'from' element.
+ *
+ * @elt_free: When a tracing_map_elt is freed, this function is called
+ * and allows client-allocated per-element data to be freed.
+ *
+ * @elt_clear: This callback allows per-element client-defined data to
+ * be cleared, if applicable.
+ *
+ * @elt_init: This callback allows per-element client-defined data to
+ * be initialized when used i.e. when the element is actually
+ * claimed by tracing_map_insert() in the context of the map
+ * insertion.
+ */
+struct tracing_map_ops {
+ int (*elt_alloc)(struct tracing_map_elt *elt);
+ void (*elt_copy)(struct tracing_map_elt *to,
+ struct tracing_map_elt *from);
+ void (*elt_free)(struct tracing_map_elt *elt);
+ void (*elt_clear)(struct tracing_map_elt *elt);
+ void (*elt_init)(struct tracing_map_elt *elt);
+};
+
+extern struct tracing_map *
+tracing_map_create(unsigned int map_bits,
+ unsigned int key_size,
+ const struct tracing_map_ops *ops,
+ void *private_data);
+extern int tracing_map_init(struct tracing_map *map);
+
+extern int tracing_map_add_sum_field(struct tracing_map *map);
+extern int tracing_map_add_key_field(struct tracing_map *map,
+ unsigned int offset,
+ tracing_map_cmp_fn_t cmp_fn);
+
+extern void tracing_map_destroy(struct tracing_map *map);
+extern void tracing_map_clear(struct tracing_map *map);
+
+extern struct tracing_map_elt *
+tracing_map_insert(struct tracing_map *map, void *key);
+extern struct tracing_map_elt *
+tracing_map_lookup(struct tracing_map *map, void *key);
+
+extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+ int field_is_signed);
+extern int tracing_map_cmp_string(void *val_a, void *val_b);
+extern int tracing_map_cmp_none(void *val_a, void *val_b);
+
+extern void tracing_map_update_sum(struct tracing_map_elt *elt,
+ unsigned int i, u64 n);
+extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
+extern void tracing_map_set_field_descr(struct tracing_map *map,
+ unsigned int i,
+ unsigned int key_offset,
+ tracing_map_cmp_fn_t cmp_fn);
+extern int
+tracing_map_sort_entries(struct tracing_map *map,
+ struct tracing_map_sort_key *sort_keys,
+ unsigned int n_sort_keys,
+ struct tracing_map_sort_entry ***sort_entries);
+
+extern void
+tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+ unsigned int n_entries);
+#endif /* __TRACING_MAP_H */
diff --git a/kernel/ucount.c b/kernel/ucount.c
new file mode 100644
index 000000000000..9d20d5dd298a
--- /dev/null
+++ b/kernel/ucount.c
@@ -0,0 +1,235 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/user_namespace.h>
+
+#define UCOUNTS_HASHTABLE_BITS 10
+static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
+static DEFINE_SPINLOCK(ucounts_lock);
+
+#define ucounts_hashfn(ns, uid) \
+ hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
+ UCOUNTS_HASHTABLE_BITS)
+#define ucounts_hashentry(ns, uid) \
+ (ucounts_hashtable + ucounts_hashfn(ns, uid))
+
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+ return &current_user_ns()->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return &current_user_ns()->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ struct user_namespace *user_ns =
+ container_of(head->set, struct user_namespace, set);
+ int mode;
+
+ /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+ if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+ mode = (table->mode & S_IRWXU) >> 6;
+ else
+ /* Allow all others at most read-only access */
+ mode = table->mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+ .lookup = set_lookup,
+ .permissions = set_permissions,
+};
+
+static int zero = 0;
+static int int_max = INT_MAX;
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(int), \
+ .mode = 0644, \
+ .proc_handler = proc_dointvec_minmax, \
+ .extra1 = &zero, \
+ .extra2 = &int_max, \
+ }
+static struct ctl_table user_table[] = {
+ UCOUNT_ENTRY("max_user_namespaces"),
+ UCOUNT_ENTRY("max_pid_namespaces"),
+ UCOUNT_ENTRY("max_uts_namespaces"),
+ UCOUNT_ENTRY("max_ipc_namespaces"),
+ UCOUNT_ENTRY("max_net_namespaces"),
+ UCOUNT_ENTRY("max_mnt_namespaces"),
+ UCOUNT_ENTRY("max_cgroup_namespaces"),
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+bool setup_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+ setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+ tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
+ if (tbl) {
+ int i;
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ tbl[i].data = &ns->ucount_max[i];
+ }
+ ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+ }
+ if (!ns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&ns->set);
+ return false;
+ }
+#endif
+ return true;
+}
+
+void retire_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = ns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(ns->sysctls);
+ retire_sysctl_set(&ns->set);
+ kfree(tbl);
+#endif
+}
+
+static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
+{
+ struct ucounts *ucounts;
+
+ hlist_for_each_entry(ucounts, hashent, node) {
+ if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
+ return ucounts;
+ }
+ return NULL;
+}
+
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+{
+ struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+ struct ucounts *ucounts, *new;
+
+ spin_lock(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (!ucounts) {
+ spin_unlock(&ucounts_lock);
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->ns = ns;
+ new->uid = uid;
+ atomic_set(&new->count, 0);
+
+ spin_lock(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (ucounts) {
+ kfree(new);
+ } else {
+ hlist_add_head(&new->node, hashent);
+ ucounts = new;
+ }
+ }
+ if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+ ucounts = NULL;
+ spin_unlock(&ucounts_lock);
+ return ucounts;
+}
+
+static void put_ucounts(struct ucounts *ucounts)
+{
+ if (atomic_dec_and_test(&ucounts->count)) {
+ spin_lock(&ucounts_lock);
+ hlist_del_init(&ucounts->node);
+ spin_unlock(&ucounts_lock);
+
+ kfree(ucounts);
+ }
+}
+
+static inline bool atomic_inc_below(atomic_t *v, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c >= u))
+ return false;
+ old = atomic_cmpxchg(v, c, c+1);
+ if (likely(old == c))
+ return true;
+ c = old;
+ }
+}
+
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
+ enum ucount_type type)
+{
+ struct ucounts *ucounts, *iter, *bad;
+ struct user_namespace *tns;
+ ucounts = get_ucounts(ns, uid);
+ for (iter = ucounts; iter; iter = tns->ucounts) {
+ int max;
+ tns = iter->ns;
+ max = READ_ONCE(tns->ucount_max[type]);
+ if (!atomic_inc_below(&iter->ucount[type], max))
+ goto fail;
+ }
+ return ucounts;
+fail:
+ bad = iter;
+ for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
+ atomic_dec(&iter->ucount[type]);
+
+ put_ucounts(ucounts);
+ return NULL;
+}
+
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
+{
+ struct ucounts *iter;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ }
+ put_ucounts(ucounts);
+}
+
+static __init int user_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ static struct ctl_table_header *user_header;
+ static struct ctl_table empty[1];
+ /*
+ * It is necessary to register the user directory in the
+ * default set so that registrations in the child sets work
+ * properly.
+ */
+ user_header = register_sysctl("user", empty);
+ BUG_ON(!user_header);
+ BUG_ON(!setup_userns_sysctls(&init_user_ns));
+#endif
+ return 0;
+}
+subsys_initcall(user_namespace_sysctl_init);
+
+
diff --git a/kernel/up.c b/kernel/up.c
index 1760bf3d1463..ee81ac9af4ca 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/smp.h>
+#include <linux/hypervisor.h>
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int wait)
@@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond);
+
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+ int ret;
+
+ if (cpu != 0)
+ return -ENXIO;
+
+ if (phys)
+ hypervisor_pin_vcpu(0);
+ ret = func(par);
+ if (phys)
+ hypervisor_pin_vcpu(-1);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..86b7854fec8e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
+static void free_user_ns(struct work_struct *work);
+
+static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
+{
+ return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
+}
+
+static void dec_user_namespaces(struct ucounts *ucounts)
+{
+ return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
+}
static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new)
struct user_namespace *ns, *parent_ns = new->user_ns;
kuid_t owner = new->euid;
kgid_t group = new->egid;
- int ret;
+ struct ucounts *ucounts;
+ int ret, i;
+ ret = -ENOSPC;
if (parent_ns->level > 32)
- return -EUSERS;
+ goto fail;
+
+ ucounts = inc_user_namespaces(parent_ns, owner);
+ if (!ucounts)
+ goto fail;
/*
* Verify that we can not violate the policy of which files
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new)
* by verifing that the root directory is at the root of the
* mount namespace which allows all files to be accessed.
*/
+ ret = -EPERM;
if (current_chrooted())
- return -EPERM;
+ goto fail_dec;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
*/
+ ret = -EPERM;
if (!kuid_has_mapping(parent_ns, owner) ||
!kgid_has_mapping(parent_ns, group))
- return -EPERM;
+ goto fail_dec;
+ ret = -ENOMEM;
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
- return -ENOMEM;
+ goto fail_dec;
ret = ns_alloc_inum(&ns->ns);
- if (ret) {
- kmem_cache_free(user_ns_cachep, ns);
- return ret;
- }
+ if (ret)
+ goto fail_free;
ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new)
ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
+ INIT_WORK(&ns->work, free_user_ns);
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ ns->ucount_max[i] = INT_MAX;
+ }
+ ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
mutex_lock(&userns_state_mutex);
ns->flags = parent_ns->flags;
mutex_unlock(&userns_state_mutex);
- set_cred_user_ns(new, ns);
-
#ifdef CONFIG_PERSISTENT_KEYRINGS
init_rwsem(&ns->persistent_keyring_register_sem);
#endif
+ ret = -ENOMEM;
+ if (!setup_userns_sysctls(ns))
+ goto fail_keyring;
+
+ set_cred_user_ns(new, ns);
return 0;
+fail_keyring:
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ key_put(ns->persistent_keyring_register);
+#endif
+ ns_free_inum(&ns->ns);
+fail_free:
+ kmem_cache_free(user_ns_cachep, ns);
+fail_dec:
+ dec_user_namespaces(ucounts);
+fail:
+ return ret;
}
int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
return err;
}
-void free_user_ns(struct user_namespace *ns)
+static void free_user_ns(struct work_struct *work)
{
- struct user_namespace *parent;
+ struct user_namespace *parent, *ns =
+ container_of(work, struct user_namespace, work);
do {
+ struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ retire_userns_sysctls(ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
+ dec_user_namespaces(ucounts);
ns = parent;
} while (atomic_dec_and_test(&parent->count));
}
-EXPORT_SYMBOL(free_user_ns);
+
+void __put_user_ns(struct user_namespace *ns)
+{
+ schedule_work(&ns->work);
+}
+EXPORT_SYMBOL(__put_user_ns);
static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
@@ -938,6 +984,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)
return allowed;
}
+/*
+ * Returns true if @ns is the same namespace as or a descendant of
+ * @target_ns.
+ */
+bool current_in_userns(const struct user_namespace *target_ns)
+{
+ struct user_namespace *ns;
+ for (ns = current_user_ns(); ns; ns = ns->parent) {
+ if (ns == target_ns)
+ return true;
+ }
+ return false;
+}
+
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
@@ -990,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return commit_creds(cred);
}
+struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+ struct user_namespace *my_user_ns = current_user_ns();
+ struct user_namespace *owner, *p;
+
+ /* See if the owner is in the current user namespace */
+ owner = p = ns->ops->owner(ns);
+ for (;;) {
+ if (!p)
+ return ERR_PTR(-EPERM);
+ if (p == my_user_ns)
+ break;
+ p = p->parent;
+ }
+
+ return &get_user_ns(owner)->ns;
+}
+
+static struct user_namespace *userns_owner(struct ns_common *ns)
+{
+ return to_user_ns(ns)->parent;
+}
+
const struct proc_ns_operations userns_operations = {
.name = "user",
.type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
+ .owner = userns_owner,
+ .get_parent = ns_get_owner,
};
static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 831ea7108232..6976cd47dcf6 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,16 @@
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
+}
+
+static void dec_uts_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
+}
+
static struct uts_namespace *create_uts_ns(void)
{
struct uts_namespace *uts_ns;
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
+ struct ucounts *ucounts;
int err;
+ err = -ENOSPC;
+ ucounts = inc_uts_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
ns = create_uts_ns();
if (!ns)
- return ERR_PTR(-ENOMEM);
+ goto fail_dec;
err = ns_alloc_inum(&ns->ns);
- if (err) {
- kfree(ns);
- return ERR_PTR(err);
- }
+ if (err)
+ goto fail_free;
+ ns->ucounts = ucounts;
ns->ns.ops = &utsns_operations;
down_read(&uts_sem);
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
return ns;
+
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_uts_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
}
/*
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref)
struct uts_namespace *ns;
ns = container_of(kref, struct uts_namespace, kref);
+ dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
@@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
return 0;
}
+static struct user_namespace *utsns_owner(struct ns_common *ns)
+{
+ return to_uts_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
+ .owner = utsns_owner,
};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f5068e94003..ef071ca73fc3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,54 +433,28 @@ static void *work_debug_hint(void *addr)
return ((struct work_struct *) addr)->func;
}
-/*
- * fixup_init is called when:
- * - an active object is initialized
- */
-static int work_fixup_init(void *addr, enum debug_obj_state state)
+static bool work_is_static_object(void *addr)
{
struct work_struct *work = addr;
- switch (state) {
- case ODEBUG_STATE_ACTIVE:
- cancel_work_sync(work);
- debug_object_init(work, &work_debug_descr);
- return 1;
- default:
- return 0;
- }
+ return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}
/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * fixup_init is called when:
+ * - an active object is initialized
*/
-static int work_fixup_activate(void *addr, enum debug_obj_state state)
+static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. The work struct was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
- debug_object_init(work, &work_debug_descr);
- debug_object_activate(work, &work_debug_descr);
- return 0;
- }
- WARN_ON_ONCE(1);
- return 0;
-
case ODEBUG_STATE_ACTIVE:
- WARN_ON(1);
-
+ cancel_work_sync(work);
+ debug_object_init(work, &work_debug_descr);
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -488,7 +462,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int work_fixup_free(void *addr, enum debug_obj_state state)
+static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
@@ -496,17 +470,17 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
cancel_work_sync(work);
debug_object_free(work, &work_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
static struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
.debug_hint = work_debug_hint,
+ .is_static_object = work_is_static_object,
.fixup_init = work_fixup_init,
- .fixup_activate = work_fixup_activate,
.fixup_free = work_fixup_free,
};
@@ -4395,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)
/**
* show_workqueue_state - dump workqueue state
*
- * Called from a sysrq handler and prints out all busy workqueues and
- * pools.
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
+ * all busy workqueues and pools.
*/
void show_workqueue_state(void)
{
@@ -4626,95 +4600,72 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
return;
- /* is @cpu the only online CPU? */
cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
- if (cpumask_weight(&cpumask) != 1)
- return;
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
- pool->attrs->cpumask) < 0);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}
-/*
- * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
- */
-static int workqueue_cpu_up_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+int workqueue_prepare_cpu(unsigned int cpu)
+{
+ struct worker_pool *pool;
+
+ for_each_cpu_worker_pool(pool, cpu) {
+ if (pool->nr_workers)
+ continue;
+ if (!create_worker(pool))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int workqueue_online_cpu(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct worker_pool *pool;
struct workqueue_struct *wq;
int pi;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- for_each_cpu_worker_pool(pool, cpu) {
- if (pool->nr_workers)
- continue;
- if (!create_worker(pool))
- return NOTIFY_BAD;
- }
- break;
-
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- mutex_lock(&wq_pool_mutex);
+ mutex_lock(&wq_pool_mutex);
- for_each_pool(pool, pi) {
- mutex_lock(&pool->attach_mutex);
+ for_each_pool(pool, pi) {
+ mutex_lock(&pool->attach_mutex);
- if (pool->cpu == cpu)
- rebind_workers(pool);
- else if (pool->cpu < 0)
- restore_unbound_workers_cpumask(pool, cpu);
+ if (pool->cpu == cpu)
+ rebind_workers(pool);
+ else if (pool->cpu < 0)
+ restore_unbound_workers_cpumask(pool, cpu);
- mutex_unlock(&pool->attach_mutex);
- }
+ mutex_unlock(&pool->attach_mutex);
+ }
- /* update NUMA affinity of unbound workqueues */
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, true);
+ /* update NUMA affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, true);
- mutex_unlock(&wq_pool_mutex);
- break;
- }
- return NOTIFY_OK;
+ mutex_unlock(&wq_pool_mutex);
+ return 0;
}
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int workqueue_cpu_down_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+int workqueue_offline_cpu(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct work_struct unbind_work;
struct workqueue_struct *wq;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- /* unbinding per-cpu workers should happen on the local CPU */
- INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
- queue_work_on(cpu, system_highpri_wq, &unbind_work);
-
- /* update NUMA affinity of unbound workqueues */
- mutex_lock(&wq_pool_mutex);
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, false);
- mutex_unlock(&wq_pool_mutex);
-
- /* wait for per-cpu unbinding to finish */
- flush_work(&unbind_work);
- destroy_work_on_stack(&unbind_work);
- break;
- }
- return NOTIFY_OK;
+ /* unbinding per-cpu workers should happen on the local CPU */
+ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+ queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+ /* update NUMA affinity of unbound workqueues */
+ mutex_lock(&wq_pool_mutex);
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, false);
+ mutex_unlock(&wq_pool_mutex);
+
+ /* wait for per-cpu unbinding to finish */
+ flush_work(&unbind_work);
+ destroy_work_on_stack(&unbind_work);
+ return 0;
}
#ifdef CONFIG_SMP
@@ -5516,9 +5467,6 @@ static int __init init_workqueues(void)
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
- hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
-
wq_numa_init();
/* initialize CPU pools */