summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorStefan Richter <stefanr@s5r6.in-berlin.de>2011-05-10 20:52:07 +0200
committerStefan Richter <stefanr@s5r6.in-berlin.de>2011-05-10 22:50:41 +0200
commit020abf03cd659388f94cb328e1e1df0656e0d7ff (patch)
tree40d05011708ad1b4a05928d167eb120420581aa6 /kernel
parent0ff8fbc61727c926883eec381fbd3d32d1fab504 (diff)
parent693d92a1bbc9e42681c42ed190bd42b636ca876f (diff)
Merge tag 'v2.6.39-rc7'
in order to pull in changes in drivers/media/dvb/firewire/ and sound/firewire/.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c10
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c96
-rw-r--r--kernel/cgroup.c125
-rw-r--r--kernel/compat.c136
-rw-r--r--kernel/cpu.c42
-rw-r--r--kernel/cpuset.c87
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c24
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/debug/kdb/kdb_main.c12
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/fork.c201
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c455
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c177
-rw-r--r--kernel/hw_breakpoint.c2
-rw-r--r--kernel/irq/Kconfig49
-rw-r--r--kernel/irq/autoprobe.c52
-rw-r--r--kernel/irq/chip.c696
-rw-r--r--kernel/irq/debug.h44
-rw-r--r--kernel/irq/dummychip.c9
-rw-r--r--kernel/irq/handle.c226
-rw-r--r--kernel/irq/internals.h161
-rw-r--r--kernel/irq/irqdesc.c92
-rw-r--r--kernel/irq/manage.c637
-rw-r--r--kernel/irq/migration.c37
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c90
-rw-r--r--kernel/irq/resend.c18
-rw-r--r--kernel/irq/settings.h125
-rw-r--r--kernel/irq/spurious.c162
-rw-r--r--kernel/irq_work.c18
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kexec.c20
-rw-r--r--kernel/kprobes.c573
-rw-r--r--kernel/kthread.c46
-rw-r--r--kernel/latencytop.c25
-rw-r--r--kernel/lockdep.c22
-rw-r--r--kernel/lockdep_proc.c25
-rw-r--r--kernel/module.c197
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c11
-rw-r--r--kernel/params.c67
-rw-r--r--kernel/perf_event.c1827
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/pm_qos_params.c24
-rw-r--r--kernel/posix-cpu-timers.c112
-rw-r--r--kernel/posix-timers.c354
-rw-r--r--kernel/power/Kconfig246
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/power/main.c7
-rw-r--r--kernel/power/nvs.c136
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/snapshot.c15
-rw-r--r--kernel/power/suspend.c16
-rw-r--r--kernel/power/swap.c9
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk.c370
-rw-r--r--kernel/ptrace.c52
-rw-r--r--kernel/rcupdate.c10
-rw-r--r--kernel/rcutiny.c106
-rw-r--r--kernel/rcutiny_plugin.h435
-rw-r--r--kernel/rcutorture.c269
-rw-r--r--kernel/rcutree.c160
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/resource.c104
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c1375
-rw-r--r--kernel/sched_autogroup.c275
-rw-r--r--kernel/sched_autogroup.h41
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c125
-rw-r--r--kernel/sched_fair.c778
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_idletask.c28
-rw-r--r--kernel/sched_rt.c63
-rw-r--r--kernel/sched_stoptask.c9
-rw-r--r--kernel/signal.c201
-rw-r--r--kernel/smp.c197
-rw-r--r--kernel/softirq.c96
-rw-r--r--kernel/srcu.c19
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c90
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c141
-rw-r--r--kernel/sysctl_binary.c22
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c64
-rw-r--r--kernel/time.c39
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/jiffies.c22
-rw-r--r--kernel/time/ntp.c440
-rw-r--r--kernel/time/posix-clock.c445
-rw-r--r--kernel/time/tick-broadcast.c11
-rw-r--r--kernel/time/tick-common.c9
-rw-r--r--kernel/time/tick-internal.h12
-rw-r--r--kernel/time/tick-oneshot.c5
-rw-r--r--kernel/time/tick-sched.c8
-rw-r--r--kernel/time/timecompare.c5
-rw-r--r--kernel/time/timekeeping.c224
-rw-r--r--kernel/time/timer_list.c12
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c90
-rw-r--r--kernel/trace/Kconfig21
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c108
-rw-r--r--kernel/trace/ftrace.c57
-rw-r--r--kernel/trace/power-traces.c5
-rw-r--r--kernel/trace/ring_buffer.c39
-rw-r--r--kernel/trace/trace.c57
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h10
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/trace/trace_events.c21
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_export.c20
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c113
-rw-r--r--kernel/trace/trace_output.c36
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/trace_syscalls.c92
-rw-r--r--kernel/tracepoint.c31
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c9
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c125
-rw-r--r--kernel/workqueue.c148
158 files changed, 11360 insertions, 5487 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa2..85cbfb31e73 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
@@ -100,12 +100,14 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -121,7 +123,7 @@ $(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d5..93950031706 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int audit_initialized;
int audit_enabled;
int audit_ever_enabled;
+EXPORT_SYMBOL_GPL(audit_enabled);
+
/* Default state when kernel boots without any parameters. */
static int audit_default;
@@ -400,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
if (err < 0) {
BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd dissapeared\n");
+ audit_log_lost("auditd disappeared\n");
audit_pid = 0;
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
pid = NETLINK_CREDS(skb)->pid;
uid = NETLINK_CREDS(skb)->uid;
- loginuid = NETLINK_CB(skb).loginuid;
- sessionid = NETLINK_CB(skb).sessionid;
- sid = NETLINK_CB(skb).sid;
+ loginuid = audit_get_loginuid(current);
+ sessionid = audit_get_sessionid(current);
+ security_task_getsecid(current, &sid);
seq = nlh->nlmsg_seq;
data = NLMSG_DATA(nlh);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c..e99dda04b12 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
- /* this could be NULL if the watch is dieing else where... */
+ /* this could be NULL if the watch is dying else where... */
struct inode *inode = chunk->mark.i.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c786646..e683869365d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+static struct audit_parent *audit_init_parent(struct path *path)
{
- struct inode *inode = ndp->path.dentry->d_inode;
+ struct inode *inode = path->dentry->d_inode;
struct audit_parent *parent;
int ret;
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
}
/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct nameidata *ndparent, *ndwatch;
+ struct nameidata nd;
+ struct dentry *d;
int err;
- ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
- if (unlikely(!ndparent))
- return -ENOMEM;
+ err = kern_path_parent(watch->path, &nd);
+ if (err)
+ return err;
- ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
- if (unlikely(!ndwatch)) {
- kfree(ndparent);
- return -ENOMEM;
+ if (nd.last_type != LAST_NORM) {
+ path_put(&nd.path);
+ return -EINVAL;
}
- err = path_lookup(path, LOOKUP_PARENT, ndparent);
- if (err) {
- kfree(ndparent);
- kfree(ndwatch);
- return err;
+ mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+ if (IS_ERR(d)) {
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ path_put(&nd.path);
+ return PTR_ERR(d);
}
-
- err = path_lookup(path, 0, ndwatch);
- if (err) {
- kfree(ndwatch);
- ndwatch = NULL;
+ if (d->d_inode) {
+ /* update watch filter fields */
+ watch->dev = d->d_inode->i_sb->s_dev;
+ watch->ino = d->d_inode->i_ino;
}
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- *ndp = ndparent;
- *ndw = ndwatch;
-
+ *parent = nd.path;
+ dput(d);
return 0;
}
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
- if (ndp) {
- path_put(&ndp->path);
- kfree(ndp);
- }
- if (ndw) {
- path_put(&ndw->path);
- kfree(ndw);
- }
-}
-
/* Associate the given rule with an existing parent.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent;
- struct nameidata *ndp = NULL, *ndw = NULL;
+ struct path parent_path;
int h, ret = 0;
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
- ret = audit_get_nd(watch->path, &ndp, &ndw);
- if (ret) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
- goto error;
- }
+ ret = audit_get_nd(watch, &parent_path);
+ /* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
- /* update watch filter fields */
- if (ndw) {
- watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->path.dentry->d_inode->i_ino;
- }
+ if (ret)
+ return ret;
/* either find an old parent or attach a new one */
- parent = audit_find_parent(ndp->path.dentry->d_inode);
+ parent = audit_find_parent(parent_path.dentry->d_inode);
if (!parent) {
- parent = audit_init_parent(ndp);
+ parent = audit_init_parent(&parent_path);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
- audit_put_nd(ndp, ndw); /* NULL args OK */
+ path_put(&parent_path);
return ret;
-
}
void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71..f8277c80d67 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
int result = 0;
+ u32 sid;
switch (f->type) {
case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
result = audit_comparator(cb->creds.gid, f->op, f->val);
break;
case AUDIT_LOGINUID:
- result = audit_comparator(cb->loginuid, f->op, f->val);
+ result = audit_comparator(audit_get_loginuid(current),
+ f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
- if (f->lsm_rule)
- result = security_audit_rule_match(cb->sid,
+ if (f->lsm_rule) {
+ security_task_getsecid(current, &sid);
+ result = security_audit_rule_match(sid,
f->type,
f->op,
f->lsm_rule,
NULL);
+ }
break;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2e..b33513a08be 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
/*
* to_send and len_sent accounting are very loose estimates. We aren't
* really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundry)
+ * within about 500 bytes (next page boundary)
*
* why snprintf? an int is up to 12 digits long. if we just assumed when
* logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c13..0c9b862292b 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
void foo(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+ DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
/* End of constants */
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a..bf0c734d0c1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/*
@@ -290,6 +291,60 @@ error:
}
/**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+ int ret = security_real_capable(t, &init_user_ns, cap);
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret = security_real_capable(t, ns, cap);
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited)
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+ int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+
+ return (ret == 0);
+}
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -299,17 +354,48 @@ error:
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
-int capable(int cap)
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
BUG();
}
- if (security_capable(cap) == 0) {
+ if (security_capable(ns, current_cred(), cap) == 0) {
current->flags |= PF_SUPERPRIV;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
+
+/**
+ * task_ns_capable - Determine whether current task has a superior
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
+ *
+ * Return true if it does, false otherwise.
+ */
+bool task_ns_capable(struct task_struct *t, int cap)
+{
+ return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+}
+EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c1..25c7eb52de1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,7 +157,7 @@ struct css_id {
};
/*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
/*
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
*/
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
iput(inode);
}
+static int cgroup_delete(const struct dentry *d)
+{
+ return 1;
+}
+
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
struct list_head *node;
BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(node);
if (d->d_inode) {
/* This should never be called on a cgroup
* directory with child cgroups */
BUG_ON(d->d_inode->i_mode & S_IFDIR);
- d = dget_locked(d);
- spin_unlock(&dcache_lock);
+ dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ spin_unlock(&dentry->d_lock);
d_delete(d);
simple_unlink(dentry->d_inode, d);
dput(d);
- spin_lock(&dcache_lock);
- }
+ spin_lock(&dentry->d_lock);
+ } else
+ spin_unlock(&d->d_lock);
node = dentry->d_subdirs.next;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
}
/*
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
*/
static void cgroup_d_remove_dir(struct dentry *dentry)
{
+ struct dentry *parent;
+
cgroup_clear_directory(dentry);
- spin_lock(&dcache_lock);
+ parent = dentry->d_parent;
+ spin_lock(&parent->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&parent->d_lock);
remove_dir(dentry);
}
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
static int cgroup_get_rootdir(struct super_block *sb)
{
+ static const struct dentry_operations cgroup_dops = {
+ .d_iput = cgroup_diput,
+ .d_delete = cgroup_delete,
+ };
+
struct inode *inode =
cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
struct dentry *dentry;
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
return -ENOMEM;
}
sb->s_root = dentry;
+ /* for everything else we want ->d_op set */
+ sb->s_d_op = &cgroup_dops;
return 0;
}
@@ -1791,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
/* Update the css_set linked lists if we're using them */
write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
@@ -2180,12 +2200,20 @@ static const struct file_operations cgroup_file_operations = {
};
static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = simple_lookup,
+ .lookup = cgroup_lookup,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
};
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+
/*
* Check if a file is a control file
*/
@@ -2199,10 +2227,6 @@ static inline struct cftype *__file_cft(struct file *file)
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- };
-
struct inode *inode;
if (!dentry)
@@ -2228,7 +2252,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
}
- dentry->d_op = &cgroup_dops;
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
return 0;
@@ -3630,17 +3653,15 @@ again:
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
- list_del(&cgrp->release_list);
+ list_del_init(&cgrp->release_list);
spin_unlock(&release_list_lock);
cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
- list_del(&cgrp->sibling);
+ list_del_init(&cgrp->sibling);
cgroup_unlock_hierarchy(cgrp->root);
- spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
- spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
dput(d);
@@ -3856,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
- list_del(&ss->sibling);
+ list_del_init(&ss->sibling);
/*
* disentangle the css from all css_sets attached to the dummytop. as
@@ -4207,20 +4228,8 @@ void cgroup_post_fork(struct task_struct *child)
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
- int i;
struct css_set *cg;
-
- if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->exit)
- ss->exit(ss, tsk);
- }
- }
+ int i;
/*
* Unlink from the css_set task list if necessary.
@@ -4230,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
- list_del(&tsk->cg_list);
+ list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
@@ -4238,7 +4247,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
+
+ if (run_callbacks && need_forkexit_callback) {
+ /*
+ * modular subsystems can't use callbacks, so no need to lock
+ * the subsys array
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->exit) {
+ struct cgroup *old_cgrp =
+ rcu_dereference_raw(cg->subsys[i])->cgroup;
+ struct cgroup *cgrp = task_cgroup(tsk, i);
+ ss->exit(ss, cgrp, old_cgrp, tsk);
+ }
+ }
+ }
task_unlock(tsk);
+
if (cg)
put_css_set_taskexit(cg);
}
@@ -4790,6 +4816,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+ struct cgroup *cgrp;
+ struct inode *inode;
+ struct cgroup_subsys_state *css;
+
+ inode = f->f_dentry->d_inode;
+ /* check in cgroup filesystem dir */
+ if (inode->i_op != &cgroup_dir_inode_operations)
+ return ERR_PTR(-EBADF);
+
+ if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+ return ERR_PTR(-EINVAL);
+
+ /* get cgroup */
+ cgrp = __d_cgrp(f->f_dentry);
+ css = cgrp->subsys[id];
+ return css ? css : ERR_PTR(-ENOENT);
+}
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a..38b1d2c1cbe 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
}
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+ memset(txc, 0, sizeof(struct timex));
+
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+ __get_user(txc->modes, &utp->modes) ||
+ __get_user(txc->offset, &utp->offset) ||
+ __get_user(txc->freq, &utp->freq) ||
+ __get_user(txc->maxerror, &utp->maxerror) ||
+ __get_user(txc->esterror, &utp->esterror) ||
+ __get_user(txc->status, &utp->status) ||
+ __get_user(txc->constant, &utp->constant) ||
+ __get_user(txc->precision, &utp->precision) ||
+ __get_user(txc->tolerance, &utp->tolerance) ||
+ __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc->tick, &utp->tick) ||
+ __get_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc->jitter, &utp->jitter) ||
+ __get_user(txc->shift, &utp->shift) ||
+ __get_user(txc->stabil, &utp->stabil) ||
+ __get_user(txc->jitcnt, &utp->jitcnt) ||
+ __get_user(txc->calcnt, &utp->calcnt) ||
+ __get_user(txc->errcnt, &utp->errcnt) ||
+ __get_user(txc->stbcnt, &utp->stbcnt))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+ __put_user(txc->modes, &utp->modes) ||
+ __put_user(txc->offset, &utp->offset) ||
+ __put_user(txc->freq, &utp->freq) ||
+ __put_user(txc->maxerror, &utp->maxerror) ||
+ __put_user(txc->esterror, &utp->esterror) ||
+ __put_user(txc->status, &utp->status) ||
+ __put_user(txc->constant, &utp->constant) ||
+ __put_user(txc->precision, &utp->precision) ||
+ __put_user(txc->tolerance, &utp->tolerance) ||
+ __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc->tick, &utp->tick) ||
+ __put_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc->jitter, &utp->jitter) ||
+ __put_user(txc->shift, &utp->shift) ||
+ __put_user(txc->stabil, &utp->stabil) ||
+ __put_user(txc->jitcnt, &utp->jitcnt) ||
+ __put_user(txc->calcnt, &utp->calcnt) ||
+ __put_user(txc->errcnt, &utp->errcnt) ||
+ __put_user(txc->stbcnt, &utp->stbcnt) ||
+ __put_user(txc->tai, &utp->tai))
+ return -EFAULT;
+ return 0;
+}
+
asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
struct timezone __user *tz)
{
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
return err;
}
+long compat_sys_clock_adjtime(clockid_t which_clock,
+ struct compat_timex __user *utp)
+{
+ struct timex txc;
+ mm_segment_t oldfs;
+ int err, ret;
+
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+ set_fs(oldfs);
+
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
+
+ return ret;
+}
+
long compat_sys_clock_getres(clockid_t which_clock,
struct compat_timespec __user *tp)
{
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
{
struct timex txc;
- int ret;
-
- memset(&txc, 0, sizeof(struct timex));
+ int err, ret;
- if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
- __get_user(txc.modes, &utp->modes) ||
- __get_user(txc.offset, &utp->offset) ||
- __get_user(txc.freq, &utp->freq) ||
- __get_user(txc.maxerror, &utp->maxerror) ||
- __get_user(txc.esterror, &utp->esterror) ||
- __get_user(txc.status, &utp->status) ||
- __get_user(txc.constant, &utp->constant) ||
- __get_user(txc.precision, &utp->precision) ||
- __get_user(txc.tolerance, &utp->tolerance) ||
- __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __get_user(txc.tick, &utp->tick) ||
- __get_user(txc.ppsfreq, &utp->ppsfreq) ||
- __get_user(txc.jitter, &utp->jitter) ||
- __get_user(txc.shift, &utp->shift) ||
- __get_user(txc.stabil, &utp->stabil) ||
- __get_user(txc.jitcnt, &utp->jitcnt) ||
- __get_user(txc.calcnt, &utp->calcnt) ||
- __get_user(txc.errcnt, &utp->errcnt) ||
- __get_user(txc.stbcnt, &utp->stbcnt))
- return -EFAULT;
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
ret = do_adjtimex(&txc);
- if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
- __put_user(txc.modes, &utp->modes) ||
- __put_user(txc.offset, &utp->offset) ||
- __put_user(txc.freq, &utp->freq) ||
- __put_user(txc.maxerror, &utp->maxerror) ||
- __put_user(txc.esterror, &utp->esterror) ||
- __put_user(txc.status, &utp->status) ||
- __put_user(txc.constant, &utp->constant) ||
- __put_user(txc.precision, &utp->precision) ||
- __put_user(txc.tolerance, &utp->tolerance) ||
- __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __put_user(txc.tick, &utp->tick) ||
- __put_user(txc.ppsfreq, &utp->ppsfreq) ||
- __put_user(txc.jitter, &utp->jitter) ||
- __put_user(txc.shift, &utp->shift) ||
- __put_user(txc.stabil, &utp->stabil) ||
- __put_user(txc.jitcnt, &utp->jitcnt) ||
- __put_user(txc.calcnt, &utp->calcnt) ||
- __put_user(txc.errcnt, &utp->errcnt) ||
- __put_user(txc.stbcnt, &utp->stbcnt) ||
- __put_user(txc.tai, &utp->tai))
- ret = -EFAULT;
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
return ret;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f1849..12b7458f23b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
#else /* #if CONFIG_HOTPLUG_CPU */
static void cpu_hotplug_begin(void) {}
static void cpu_hotplug_done(void) {}
-#endif /* #esle #if CONFIG_HOTPLUG_CPU */
+#endif /* #else #if CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
{
BUG_ON(cpu_notify(val, v));
}
-
EXPORT_SYMBOL(register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -189,7 +188,6 @@ static inline void check_for_tasks(int cpu)
}
struct take_cpu_down_param {
- struct task_struct *caller;
unsigned long mod;
void *hcpu;
};
@@ -198,7 +196,6 @@ struct take_cpu_down_param {
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
- unsigned int cpu = (unsigned long)param->hcpu;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
@@ -207,12 +204,6 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
-
- if (task_cpu(param->caller) == cpu)
- move_task_off_dead_cpu(cpu, param->caller);
- /* Force idle task to run as soon as we yield: it should
- immediately notice cpu is offline and die quickly. */
- sched_idle_next();
return 0;
}
@@ -223,7 +214,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
- .caller = current,
.mod = mod,
.hcpu = hcpu,
};
@@ -235,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
return -EINVAL;
cpu_hotplug_begin();
+
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
if (err) {
nr_calls--;
@@ -253,9 +244,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
}
BUG_ON(cpu_online(cpu));
- /* Wait for it to sleep (leaving idle task). */
+ /*
+ * The migration_call() CPU_DYING callback will have removed all
+ * runnable tasks from the cpu, there's only the idle task left now
+ * that the migration thread is done doing the stop_machine thing.
+ *
+ * Wait for the stop thread to go away.
+ */
while (!idle_cpu(cpu))
- yield();
+ cpu_relax();
/* This actually kills the CPU. */
__cpu_die(cpu);
@@ -306,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- printk("%s: attempt to bring up CPU %u failed\n",
+ printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
__func__, cpu);
goto out_notify;
}
@@ -386,6 +383,14 @@ out:
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
+
int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
@@ -397,6 +402,7 @@ int disable_nonboot_cpus(void)
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
+ arch_disable_nonboot_cpus_begin();
printk("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
@@ -412,6 +418,8 @@ int disable_nonboot_cpus(void)
}
}
+ arch_disable_nonboot_cpus_end();
+
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
@@ -441,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk("Enabling non-boot CPUs ...\n");
+ printk(KERN_INFO "Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
error = _cpu_up(cpu, 1);
if (!error) {
- printk("CPU%d is up\n", cpu);
+ printk(KERN_INFO "CPU%d is up\n", cpu);
continue;
}
printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -500,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad..33eee16addb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
- if (!newmems)
- return;
+ static nodemask_t newmems; /* protected by cgroup_mutex */
cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, newmems);
-
- cpuset_change_task_nodemask(p, newmems);
+ guarantee_online_mems(cs, &newmems);
- NODEMASK_FREE(newmems);
+ cpuset_change_task_nodemask(p, &newmems);
mm = get_task_mm(p);
if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
- if (from == NULL || to == NULL)
- goto alloc_fail;
+ static nodemask_t to; /* protected by cgroup_mutex */
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
} else {
guarantee_online_cpus(cs, cpus_attach);
}
- guarantee_online_mems(cs, to);
+ guarantee_online_mems(cs, &to);
/* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, to, cs);
+ cpuset_attach_task(tsk, &to, cs);
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, to, cs);
+ cpuset_attach_task(c, &to, cs);
}
rcu_read_unlock();
}
/* change mm; only needs to be done once even if threadgroup */
- *from = oldcs->mems_allowed;
- *to = cs->mems_allowed;
+ to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, from, to);
+ cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(from);
- NODEMASK_FREE(to);
}
/* The various types of files and directories in a cpuset file system */
@@ -1575,8 +1561,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
return -ENODEV;
trialcs = alloc_trial_cpuset(cs);
- if (!trialcs)
- return -ENOMEM;
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out;
+ }
switch (cft->private) {
case FILE_CPULIST:
@@ -1591,6 +1579,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
+out:
cgroup_unlock();
return retval;
}
@@ -1607,34 +1596,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
* across a page fault.
*/
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
- int ret;
+ size_t count;
mutex_lock(&callback_mutex);
- ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+ count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
mutex_unlock(&callback_mutex);
- return ret;
+ return count;
}
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
- NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
- int retval;
-
- if (mask == NULL)
- return -ENOMEM;
+ size_t count;
mutex_lock(&callback_mutex);
- *mask = cs->mems_allowed;
+ count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
mutex_unlock(&callback_mutex);
- retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
-
- NODEMASK_FREE(mask);
-
- return retval;
+ return count;
}
static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1859,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
cs = cgroup_cs(cgroup);
parent_cs = cgroup_cs(parent);
+ mutex_lock(&callback_mutex);
cs->mems_allowed = parent_cs->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+ mutex_unlock(&callback_mutex);
return;
}
@@ -2063,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2083,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
- *oldmems = cp->mems_allowed;
+ oldmems = cp->mems_allowed;
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
@@ -2099,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, oldmems, NULL);
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
- NODEMASK_FREE(oldmems);
}
/*
@@ -2144,19 +2123,16 @@ void cpuset_update_active_cpus(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return NOTIFY_DONE;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
cgroup_lock();
switch (action) {
case MEM_ONLINE:
- *oldmems = top_cpuset.mems_allowed;
+ oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+ update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
break;
case MEM_OFFLINE:
/*
@@ -2170,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
}
cgroup_unlock();
- NODEMASK_FREE(oldmems);
return NOTIFY_OK;
}
#endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 00000000000..5f85690285d
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e37..5557b55048d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
static struct thread_group_cred init_tgcred = {
.usage = ATOMIC_INIT(2),
.tgid = 0,
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
};
#endif
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void)
#endif
atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
goto error;
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
return new;
error:
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
validate_creds(old);
*new = *old;
+ atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_uid(new->user);
get_group_info(new->group_info);
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
put_cred(old);
validate_creds(new);
return new;
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
}
EXPORT_SYMBOL(set_create_files_as);
+struct user_namespace *current_user_ns(void)
+{
+ return _current_user_ns();
+}
+EXPORT_SYMBOL(current_user_ns);
+
#ifdef CONFIG_DEBUG_CREDENTIALS
bool creds_are_invalid(const struct cred *cred)
@@ -748,7 +754,11 @@ bool creds_are_invalid(const struct cred *cred)
if (cred->magic != CRED_MAGIC)
return true;
#ifdef CONFIG_SECURITY_SELINUX
- if (selinux_is_enabled()) {
+ /*
+ * cred->security == NULL if security_cred_alloc_blank() or
+ * security_prepare_creds() returned an error.
+ */
+ if (selinux_is_enabled() && cred->security) {
if ((unsigned long) cred->security < PAGE_SIZE)
return true;
if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d..bad6786dee8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
/*
* For single stepping, try to only enter on the processor
- * that was single stepping. To gaurd against a deadlock, the
+ * that was single stepping. To guard against a deadlock, the
* kernel will only try for the value of sstep_tries before
* giving up and continuing on.
*/
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe..a11db956dd6 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
put_packet(remcom_out_buffer);
return 0;
}
+
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+ unsigned char checksum, ch, buffer[3];
+ int loop;
+
+ buffer[0] = 'W';
+ buffer[1] = hex_asc_hi(status);
+ buffer[2] = hex_asc_lo(status);
+
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+
+ for (loop = 0; loop < 3; loop++) {
+ ch = buffer[loop];
+ checksum += ch;
+ dbg_io_ops->write_char(ch);
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+
+ /* make sure the output is flushed, lest the bootloader clobber it */
+ dbg_io_ops->flush();
+}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a6e72976682..be14779bcef 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic;
static kdbtab_t *kdb_commands;
#define KDB_BASE_CMD_MAX 50
static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[50];
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
#define for_each_kdbcmd(cmd, num) \
for ((cmd) = kdb_base_commands, (num) = 0; \
num < kdb_max_commands; \
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
* symbol name, and offset to the caller.
*
* The argument may consist of a numeric value (decimal or
- * hexidecimal), a symbol name, a register name (preceeded by the
+ * hexidecimal), a symbol name, a register name (preceded by the
* percent sign), an environment variable with a numeric value
- * (preceeded by a dollar sign) or a simple arithmetic expression
+ * (preceded by a dollar sign) or a simple arithmetic expression
* consisting of a symbol name, +/-, and a numeric constant value
* (offset).
* Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
* error The hardware-defined error code
* reason2 kdb's current reason code.
* Initially error but can change
- * acording to kdb state.
+ * according to kdb state.
* db_result Result code from break or debug point.
* regs The exception frame at time of fault/breakpoint.
* should always be valid.
@@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void)
"Send a signal to a process", 0, KDB_REPEAT_NONE);
kdb_register_repeat("summary", kdb_summary, "",
"Summarize the system", 4, KDB_REPEAT_NONE);
- kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+ kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
"Display per_cpu variables", 3, KDB_REPEAT_NONE);
kdb_register_repeat("grephelp", kdb_grep_help, "",
"Display help on | grep", 0, KDB_REPEAT_NONE);
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void)
}
}
-/* Intialize kdb_printf, breakpoint tables and kdb state */
+/* Initialize kdb_printf, breakpoint tables and kdb state */
void __init kdb_init(int lvl)
{
static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb05..5532dd37aa8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
* Mask for process state.
* Notes:
* The mask folds data from several sources into a single long value, so
- * be carefull not to overlap the bits. TASK_* bits are in the LSB,
+ * be careful not to overlap the bits. TASK_* bits are in the LSB,
* special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
* is no overlap between TASK_* and EXIT_* but that may not always be
* true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/exit.c b/kernel/exit.c
index 676149a4ac5..8dd87418154 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
- __get_cpu_var(process_counts)--;
+ __this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
}
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* Let father know we died
*
* Thread signals are configurable, but you aren't going to use
- * that to send signals to arbitary processes.
+ * that to send signals to arbitrary processes.
* That stops right now.
*
* If the parent exec id doesn't match the exec id we saved
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
+ WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
@@ -994,6 +995,15 @@ NORET_TYPE void do_exit(long code)
exit_fs(tsk);
check_stack_usage();
exit_thread();
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ *
+ * because of cgroup mode, must be called before cgroup_exit()
+ */
+ perf_event_exit_task(tsk);
+
cgroup_exit(tsk, 1);
if (group_dead)
@@ -1006,12 +1016,7 @@ NORET_TYPE void do_exit(long code)
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
- flush_ptrace_hw_breakpoint(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_event_exit_task(tsk);
+ ptrace_put_breakpoints(tsk);
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b..e7548dee636 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/compat.h>
+#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
@@ -66,6 +67,7 @@
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
+#include <linux/khugepaged.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -108,20 +110,25 @@ int nr_processes(void)
}
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
-# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
+# define alloc_task_struct_node(node) \
+ kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk) \
+ kmem_cache_free(task_struct_cachep, (tsk))
static struct kmem_cache *task_struct_cachep;
#endif
#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
{
#ifdef CONFIG_DEBUG_STACK_USAGE
gfp_t mask = GFP_KERNEL | __GFP_ZERO;
#else
gfp_t mask = GFP_KERNEL;
#endif
- return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+
+ return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
@@ -169,6 +176,7 @@ EXPORT_SYMBOL(free_task);
static inline void free_signal_struct(struct signal_struct *sig)
{
taskstats_tgid_free(sig);
+ sched_autogroup_exit(sig);
kmem_cache_free(signal_cachep, sig);
}
@@ -191,6 +199,7 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+EXPORT_SYMBOL_GPL(__put_task_struct);
/*
* macro override instead of weak attribute alias, to workaround
@@ -246,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
struct task_struct *tsk;
struct thread_info *ti;
unsigned long *stackend;
-
+ int node = tsk_fork_get_node(orig);
int err;
prepare_to_copy(orig);
- tsk = alloc_task_struct();
+ tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
- ti = alloc_thread_info(tsk);
+ ti = alloc_thread_info_node(tsk, node);
if (!ti) {
free_task_struct(tsk);
return NULL;
@@ -273,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
+ clear_tsk_need_resched(tsk);
stackend = end_of_stack(tsk);
*stackend = STACK_END_MAGIC; /* for overflow detection */
@@ -328,6 +338,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
+ retval = khugepaged_fork(mm, oldmm);
+ if (retval)
+ goto out;
prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -527,6 +540,9 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(mm->pmd_huge_pte);
+#endif
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -541,6 +557,7 @@ void mmput(struct mm_struct *mm)
if (atomic_dec_and_test(&mm->mm_users)) {
exit_aio(mm);
ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
@@ -667,6 +684,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
mm->token_priority = 0;
mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ mm->pmd_huge_pte = NULL;
+#endif
+
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -904,9 +925,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
posix_cpu_timers_init_group(sig);
tty_audit_fork(sig);
+ sched_autogroup_fork(sig);
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
+ sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
@@ -1164,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}
p->pid = pid_nr(pid);
@@ -1188,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+ p->plug = NULL;
+#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
@@ -1273,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
tracehook_finish_clone(p, clone_flags, trace);
if (thread_group_leader(p)) {
- if (clone_flags & CLONE_NEWPID)
+ if (is_child_reaper(pid))
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
@@ -1282,7 +1302,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
- __get_cpu_var(process_counts)++;
+ __this_cpu_inc(process_counts);
}
attach_pid(p, PIDTYPE_PID, pid);
nr_threads++;
@@ -1407,23 +1427,6 @@ long do_fork(unsigned long clone_flags,
}
/*
- * We hope to recycle these flags after 2.6.26
- */
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- static int __read_mostly count = 100;
-
- if (count > 0 && printk_ratelimit()) {
- char comm[TASK_COMM_LEN];
-
- count--;
- printk(KERN_INFO "fork(): process `%s' used deprecated "
- "clone flags 0x%lx\n",
- get_task_comm(comm, current),
- clone_flags & CLONE_STOPPED);
- }
- }
-
- /*
* When called from kernel_thread, don't do user tracing stuff.
*/
if (likely(user_mode(regs)))
@@ -1461,16 +1464,7 @@ long do_fork(unsigned long clone_flags,
*/
p->flags &= ~PF_STARTING;
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- /*
- * We'll start up with an immediate SIGSTOP.
- */
- sigaddset(&p->pending.signal, SIGSTOP);
- set_tsk_thread_flag(p, TIF_SIGPENDING);
- __set_task_state(p, TASK_STOPPED);
- } else {
- wake_up_new_task(p, clone_flags);
- }
+ wake_up_new_task(p, clone_flags);
tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p);
@@ -1522,38 +1516,24 @@ void __init proc_caches_init(void)
}
/*
- * Check constraints on flags passed to the unshare system call and
- * force unsharing of additional process context as appropriate.
+ * Check constraints on flags passed to the unshare system call.
*/
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
{
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ return -EINVAL;
/*
- * If unsharing a thread from a thread group, must also
- * unshare vm.
- */
- if (*flags_ptr & CLONE_THREAD)
- *flags_ptr |= CLONE_VM;
-
- /*
- * If unsharing vm, must also unshare signal handlers.
- */
- if (*flags_ptr & CLONE_VM)
- *flags_ptr |= CLONE_SIGHAND;
-
- /*
- * If unsharing namespace, must also unshare filesystem information.
+ * Not implemented, but pretend it works if there is nothing to
+ * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+ * needs to unshare vm.
*/
- if (*flags_ptr & CLONE_NEWNS)
- *flags_ptr |= CLONE_FS;
-}
-
-/*
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
- if (unshare_flags & CLONE_THREAD)
- return -EINVAL;
+ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+ /* FIXME: get_task_mm() increments ->mm_users */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+ }
return 0;
}
@@ -1580,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
- struct sighand_struct *sigh = current->sighand;
-
- if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
- return -EINVAL;
- else
- return 0;
-}
-
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
- struct mm_struct *mm = current->mm;
-
- if ((unshare_flags & CLONE_VM) &&
- (mm && atomic_read(&mm->mm_users) > 1)) {
- return -EINVAL;
- }
-
- return 0;
-}
-
-/*
* Unshare file descriptor table if it is being shared
*/
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1635,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
*/
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
- int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct sighand_struct *new_sigh = NULL;
- struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
+ int err;
- check_unshare_flags(&unshare_flags);
-
- /* Return -EINVAL for all unsupported flags */
- err = -EINVAL;
- if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ err = check_unshare_flags(unshare_flags);
+ if (err)
goto bad_unshare_out;
/*
+ * If unsharing namespace, must also unshare filesystem information.
+ */
+ if (unshare_flags & CLONE_NEWNS)
+ unshare_flags |= CLONE_FS;
+ /*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable.
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
- if ((err = unshare_thread(unshare_flags)))
- goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
- goto bad_unshare_cleanup_thread;
- if ((err = unshare_sighand(unshare_flags, &new_sigh)))
- goto bad_unshare_cleanup_fs;
- if ((err = unshare_vm(unshare_flags, &new_mm)))
- goto bad_unshare_cleanup_sigh;
+ goto bad_unshare_out;
if ((err = unshare_fd(unshare_flags, &new_fd)))
- goto bad_unshare_cleanup_vm;
+ goto bad_unshare_cleanup_fs;
if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
new_fs)))
goto bad_unshare_cleanup_fd;
- if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1699,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_mm) {
- mm = current->mm;
- active_mm = current->active_mm;
- current->mm = new_mm;
- current->active_mm = new_mm;
- if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
- atomic_dec(&mm->oom_disable_count);
- atomic_inc(&new_mm->oom_disable_count);
- }
- activate_mm(active_mm, new_mm);
- new_mm = mm;
- }
-
if (new_fd) {
fd = current->files;
current->files = new_fd;
@@ -1728,20 +1659,10 @@ bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
-bad_unshare_cleanup_vm:
- if (new_mm)
- mmput(new_mm);
-
-bad_unshare_cleanup_sigh:
- if (new_sigh)
- if (atomic_dec_and_test(&new_sigh->count))
- kmem_cache_free(sighand_cachep, new_sigh);
-
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);
-bad_unshare_cleanup_thread:
bad_unshare_out:
return err;
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb..66ecd2ead21 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
}
if (should_send_signal(p)) {
- if (!signal_pending(p))
- fake_signal_wake_up(p);
+ fake_signal_wake_up(p);
+ /*
+ * fake_signal_wake_up() goes through p's scheduler
+ * lock and guarantees that TASK_STOPPED/TRACED ->
+ * TASK_RUNNING transition can't race with task state
+ * testing in try_to_freeze_tasks().
+ */
} else if (sig_only) {
return false;
} else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777a27d..fe28dc282ea 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED 0x01
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
+
+/*
* Priority Inheritance state:
*/
struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
u32 bitset;
};
+static const struct futex_q futex_q_init = {
+ /* list gets initialized in queue_me()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
@@ -219,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page;
+ struct page *page, *page_head;
int err;
/*
@@ -251,11 +265,46 @@ again:
if (err < 0)
return err;
- page = compound_head(page);
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ page_head = page;
+ if (unlikely(PageTail(page))) {
put_page(page);
+ /* serialize against __split_huge_page_splitting() */
+ local_irq_disable();
+ if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ page_head = compound_head(page);
+ /*
+ * page_head is valid pointer but we must pin
+ * it before taking the PG_lock and/or
+ * PG_compound_lock. The moment we re-enable
+ * irqs __split_huge_page_splitting() can
+ * return and the head page can be freed from
+ * under us. We can't take the PG_lock and/or
+ * PG_compound_lock on a page that could be
+ * freed from under us.
+ */
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ goto again;
+ }
+ }
+#else
+ page_head = compound_head(page);
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+#endif
+
+ lock_page(page_head);
+ if (!page_head->mapping) {
+ unlock_page(page_head);
+ put_page(page_head);
goto again;
}
@@ -266,25 +315,24 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (PageAnon(page_head)) {
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page->mapping->host;
- key->shared.pgoff = page->index;
+ key->shared.inode = page_head->mapping->host;
+ key->shared.pgoff = page_head->index;
}
get_futex_key_refs(key);
- unlock_page(page);
- put_page(page);
+ unlock_page(page_head);
+ put_page(page_head);
return 0;
}
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
{
drop_futex_key_refs(key);
}
@@ -333,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
return NULL;
}
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+ u32 uval, u32 newval)
{
- u32 curval;
+ int ret;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
pagefault_enable();
- return curval;
+ return ret;
}
static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -626,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct task_struct *task, int set_waiters)
{
int lock_taken, ret, ownerdied = 0;
- u32 uval, newval, curval;
+ u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
ret = lock_taken = 0;
@@ -636,19 +685,17 @@ retry:
* (by doing a 0 -> TID atomic cmpxchg), while holding all
* the locks. It will most likely not succeed.
*/
- newval = task_pid_vnr(task);
+ newval = vpid;
if (set_waiters)
newval |= FUTEX_WAITERS;
- curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+ if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
@@ -675,14 +722,12 @@ retry:
*/
if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
/* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+ newval = (curval & ~FUTEX_TID_MASK) | vpid;
ownerdied = 0;
lock_taken = 1;
}
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
if (unlikely(curval != uval))
goto retry;
@@ -727,6 +772,24 @@ retry:
return ret;
}
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+ || WARN_ON(plist_node_empty(&q->list)))
+ return;
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
@@ -744,7 +807,7 @@ static void wake_futex(struct futex_q *q)
*/
get_task_struct(p);
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
* q->lock_ptr = NULL is written, without taking any locks. A
@@ -778,10 +841,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
- * This happens when we have stolen the lock and the original
- * pending owner did not enqueue itself back on the rt_mutex.
- * Thats not a tragedy. We know that way, that a lock waiter
- * is on the fly. We make the futex_q waiter the pending owner.
+ * It is possible that the next waiter (the one that brought
+ * this owner to the kernel) timed out and is no longer
+ * waiting on the lock.
*/
if (!new_owner)
new_owner = this->task;
@@ -796,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
ret = -EINVAL;
@@ -833,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
- oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
- if (oldval == -EFAULT)
- return oldval;
+ if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+ return -EFAULT;
if (oldval != uval)
return -EAGAIN;
@@ -870,7 +928,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
@@ -881,7 +940,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
if (unlikely(ret != 0))
goto out;
@@ -907,7 +966,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
}
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
}
@@ -917,7 +976,7 @@ out:
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +986,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -962,11 +1021,11 @@ retry_private:
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
@@ -996,9 +1055,9 @@ retry_private:
double_unlock_hb(hb1, hb2);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
return ret;
}
@@ -1023,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
plist_del(&q->list, &hb1->chain);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb2->lock;
-#endif
}
get_futex_key_refs(key2);
q->key = *key2;
@@ -1052,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
get_futex_key_refs(key);
q->key = *key;
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1133,13 +1185,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
* @uaddr1: source futex user address
- * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @uaddr2: target futex user address
* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
* @nr_requeue: number of waiters to requeue (0-INT_MAX)
* @cmpval: @uaddr1 expected value (or %NULL)
* @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
+ * pi futex (pi to pi requeue is not supported)
*
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1200,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* >=0 - on success, the number of tasks requeued or woken
* <0 - on error
*/
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval,
- int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1243,10 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1216,11 +1268,11 @@ retry_private:
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
if (curval != *cmpval) {
@@ -1260,8 +1312,8 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
@@ -1269,8 +1321,8 @@ retry_private:
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
cond_resched();
goto retry;
default:
@@ -1352,9 +1404,9 @@ out_unlock:
drop_futex_key_refs(&key1);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
if (pi_state != NULL)
free_pi_state(pi_state);
@@ -1409,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
prio = min(current->normal_prio, MAX_RT_PRIO);
plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
@@ -1456,8 +1505,7 @@ retry:
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(q->pi_state);
@@ -1477,8 +1525,7 @@ retry:
static void unqueue_me_pi(struct futex_q *q)
__releases(q->lock_ptr)
{
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
@@ -1494,7 +1541,7 @@ static void unqueue_me_pi(struct futex_q *q)
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner, int fshared)
+ struct task_struct *newowner)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
@@ -1508,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
/*
* We are here either because we stole the rtmutex from the
- * pending owner or we are the pending owner which failed to
- * get the rtmutex. We have to replace the pending owner TID
- * in the user space variable. This must be atomic as we have
- * to preserve the owner died bit here.
+ * previous highest priority waiter or we are the highest priority
+ * waiter but failed to get the rtmutex the first time.
+ * We have to replace the newowner TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
* because we can fault here. Imagine swapped out pages or a fork
@@ -1530,9 +1577,7 @@ retry:
while (1) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
goto handle_fault;
if (curval == uval)
break;
@@ -1560,8 +1605,8 @@ retry:
/*
* To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the pending
- * owner itself or the task which stole the rtmutex) the
+ * lock here. That gives the other task (either the highest priority
+ * waiter itself or the task which stole the rtmutex) the
* chance to try the fixup of the pi_state. So once we are
* back from handling the fault we need to check the pi_state
* after reacquiring the hash bucket lock and before trying to
@@ -1587,20 +1632,11 @@ handle_fault:
goto retry;
}
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED 0x01
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
static long futex_wait_restart(struct restart_block *restart);
/**
* fixup_owner() - Post lock pi_state and corner case management
* @uaddr: user address of the futex
- * @fshared: whether the futex is shared (1) or not (0)
* @q: futex_q (contains pi_state and access to the rt_mutex)
* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
*
@@ -1613,8 +1649,7 @@ static long futex_wait_restart(struct restart_block *restart);
* 0 - success, lock not taken
* <0 - on error (-EFAULT)
*/
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
- int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
struct task_struct *owner;
int ret = 0;
@@ -1625,7 +1660,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
* did a lock-steal - fix up the PI-state in that case:
*/
if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+ ret = fixup_pi_state_owner(uaddr, q, current);
goto out;
}
@@ -1647,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
/*
* pi_state is incorrect, some other task did a lock steal and
* we returned due to timeout or signal without taking the
- * rt_mutex. Too late. We can access the rt_mutex_owner without
- * locking, as the other task is now blocked on the hash bucket
- * lock. Fix the state up.
+ * rt_mutex. Too late.
*/
+ raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+ if (!owner)
+ owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+ raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
/*
* Paranoia check. If we did not take the lock, then we should not be
- * the owner, nor the pending owner, of the rt_mutex.
+ * the owner of the rt_mutex.
*/
if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1715,7 +1752,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* futex_wait_setup() - Prepare to wait on a futex
* @uaddr: the futex userspace address
* @val: the expected value
- * @fshared: whether the futex is shared (1) or not (0)
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
* @hb: storage for hash_bucket pointer to be returned to caller
*
@@ -1728,7 +1765,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* 0 - uaddr contains val and hb has been locked
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
*/
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
{
u32 uval;
@@ -1743,17 +1780,17 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
*
* The basic logical guarantee of a futex is that it blocks ONLY
* if cond(var) is known to be true at the time of blocking, for
- * any cond. If we queued after testing *uaddr, that would open
- * a race condition where we could block indefinitely with
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
* cond(var) false, which would violate the guarantee.
*
- * A consequence is that futex_wait() can return zero and absorb
- * a wakeup when *uaddr != val on entry to the syscall. This is
- * rare, but normal.
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
*/
retry:
- q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
if (unlikely(ret != 0))
return ret;
@@ -1769,10 +1806,10 @@ retry_private:
if (ret)
goto out;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
goto retry;
}
@@ -1783,32 +1820,29 @@ retry_private:
out:
if (ret)
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
return ret;
}
-static int futex_wait(u32 __user *uaddr, int fshared,
- u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct restart_block *restart;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
-
- q.pi_state = NULL;
q.bitset = bitset;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
@@ -1819,7 +1853,7 @@ retry:
* Prepare to wait on uaddr. On success, holds hb lock and increments
* q.key refs.
*/
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out;
@@ -1852,12 +1886,7 @@ retry:
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
- restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
- if (fshared)
- restart->futex.flags |= FLAGS_SHARED;
- if (clockrt)
- restart->futex.flags |= FLAGS_CLOCKRT;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
ret = -ERESTART_RESTARTBLOCK;
@@ -1873,7 +1902,6 @@ out:
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = restart->futex.uaddr;
- int fshared = 0;
ktime_t t, *tp = NULL;
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart)
tp = &t;
}
restart->fn = do_no_restart_syscall;
- if (restart->futex.flags & FLAGS_SHARED)
- fshared = 1;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
- restart->futex.bitset,
- restart->futex.flags & FLAGS_CLOCKRT);
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
}
@@ -1895,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart)
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
- int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+ ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int res, ret;
if (refill_pi_state_cache())
@@ -1914,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
hrtimer_set_expires(&to->timer, *time);
}
- q.pi_state = NULL;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
retry:
- q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
if (unlikely(ret != 0))
goto out;
@@ -1941,7 +1963,7 @@ retry_private:
* exit to complete.
*/
queue_unlock(&q, hb);
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
cond_resched();
goto retry;
default:
@@ -1971,7 +1993,7 @@ retry_private:
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr, fshared, &q, !ret);
+ res = fixup_owner(uaddr, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it acquired
* the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +2017,7 @@ out_unlock_put_key:
queue_unlock(&q, hb);
out_put_key:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out:
if (to)
destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +2030,10 @@ uaddr_faulted:
if (ret)
goto out_put_key;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
goto retry;
}
@@ -2020,13 +2042,13 @@ uaddr_faulted:
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- u32 uval;
struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
+ u32 uval, vpid = task_pid_vnr(current);
int ret;
retry:
@@ -2035,10 +2057,10 @@ retry:
/*
* We release only a lock we actually own:
*/
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+ if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
if (unlikely(ret != 0))
goto out;
@@ -2050,17 +2072,14 @@ retry:
* again. If it succeeds then we can return without waking
* anyone else up:
*/
- if (!(uval & FUTEX_OWNER_DIED))
- uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
-
-
- if (unlikely(uval == -EFAULT))
+ if (!(uval & FUTEX_OWNER_DIED) &&
+ cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
* Rare case: we managed to release the lock atomically,
* no need to wake anyone else up:
*/
- if (unlikely(uval == task_pid_vnr(current)))
+ if (unlikely(uval == vpid))
goto out_unlock;
/*
@@ -2093,14 +2112,14 @@ retry:
out_unlock:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
pi_faulted:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
if (!ret)
@@ -2145,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We were woken prior to requeue by a timeout or a signal.
* Unqueue the futex_q and determine which it was.
*/
- plist_del(&q->list, &q->list.plist);
+ plist_del(&q->list, &hb->chain);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2160,7 +2179,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
- * @fshared: whether the futexes are shared (1) or not (0). They must be
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
@@ -2198,16 +2217,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* 0 - On success
* <0 - On error
*/
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
- int clockrt, u32 __user *uaddr2)
+ u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
- union futex_key key2;
- struct futex_q q;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
int res, ret;
if (!bitset)
@@ -2215,8 +2234,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
@@ -2229,12 +2249,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
- key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out;
- q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
@@ -2243,7 +2261,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* Prepare to wait on uaddr. On success, increments q.key (key1) ref
* count.
*/
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out_key2;
@@ -2273,8 +2291,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
*/
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
- ret = fixup_pi_state_owner(uaddr2, &q, current,
- fshared);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
spin_unlock(q.lock_ptr);
}
} else {
@@ -2293,7 +2310,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr2, fshared, &q, !ret);
+ res = fixup_owner(uaddr2, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2341,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
}
out_put_keys:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out_key2:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out:
if (to) {
@@ -2401,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
goto err_unlock;
ret = -EPERM;
pcred = __task_cred(p);
+ /* If victim is in different user_ns, then uids are not
+ comparable, so we must have CAP_SYS_PTRACE */
+ if (cred->user->user_ns != pcred->user->user_ns) {
+ if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+ goto err_unlock;
+ goto ok;
+ }
+ /* If victim is in same user_ns, then uids are comparable */
if (cred->euid != pcred->euid &&
cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
+ !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
goto err_unlock;
+ok:
head = p->robust_list;
rcu_read_unlock();
}
@@ -2443,11 +2469,20 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
- if (nval == -EFAULT)
- return -1;
-
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+ }
if (nval != uval)
goto retry;
@@ -2551,58 +2586,57 @@ void exit_robust_list(struct task_struct *curr)
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int clockrt, ret = -ENOSYS;
- int cmd = op & FUTEX_CMD_MASK;
- int fshared = 0;
+ int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+ unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
- fshared = 1;
+ flags |= FLAGS_SHARED;
- clockrt = op & FUTEX_CLOCK_REALTIME;
- if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
- return -ENOSYS;
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ return -ENOSYS;
+ }
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+ ret = futex_wait(uaddr, flags, val, timeout, val3);
break;
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, fshared, val, val3);
+ ret = futex_wake(uaddr, flags, val, val3);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+ ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, fshared);
+ ret = futex_unlock_pi(uaddr, flags);
break;
case FUTEX_TRYLOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
break;
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
- clockrt, uaddr2);
+ ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
break;
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 1);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
break;
default:
ret = -ENOSYS;
@@ -2659,8 +2693,7 @@ static int __init futex_init(void)
* implementation, the non-functional ones will return
* -ENOSYS.
*/
- curval = cmpxchg_futex_value_locked(NULL, 0, 0);
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5..5f9e689dc8f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
goto err_unlock;
ret = -EPERM;
pcred = __task_cred(p);
+ /* If victim is in different user_ns, then uids are not
+ comparable, so we must have CAP_SYS_PTRACE */
+ if (cred->user->user_ns != pcred->user->user_ns) {
+ if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+ goto err_unlock;
+ goto ok;
+ }
+ /* If victim is in same user_ns, then uids are comparable */
if (cred->euid != pcred->euid &&
cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
+ !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
goto err_unlock;
+ok:
head = p->compat_robust_list;
rcu_read_unlock();
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da7..b8cadf70b1f 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d51..e97ca59e252 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
-EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf..1cc476d52dd 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!capable(CAP_SETGID))
+ if (!nsown_capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6c..87fdb3f8db1 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
/*
* The timer bases:
*
- * Note: If we want to add new timer bases, we have to skip the two
- * clock ids captured by the cpu-timers. We do this by holding empty
- * entries rather than doing math adjustment of the clock ids.
- * This ensures that we capture erroneous accesses to these clock ids
- * rather than moving them into the range of valid clock id's.
+ * There are more clockids then hrtimer bases. Thus, we index
+ * into the timer bases by the hrtimer_base_type enum. When trying
+ * to reach a base using a clockid, hrtimer_clockid_to_base()
+ * is used to convert from clockid to the proper hrtimer_base_type.
*/
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
@@ -74,30 +73,43 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.get_time = &ktime_get,
.resolution = KTIME_LOW_RES,
},
+ {
+ .index = CLOCK_BOOTTIME,
+ .get_time = &ktime_get_boottime,
+ .resolution = KTIME_LOW_RES,
+ },
}
};
+static int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
+ [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
+ [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
+};
+
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+ return hrtimer_clock_to_base_table[clock_id];
+}
+
+
/*
* Get the coarse grained time at the softirq based on xtime and
* wall_to_monotonic.
*/
static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
- ktime_t xtim, tomono;
- struct timespec xts, tom;
- unsigned long seq;
+ ktime_t xtim, mono, boot;
+ struct timespec xts, tom, slp;
- do {
- seq = read_seqbegin(&xtime_lock);
- xts = __current_kernel_time();
- tom = __get_wall_to_monotonic();
- } while (read_seqretry(&xtime_lock, seq));
+ get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
xtim = timespec_to_ktime(xts);
- tomono = timespec_to_ktime(tom);
- base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
- base->clock_base[CLOCK_MONOTONIC].softirq_time =
- ktime_add(xtim, tomono);
+ mono = ktime_add(xtim, timespec_to_ktime(tom));
+ boot = ktime_add(mono, timespec_to_ktime(slp));
+ base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
+ base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+ base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
}
/*
@@ -184,10 +196,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
struct hrtimer_cpu_base *new_cpu_base;
int this_cpu = smp_processor_id();
int cpu = hrtimer_get_target(this_cpu, pinned);
+ int basenum = hrtimer_clockid_to_base(base->index);
again:
new_cpu_base = &per_cpu(hrtimer_bases, cpu);
- new_base = &new_cpu_base->clock_base[base->index];
+ new_base = &new_cpu_base->clock_base[basenum];
if (base != new_base) {
/*
@@ -334,6 +347,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
static struct debug_obj_descr hrtimer_debug_descr;
+static void *hrtimer_debug_hint(void *addr)
+{
+ return ((struct hrtimer *) addr)->function;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
@@ -393,6 +411,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
static struct debug_obj_descr hrtimer_debug_descr = {
.name = "hrtimer",
+ .debug_hint = hrtimer_debug_hint,
.fixup_init = hrtimer_fixup_init,
.fixup_activate = hrtimer_fixup_activate,
.fixup_free = hrtimer_fixup_free,
@@ -497,7 +516,7 @@ static inline int hrtimer_is_hres_enabled(void)
*/
static inline int hrtimer_hres_active(void)
{
- return __get_cpu_var(hrtimer_bases).hres_active;
+ return __this_cpu_read(hrtimer_bases.hres_active);
}
/*
@@ -516,10 +535,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
struct hrtimer *timer;
+ struct timerqueue_node *next;
- if (!base->first)
+ next = timerqueue_getnext(&base->active);
+ if (!next)
continue;
- timer = rb_entry(base->first, struct hrtimer, node);
+ timer = container_of(next, struct hrtimer, node);
+
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
/*
* clock_was_set() has changed base->offset so the
@@ -608,24 +630,23 @@ static int hrtimer_reprogram(struct hrtimer *timer,
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base;
- struct timespec realtime_offset, wtm;
- unsigned long seq;
+ struct timespec realtime_offset, wtm, sleep;
if (!hrtimer_hres_active())
return;
- do {
- seq = read_seqbegin(&xtime_lock);
- wtm = __get_wall_to_monotonic();
- } while (read_seqretry(&xtime_lock, seq));
+ get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
+ &sleep);
set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
base = &__get_cpu_var(hrtimer_bases);
/* Adjust CLOCK_REALTIME offset */
raw_spin_lock(&base->lock);
- base->clock_base[CLOCK_REALTIME].offset =
+ base->clock_base[HRTIMER_BASE_REALTIME].offset =
timespec_to_ktime(realtime_offset);
+ base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+ timespec_to_ktime(sleep);
hrtimer_force_reprogram(base, 0);
raw_spin_unlock(&base->lock);
@@ -670,14 +691,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
}
/*
- * Initialize the high resolution related parts of a hrtimer
- */
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
-{
-}
-
-
-/*
* When High resolution timers are active, try to reprogram. Note, that in case
* the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
* check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -722,8 +735,9 @@ static int hrtimer_switch_to_hres(void)
return 0;
}
base->hres_active = 1;
- base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
- base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+ base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
+ base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
+ base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
@@ -747,7 +761,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
return 0;
}
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
@@ -840,48 +853,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
static int enqueue_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
- struct rb_node **link = &base->active.rb_node;
- struct rb_node *parent = NULL;
- struct hrtimer *entry;
- int leftmost = 1;
-
debug_activate(timer);
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct hrtimer, node);
- /*
- * We dont care about collisions. Nodes with
- * the same expiry time stay together.
- */
- if (hrtimer_get_expires_tv64(timer) <
- hrtimer_get_expires_tv64(entry)) {
- link = &(*link)->rb_left;
- } else {
- link = &(*link)->rb_right;
- leftmost = 0;
- }
- }
-
- /*
- * Insert the timer to the rbtree and check whether it
- * replaces the first pending timer
- */
- if (leftmost)
- base->first = &timer->node;
+ timerqueue_add(&base->active, &timer->node);
- rb_link_node(&timer->node, parent, link);
- rb_insert_color(&timer->node, &base->active);
/*
* HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
* state of a possibly running callback.
*/
timer->state |= HRTIMER_STATE_ENQUEUED;
- return leftmost;
+ return (&timer->node == base->active.next);
}
/*
@@ -901,12 +883,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
goto out;
- /*
- * Remove the timer from the rbtree and replace the first
- * entry pointer if necessary.
- */
- if (base->first == &timer->node) {
- base->first = rb_next(&timer->node);
+ if (&timer->node == timerqueue_getnext(&base->active)) {
#ifdef CONFIG_HIGH_RES_TIMERS
/* Reprogram the clock event device. if enabled */
if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +896,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
}
#endif
}
- rb_erase(&timer->node, &base->active);
+ timerqueue_del(&base->active, &timer->node);
out:
timer->state = newstate;
}
@@ -1128,11 +1105,13 @@ ktime_t hrtimer_get_next_event(void)
if (!hrtimer_hres_active()) {
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
struct hrtimer *timer;
+ struct timerqueue_node *next;
- if (!base->first)
+ next = timerqueue_getnext(&base->active);
+ if (!next)
continue;
- timer = rb_entry(base->first, struct hrtimer, node);
+ timer = container_of(next, struct hrtimer, node);
delta.tv64 = hrtimer_get_expires_tv64(timer);
delta = ktime_sub(delta, base->get_time());
if (delta.tv64 < mindelta.tv64)
@@ -1152,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
struct hrtimer_cpu_base *cpu_base;
+ int base;
memset(timer, 0, sizeof(struct hrtimer));
@@ -1160,8 +1140,9 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
clock_id = CLOCK_MONOTONIC;
- timer->base = &cpu_base->clock_base[clock_id];
- hrtimer_init_timer_hres(timer);
+ base = hrtimer_clockid_to_base(clock_id);
+ timer->base = &cpu_base->clock_base[base];
+ timerqueue_init(&timer->node);
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
@@ -1195,9 +1176,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
{
struct hrtimer_cpu_base *cpu_base;
+ int base = hrtimer_clockid_to_base(which_clock);
cpu_base = &__raw_get_cpu_var(hrtimer_bases);
- *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+ *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
return 0;
}
@@ -1278,14 +1260,14 @@ retry:
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
ktime_t basenow;
- struct rb_node *node;
+ struct timerqueue_node *node;
basenow = ktime_add(now, base->offset);
- while ((node = base->first)) {
+ while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
- timer = rb_entry(node, struct hrtimer, node);
+ timer = container_of(node, struct hrtimer, node);
/*
* The immediate goal for using the softexpires is
@@ -1441,7 +1423,7 @@ void hrtimer_run_pending(void)
*/
void hrtimer_run_queues(void)
{
- struct rb_node *node;
+ struct timerqueue_node *node;
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
struct hrtimer_clock_base *base;
int index, gettime = 1;
@@ -1451,8 +1433,7 @@ void hrtimer_run_queues(void)
for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
base = &cpu_base->clock_base[index];
-
- if (!base->first)
+ if (!timerqueue_getnext(&base->active))
continue;
if (gettime) {
@@ -1462,10 +1443,10 @@ void hrtimer_run_queues(void)
raw_spin_lock(&cpu_base->lock);
- while ((node = base->first)) {
+ while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
- timer = rb_entry(node, struct hrtimer, node);
+ timer = container_of(node, struct hrtimer, node);
if (base->softirq_time.tv64 <=
hrtimer_get_expires_tv64(timer))
break;
@@ -1630,8 +1611,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
raw_spin_lock_init(&cpu_base->lock);
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
+ timerqueue_init_head(&cpu_base->clock_base[i].active);
+ }
hrtimer_init_hres(cpu_base);
}
@@ -1642,10 +1625,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
struct hrtimer_clock_base *new_base)
{
struct hrtimer *timer;
- struct rb_node *node;
+ struct timerqueue_node *node;
- while ((node = rb_first(&old_base->active))) {
- timer = rb_entry(node, struct hrtimer, node);
+ while ((node = timerqueue_getnext(&old_base->active))) {
+ timer = container_of(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_deactivate(timer);
@@ -1774,7 +1757,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
}
/*
- * A NULL parameter means "inifinte"
+ * A NULL parameter means "infinite"
*/
if (!expires) {
schedule();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index e5325825aeb..086adf25a55 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void)
constraints_initialized = 1;
- perf_pmu_register(&perf_breakpoint);
+ perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2..c574f9a12c4 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
+# Select this to activate the generic irq options below
config HAVE_GENERIC_HARDIRQS
- def_bool n
+ bool
if HAVE_GENERIC_HARDIRQS
menu "IRQ subsystem"
@@ -9,31 +10,47 @@ menu "IRQ subsystem"
config GENERIC_HARDIRQS
def_bool y
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
-# Select this to disable the deprecated stuff
-config GENERIC_HARDIRQS_NO_DEPRECATED
- def_bool n
-
# Options selectable by the architecture code
+
+# Make sparse irq Kconfig switch below available
config HAVE_SPARSE_IRQ
- def_bool n
+ bool
+# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
- def_bool n
+ bool
+
+# Use the generic /proc/interrupts implementation
+config GENERIC_IRQ_SHOW
+ bool
+# Print level/edge extra information
+config GENERIC_IRQ_SHOW_LEVEL
+ bool
+
+# Support for delayed migration from interrupt context
config GENERIC_PENDING_IRQ
- def_bool n
+ bool
+# Alpha specific irq affinity mechanism
config AUTO_IRQ_AFFINITY
- def_bool n
-
-config IRQ_PER_CPU
- def_bool n
+ bool
+# Tasklet based software resend for pending interrupts on enable_irq()
config HARDIRQS_SW_RESEND
- def_bool n
+ bool
+
+# Preflow handler support for fasteoi (sparc64)
+config IRQ_PREFLOW_FASTEOI
+ bool
+
+# Edge style eoi based handler (cell)
+config IRQ_EDGE_EOI_HANDLER
+ bool
+
+# Support forced irq threading
+config IRQ_FORCED_THREADING
+ bool
config SPARSE_IRQ
bool "Support sparse irq numbering"
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c3..342d8f44e40 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
/*
* Autodetection depends on the fact that any interrupt that
* comes in on to an unassigned handler will get stuck with
- * "IRQ_WAITING" cleared and the interrupt disabled.
+ * "IRQS_WAITING" cleared and the interrupt disabled.
*/
static DEFINE_MUTEX(probing_active);
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
{
struct irq_desc *desc;
unsigned long mask = 0;
- unsigned int status;
int i;
/*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
- if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
- /*
- * An old-style architecture might still have
- * the handle_bad_irq handler there:
- */
- compat_irq_chip_set_default_handler(desc);
-
+ if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
- desc->irq_data.chip->irq_startup(&desc->irq_data);
+ irq_startup(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -75,10 +68,10 @@ unsigned long probe_irq_on(void)
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
- if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
- desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
- if (desc->irq_data.chip->irq_startup(&desc->irq_data))
- desc->status |= IRQ_PENDING;
+ if (!desc->action && irq_settings_can_probe(desc)) {
+ desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
+ if (irq_startup(desc))
+ desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -93,13 +86,12 @@ unsigned long probe_irq_on(void)
*/
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
- status = desc->status;
- if (status & IRQ_AUTODETECT) {
+ if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
- if (!(status & IRQ_WAITING)) {
- desc->status = status & ~IRQ_AUTODETECT;
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ if (!(desc->istate & IRQS_WAITING)) {
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
} else
if (i < 32)
mask |= 1 << i;
@@ -125,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on);
*/
unsigned int probe_irq_mask(unsigned long val)
{
- unsigned int status, mask = 0;
+ unsigned int mask = 0;
struct irq_desc *desc;
int i;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
- status = desc->status;
-
- if (status & IRQ_AUTODETECT) {
- if (i < 16 && !(status & IRQ_WAITING))
+ if (desc->istate & IRQS_AUTODETECT) {
+ if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
- desc->status = status & ~IRQ_AUTODETECT;
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -169,20 +159,18 @@ int probe_irq_off(unsigned long val)
{
int i, irq_found = 0, nr_of_irqs = 0;
struct irq_desc *desc;
- unsigned int status;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
- status = desc->status;
- if (status & IRQ_AUTODETECT) {
- if (!(status & IRQ_WAITING)) {
+ if (desc->istate & IRQS_AUTODETECT) {
+ if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
irq_found = i;
nr_of_irqs++;
}
- desc->status = status & ~IRQ_AUTODETECT;
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad8..4af1e2b244c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,115 @@
#include "internals.h"
/**
- * set_irq_chip - set the irq chip for an irq
+ * irq_set_chip - set the irq chip for an irq
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
-int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, struct irq_chip *chip)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+ if (!desc)
return -EINVAL;
- }
if (!chip)
chip = &no_irq_chip;
- raw_spin_lock_irqsave(&desc->lock, flags);
- irq_chip_set_defaults(chip);
desc->irq_data.chip = chip;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ irq_put_desc_unlock(desc, flags);
+ /*
+ * For !CONFIG_SPARSE_IRQ make the irq show up in
+ * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
+ * already marked, and this call is harmless.
+ */
+ irq_reserve_irq(irq);
return 0;
}
-EXPORT_SYMBOL(set_irq_chip);
+EXPORT_SYMBOL(irq_set_chip);
/**
- * set_irq_type - set the irq trigger type for an irq
+ * irq_set_type - set the irq trigger type for an irq
* @irq: irq number
* @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
-int set_irq_type(unsigned int irq, unsigned int type)
+int irq_set_irq_type(unsigned int irq, unsigned int type)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- int ret = -ENXIO;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+ int ret = 0;
- if (!desc) {
- printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
- return -ENODEV;
- }
+ if (!desc)
+ return -EINVAL;
type &= IRQ_TYPE_SENSE_MASK;
- if (type == IRQ_TYPE_NONE)
- return 0;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = __irq_set_trigger(desc, irq, type);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (type != IRQ_TYPE_NONE)
+ ret = __irq_set_trigger(desc, irq, type);
+ irq_put_desc_busunlock(desc, flags);
return ret;
}
-EXPORT_SYMBOL(set_irq_type);
+EXPORT_SYMBOL(irq_set_irq_type);
/**
- * set_irq_data - set irq type data for an irq
+ * irq_set_handler_data - set irq handler data for an irq
* @irq: Interrupt number
* @data: Pointer to interrupt specific data
*
* Set the hardware irq controller data for an irq
*/
-int set_irq_data(unsigned int irq, void *data)
+int irq_set_handler_data(unsigned int irq, void *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install controller data for IRQ%d\n", irq);
+ if (!desc)
return -EINVAL;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
desc->irq_data.handler_data = data;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ irq_put_desc_unlock(desc, flags);
return 0;
}
-EXPORT_SYMBOL(set_irq_data);
+EXPORT_SYMBOL(irq_set_handler_data);
/**
- * set_irq_msi - set MSI descriptor data for an irq
+ * irq_set_msi_desc - set MSI descriptor data for an irq
* @irq: Interrupt number
* @entry: Pointer to MSI descriptor data
*
* Set the MSI descriptor entry for an irq
*/
-int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install msi data for IRQ%d\n", irq);
+ if (!desc)
return -EINVAL;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
desc->irq_data.msi_desc = entry;
if (entry)
entry->irq = irq;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ irq_put_desc_unlock(desc, flags);
return 0;
}
/**
- * set_irq_chip_data - set irq chip data for an irq
+ * irq_set_chip_data - set irq chip data for an irq
* @irq: Interrupt number
* @data: Pointer to chip specific data
*
* Set the hardware irq chip data for an irq
*/
-int set_irq_chip_data(unsigned int irq, void *data)
+int irq_set_chip_data(unsigned int irq, void *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install chip data for IRQ%d\n", irq);
- return -EINVAL;
- }
-
- if (!desc->irq_data.chip) {
- printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+ if (!desc)
return -EINVAL;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
desc->irq_data.chip_data = data;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ irq_put_desc_unlock(desc, flags);
return 0;
}
-EXPORT_SYMBOL(set_irq_chip_data);
+EXPORT_SYMBOL(irq_set_chip_data);
struct irq_data *irq_get_irq_data(unsigned int irq)
{
@@ -162,221 +137,71 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
}
EXPORT_SYMBOL_GPL(irq_get_irq_data);
-/**
- * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
- *
- * @irq: Interrupt number
- * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
- *
- * The IRQ_NESTED_THREAD flag indicates that on
- * request_threaded_irq() no separate interrupt thread should be
- * created for the irq as the handler are called nested in the
- * context of a demultiplexing interrupt handler thread.
- */
-void set_irq_nested_thread(unsigned int irq, int nest)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- if (!desc)
- return;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (nest)
- desc->status |= IRQ_NESTED_THREAD;
- else
- desc->status &= ~IRQ_NESTED_THREAD;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-EXPORT_SYMBOL_GPL(set_irq_nested_thread);
-
-/*
- * default enable function
- */
-static void default_enable(struct irq_data *data)
+static void irq_state_clr_disabled(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_data_to_desc(data);
-
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
- desc->status &= ~IRQ_MASKED;
+ irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
}
-/*
- * default disable function
- */
-static void default_disable(struct irq_data *data)
-{
-}
-
-/*
- * default startup function
- */
-static unsigned int default_startup(struct irq_data *data)
+static void irq_state_set_disabled(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_data_to_desc(data);
-
- desc->irq_data.chip->irq_enable(data);
- return 0;
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
}
-/*
- * default shutdown function
- */
-static void default_shutdown(struct irq_data *data)
+static void irq_state_clr_masked(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_data_to_desc(data);
-
- desc->irq_data.chip->irq_mask(&desc->irq_data);
- desc->status |= IRQ_MASKED;
+ irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
}
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-/* Temporary migration helpers */
-static void compat_irq_mask(struct irq_data *data)
+static void irq_state_set_masked(struct irq_desc *desc)
{
- data->chip->mask(data->irq);
+ irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
}
-static void compat_irq_unmask(struct irq_data *data)
+int irq_startup(struct irq_desc *desc)
{
- data->chip->unmask(data->irq);
-}
+ irq_state_clr_disabled(desc);
+ desc->depth = 0;
-static void compat_irq_ack(struct irq_data *data)
-{
- data->chip->ack(data->irq);
-}
-
-static void compat_irq_mask_ack(struct irq_data *data)
-{
- data->chip->mask_ack(data->irq);
-}
-
-static void compat_irq_eoi(struct irq_data *data)
-{
- data->chip->eoi(data->irq);
-}
-
-static void compat_irq_enable(struct irq_data *data)
-{
- data->chip->enable(data->irq);
-}
-
-static void compat_irq_disable(struct irq_data *data)
-{
- data->chip->disable(data->irq);
-}
-
-static void compat_irq_shutdown(struct irq_data *data)
-{
- data->chip->shutdown(data->irq);
-}
-
-static unsigned int compat_irq_startup(struct irq_data *data)
-{
- return data->chip->startup(data->irq);
-}
-
-static int compat_irq_set_affinity(struct irq_data *data,
- const struct cpumask *dest, bool force)
-{
- return data->chip->set_affinity(data->irq, dest);
-}
-
-static int compat_irq_set_type(struct irq_data *data, unsigned int type)
-{
- return data->chip->set_type(data->irq, type);
-}
-
-static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
-{
- return data->chip->set_wake(data->irq, on);
-}
+ if (desc->irq_data.chip->irq_startup) {
+ int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ return ret;
+ }
-static int compat_irq_retrigger(struct irq_data *data)
-{
- return data->chip->retrigger(data->irq);
+ irq_enable(desc);
+ return 0;
}
-static void compat_bus_lock(struct irq_data *data)
+void irq_shutdown(struct irq_desc *desc)
{
- data->chip->bus_lock(data->irq);
+ irq_state_set_disabled(desc);
+ desc->depth = 1;
+ if (desc->irq_data.chip->irq_shutdown)
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ if (desc->irq_data.chip->irq_disable)
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_state_set_masked(desc);
}
-static void compat_bus_sync_unlock(struct irq_data *data)
+void irq_enable(struct irq_desc *desc)
{
- data->chip->bus_sync_unlock(data->irq);
+ irq_state_clr_disabled(desc);
+ if (desc->irq_data.chip->irq_enable)
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ irq_state_clr_masked(desc);
}
-#endif
-/*
- * Fixup enable/disable function pointers
- */
-void irq_chip_set_defaults(struct irq_chip *chip)
+void irq_disable(struct irq_desc *desc)
{
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
- /*
- * Compat fixup functions need to be before we set the
- * defaults for enable/disable/startup/shutdown
- */
- if (chip->enable)
- chip->irq_enable = compat_irq_enable;
- if (chip->disable)
- chip->irq_disable = compat_irq_disable;
- if (chip->shutdown)
- chip->irq_shutdown = compat_irq_shutdown;
- if (chip->startup)
- chip->irq_startup = compat_irq_startup;
-#endif
- /*
- * The real defaults
- */
- if (!chip->irq_enable)
- chip->irq_enable = default_enable;
- if (!chip->irq_disable)
- chip->irq_disable = default_disable;
- if (!chip->irq_startup)
- chip->irq_startup = default_startup;
- /*
- * We use chip->irq_disable, when the user provided its own. When
- * we have default_disable set for chip->irq_disable, then we need
- * to use default_shutdown, otherwise the irq line is not
- * disabled on free_irq():
- */
- if (!chip->irq_shutdown)
- chip->irq_shutdown = chip->irq_disable != default_disable ?
- chip->irq_disable : default_shutdown;
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
- if (!chip->end)
- chip->end = dummy_irq_chip.end;
-
- /*
- * Now fix up the remaining compat handlers
- */
- if (chip->bus_lock)
- chip->irq_bus_lock = compat_bus_lock;
- if (chip->bus_sync_unlock)
- chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
- if (chip->mask)
- chip->irq_mask = compat_irq_mask;
- if (chip->unmask)
- chip->irq_unmask = compat_irq_unmask;
- if (chip->ack)
- chip->irq_ack = compat_irq_ack;
- if (chip->mask_ack)
- chip->irq_mask_ack = compat_irq_mask_ack;
- if (chip->eoi)
- chip->irq_eoi = compat_irq_eoi;
- if (chip->set_affinity)
- chip->irq_set_affinity = compat_irq_set_affinity;
- if (chip->set_type)
- chip->irq_set_type = compat_irq_set_type;
- if (chip->set_wake)
- chip->irq_set_wake = compat_irq_set_wake;
- if (chip->retrigger)
- chip->irq_retrigger = compat_irq_retrigger;
-#endif
+ irq_state_set_disabled(desc);
+ if (desc->irq_data.chip->irq_disable) {
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ irq_state_set_masked(desc);
+ }
}
static inline void mask_ack_irq(struct irq_desc *desc)
@@ -388,22 +213,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
if (desc->irq_data.chip->irq_ack)
desc->irq_data.chip->irq_ack(&desc->irq_data);
}
- desc->status |= IRQ_MASKED;
+ irq_state_set_masked(desc);
}
-static inline void mask_irq(struct irq_desc *desc)
+void mask_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_mask) {
desc->irq_data.chip->irq_mask(&desc->irq_data);
- desc->status |= IRQ_MASKED;
+ irq_state_set_masked(desc);
}
}
-static inline void unmask_irq(struct irq_desc *desc)
+void unmask_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_unmask) {
desc->irq_data.chip->irq_unmask(&desc->irq_data);
- desc->status &= ~IRQ_MASKED;
+ irq_state_clr_masked(desc);
}
}
@@ -428,10 +253,10 @@ void handle_nested_irq(unsigned int irq)
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
goto out_unlock;
- desc->status |= IRQ_INPROGRESS;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +264,20 @@ void handle_nested_irq(unsigned int irq)
note_interrupt(irq, desc, action_ret);
raw_spin_lock_irq(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
out_unlock:
raw_spin_unlock_irq(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
+static bool irq_check_poll(struct irq_desc *desc)
+{
+ if (!(desc->istate & IRQS_POLL_INPROGRESS))
+ return false;
+ return irq_wait_for_poll(desc);
+}
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -461,29 +293,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
void
handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
- struct irqaction *action;
- irqreturn_t action_ret;
-
raw_spin_lock(&desc->lock);
- if (unlikely(desc->status & IRQ_INPROGRESS))
- goto out_unlock;
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+ if (!irq_check_poll(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
- action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
goto out_unlock;
- desc->status |= IRQ_INPROGRESS;
- raw_spin_unlock(&desc->lock);
+ handle_irq_event(desc);
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
out_unlock:
raw_spin_unlock(&desc->lock);
}
@@ -501,42 +324,42 @@ out_unlock:
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
- struct irqaction *action;
- irqreturn_t action_ret;
-
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
- if (unlikely(desc->status & IRQ_INPROGRESS))
- goto out_unlock;
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+ if (!irq_check_poll(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* keep it masked and get out of here
*/
- action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
goto out_unlock;
- desc->status |= IRQ_INPROGRESS;
- raw_spin_unlock(&desc->lock);
-
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
+ handle_irq_event(desc);
- if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+ if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
unmask_irq(desc);
out_unlock:
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void preflow_handler(struct irq_desc *desc)
+{
+ if (desc->preflow_handler)
+ desc->preflow_handler(&desc->irq_data);
+}
+#else
+static inline void preflow_handler(struct irq_desc *desc) { }
+#endif
+
/**
* handle_fasteoi_irq - irq handler for transparent controllers
* @irq: the interrupt number
@@ -550,42 +373,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
- struct irqaction *action;
- irqreturn_t action_ret;
-
raw_spin_lock(&desc->lock);
- if (unlikely(desc->status & IRQ_INPROGRESS))
- goto out;
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+ if (!irq_check_poll(desc))
+ goto out;
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* then mask it and get out of here:
*/
- action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
- desc->status |= IRQ_PENDING;
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
mask_irq(desc);
goto out;
}
- desc->status |= IRQ_INPROGRESS;
- desc->status &= ~IRQ_PENDING;
- raw_spin_unlock(&desc->lock);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ preflow_handler(desc);
+ handle_irq_event(desc);
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
-out:
+out_eoi:
desc->irq_data.chip->irq_eoi(&desc->irq_data);
-
+out_unlock:
raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ goto out_eoi;
+ goto out_unlock;
}
/**
@@ -594,7 +415,7 @@ out:
* @desc: the interrupt description structure for this irq
*
* Interrupt occures on the falling and/or rising edge of a hardware
- * signal. The occurence is latched into the irq controller hardware
+ * signal. The occurrence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
* is handled by the associated event handler. If this happens it
@@ -609,32 +430,27 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
/*
* If we're currently running this IRQ, or its disabled,
* we shouldn't process the IRQ. Mark it pending, handle
* the necessary masking and go out
*/
- if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
- !desc->action)) {
- desc->status |= (IRQ_PENDING | IRQ_MASKED);
- mask_ack_irq(desc);
- goto out_unlock;
+ if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+ irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+ if (!irq_check_poll(desc)) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
}
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
- /* Mark the IRQ currently in progress.*/
- desc->status |= IRQ_INPROGRESS;
-
do {
- struct irqaction *action = desc->action;
- irqreturn_t action_ret;
-
- if (unlikely(!action)) {
+ if (unlikely(!desc->action)) {
mask_irq(desc);
goto out_unlock;
}
@@ -644,26 +460,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
- if (unlikely((desc->status &
- (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
- (IRQ_PENDING | IRQ_MASKED))) {
- unmask_irq(desc);
+ if (unlikely(desc->istate & IRQS_PENDING)) {
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data))
+ unmask_irq(desc);
}
- desc->status &= ~IRQ_PENDING;
- raw_spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
- raw_spin_lock(&desc->lock);
+ handle_irq_event(desc);
- } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+ } while ((desc->istate & IRQS_PENDING) &&
+ !irqd_irq_disabled(&desc->irq_data));
- desc->status &= ~IRQ_INPROGRESS;
out_unlock:
raw_spin_unlock(&desc->lock);
}
+#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
+/**
+ * handle_edge_eoi_irq - edge eoi type IRQ handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Similar as the above handle_edge_irq, but using eoi and w/o the
+ * mask/unmask logic.
+ */
+void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+
+ raw_spin_lock(&desc->lock);
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ /*
+ * If we're currently running this IRQ, or its disabled,
+ * we shouldn't process the IRQ. Mark it pending, handle
+ * the necessary masking and go out
+ */
+ if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+ irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+ if (!irq_check_poll(desc)) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+ }
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ do {
+ if (unlikely(!desc->action))
+ goto out_eoi;
+
+ handle_irq_event(desc);
+
+ } while ((desc->istate & IRQS_PENDING) &&
+ !irqd_irq_disabled(&desc->irq_data));
+
+out_eoi:
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+#endif
+
/**
* handle_percpu_irq - Per CPU local irq handler
* @irq: the interrupt number
@@ -674,103 +530,145 @@ out_unlock:
void
handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
- irqreturn_t action_ret;
+ struct irq_chip *chip = irq_desc_get_chip(desc);
kstat_incr_irqs_this_cpu(irq, desc);
- if (desc->irq_data.chip->irq_ack)
- desc->irq_data.chip->irq_ack(&desc->irq_data);
+ if (chip->irq_ack)
+ chip->irq_ack(&desc->irq_data);
- action_ret = handle_IRQ_event(irq, desc->action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ handle_irq_event_percpu(desc, desc->action);
- if (desc->irq_data.chip->irq_eoi)
- desc->irq_data.chip->irq_eoi(&desc->irq_data);
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
}
void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
const char *name)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install type control for IRQ%d\n", irq);
+ if (!desc)
return;
- }
- if (!handle)
+ if (!handle) {
handle = handle_bad_irq;
- else if (desc->irq_data.chip == &no_irq_chip) {
- printk(KERN_WARNING "Trying to install %sinterrupt handler "
- "for IRQ%d\n", is_chained ? "chained " : "", irq);
- /*
- * Some ARM implementations install a handler for really dumb
- * interrupt hardware without setting an irq_chip. This worked
- * with the ARM no_irq_chip but the check in setup_irq would
- * prevent us to setup the interrupt at all. Switch it to
- * dummy_irq_chip for easy transition.
- */
- desc->irq_data.chip = &dummy_irq_chip;
+ } else {
+ if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
+ goto out;
}
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, flags);
-
/* Uninstall? */
if (handle == handle_bad_irq) {
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
- desc->status |= IRQ_DISABLED;
+ irq_state_set_disabled(desc);
desc->depth = 1;
}
desc->handle_irq = handle;
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
- desc->status &= ~IRQ_DISABLED;
- desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
- desc->depth = 0;
- desc->irq_data.chip->irq_startup(&desc->irq_data);
+ irq_settings_set_noprobe(desc);
+ irq_settings_set_norequest(desc);
+ irq_startup(desc);
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(desc);
-}
-EXPORT_SYMBOL_GPL(__set_irq_handler);
-
-void
-set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
- irq_flow_handler_t handle)
-{
- set_irq_chip(irq, chip);
- __set_irq_handler(irq, handle, 0, NULL);
+out:
+ irq_put_desc_busunlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(__irq_set_handler);
void
-set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
- set_irq_chip(irq, chip);
- __set_irq_handler(irq, handle, 0, name);
+ irq_set_chip(irq, chip);
+ __irq_set_handler(irq, handle, 0, name);
}
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
if (!desc)
return;
+ irq_settings_clr_and_set(desc, clr, set);
+
+ irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+ IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+ if (irq_settings_has_no_balance_set(desc))
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ if (irq_settings_is_per_cpu(desc))
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ if (irq_settings_can_move_pcntxt(desc))
+ irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
+ if (irq_settings_is_level(desc))
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
+
+ irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+
+ irq_put_desc_unlock(desc, flags);
+}
+
+/**
+ * irq_cpu_online - Invoke all irq_cpu_online functions.
+ *
+ * Iterate through all irqs and invoke the chip.irq_cpu_online()
+ * for each.
+ */
+void irq_cpu_online(void)
+{
+ struct irq_desc *desc;
+ struct irq_chip *chip;
+ unsigned long flags;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
- /* Sanitize flags */
- set &= IRQF_MODIFY_MASK;
- clr &= IRQF_MODIFY_MASK;
+ raw_spin_lock_irqsave(&desc->lock, flags);
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->status &= ~clr;
- desc->status |= set;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (chip && chip->irq_cpu_online &&
+ (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+ !irqd_irq_disabled(&desc->irq_data)))
+ chip->irq_cpu_online(&desc->irq_data);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
+
+/**
+ * irq_cpu_offline - Invoke all irq_cpu_offline functions.
+ *
+ * Iterate through all irqs and invoke the chip.irq_cpu_offline()
+ * for each.
+ */
+void irq_cpu_offline(void)
+{
+ struct irq_desc *desc;
+ struct irq_chip *chip;
+ unsigned long flags;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (chip && chip->irq_cpu_offline &&
+ (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+ !irqd_irq_disabled(&desc->irq_data)))
+ chip->irq_cpu_offline(&desc->irq_data);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 00000000000..306cba37e9a
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,44 @@
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+/* FIXME */
+#define PD(f) do { } while (0)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+ printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+ irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+ printk("->handle_irq(): %p, ", desc->handle_irq);
+ print_symbol("%s\n", (unsigned long)desc->handle_irq);
+ printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+ print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+ printk("->action(): %p\n", desc->action);
+ if (desc->action) {
+ printk("->action->handler(): %p, ", desc->action->handler);
+ print_symbol("%s\n", (unsigned long)desc->action->handler);
+ }
+
+ P(IRQ_LEVEL);
+ P(IRQ_PER_CPU);
+ P(IRQ_NOPROBE);
+ P(IRQ_NOREQUEST);
+ P(IRQ_NOAUTOEN);
+
+ PS(IRQS_AUTODETECT);
+ PS(IRQS_REPLAY);
+ PS(IRQS_WAITING);
+ PS(IRQS_PENDING);
+
+ PD(IRQS_INPROGRESS);
+ PD(IRQS_DISABLED);
+ PD(IRQS_MASKED);
+}
+
+#undef P
+#undef PS
+#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 20dc5474947..b5fcd96c710 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data)
return 0;
}
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static void compat_noop(unsigned int irq) { }
-#define END_INIT .end = compat_noop
-#else
-#define END_INIT
-#endif
-
/*
* Generic no controller implementation
*/
@@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = {
.irq_enable = noop,
.irq_disable = noop,
.irq_ack = ack_bad,
- END_INIT
};
/*
@@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = {
.irq_ack = noop,
.irq_mask = noop,
.irq_unmask = noop,
- END_INIT
};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb6330..90cb55f6d7e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
"but no thread function available.", irq, action->name);
}
-/**
- * handle_IRQ_event - irq action chain handler
- * @irq: the interrupt number
- * @action: the interrupt action chain for this irq
- *
- * Handles the action chain of an irq event
- */
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+{
+ /*
+ * Wake up the handler thread for this action. In case the
+ * thread crashed and was killed we just pretend that we
+ * handled the interrupt. The hardirq handler has disabled the
+ * device interrupt, so no irq storm is lurking. If the
+ * RUNTHREAD bit is already set, nothing to do.
+ */
+ if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+ test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ return;
+
+ /*
+ * It's safe to OR the mask lockless here. We have only two
+ * places which write to threads_oneshot: This code and the
+ * irq thread.
+ *
+ * This code is the hard irq context and can never run on two
+ * cpus in parallel. If it ever does we have more serious
+ * problems than this bitmask.
+ *
+ * The irq threads of this irq which clear their "running" bit
+ * in threads_oneshot are serialized via desc->lock against
+ * each other and they are serialized against this code by
+ * IRQS_INPROGRESS.
+ *
+ * Hard irq handler:
+ *
+ * spin_lock(desc->lock);
+ * desc->state |= IRQS_INPROGRESS;
+ * spin_unlock(desc->lock);
+ * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+ * desc->threads_oneshot |= mask;
+ * spin_lock(desc->lock);
+ * desc->state &= ~IRQS_INPROGRESS;
+ * spin_unlock(desc->lock);
+ *
+ * irq thread:
+ *
+ * again:
+ * spin_lock(desc->lock);
+ * if (desc->state & IRQS_INPROGRESS) {
+ * spin_unlock(desc->lock);
+ * while(desc->state & IRQS_INPROGRESS)
+ * cpu_relax();
+ * goto again;
+ * }
+ * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ * desc->threads_oneshot &= ~mask;
+ * spin_unlock(desc->lock);
+ *
+ * So either the thread waits for us to clear IRQS_INPROGRESS
+ * or we are waiting in the flow handler for desc->lock to be
+ * released before we reach this point. The thread also checks
+ * IRQTF_RUNTHREAD under desc->lock. If set it leaves
+ * threads_oneshot untouched and runs the thread another time.
+ */
+ desc->threads_oneshot |= action->thread_mask;
+ wake_up_process(action->thread);
+}
+
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{
- irqreturn_t ret, retval = IRQ_NONE;
- unsigned int status = 0;
+ irqreturn_t retval = IRQ_NONE;
+ unsigned int random = 0, irq = desc->irq_data.irq;
do {
+ irqreturn_t res;
+
trace_irq_handler_entry(irq, action);
- ret = action->handler(irq, action->dev_id);
- trace_irq_handler_exit(irq, action, ret);
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+ irq, action->handler))
+ local_irq_disable();
- switch (ret) {
+ switch (res) {
case IRQ_WAKE_THREAD:
/*
* Set result to handled so the spurious check
* does not trigger.
*/
- ret = IRQ_HANDLED;
+ res = IRQ_HANDLED;
/*
* Catch drivers which return WAKE_THREAD but
@@ -85,147 +147,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
break;
}
- /*
- * Wake up the handler thread for this
- * action. In case the thread crashed and was
- * killed we just pretend that we handled the
- * interrupt. The hardirq handler above has
- * disabled the device interrupt, so no irq
- * storm is lurking.
- */
- if (likely(!test_bit(IRQTF_DIED,
- &action->thread_flags))) {
- set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
- wake_up_process(action->thread);
- }
+ irq_wake_thread(desc, action);
/* Fall through to add to randomness */
case IRQ_HANDLED:
- status |= action->flags;
+ random |= action->flags;
break;
default:
break;
}
- retval |= ret;
+ retval |= res;
action = action->next;
} while (action);
- if (status & IRQF_SAMPLE_RANDOM)
+ if (random & IRQF_SAMPLE_RANDOM)
add_interrupt_randomness(irq);
- local_irq_disable();
+ if (!noirqdebug)
+ note_interrupt(irq, desc, retval);
return retval;
}
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq: the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
+irqreturn_t handle_irq_event(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- unsigned int status;
+ struct irqaction *action = desc->action;
+ irqreturn_t ret;
- kstat_incr_irqs_this_cpu(irq, desc);
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ raw_spin_unlock(&desc->lock);
- if (CHECK_IRQ_PER_CPU(desc->status)) {
- irqreturn_t action_ret;
-
- /*
- * No locking required for CPU-local interrupts:
- */
- if (desc->irq_data.chip->ack)
- desc->irq_data.chip->ack(irq);
- if (likely(!(desc->status & IRQ_DISABLED))) {
- action_ret = handle_IRQ_event(irq, desc->action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
- }
- desc->irq_data.chip->end(irq);
- return 1;
- }
+ ret = handle_irq_event_percpu(desc, action);
raw_spin_lock(&desc->lock);
- if (desc->irq_data.chip->ack)
- desc->irq_data.chip->ack(irq);
- /*
- * REPLAY is when Linux resends an IRQ that was dropped earlier
- * WAITING is used by probe to mark irqs that are being tested
- */
- status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
- status |= IRQ_PENDING; /* we _want_ to handle it */
-
- /*
- * If the IRQ is disabled for whatever reason, we cannot
- * use the action we have.
- */
- action = NULL;
- if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
- action = desc->action;
- status &= ~IRQ_PENDING; /* we commit to handling */
- status |= IRQ_INPROGRESS; /* we are handling it */
- }
- desc->status = status;
-
- /*
- * If there is no IRQ handler or it was disabled, exit early.
- * Since we set PENDING, if another processor is handling
- * a different instance of this same irq, the other processor
- * will take care of it.
- */
- if (unlikely(!action))
- goto out;
-
- /*
- * Edge triggered interrupts need to remember
- * pending events.
- * This applies to any hw interrupts that allow a second
- * instance of the same irq to arrive while we are in do_IRQ
- * or in the handler. But the code here only handles the _second_
- * instance of the irq, not the third or fourth. So it is mostly
- * useful for irq hardware that does not mask cleanly in an
- * SMP environment.
- */
- for (;;) {
- irqreturn_t action_ret;
-
- raw_spin_unlock(&desc->lock);
-
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock(&desc->lock);
- if (likely(!(desc->status & IRQ_PENDING)))
- break;
- desc->status &= ~IRQ_PENDING;
- }
- desc->status &= ~IRQ_INPROGRESS;
-
-out:
- /*
- * The ->end() handler has to deal with interrupts which got
- * disabled while the handler was running.
- */
- desc->irq_data.chip->end(irq);
- raw_spin_unlock(&desc->lock);
-
- return 1;
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ return ret;
}
-#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085..6546431447d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,27 +1,87 @@
/*
* IRQ subsystem internal functions and variables:
+ *
+ * Do not ever include this file from anything else than
+ * kernel/irq/. Do not even think about using any information outside
+ * of this file for your non core code.
*/
#include <linux/irqdesc.h>
+#ifdef CONFIG_SPARSE_IRQ
+# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+#else
+# define IRQ_BITMAP_BITS NR_IRQS
+#endif
+
+#define istate core_internal_state__do_not_mess_with_it
+
extern int noirqdebug;
-#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED - handler thread died
+ * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
+ * IRQTF_AFFINITY - irq thread is requested to adjust affinity
+ * IRQTF_FORCED_THREAD - irq action is force threaded
+ */
+enum {
+ IRQTF_RUNTHREAD,
+ IRQTF_DIED,
+ IRQTF_WARNED,
+ IRQTF_AFFINITY,
+ IRQTF_FORCED_THREAD,
+};
-/* Set default functions for irq_chip structures: */
-extern void irq_chip_set_defaults(struct irq_chip *chip);
+/*
+ * Bit masks for desc->state
+ *
+ * IRQS_AUTODETECT - autodetection in progress
+ * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
+ * detection
+ * IRQS_POLL_INPROGRESS - polling in progress
+ * IRQS_ONESHOT - irq is not unmasked in primary handler
+ * IRQS_REPLAY - irq is replayed
+ * IRQS_WAITING - irq is waiting
+ * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_SUSPENDED - irq is suspended
+ */
+enum {
+ IRQS_AUTODETECT = 0x00000001,
+ IRQS_SPURIOUS_DISABLED = 0x00000002,
+ IRQS_POLL_INPROGRESS = 0x00000008,
+ IRQS_ONESHOT = 0x00000020,
+ IRQS_REPLAY = 0x00000040,
+ IRQS_WAITING = 0x00000080,
+ IRQS_PENDING = 0x00000200,
+ IRQS_SUSPENDED = 0x00000800,
+};
+
+#include "debug.h"
+#include "settings.h"
-/* Set default handler: */
-extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern int irq_startup(struct irq_desc *desc);
+extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_enable(struct irq_desc *desc);
+extern void irq_disable(struct irq_desc *desc);
+extern void mask_irq(struct irq_desc *desc);
+extern void unmask_irq(struct irq_desc *desc);
+
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event(struct irq_desc *desc);
+
/* Resending of interrupts :*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+bool irq_wait_for_poll(struct irq_desc *desc);
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -37,20 +97,10 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
-extern int irq_select_affinity_usr(unsigned int irq);
+extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static inline void irq_end(unsigned int irq, struct irq_desc *desc)
-{
- if (desc->irq_data.chip && desc->irq_data.chip->end)
- desc->irq_data.chip->end(irq);
-}
-#else
-static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
-#endif
-
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
@@ -64,43 +114,58 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
}
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
+
+static inline struct irq_desc *
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+{
+ return __irq_get_desc_lock(irq, flags, true);
+}
+
+static inline void
+irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+{
+ __irq_put_desc_unlock(desc, flags, true);
+}
+
+static inline struct irq_desc *
+irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+{
+ return __irq_get_desc_lock(irq, flags, false);
+}
+
+static inline void
+irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+{
+ __irq_put_desc_unlock(desc, flags, false);
+}
+
/*
- * Debugging printout:
+ * Manipulation functions for irq_data.state
*/
+static inline void irqd_set_move_pending(struct irq_data *d)
+{
+ d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+}
-#include <linux/kallsyms.h>
-
-#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+static inline void irqd_clr_move_pending(struct irq_data *d)
+{
+ d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+}
-static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+static inline void irqd_clear(struct irq_data *d, unsigned int mask)
{
- printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
- irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
- printk("->handle_irq(): %p, ", desc->handle_irq);
- print_symbol("%s\n", (unsigned long)desc->handle_irq);
- printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
- print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
- printk("->action(): %p\n", desc->action);
- if (desc->action) {
- printk("->action->handler(): %p, ", desc->action->handler);
- print_symbol("%s\n", (unsigned long)desc->action->handler);
- }
-
- P(IRQ_INPROGRESS);
- P(IRQ_DISABLED);
- P(IRQ_PENDING);
- P(IRQ_REPLAY);
- P(IRQ_AUTODETECT);
- P(IRQ_WAITING);
- P(IRQ_LEVEL);
- P(IRQ_MASKED);
-#ifdef CONFIG_IRQ_PER_CPU
- P(IRQ_PER_CPU);
-#endif
- P(IRQ_NOPROBE);
- P(IRQ_NOREQUEST);
- P(IRQ_NOAUTOEN);
+ d->state_use_accessors &= ~mask;
}
-#undef P
+static inline void irqd_set(struct irq_data *d, unsigned int mask)
+{
+ d->state_use_accessors |= mask;
+}
+static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
+{
+ return d->state_use_accessors & mask;
+}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f..2c039c9b938 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,18 +72,22 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
{
+ int cpu;
+
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
desc->irq_data.handler_data = NULL;
desc->irq_data.msi_desc = NULL;
- desc->status = IRQ_DEFAULT_INIT_FLAGS;
+ irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
desc->handle_irq = handle_bad_irq;
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
desc->name = NULL;
- memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
desc_smp_init(desc, node);
}
@@ -91,7 +95,7 @@ int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
#ifdef CONFIG_SPARSE_IRQ
@@ -133,8 +137,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
if (!desc)
return NULL;
/* allocate based on nr_cpu_ids */
- desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
- gfp, node);
+ desc->kstat_irqs = alloc_percpu(unsigned int);
if (!desc->kstat_irqs)
goto err_desc;
@@ -149,7 +152,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
return desc;
err_kstat:
- kfree(desc->kstat_irqs);
+ free_percpu(desc->kstat_irqs);
err_desc:
kfree(desc);
return NULL;
@@ -166,7 +169,7 @@ static void free_desc(unsigned int irq)
mutex_unlock(&sparse_irq_lock);
free_masks(desc);
- kfree(desc->kstat_irqs);
+ free_percpu(desc->kstat_irqs);
kfree(desc);
}
@@ -195,13 +198,12 @@ err:
return -ENOMEM;
}
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
+static int irq_expand_nr_irqs(unsigned int nr)
{
- int res = irq_alloc_descs(irq, irq, 1, node);
-
- if (res == -EEXIST || res == irq)
- return irq_to_desc(irq);
- return NULL;
+ if (nr > IRQ_BITMAP_BITS)
+ return -ENOMEM;
+ nr_irqs = nr;
+ return 0;
}
int __init early_irq_init(void)
@@ -215,6 +217,15 @@ int __init early_irq_init(void)
initcnt = arch_probe_nr_irqs();
printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+ if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
+ nr_irqs = IRQ_BITMAP_BITS;
+
+ if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
+ initcnt = IRQ_BITMAP_BITS;
+
+ if (initcnt > nr_irqs)
+ nr_irqs = initcnt;
+
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node);
set_bit(i, allocated_irqs);
@@ -227,14 +238,12 @@ int __init early_irq_init(void)
struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
- .status = IRQ_DEFAULT_INIT_FLAGS,
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
}
};
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
@@ -250,7 +259,8 @@ int __init early_irq_init(void)
for (i = 0; i < count; i++) {
desc[i].irq_data.irq = i;
desc[i].irq_data.chip = &no_irq_chip;
- desc[i].kstat_irqs = kstat_irqs_all[i];
+ desc[i].kstat_irqs = alloc_percpu(unsigned int);
+ irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
alloc_masks(desc + i, GFP_KERNEL, node);
desc_smp_init(desc + i, node);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -263,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
- return irq_to_desc(irq);
-}
-
static void free_desc(unsigned int irq)
{
dynamic_irq_cleanup(irq);
@@ -277,6 +282,12 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
{
return start;
}
+
+static int irq_expand_nr_irqs(unsigned int nr)
+{
+ return -ENOMEM;
+}
+
#endif /* !CONFIG_SPARSE_IRQ */
/* Dynamic interrupt handling */
@@ -320,14 +331,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
mutex_lock(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+ start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+ from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
goto err;
- ret = -ENOMEM;
- if (start >= nr_irqs)
- goto err;
+ if (start + cnt > nr_irqs) {
+ ret = irq_expand_nr_irqs(start + cnt);
+ if (ret)
+ goto err;
+ }
bitmap_set(allocated_irqs, start, cnt);
mutex_unlock(&sparse_irq_lock);
@@ -374,6 +388,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
return find_next_bit(allocated_irqs, nr_irqs, offset);
}
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ if (bus)
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, *flags);
+ }
+ return desc;
+}
+
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+{
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (bus)
+ chip_bus_sync_unlock(desc);
+}
+
/**
* dynamic_irq_cleanup - cleanup a dynamically allocated irq
* @irq: irq number to initialize
@@ -391,7 +425,9 @@ void dynamic_irq_cleanup(unsigned int irq)
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
- return desc ? desc->kstat_irqs[cpu] : 0;
+
+ return desc && desc->kstat_irqs ?
+ *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +437,10 @@ unsigned int kstat_irqs(unsigned int irq)
int cpu;
int sum = 0;
- if (!desc)
+ if (!desc || !desc->kstat_irqs)
return 0;
for_each_possible_cpu(cpu)
- sum += desc->kstat_irqs[cpu];
+ sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f95..07c1611f389 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
#include "internals.h"
+#ifdef CONFIG_IRQ_FORCED_THREADING
+__read_mostly bool force_irqthreads;
+
+static int __init setup_forced_irqthreads(char *arg)
+{
+ force_irqthreads = true;
+ return 0;
+}
+early_param("threadirqs", setup_forced_irqthreads);
+#endif
+
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned int status;
+ bool inprogress;
if (!desc)
return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
* Wait until we're out of the critical section. This might
* give the wrong answer due to the lack of memory barriers.
*/
- while (desc->status & IRQ_INPROGRESS)
+ while (irqd_irq_inprogress(&desc->irq_data))
cpu_relax();
/* Ok, that indicated we're done: double-check carefully. */
raw_spin_lock_irqsave(&desc->lock, flags);
- status = desc->status;
+ inprogress = irqd_irq_inprogress(&desc->irq_data);
raw_spin_unlock_irqrestore(&desc->lock, flags);
/* Oops, that failed? */
- } while (status & IRQ_INPROGRESS);
+ } while (inprogress);
/*
* We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
- !desc->irq_data.chip->irq_set_affinity)
+ if (!desc || !irqd_can_balance(&desc->irq_data) ||
+ !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
return 0;
return 1;
@@ -100,67 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc)
}
}
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+ return irqd_can_move_in_process_context(data);
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+ return irqd_is_setaffinity_pending(data);
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+ cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+ cpumask_copy(mask, desc->pending_mask);
+}
+#else
+static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
+static inline bool irq_move_pending(struct irq_data *data) { return false; }
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+#endif
+
+int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+{
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ struct irq_desc *desc = irq_data_to_desc(data);
+ int ret = 0;
+
+ if (!chip || !chip->irq_set_affinity)
+ return -EINVAL;
+
+ if (irq_can_move_pcntxt(data)) {
+ ret = chip->irq_set_affinity(data, mask, false);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ cpumask_copy(data->affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ ret = 0;
+ }
+ } else {
+ irqd_set_move_pending(data);
+ irq_copy_pending(desc, mask);
+ }
+
+ if (desc->affinity_notify) {
+ kref_get(&desc->affinity_notify->kref);
+ schedule_work(&desc->affinity_notify->work);
+ }
+ irqd_set(data, IRQD_AFFINITY_SET);
+
+ return ret;
+}
+
/**
* irq_set_affinity - Set the irq affinity of a given irq
* @irq: Interrupt to set affinity
- * @cpumask: cpumask
+ * @mask: cpumask
*
*/
-int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irq_chip *chip = desc->irq_data.chip;
unsigned long flags;
+ int ret;
- if (!chip->irq_set_affinity)
+ if (!desc)
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
-
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (desc->status & IRQ_MOVE_PCNTXT) {
- if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
- cpumask_copy(desc->irq_data.affinity, cpumask);
- irq_set_thread_affinity(desc);
- }
- }
- else {
- desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(desc->pending_mask, cpumask);
- }
-#else
- if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
- cpumask_copy(desc->irq_data.affinity, cpumask);
- irq_set_thread_affinity(desc);
- }
-#endif
- desc->status |= IRQ_AFFINITY_SET;
+ ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- return 0;
+ return ret;
}
int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+
+ if (!desc)
+ return -EINVAL;
+ desc->affinity_hint = m;
+ irq_put_desc_unlock(desc, flags);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+
+static void irq_affinity_notify(struct work_struct *work)
+{
+ struct irq_affinity_notify *notify =
+ container_of(work, struct irq_affinity_notify, work);
+ struct irq_desc *desc = irq_to_desc(notify->irq);
+ cpumask_var_t cpumask;
+ unsigned long flags;
+
+ if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ goto out;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ if (irq_move_pending(&desc->irq_data))
+ irq_get_pending(cpumask, desc);
+ else
+ cpumask_copy(cpumask, desc->irq_data.affinity);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ notify->notify(notify, cpumask);
+
+ free_cpumask_var(cpumask);
+out:
+ kref_put(&notify->kref, notify->release);
+}
+
+/**
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated and must be disabled before the IRQ is
+ * freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_affinity_notify *old_notify;
unsigned long flags;
+ /* The release function is promised process context */
+ might_sleep();
+
if (!desc)
return -EINVAL;
+ /* Complete initialisation of *notify */
+ if (notify) {
+ notify->irq = irq;
+ kref_init(&notify->kref);
+ INIT_WORK(&notify->work, irq_affinity_notify);
+ }
+
raw_spin_lock_irqsave(&desc->lock, flags);
- desc->affinity_hint = m;
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (old_notify)
+ kref_put(&old_notify->kref, old_notify->release);
+
return 0;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
* Generic version of the affinity autoselector.
*/
-static int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct cpumask *set = irq_default_affinity;
+ int ret;
+
+ /* Excludes PER_CPU and NO_BALANCE interrupts */
if (!irq_can_set_affinity(irq))
return 0;
@@ -168,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
* Preserve an userspace affinity setup, but make sure that
* one of the targets is online.
*/
- if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
- < nr_cpu_ids)
- goto set_affinity;
+ if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+ if (cpumask_intersects(desc->irq_data.affinity,
+ cpu_online_mask))
+ set = desc->irq_data.affinity;
else
- desc->status &= ~IRQ_AFFINITY_SET;
+ irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
}
- cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
-set_affinity:
- desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
-
+ cpumask_and(mask, cpu_online_mask, set);
+ ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ cpumask_copy(desc->irq_data.affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ }
return 0;
}
#else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
{
return irq_select_affinity(irq);
}
@@ -192,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
/*
* Called when affinity is set via /proc/irq
*/
-int irq_select_affinity_usr(unsigned int irq)
+int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
int ret;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = setup_affinity(irq, desc);
- if (!ret)
- irq_set_thread_affinity(desc);
+ ret = setup_affinity(irq, desc, mask);
raw_spin_unlock_irqrestore(&desc->lock, flags);
-
return ret;
}
#else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
return 0;
}
@@ -219,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
if (suspend) {
if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
return;
- desc->status |= IRQ_SUSPENDED;
+ desc->istate |= IRQS_SUSPENDED;
}
- if (!desc->depth++) {
- desc->status |= IRQ_DISABLED;
- desc->irq_data.chip->irq_disable(&desc->irq_data);
- }
+ if (!desc->depth++)
+ irq_disable(desc);
+}
+
+static int __disable_irq_nosync(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+
+ if (!desc)
+ return -EINVAL;
+ __disable_irq(desc, irq, false);
+ irq_put_desc_busunlock(desc, flags);
+ return 0;
}
/**
@@ -241,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
*/
void disable_irq_nosync(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- if (!desc)
- return;
-
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, flags);
- __disable_irq(desc, irq, false);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(desc);
+ __disable_irq_nosync(irq);
}
EXPORT_SYMBOL(disable_irq_nosync);
@@ -269,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
*/
void disable_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc)
- return;
-
- disable_irq_nosync(irq);
- if (desc->action)
+ if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
EXPORT_SYMBOL(disable_irq);
void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
{
- if (resume)
- desc->status &= ~IRQ_SUSPENDED;
+ if (resume) {
+ if (!(desc->istate & IRQS_SUSPENDED)) {
+ if (!desc->action)
+ return;
+ if (!(desc->action->flags & IRQF_FORCE_RESUME))
+ return;
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+ }
+ desc->istate &= ~IRQS_SUSPENDED;
+ }
switch (desc->depth) {
case 0:
@@ -291,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
break;
case 1: {
- unsigned int status = desc->status & ~IRQ_DISABLED;
-
- if (desc->status & IRQ_SUSPENDED)
+ if (desc->istate & IRQS_SUSPENDED)
goto err_out;
/* Prevent probing on this irq: */
- desc->status = status | IRQ_NOPROBE;
+ irq_settings_set_noprobe(desc);
+ irq_enable(desc);
check_irq_resend(desc, irq);
/* fall-through */
}
@@ -318,21 +447,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
*/
void enable_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
if (!desc)
return;
+ if (WARN(!desc->irq_data.chip,
+ KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+ goto out;
- if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
- KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
- return;
-
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, false);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(desc);
+out:
+ irq_put_desc_busunlock(desc, flags);
}
EXPORT_SYMBOL(enable_irq);
@@ -348,7 +474,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
}
/**
- * set_irq_wake - control irq power management wakeup
+ * irq_set_irq_wake - control irq power management wakeup
* @irq: interrupt to control
* @on: enable/disable power management wakeup
*
@@ -359,23 +485,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
* Wakeup mode lets this IRQ wake the system from sleep
* states like "suspend to RAM".
*/
-int set_irq_wake(unsigned int irq, unsigned int on)
+int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
int ret = 0;
/* wakeup-capable irqs can be shared between drivers that
* don't need to have the same sleep mode behaviors.
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
if (on) {
if (desc->wake_depth++ == 0) {
ret = set_irq_wake_real(irq, on);
if (ret)
desc->wake_depth = 0;
else
- desc->status |= IRQ_WAKEUP;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
}
} else {
if (desc->wake_depth == 0) {
@@ -385,14 +510,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
if (ret)
desc->wake_depth = 1;
else
- desc->status &= ~IRQ_WAKEUP;
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
}
}
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ irq_put_desc_busunlock(desc, flags);
return ret;
}
-EXPORT_SYMBOL(set_irq_wake);
+EXPORT_SYMBOL(irq_set_irq_wake);
/*
* Internal function that tells the architecture code whether a
@@ -401,43 +525,27 @@ EXPORT_SYMBOL(set_irq_wake);
*/
int can_request_irq(unsigned int irq, unsigned long irqflags)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ int canrequest = 0;
if (!desc)
return 0;
- if (desc->status & IRQ_NOREQUEST)
- return 0;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- action = desc->action;
- if (action)
- if (irqflags & action->flags & IRQF_SHARED)
- action = NULL;
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return !action;
-}
-
-void compat_irq_chip_set_default_handler(struct irq_desc *desc)
-{
- /*
- * If the architecture still has not overriden
- * the flow handler then zap the default. This
- * should catch incorrect flow-type setting.
- */
- if (desc->handle_irq == &handle_bad_irq)
- desc->handle_irq = NULL;
+ if (irq_settings_can_request(desc)) {
+ if (desc->action)
+ if (irqflags & desc->action->flags & IRQF_SHARED)
+ canrequest =1;
+ }
+ irq_put_desc_unlock(desc, flags);
+ return canrequest;
}
int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags)
{
- int ret;
struct irq_chip *chip = desc->irq_data.chip;
+ int ret, unmask = 0;
if (!chip || !chip->irq_set_type) {
/*
@@ -449,23 +557,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
return 0;
}
+ flags &= IRQ_TYPE_SENSE_MASK;
+
+ if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
+ if (!irqd_irq_masked(&desc->irq_data))
+ mask_irq(desc);
+ if (!irqd_irq_disabled(&desc->irq_data))
+ unmask = 1;
+ }
+
/* caller masked out all except trigger mode flags */
ret = chip->irq_set_type(&desc->irq_data, flags);
- if (ret)
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
+ irqd_set(&desc->irq_data, flags);
+
+ case IRQ_SET_MASK_OK_NOCOPY:
+ flags = irqd_get_trigger_type(&desc->irq_data);
+ irq_settings_set_trigger_mask(desc, flags);
+ irqd_clear(&desc->irq_data, IRQD_LEVEL);
+ irq_settings_clr_level(desc);
+ if (flags & IRQ_TYPE_LEVEL_MASK) {
+ irq_settings_set_level(desc);
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
+ }
+
+ ret = 0;
+ break;
+ default:
pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
flags, irq, chip->irq_set_type);
- else {
- if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
- flags |= IRQ_LEVEL;
- /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
- desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
- desc->status |= flags;
-
- if (chip != desc->irq_data.chip)
- irq_chip_set_defaults(desc->irq_data.chip);
}
-
+ if (unmask)
+ unmask_irq(desc);
return ret;
}
@@ -509,8 +635,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
* handler finished. unmask if the interrupt has not been disabled and
* is marked MASKED.
*/
-static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+static void irq_finalize_oneshot(struct irq_desc *desc,
+ struct irqaction *action, bool force)
{
+ if (!(desc->istate & IRQS_ONESHOT))
+ return;
again:
chip_bus_lock(desc);
raw_spin_lock_irq(&desc->lock);
@@ -522,26 +651,42 @@ again:
* The thread is faster done than the hard interrupt handler
* on the other CPU. If we unmask the irq line then the
* interrupt can come in again and masks the line, leaves due
- * to IRQ_INPROGRESS and the irq line is masked forever.
+ * to IRQS_INPROGRESS and the irq line is masked forever.
+ *
+ * This also serializes the state of shared oneshot handlers
+ * versus "desc->threads_onehsot |= action->thread_mask;" in
+ * irq_wake_thread(). See the comment there which explains the
+ * serialization.
*/
- if (unlikely(desc->status & IRQ_INPROGRESS)) {
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
raw_spin_unlock_irq(&desc->lock);
chip_bus_sync_unlock(desc);
cpu_relax();
goto again;
}
- if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
- desc->status &= ~IRQ_MASKED;
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
- }
+ /*
+ * Now check again, whether the thread should run. Otherwise
+ * we would clear the threads_oneshot bit of this thread which
+ * was just set.
+ */
+ if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ goto out_unlock;
+
+ desc->threads_oneshot &= ~action->thread_mask;
+
+ if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data))
+ unmask_irq(desc);
+
+out_unlock:
raw_spin_unlock_irq(&desc->lock);
chip_bus_sync_unlock(desc);
}
#ifdef CONFIG_SMP
/*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Check whether we need to chasnge the affinity of the interrupt thread.
*/
static void
irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,14 +718,49 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
#endif
/*
+ * Interrupts which are not explicitely requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
+ */
+static void
+irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+ local_bh_disable();
+ action->thread_fn(action->irq, action->dev_id);
+ irq_finalize_oneshot(desc, action, false);
+ local_bh_enable();
+}
+
+/*
+ * Interrupts explicitely requested as threaded interupts want to be
+ * preemtible - many of them need to sleep and wait for slow busses to
+ * complete.
+ */
+static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+ action->thread_fn(action->irq, action->dev_id);
+ irq_finalize_oneshot(desc, action, false);
+}
+
+/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
{
- struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+ static const struct sched_param param = {
+ .sched_priority = MAX_USER_RT_PRIO/2,
+ };
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
- int wake, oneshot = desc->status & IRQ_ONESHOT;
+ void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+ int wake;
+
+ if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+ &action->thread_flags))
+ handler_fn = irq_forced_thread_fn;
+ else
+ handler_fn = irq_thread_fn;
sched_setscheduler(current, SCHED_FIFO, &param);
current->irqaction = action;
@@ -592,23 +772,19 @@ static int irq_thread(void *data)
atomic_inc(&desc->threads_active);
raw_spin_lock_irq(&desc->lock);
- if (unlikely(desc->status & IRQ_DISABLED)) {
+ if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
/*
* CHECKME: We might need a dedicated
* IRQ_THREAD_PENDING flag here, which
* retriggers the thread in check_irq_resend()
- * but AFAICT IRQ_PENDING should be fine as it
+ * but AFAICT IRQS_PENDING should be fine as it
* retriggers the interrupt itself --- tglx
*/
- desc->status |= IRQ_PENDING;
+ desc->istate |= IRQS_PENDING;
raw_spin_unlock_irq(&desc->lock);
} else {
raw_spin_unlock_irq(&desc->lock);
-
- action->thread_fn(action->irq, action->dev_id);
-
- if (oneshot)
- irq_finalize_oneshot(action->irq, desc);
+ handler_fn(desc, action);
}
wake = atomic_dec_and_test(&desc->threads_active);
@@ -617,6 +793,9 @@ static int irq_thread(void *data)
wake_up(&desc->wait_for_threads);
}
+ /* Prevent a stale desc->threads_oneshot */
+ irq_finalize_oneshot(desc, action, true);
+
/*
* Clear irqaction. Otherwise exit_irq_thread() would make
* fuzz about an active irq thread going into nirvana.
@@ -631,6 +810,7 @@ static int irq_thread(void *data)
void exit_irq_thread(void)
{
struct task_struct *tsk = current;
+ struct irq_desc *desc;
if (!tsk->irqaction)
return;
@@ -639,6 +819,14 @@ void exit_irq_thread(void)
"exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+ desc = irq_to_desc(tsk->irqaction->irq);
+
+ /*
+ * Prevent a stale desc->threads_oneshot. Must be called
+ * before setting the IRQTF_DIED flag.
+ */
+ irq_finalize_oneshot(desc, tsk->irqaction, true);
+
/*
* Set the THREAD DIED flag to prevent further wakeups of the
* soon to be gone threaded handler.
@@ -646,6 +834,22 @@ void exit_irq_thread(void)
set_bit(IRQTF_DIED, &tsk->irqaction->flags);
}
+static void irq_setup_forced_threading(struct irqaction *new)
+{
+ if (!force_irqthreads)
+ return;
+ if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
+ return;
+
+ new->flags |= IRQF_ONESHOT;
+
+ if (!new->thread_fn) {
+ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+ new->thread_fn = new->handler;
+ new->handler = irq_default_primary_handler;
+ }
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -655,9 +859,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
struct irqaction *old, **old_ptr;
const char *old_name = NULL;
- unsigned long flags;
- int nested, shared = 0;
- int ret;
+ unsigned long flags, thread_mask = 0;
+ int ret, nested, shared = 0;
+ cpumask_var_t mask;
if (!desc)
return -EINVAL;
@@ -681,15 +885,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
rand_initialize_irq(irq);
}
- /* Oneshot interrupts are not allowed with shared */
- if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
- return -EINVAL;
-
/*
* Check whether the interrupt nests into another interrupt
* thread.
*/
- nested = desc->status & IRQ_NESTED_THREAD;
+ nested = irq_settings_is_nested_thread(desc);
if (nested) {
if (!new->thread_fn)
return -EINVAL;
@@ -699,6 +899,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* dummy function which warns when called.
*/
new->handler = irq_nested_primary_handler;
+ } else {
+ irq_setup_forced_threading(new);
}
/*
@@ -722,6 +924,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->thread = t;
}
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out_thread;
+ }
+
/*
* The following block of code has to be executed atomically
*/
@@ -733,32 +940,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* Can't share interrupts unless both agree to and are
* the same type (level, edge, polarity). So both flag
* fields must have IRQF_SHARED set and the bits which
- * set the trigger type must match.
+ * set the trigger type must match. Also all must
+ * agree on ONESHOT.
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+ ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+ ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
old_name = old->name;
goto mismatch;
}
-#if defined(CONFIG_IRQ_PER_CPU)
/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
(new->flags & IRQF_PERCPU))
goto mismatch;
-#endif
/* add new interrupt at end of irq queue */
do {
+ thread_mask |= old->thread_mask;
old_ptr = &old->next;
old = *old_ptr;
} while (old);
shared = 1;
}
- if (!shared) {
- irq_chip_set_defaults(desc->irq_data.chip);
+ /*
+ * Setup the thread mask for this irqaction. Unlikely to have
+ * 32 resp 64 irqs sharing one line, but who knows.
+ */
+ if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
+ ret = -EBUSY;
+ goto out_mask;
+ }
+ new->thread_mask = 1 << ffz(thread_mask);
+ if (!shared) {
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
@@ -767,42 +983,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->flags & IRQF_TRIGGER_MASK);
if (ret)
- goto out_thread;
- } else
- compat_irq_chip_set_default_handler(desc);
-#if defined(CONFIG_IRQ_PER_CPU)
- if (new->flags & IRQF_PERCPU)
- desc->status |= IRQ_PER_CPU;
-#endif
+ goto out_mask;
+ }
+
+ desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
+ IRQS_ONESHOT | IRQS_WAITING);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
- desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
- IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+ if (new->flags & IRQF_PERCPU) {
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ irq_settings_set_per_cpu(desc);
+ }
if (new->flags & IRQF_ONESHOT)
- desc->status |= IRQ_ONESHOT;
+ desc->istate |= IRQS_ONESHOT;
- if (!(desc->status & IRQ_NOAUTOEN)) {
- desc->depth = 0;
- desc->status &= ~IRQ_DISABLED;
- desc->irq_data.chip->irq_startup(&desc->irq_data);
- } else
+ if (irq_settings_can_autoenable(desc))
+ irq_startup(desc);
+ else
/* Undo nested disables: */
desc->depth = 1;
/* Exclude IRQ from balancing if requested */
- if (new->flags & IRQF_NOBALANCING)
- desc->status |= IRQ_NO_BALANCING;
+ if (new->flags & IRQF_NOBALANCING) {
+ irq_settings_set_no_balancing(desc);
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ }
/* Set default affinity mask once everything is setup */
- setup_affinity(irq, desc);
-
- } else if ((new->flags & IRQF_TRIGGER_MASK)
- && (new->flags & IRQF_TRIGGER_MASK)
- != (desc->status & IRQ_TYPE_SENSE_MASK)) {
- /* hope the handler works with the actual trigger mode... */
- pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
- irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
- (int)(new->flags & IRQF_TRIGGER_MASK));
+ setup_affinity(irq, desc, mask);
+
+ } else if (new->flags & IRQF_TRIGGER_MASK) {
+ unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
+ unsigned int omsk = irq_settings_get_trigger_mask(desc);
+
+ if (nmsk != omsk)
+ /* hope the handler works with current trigger mode */
+ pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+ irq, nmsk, omsk);
}
new->irq = irq;
@@ -816,8 +1034,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* Check whether we disabled the irq via the spurious handler
* before. Reenable it and give it another chance.
*/
- if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
- desc->status &= ~IRQ_SPURIOUS_DISABLED;
+ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
+ desc->istate &= ~IRQS_SPURIOUS_DISABLED;
__enable_irq(desc, irq, false);
}
@@ -833,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
+ free_cpumask_var(mask);
return 0;
@@ -847,8 +1066,11 @@ mismatch:
#endif
ret = -EBUSY;
-out_thread:
+out_mask:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ free_cpumask_var(mask);
+
+out_thread:
if (new->thread) {
struct task_struct *t = new->thread;
@@ -869,9 +1091,14 @@ out_thread:
*/
int setup_irq(unsigned int irq, struct irqaction *act)
{
+ int retval;
struct irq_desc *desc = irq_to_desc(irq);
- return __setup_irq(irq, desc, act);
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, act);
+ chip_bus_sync_unlock(desc);
+
+ return retval;
}
EXPORT_SYMBOL_GPL(setup_irq);
@@ -922,13 +1149,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
#endif
/* If this was the last handler, shut down the IRQ line: */
- if (!desc->action) {
- desc->status |= IRQ_DISABLED;
- if (desc->irq_data.chip->irq_shutdown)
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
- else
- desc->irq_data.chip->irq_disable(&desc->irq_data);
- }
+ if (!desc->action)
+ irq_shutdown(desc);
#ifdef CONFIG_SMP
/* make sure affinity_hint is cleaned up */
@@ -1002,6 +1224,11 @@ void free_irq(unsigned int irq, void *dev_id)
if (!desc)
return;
+#ifdef CONFIG_SMP
+ if (WARN_ON(desc->affinity_notify))
+ desc->affinity_notify = NULL;
+#endif
+
chip_bus_lock(desc);
kfree(__free_irq(irq, dev_id));
chip_bus_sync_unlock(desc);
@@ -1072,7 +1299,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (!desc)
return -EINVAL;
- if (desc->status & IRQ_NOREQUEST)
+ if (!irq_settings_can_request(desc))
return -EINVAL;
if (!handler) {
@@ -1098,7 +1325,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (retval)
kfree(action);
-#ifdef CONFIG_DEBUG_SHIRQ
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
if (!retval && (irqflags & IRQF_SHARED)) {
/*
* It's a shared IRQ -- the driver ought to be prepared for it
@@ -1147,7 +1374,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
if (!desc)
return -EINVAL;
- if (desc->status & IRQ_NESTED_THREAD) {
+ if (irq_settings_is_nested_thread(desc)) {
ret = request_threaded_irq(irq, NULL, handler,
flags, name, dev_id);
return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 1d254194048..47420908fba 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
#include "internals.h"
-void move_masked_irq(int irq)
+void irq_move_masked_irq(struct irq_data *idata)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_chip *chip = desc->irq_data.chip;
+ struct irq_desc *desc = irq_data_to_desc(idata);
+ struct irq_chip *chip = idata->chip;
- if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
return;
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (CHECK_IRQ_PER_CPU(desc->status)) {
+ if (!irqd_can_balance(&desc->irq_data)) {
WARN_ON(1);
return;
}
- desc->status &= ~IRQ_MOVE_PENDING;
+ irqd_clr_move_pending(&desc->irq_data);
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
@@ -35,7 +35,7 @@ void move_masked_irq(int irq)
* do the disable, re-program, enable sequence.
* This is *not* particularly important for level triggered
* but in a edge trigger case, we might be setting rte
- * when an active trigger is comming in. This could
+ * when an active trigger is coming in. This could
* cause some ioapics to mal-function.
* Being paranoid i guess!
*
@@ -53,18 +53,25 @@ void move_masked_irq(int irq)
cpumask_clear(desc->pending_mask);
}
-void move_native_irq(int irq)
+void irq_move_irq(struct irq_data *idata)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ bool masked;
- if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ if (likely(!irqd_is_setaffinity_pending(idata)))
return;
- if (unlikely(desc->status & IRQ_DISABLED))
+ if (unlikely(irqd_irq_disabled(idata)))
return;
- desc->irq_data.chip->irq_mask(&desc->irq_data);
- move_masked_irq(irq);
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ /*
+ * Be careful vs. already masked interrupts. If this is a
+ * threaded interrupt with ONESHOT set, we can end up with an
+ * interrupt storm.
+ */
+ masked = irqd_irq_masked(idata);
+ if (!masked)
+ idata->chip->irq_mask(idata);
+ irq_move_masked_irq(idata);
+ if (!masked)
+ idata->chip->irq_unmask(idata);
}
-
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b0..f76fc00c987 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
* During system-wide suspend or hibernation device drivers need to be prevented
* from receiving interrupts and this function is provided for this purpose.
* It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQ_SUSPENDED flag for each of them.
+ * and sets the IRQS_SUSPENDED flag for each of them.
*/
void suspend_device_irqs(void)
{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
}
for_each_irq_desc(irq, desc)
- if (desc->status & IRQ_SUSPENDED)
+ if (desc->istate & IRQS_SUSPENDED)
synchronize_irq(irq);
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
* resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
*
* Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQ_SUSPENDED flag set.
+ * have the IRQS_SUSPENDED flag set.
*/
void resume_device_irqs(void)
{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
- if (!(desc->status & IRQ_SUSPENDED))
- continue;
-
raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, true);
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
struct irq_desc *desc;
int irq;
- for_each_irq_desc(irq, desc)
- if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
- return -EBUSY;
+ for_each_irq_desc(irq, desc) {
+ if (irqd_is_wakeup_set(&desc->irq_data)) {
+ if (desc->istate & IRQS_PENDING)
+ return -EBUSY;
+ continue;
+ }
+ /*
+ * Check the non wakeup interrupts whether they need
+ * to be masked before finally going into suspend
+ * state. That's for hardware which has no wakeup
+ * source configuration facility. The chip
+ * implementation indicates that with
+ * IRQCHIP_MASK_ON_SUSPEND.
+ */
+ if (desc->istate & IRQS_SUSPENDED &&
+ irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ mask_irq(desc);
+ }
return 0;
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a8898..834899f2500 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
#include "internals.h"
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
const struct cpumask *mask = desc->irq_data.affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (desc->status & IRQ_MOVE_PENDING)
+ if (irqd_is_setaffinity_pending(&desc->irq_data))
mask = desc->pending_mask;
#endif
seq_cpumask(m, mask);
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
- irq_balancing_disabled(irq))
+ if (!irq_can_set_affinity(irq) || no_irq_affinity)
return -EIO;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
if (!cpumask_intersects(new_value, cpu_online_mask)) {
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
- err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+ err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
} else {
irq_set_affinity(irq, new_value);
err = count;
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
static int irq_spurious_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_spurious_proc_show, NULL);
+ return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
}
static const struct file_operations irq_spurious_proc_fops = {
@@ -357,3 +357,83 @@ void init_irq_proc(void)
}
}
+#ifdef CONFIG_GENERIC_IRQ_SHOW
+
+int __weak arch_show_interrupts(struct seq_file *p, int prec)
+{
+ return 0;
+}
+
+#ifndef ACTUAL_NR_IRQS
+# define ACTUAL_NR_IRQS nr_irqs
+#endif
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ static int prec;
+
+ unsigned long flags, any_count = 0;
+ int i = *(loff_t *) v, j;
+ struct irqaction *action;
+ struct irq_desc *desc;
+
+ if (i > ACTUAL_NR_IRQS)
+ return 0;
+
+ if (i == ACTUAL_NR_IRQS)
+ return arch_show_interrupts(p, prec);
+
+ /* print header and calculate the width of the first column */
+ if (i == 0) {
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
+ seq_printf(p, "%*s", prec + 8, "");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d", j);
+ seq_putc(p, '\n');
+ }
+
+ desc = irq_to_desc(i);
+ if (!desc)
+ return 0;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for_each_online_cpu(j)
+ any_count |= kstat_irqs_cpu(i, j);
+ action = desc->action;
+ if (!action && !any_count)
+ goto out;
+
+ seq_printf(p, "%*d: ", prec, i);
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+
+ if (desc->irq_data.chip) {
+ if (desc->irq_data.chip->irq_print_chip)
+ desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
+ else if (desc->irq_data.chip->name)
+ seq_printf(p, " %8s", desc->irq_data.chip->name);
+ else
+ seq_printf(p, " %8s", "-");
+ } else {
+ seq_printf(p, " %8s", "None");
+ }
+#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
+ seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
+#endif
+ if (desc->name)
+ seq_printf(p, "-%-8s", desc->name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+
+ seq_putc(p, '\n');
+out:
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929a..14dd5761e8c 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
/*
* Run software resends of IRQ's
@@ -55,20 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq)
{
- unsigned int status = desc->status;
-
- /*
- * Make sure the interrupt is enabled, before resending it:
- */
- desc->irq_data.chip->irq_enable(&desc->irq_data);
-
/*
* We do not resend level type interrupts. Level type
* interrupts are resent by hardware when they are still
* active.
*/
- if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
- desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+ if (irq_settings_is_level(desc))
+ return;
+ if (desc->istate & IRQS_REPLAY)
+ return;
+ if (desc->istate & IRQS_PENDING) {
+ desc->istate &= ~IRQS_PENDING;
+ desc->istate |= IRQS_REPLAY;
if (!desc->irq_data.chip->irq_retrigger ||
!desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 00000000000..0d91730b633
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,125 @@
+/*
+ * Internal header to deal with irq_desc->status which will be renamed
+ * to irq_desc->settings.
+ */
+enum {
+ _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
+ _IRQ_PER_CPU = IRQ_PER_CPU,
+ _IRQ_LEVEL = IRQ_LEVEL,
+ _IRQ_NOPROBE = IRQ_NOPROBE,
+ _IRQ_NOREQUEST = IRQ_NOREQUEST,
+ _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
+ _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
+ _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
+ _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
+ _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
+};
+
+#define IRQ_PER_CPU GOT_YOU_MORON
+#define IRQ_NO_BALANCING GOT_YOU_MORON
+#define IRQ_LEVEL GOT_YOU_MORON
+#define IRQ_NOPROBE GOT_YOU_MORON
+#define IRQ_NOREQUEST GOT_YOU_MORON
+#define IRQ_NOAUTOEN GOT_YOU_MORON
+#define IRQ_NESTED_THREAD GOT_YOU_MORON
+#undef IRQF_MODIFY_MASK
+#define IRQF_MODIFY_MASK GOT_YOU_MORON
+
+static inline void
+irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
+{
+ desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
+ desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
+}
+
+static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NO_BALANCING;
+}
+
+static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NO_BALANCING;
+}
+
+static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline void
+irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
+{
+ desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
+ desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline bool irq_settings_is_level(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_LEVEL;
+}
+
+static inline void irq_settings_clr_level(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_LEVEL;
+}
+
+static inline void irq_settings_set_level(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_LEVEL;
+}
+
+static inline bool irq_settings_can_request(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOREQUEST);
+}
+
+static inline void irq_settings_clr_norequest(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOREQUEST;
+}
+
+static inline void irq_settings_set_norequest(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOREQUEST;
+}
+
+static inline bool irq_settings_can_probe(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOPROBE);
+}
+
+static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOPROBE;
+}
+
+static inline void irq_settings_set_noprobe(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOPROBE;
+}
+
+static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
+}
+
+static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
+}
+
+static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NESTED_THREAD;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f..dfbd550401b 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,93 @@ static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(unsigned long dummy);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static int irq_poll_cpu;
+static atomic_t irq_poll_active;
+
+/*
+ * We wait here for a poller to finish.
+ *
+ * If the poll runs on this CPU, then we yell loudly and return
+ * false. That will leave the interrupt line disabled in the worst
+ * case, but it should never happen.
+ *
+ * We wait until the poller is done and then recheck disabled and
+ * action (about to be disabled). Only if it's still active, we return
+ * true and let the handler run.
+ */
+bool irq_wait_for_poll(struct irq_desc *desc)
+{
+ if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+ "irq poll in progress on cpu %d for irq %d\n",
+ smp_processor_id(), desc->irq_data.irq))
+ return false;
+
+#ifdef CONFIG_SMP
+ do {
+ raw_spin_unlock(&desc->lock);
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ } while (irqd_irq_inprogress(&desc->irq_data));
+ /* Might have been disabled in meantime */
+ return !irqd_irq_disabled(&desc->irq_data) && desc->action;
+#else
+ return false;
+#endif
+}
+
/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(int irq, struct irq_desc *desc)
+static int try_one_irq(int irq, struct irq_desc *desc, bool force)
{
+ irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
- int ok = 0, work = 0;
raw_spin_lock(&desc->lock);
- /* Already running on another processor */
- if (desc->status & IRQ_INPROGRESS) {
- /*
- * Already running: If it is shared get the other
- * CPU to go looking for our mystery interrupt too
- */
- if (desc->action && (desc->action->flags & IRQF_SHARED))
- desc->status |= IRQ_PENDING;
- raw_spin_unlock(&desc->lock);
- return ok;
- }
- /* Honour the normal IRQ locking */
- desc->status |= IRQ_INPROGRESS;
- action = desc->action;
- raw_spin_unlock(&desc->lock);
- while (action) {
- /* Only shared IRQ handlers are safe to call */
- if (action->flags & IRQF_SHARED) {
- if (action->handler(irq, action->dev_id) ==
- IRQ_HANDLED)
- ok = 1;
- }
- action = action->next;
- }
- local_irq_disable();
- /* Now clean up the flags */
- raw_spin_lock(&desc->lock);
- action = desc->action;
+ /* PER_CPU and nested thread interrupts are never polled */
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
+ goto out;
/*
- * While we were looking for a fixup someone queued a real
- * IRQ clashing with our walk:
+ * Do not poll disabled interrupts unless the spurious
+ * disabled poller asks explicitely.
*/
- while ((desc->status & IRQ_PENDING) && action) {
+ if (irqd_irq_disabled(&desc->irq_data) && !force)
+ goto out;
+
+ /*
+ * All handlers must agree on IRQF_SHARED, so we test just the
+ * first. Check for action->next as well.
+ */
+ action = desc->action;
+ if (!action || !(action->flags & IRQF_SHARED) ||
+ (action->flags & __IRQF_TIMER) || !action->next)
+ goto out;
+
+ /* Already running on another processor */
+ if (irqd_irq_inprogress(&desc->irq_data)) {
/*
- * Perform real IRQ processing for the IRQ we deferred
+ * Already running: If it is shared get the other
+ * CPU to go looking for our mystery interrupt too
*/
- work = 1;
- raw_spin_unlock(&desc->lock);
- handle_IRQ_event(irq, action);
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_PENDING;
+ desc->istate |= IRQS_PENDING;
+ goto out;
}
- desc->status &= ~IRQ_INPROGRESS;
- /*
- * If we did actual work for the real IRQ line we must let the
- * IRQ controller clean up too
- */
- if (work)
- irq_end(irq, desc);
- raw_spin_unlock(&desc->lock);
- return ok;
+ /* Mark it poll in progress */
+ desc->istate |= IRQS_POLL_INPROGRESS;
+ do {
+ if (handle_irq_event(desc) == IRQ_HANDLED)
+ ret = IRQ_HANDLED;
+ action = desc->action;
+ } while ((desc->istate & IRQS_PENDING) && action);
+ desc->istate &= ~IRQS_POLL_INPROGRESS;
+out:
+ raw_spin_unlock(&desc->lock);
+ return ret == IRQ_HANDLED;
}
static int misrouted_irq(int irq)
@@ -92,6 +115,11 @@ static int misrouted_irq(int irq)
struct irq_desc *desc;
int i, ok = 0;
+ if (atomic_inc_return(&irq_poll_active) == 1)
+ goto out;
+
+ irq_poll_cpu = smp_processor_id();
+
for_each_irq_desc(i, desc) {
if (!i)
continue;
@@ -99,9 +127,11 @@ static int misrouted_irq(int irq)
if (i == irq) /* Already tried */
continue;
- if (try_one_irq(i, desc))
+ if (try_one_irq(i, desc, false))
ok = 1;
}
+out:
+ atomic_dec(&irq_poll_active);
/* So the caller can adjust the irq error counts */
return ok;
}
@@ -111,23 +141,28 @@ static void poll_spurious_irqs(unsigned long dummy)
struct irq_desc *desc;
int i;
+ if (atomic_inc_return(&irq_poll_active) != 1)
+ goto out;
+ irq_poll_cpu = smp_processor_id();
+
for_each_irq_desc(i, desc) {
- unsigned int status;
+ unsigned int state;
if (!i)
continue;
/* Racy but it doesn't matter */
- status = desc->status;
+ state = desc->istate;
barrier();
- if (!(status & IRQ_SPURIOUS_DISABLED))
+ if (!(state & IRQS_SPURIOUS_DISABLED))
continue;
local_irq_disable();
- try_one_irq(i, desc);
+ try_one_irq(i, desc, true);
local_irq_enable();
}
-
+out:
+ atomic_dec(&irq_poll_active);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
@@ -139,15 +174,13 @@ static void poll_spurious_irqs(unsigned long dummy)
*
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
- *
- * Called under desc->lock
*/
-
static void
__report_bad_irq(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
struct irqaction *action;
+ unsigned long flags;
if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +192,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
dump_stack();
printk(KERN_ERR "handlers:\n");
+ /*
+ * We need to take desc->lock here. note_interrupt() is called
+ * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+ * with something else removing an action. It's ok to take
+ * desc->lock here. See synchronize_irq().
+ */
+ raw_spin_lock_irqsave(&desc->lock, flags);
action = desc->action;
while (action) {
printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +207,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
printk("\n");
action = action->next;
}
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static void
@@ -218,6 +259,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
+ if (desc->istate & IRQS_POLL_INPROGRESS)
+ return;
+
if (unlikely(action_ret != IRQ_HANDLED)) {
/*
* If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +298,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
* Now kill the IRQ
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
- desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+ desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
- desc->irq_data.chip->irq_disable(&desc->irq_data);
+ irq_disable(desc);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 90f881904bb..c58fa7da8ae 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
*/
static void __irq_work_queue(struct irq_work *entry)
{
- struct irq_work **head, *next;
+ struct irq_work *next;
- head = &get_cpu_var(irq_work_list);
+ preempt_disable();
do {
- next = *head;
+ next = __this_cpu_read(irq_work_list);
/* Can assign non-atomic because we keep the flags set. */
entry->next = next_flags(next, IRQ_WORK_FLAGS);
- } while (cmpxchg(head, next, entry) != next);
+ } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
/* The list was empty, raise self-interrupt to start processing. */
if (!irq_work_next(entry))
arch_irq_work_raise();
- put_cpu_var(irq_work_list);
+ preempt_enable();
}
/*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
*/
void irq_work_run(void)
{
- struct irq_work *list, **head;
+ struct irq_work *list;
- head = &__get_cpu_var(irq_work_list);
- if (*head == NULL)
+ if (this_cpu_read(irq_work_list) == NULL)
return;
BUG_ON(!in_irq());
BUG_ON(!irqs_disabled());
- list = xchg(head, NULL);
+ list = this_cpu_xchg(irq_work_list, NULL);
+
while (list != NULL) {
struct irq_work *entry = list;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b575..079f1d39a8b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
arch_is_kernel_text(addr))
return 1;
- return in_gate_area_no_task(addr);
+ return in_gate_area_no_mm(addr);
}
static inline int is_kernel(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
return 1;
- return in_gate_area_no_task(addr);
+ return in_gate_area_no_mm(addr);
}
static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
}
/* Look up a kernel symbol and return it in a text buffer. */
-int sprint_symbol(char *buffer, unsigned long address)
+static int __sprint_symbol(char *buffer, unsigned long address,
+ int symbol_offset)
{
char *modname;
const char *name;
unsigned long offset, size;
int len;
+ address += symbol_offset;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
strcpy(buffer, name);
len = strlen(buffer);
buffer += len;
+ offset -= symbol_offset;
if (modname)
- len += sprintf(buffer, "+%#lx/%#lx [%s]",
- offset, size, modname);
+ len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
else
len += sprintf(buffer, "+%#lx/%#lx", offset, size);
return len;
}
+
+/**
+ * sprint_symbol - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size and module name to @buffer if possible. If no symbol was found,
+ * just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0);
+}
+
EXPORT_SYMBOL_GPL(sprint_symbol);
+/**
+ * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, -1);
+}
+
/* Look up a kernel symbol and print it to the kernel messages. */
void __print_symbol(const char *fmt, unsigned long address)
{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%0*lx %c %s\t[%s]\n",
- (int)(2 * sizeof(void *)),
- iter->value, type, iter->name, iter->module_name);
+ seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ type, iter->name, iter->module_name);
} else
- seq_printf(m, "%0*lx %c %s\n",
- (int)(2 * sizeof(void *)),
- iter->value, iter->type, iter->name);
+ seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ iter->type, iter->name);
return 0;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc756..87b77de03dd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/kmsg_dump.h>
+#include <linux/syscore_ops.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
/* Initialize the list of destination pages */
INIT_LIST_HEAD(&image->dest_pages);
- /* Initialize the list of unuseable pages */
+ /* Initialize the list of unusable pages */
INIT_LIST_HEAD(&image->unuseable_pages);
/* Read in the segments */
@@ -163,7 +164,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
* just verifies it is an address we can use.
*
* Since the kernel does everything in page size chunks ensure
- * the destination addreses are page aligned. Too many
+ * the destination addresses are page aligned. Too many
* special cases crop of when we don't do this. The most
* insidious is getting overlapping destination addresses
* simply because addresses are changed to page size
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
/* Deal with the destination pages I have inadvertently allocated.
*
* Ideally I would convert multi-page allocations into single
- * page allocations, and add everyting to image->dest_pages.
+ * page allocations, and add everything to image->dest_pages.
*
* For now it is simpler to just free the pages.
*/
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
/* Walk through and free any extra destination pages I may have */
kimage_free_page_list(&image->dest_pages);
- /* Walk through and free any unuseable pages I have cached */
+ /* Walk through and free any unusable pages I have cached */
kimage_free_page_list(&image->unuseable_pages);
}
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
return size;
}
-static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+ unsigned long end)
{
unsigned long addr;
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
start = roundup(start, PAGE_SIZE);
end = roundup(start + new_size, PAGE_SIZE);
- free_reserved_phys_range(end, crashk_res.end);
+ crash_free_reserved_phys_range(end, crashk_res.end);
if ((start == end) && (crashk_res.parent != NULL))
release_resource(&crashk_res);
@@ -1531,6 +1533,11 @@ int kernel_kexec(void)
local_irq_disable();
/* Suspend system devices */
error = sysdev_suspend(PMSG_FREEZE);
+ if (!error) {
+ error = syscore_suspend();
+ if (error)
+ sysdev_resume();
+ }
if (error)
goto Enable_irqs;
} else
@@ -1545,6 +1552,7 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
+ syscore_resume();
sysdev_resume();
Enable_irqs:
local_irq_enable();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106..77981813a1e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
{
- __get_cpu_var(kprobe_instance) = kp;
+ __this_cpu_write(kprobe_instance, kp);
}
static inline void reset_kprobe_instance(void)
{
- __get_cpu_var(kprobe_instance) = NULL;
+ __this_cpu_write(kprobe_instance, NULL);
}
/*
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
return p->pre_handler == aggr_pre_handler;
}
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+ return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+ list_empty(&p->list);
+}
+
/*
* Keep all fields in the kprobe consistent
*/
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
- memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
- memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+ memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
+ memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
}
#ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
}
}
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ arch_remove_optimized_kprobe(op);
+ arch_remove_kprobe(p);
+ kfree(op);
+}
+
/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
return 0;
}
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+ if (!kprobe_aggrprobe(p))
+ return kprobe_disabled(p);
+
+ op = container_of(p, struct optimized_kprobe, kp);
+
+ return kprobe_disabled(p) && list_empty(&op->list);
+}
+
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ if (kprobe_aggrprobe(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!list_empty(&op->list))
+ return 1;
+ }
+ return 0;
+}
+
/*
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
#define OPTIMIZE_DELAY 5
-/* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+/*
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
{
- struct optimized_kprobe *op, *tmp;
-
- /* Lock modules while optimizing kprobes */
- mutex_lock(&module_mutex);
- mutex_lock(&kprobe_mutex);
- if (kprobes_all_disarmed || !kprobes_allow_optimization)
- goto end;
-
- /*
- * Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
- */
- synchronize_sched();
+ /* Optimization never be done when disarmed */
+ if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+ list_empty(&optimizing_list))
+ return;
/*
* The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
*/
get_online_cpus();
mutex_lock(&text_mutex);
- list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
- WARN_ON(kprobe_disabled(&op->kp));
- if (arch_optimize_kprobe(op) < 0)
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- list_del_init(&op->list);
+ arch_optimize_kprobes(&optimizing_list);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ /* Unoptimization must be done anytime */
+ if (list_empty(&unoptimizing_list))
+ return;
+
+ /* Ditto to do_optimize_kprobes */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+ /* Loop free_list for disarming */
+ list_for_each_entry_safe(op, tmp, free_list, list) {
+ /* Disarm probes if marked disabled */
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
+ if (kprobe_unused(&op->kp)) {
+ /*
+ * Remove unused probes from hash list. After waiting
+ * for synchronization, these probes are reclaimed.
+ * (reclaiming is done by do_free_cleaned_kprobes.)
+ */
+ hlist_del_rcu(&op->kp.hlist);
+ } else
+ list_del_init(&op->list);
}
mutex_unlock(&text_mutex);
put_online_cpus();
-end:
+}
+
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, free_list, list) {
+ BUG_ON(!kprobe_unused(&op->kp));
+ list_del_init(&op->list);
+ free_aggr_kprobe(&op->kp);
+ }
+}
+
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+ if (!delayed_work_pending(&optimizing_work))
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+ LIST_HEAD(free_list);
+
+ /* Lock modules while optimizing kprobes */
+ mutex_lock(&module_mutex);
+ mutex_lock(&kprobe_mutex);
+
+ /*
+ * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+ * kprobes before waiting for quiesence period.
+ */
+ do_unoptimize_kprobes(&free_list);
+
+ /*
+ * Step 2: Wait for quiesence period to ensure all running interrupts
+ * are done. Because optprobe may modify multiple instructions
+ * there is a chance that Nth instruction is interrupted. In that
+ * case, running interrupt can return to 2nd-Nth byte of jump
+ * instruction. This wait is for avoiding it.
+ */
+ synchronize_sched();
+
+ /* Step 3: Optimize kprobes after quiesence period */
+ do_optimize_kprobes();
+
+ /* Step 4: Free cleaned kprobes after quiesence period */
+ do_free_cleaned_kprobes(&free_list);
+
mutex_unlock(&kprobe_mutex);
mutex_unlock(&module_mutex);
+
+ /* Step 5: Kick optimizer again if needed */
+ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+ kick_kprobe_optimizer();
+ else
+ /* Wake up all waiters */
+ complete_all(&optimizer_comp);
+}
+
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+ if (delayed_work_pending(&optimizing_work))
+ wait_for_completion(&optimizer_comp);
}
/* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
/* Check if it is already optimized. */
if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
return;
-
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- list_add(&op->list, &optimizing_list);
- if (!delayed_work_pending(&optimizing_work))
- schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+
+ if (!list_empty(&op->list))
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ else {
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
+ }
+}
+
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ get_online_cpus();
+ arch_unoptimize_kprobe(op);
+ put_online_cpus();
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
}
/* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
- if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
- op = container_of(p, struct optimized_kprobe, kp);
- if (!list_empty(&op->list))
- /* Dequeue from the optimization queue */
+ if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
+ return; /* This is not an optprobe nor optimized */
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!kprobe_optimized(p)) {
+ /* Unoptimized or unoptimizing case */
+ if (force && !list_empty(&op->list)) {
+ /*
+ * Only if this is unoptimizing kprobe and forced,
+ * forcibly unoptimize it. (No need to unoptimize
+ * unoptimized kprobe again :)
+ */
list_del_init(&op->list);
- else
- /* Replace jump with break */
- arch_unoptimize_kprobe(op);
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ force_unoptimize_kprobe(op);
+ }
+ return;
+ }
+
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ if (!list_empty(&op->list)) {
+ /* Dequeue from the optimization queue */
+ list_del_init(&op->list);
+ return;
+ }
+ /* Optimized kprobe case */
+ if (force)
+ /* Forcibly update the code: this is a special case */
+ force_unoptimize_kprobe(op);
+ else {
+ list_add(&op->list, &unoptimizing_list);
+ kick_kprobe_optimizer();
}
}
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ struct optimized_kprobe *op;
+
+ BUG_ON(!kprobe_unused(ap));
+ /*
+ * Unused kprobe MUST be on the way of delayed unoptimizing (means
+ * there is still a relative jump) and disabled.
+ */
+ op = container_of(ap, struct optimized_kprobe, kp);
+ if (unlikely(list_empty(&op->list)))
+ printk(KERN_WARNING "Warning: found a stray unused "
+ "aggrprobe@%p\n", ap->addr);
+ /* Enable the probe again */
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ /* Optimize it again (remove from op->list) */
+ BUG_ON(!kprobe_optready(ap));
+ optimize_kprobe(ap);
+}
+
/* Remove optimized instructions */
static void __kprobes kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
+ if (!list_empty(&op->list))
+ /* Dequeue from the (un)optimization queue */
list_del_init(&op->list);
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- }
- /* Don't unoptimize, because the target code will be freed. */
+
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ /* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
arch_prepare_optimized_kprobe(op);
}
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
- struct optimized_kprobe *op;
-
- op = container_of(p, struct optimized_kprobe, kp);
- arch_remove_optimized_kprobe(op);
- kfree(op);
-}
-
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe */
- free_aggr_kprobe(ap);
+ arch_remove_optimized_kprobe(op);
+ kfree(op);
return;
}
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
return;
kprobes_allow_optimization = false;
- printk(KERN_INFO "Kprobes globally unoptimized\n");
- get_online_cpus(); /* For avoiding text_mutex deadlock */
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!kprobe_disabled(p))
- unoptimize_kprobe(p);
+ unoptimize_kprobe(p, false);
}
}
-
- mutex_unlock(&text_mutex);
- put_online_cpus();
- /* Allow all currently running kprobes to complete */
- synchronize_sched();
+ /* Wait for unoptimizing completion */
+ wait_for_kprobe_optimizer();
+ printk(KERN_INFO "Kprobes globally unoptimized\n");
}
int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
}
#endif /* CONFIG_SYSCTL */
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
static void __kprobes __arm_kprobe(struct kprobe *p)
{
- struct kprobe *old_p;
+ struct kprobe *_p;
/* Check collision with other optimized kprobes */
- old_p = get_optimized_kprobe((unsigned long)p->addr);
- if (unlikely(old_p))
- unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p))
+ /* Fallback to unoptimized kprobe */
+ unoptimize_kprobe(_p, true);
arch_arm_kprobe(p);
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
{
- struct kprobe *old_p;
+ struct kprobe *_p;
- unoptimize_kprobe(p); /* Try to unoptimize */
- arch_disarm_kprobe(p);
+ unoptimize_kprobe(p, false); /* Try to unoptimize */
- /* If another kprobe was blocked, optimize it. */
- old_p = get_optimized_kprobe((unsigned long)p->addr);
- if (unlikely(old_p))
- optimize_kprobe(old_p);
+ if (!kprobe_queued(p)) {
+ arch_disarm_kprobe(p);
+ /* If another kprobe was blocked, optimize it. */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p) && reopt)
+ optimize_kprobe(_p);
+ }
+ /* TODO: reoptimize others after unoptimized this probe */
}
#else /* !CONFIG_OPTPROBES */
#define optimize_kprobe(p) do {} while (0)
-#define unoptimize_kprobe(p) do {} while (0)
+#define unoptimize_kprobe(p, f) do {} while (0)
#define kill_optimized_kprobe(p) do {} while (0)
#define prepare_optimized_kprobe(p) do {} while (0)
#define try_to_optimize_kprobe(p) do {} while (0)
#define __arm_kprobe(p) arch_arm_kprobe(p)
-#define __disarm_kprobe(p) arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
+#define kprobe_disarmed(p) kprobe_disabled(p)
+#define wait_for_kprobe_optimizer() do {} while (0)
+
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+ BUG_ON(kprobe_unused(ap));
+}
static __kprobes void free_aggr_kprobe(struct kprobe *p)
{
+ arch_remove_kprobe(p);
kfree(p);
}
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
/* Disarm a kprobe with text_mutex */
static void __kprobes disarm_kprobe(struct kprobe *kp)
{
- get_online_cpus(); /* For avoiding text_mutex deadlock */
+ /* Ditto */
mutex_lock(&text_mutex);
- __disarm_kprobe(kp);
+ __disarm_kprobe(kp, true);
mutex_unlock(&text_mutex);
- put_online_cpus();
}
/*
@@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
int trapnr)
{
- struct kprobe *cur = __get_cpu_var(kprobe_instance);
+ struct kprobe *cur = __this_cpu_read(kprobe_instance);
/*
* if we faulted "during" the execution of a user specified
@@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
{
- struct kprobe *cur = __get_cpu_var(kprobe_instance);
+ struct kprobe *cur = __this_cpu_read(kprobe_instance);
int ret = 0;
if (cur && cur->break_handler) {
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
if (p->break_handler || p->post_handler)
- unoptimize_kprobe(ap); /* Fall back to normal kprobe */
+ unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
if (p->break_handler) {
if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
struct kprobe *p)
{
int ret = 0;
- struct kprobe *ap = old_p;
+ struct kprobe *ap = orig_p;
- if (!kprobe_aggrprobe(old_p)) {
- /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
- ap = alloc_aggr_kprobe(old_p);
+ if (!kprobe_aggrprobe(orig_p)) {
+ /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+ ap = alloc_aggr_kprobe(orig_p);
if (!ap)
return -ENOMEM;
- init_aggr_kprobe(ap, old_p);
- }
+ init_aggr_kprobe(ap, orig_p);
+ } else if (kprobe_unused(ap))
+ /* This probe is going to die. Rescue it */
+ reuse_unused_kprobe(ap);
if (kprobe_gone(ap)) {
/*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
return add_new_kprobe(ap, p);
}
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
- struct kprobe *kp;
-
- list_for_each_entry_rcu(kp, &p->list, list) {
- if (!kprobe_disabled(kp))
- /*
- * There is an active probe on the list.
- * We can't disable aggr_kprobe.
- */
- return 0;
- }
- p->flags |= KPROBE_FLAG_DISABLED;
- return 1;
-}
-
static int __kprobes in_kprobes_functions(unsigned long addr)
{
struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
{
- struct kprobe *old_p, *list_p;
+ struct kprobe *ap, *list_p;
- old_p = get_kprobe(p->addr);
- if (unlikely(!old_p))
+ ap = get_kprobe(p->addr);
+ if (unlikely(!ap))
return NULL;
- if (p != old_p) {
- list_for_each_entry_rcu(list_p, &old_p->list, list)
+ if (p != ap) {
+ list_for_each_entry_rcu(list_p, &ap->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid;
return NULL;
}
valid:
- return old_p;
+ return ap;
}
/* Return error if the kprobe is being re-registered */
static inline int check_kprobe_rereg(struct kprobe *p)
{
int ret = 0;
- struct kprobe *old_p;
mutex_lock(&kprobe_mutex);
- old_p = __get_valid_kprobe(p);
- if (old_p)
+ if (__get_valid_kprobe(p))
ret = -EINVAL;
mutex_unlock(&kprobe_mutex);
+
return ret;
}
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
}
EXPORT_SYMBOL_GPL(register_kprobe);
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &ap->list, list)
+ if (!kprobe_disabled(kp))
+ /*
+ * There is an active probe on the list.
+ * We can't disable this ap.
+ */
+ return 0;
+
+ return 1;
+}
+
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+ struct kprobe *orig_p;
+
+ /* Get an original kprobe for return */
+ orig_p = __get_valid_kprobe(p);
+ if (unlikely(orig_p == NULL))
+ return NULL;
+
+ if (!kprobe_disabled(p)) {
+ /* Disable probe if it is a child probe */
+ if (p != orig_p)
+ p->flags |= KPROBE_FLAG_DISABLED;
+
+ /* Try to disarm and disable this/parent probe */
+ if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+ disarm_kprobe(orig_p);
+ orig_p->flags |= KPROBE_FLAG_DISABLED;
+ }
+ }
+
+ return orig_p;
+}
+
/*
* Unregister a kprobe without a scheduler synchronization.
*/
static int __kprobes __unregister_kprobe_top(struct kprobe *p)
{
- struct kprobe *old_p, *list_p;
+ struct kprobe *ap, *list_p;
- old_p = __get_valid_kprobe(p);
- if (old_p == NULL)
+ /* Disable kprobe. This will disarm it if needed. */
+ ap = __disable_kprobe(p);
+ if (ap == NULL)
return -EINVAL;
- if (old_p == p ||
- (kprobe_aggrprobe(old_p) &&
- list_is_singular(&old_p->list))) {
+ if (ap == p)
/*
- * Only probe on the hash list. Disarm only if kprobes are
- * enabled and not gone - otherwise, the breakpoint would
- * already have been removed. We save on flushing icache.
+ * This probe is an independent(and non-optimized) kprobe
+ * (not an aggrprobe). Remove from the hash list.
*/
- if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
- disarm_kprobe(old_p);
- hlist_del_rcu(&old_p->hlist);
- } else {
+ goto disarmed;
+
+ /* Following process expects this probe is an aggrprobe */
+ WARN_ON(!kprobe_aggrprobe(ap));
+
+ if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+ /*
+ * !disarmed could be happen if the probe is under delayed
+ * unoptimizing.
+ */
+ goto disarmed;
+ else {
+ /* If disabling probe has special handlers, update aggrprobe */
if (p->break_handler && !kprobe_gone(p))
- old_p->break_handler = NULL;
+ ap->break_handler = NULL;
if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry_rcu(list_p, &old_p->list, list) {
+ list_for_each_entry_rcu(list_p, &ap->list, list) {
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
- old_p->post_handler = NULL;
+ ap->post_handler = NULL;
}
noclean:
+ /*
+ * Remove from the aggrprobe: this path will do nothing in
+ * __unregister_kprobe_bottom().
+ */
list_del_rcu(&p->list);
- if (!kprobe_disabled(old_p)) {
- try_to_disable_aggr_kprobe(old_p);
- if (!kprobes_all_disarmed) {
- if (kprobe_disabled(old_p))
- disarm_kprobe(old_p);
- else
- /* Try to optimize this probe again */
- optimize_kprobe(old_p);
- }
- }
+ if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /*
+ * Try to optimize this probe again, because post
+ * handler may have been changed.
+ */
+ optimize_kprobe(ap);
}
return 0;
+
+disarmed:
+ BUG_ON(!kprobe_disarmed(ap));
+ hlist_del_rcu(&ap->hlist);
+ return 0;
}
static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
{
- struct kprobe *old_p;
+ struct kprobe *ap;
if (list_empty(&p->list))
+ /* This is an independent kprobe */
arch_remove_kprobe(p);
else if (list_is_singular(&p->list)) {
- /* "p" is the last child of an aggr_kprobe */
- old_p = list_entry(p->list.next, struct kprobe, list);
+ /* This is the last child of an aggrprobe */
+ ap = list_entry(p->list.next, struct kprobe, list);
list_del(&p->list);
- arch_remove_kprobe(old_p);
- free_aggr_kprobe(old_p);
+ free_aggr_kprobe(ap);
}
+ /* Otherwise, do nothing. */
}
int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
int __kprobes disable_kprobe(struct kprobe *kp)
{
int ret = 0;
- struct kprobe *p;
mutex_lock(&kprobe_mutex);
- /* Check whether specified probe is valid. */
- p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
+ /* Disable this kprobe */
+ if (__disable_kprobe(kp) == NULL)
ret = -EINVAL;
- goto out;
- }
- /* If the probe is already disabled (or gone), just return */
- if (kprobe_disabled(kp))
- goto out;
-
- kp->flags |= KPROBE_FLAG_DISABLED;
- if (p != kp)
- /* When kp != p, p is always enabled. */
- try_to_disable_aggr_kprobe(p);
-
- if (!kprobes_all_disarmed && kprobe_disabled(p))
- disarm_kprobe(p);
-out:
mutex_unlock(&kprobe_mutex);
return ret;
}
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
mutex_lock(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
- if (kprobes_all_disarmed)
- goto already_disabled;
+ if (kprobes_all_disarmed) {
+ mutex_unlock(&kprobe_mutex);
+ return;
+ }
kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
- /*
- * Here we call get_online_cpus() for avoiding text_mutex deadlock,
- * because disarming may also unoptimize kprobes.
- */
- get_online_cpus();
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- __disarm_kprobe(p);
+ __disarm_kprobe(p, false);
}
}
-
mutex_unlock(&text_mutex);
- put_online_cpus();
mutex_unlock(&kprobe_mutex);
- /* Allow all currently running kprobes to complete */
- synchronize_sched();
- return;
-already_disabled:
- mutex_unlock(&kprobe_mutex);
- return;
+ /* Wait for disarming all kprobes by optimizer */
+ wait_for_kprobe_optimizer();
}
/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d..3b34d2732bc 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
+ int node;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
do_exit(ret);
}
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+ if (tsk == kthreadd_task)
+ return tsk->pref_node_fork;
+#endif
+ return numa_node_id();
+}
+
static void create_kthread(struct kthread_create_info *create)
{
int pid;
+#ifdef CONFIG_NUMA
+ current->pref_node_fork = create->node;
+#endif
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
}
/**
- * kthread_create - create a kthread.
+ * kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
+ * @node: memory node number.
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
* it. See also kthread_run().
*
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
+ * standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
* or a negative error number; it will be passed to kthread_stop().
*
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
-struct task_struct *kthread_create(int (*threadfn)(void *data),
- void *data,
- const char namefmt[],
- ...)
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+ void *data,
+ int node,
+ const char namefmt[],
+ ...)
{
struct kthread_create_info create;
create.threadfn = threadfn;
create.data = data;
+ create.node = node;
init_completion(&create.done);
spin_lock(&kthread_create_lock);
@@ -148,7 +167,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
wait_for_completion(&create.done);
if (!IS_ERR(create.result)) {
- struct sched_param param = { .sched_priority = 0 };
+ static const struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
}
return create.result;
}
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_on_node);
/**
* kthread_bind - bind a just-created kthread to a cpu.
@@ -265,6 +284,17 @@ int kthreadd(void *unused)
return 0;
}
+void __init_kthread_worker(struct kthread_worker *worker,
+ const char *name,
+ struct lock_class_key *key)
+{
+ spin_lock_init(&worker->lock);
+ lockdep_set_class_and_name(&worker->lock, key, name);
+ INIT_LIST_HEAD(&worker->work_list);
+ worker->task = NULL;
+}
+EXPORT_SYMBOL_GPL(__init_kthread_worker);
+
/**
* kthread_worker_fn - kthread function to process kthread_worker
* @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc..376066e1041 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
}
/**
- * __account_scheduler_latency - record an occured latency
+ * __account_scheduler_latency - record an occurred latency
* @tsk - the task struct of the task hitting the latency
* @usecs - the duration of the latency in microseconds
* @inter - 1 if the sleep was interruptible, 0 if uninterruptible
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < MAXLR; i++) {
- if (latency_record[i].backtrace[0]) {
+ struct latency_record *lr = &latency_record[i];
+
+ if (lr->backtrace[0]) {
int q;
- seq_printf(m, "%i %lu %lu ",
- latency_record[i].count,
- latency_record[i].time,
- latency_record[i].max);
+ seq_printf(m, "%i %lu %lu",
+ lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
- char sym[KSYM_SYMBOL_LEN];
- char *c;
- if (!latency_record[i].backtrace[q])
+ unsigned long bt = lr->backtrace[q];
+ if (!bt)
break;
- if (latency_record[i].backtrace[q] == ULONG_MAX)
+ if (bt == ULONG_MAX)
break;
- sprint_symbol(sym, latency_record[i].backtrace[q]);
- c = strchr(sym, '+');
- if (c)
- *c = 0;
- seq_printf(m, "%s ", sym);
+ seq_printf(m, " %ps", (void *)bt);
}
seq_printf(m, "\n");
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d..53a68956f13 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
}
/*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-
-void early_boot_irqs_off(void)
-{
- early_boot_irqs_enabled = 0;
-}
-
-void early_boot_irqs_on(void)
-{
- early_boot_irqs_enabled = 1;
-}
-
-/*
* Hardirqs will be enabled:
*/
void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,13 +2303,13 @@ void trace_hardirqs_on_caller(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
- if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
return;
if (unlikely(curr->hardirqs_enabled)) {
/*
* Neither irq nor preemption are disabled here
- * so this is racy by nature but loosing one hit
+ * so this is racy by nature but losing one hit
* in a stat is not a big deal.
*/
__debug_atomic_inc(redundant_hardirqs_on);
@@ -2636,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
if (!graph_lock())
return 0;
/*
- * Make sure we didnt race:
+ * Make sure we didn't race:
*/
if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
graph_unlock();
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d..71edd2f60c0 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
- sum_forward_deps = 0, factor = 0;
+ sum_forward_deps = 0;
list_for_each_entry(class, &all_lock_classes, lock_entry) {
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_hardirq_unsafe * nr_hardirq_safe +
nr_list_entries);
- /*
- * Estimated factor between direct and indirect
- * dependencies:
- */
- if (nr_list_entries)
- factor = sum_forward_deps / nr_list_entries;
-
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
nr_lock_chains, MAX_LOCKDEP_CHAINS);
@@ -494,7 +487,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
namelen += 2;
for (i = 0; i < LOCKSTAT_POINTS; i++) {
- char sym[KSYM_SYMBOL_LEN];
char ip[32];
if (class->contention_point[i] == 0)
@@ -503,15 +495,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
if (!i)
seq_line(m, '-', 40-namelen, namelen);
- sprint_symbol(sym, class->contention_point[i]);
snprintf(ip, sizeof(ip), "[<%p>]",
(void *)class->contention_point[i]);
- seq_printf(m, "%40s %14lu %29s %s\n", name,
- stats->contention_point[i],
- ip, sym);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contention_point[i],
+ ip, (void *)class->contention_point[i]);
}
for (i = 0; i < LOCKSTAT_POINTS; i++) {
- char sym[KSYM_SYMBOL_LEN];
char ip[32];
if (class->contending_point[i] == 0)
@@ -520,12 +510,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
if (!i)
seq_line(m, '-', 40-namelen, namelen);
- sprint_symbol(sym, class->contending_point[i]);
snprintf(ip, sizeof(ip), "[<%p>]",
(void *)class->contending_point[i]);
- seq_printf(m, "%40s %14lu %29s %s\n", name,
- stats->contending_point[i],
- ip, sym);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contending_point[i],
+ ip, (void *)class->contending_point[i]);
}
if (i) {
seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index d190664f25f..d5938a5c19c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
#include <linux/percpu.h>
#include <linux/kmemleak.h>
#include <linux/jump_label.h>
+#include <linux/pfn.h>
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
@@ -70,6 +71,26 @@
#define ARCH_SHF_SMALL 0
#endif
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
+ (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+ PFN_DOWN((unsigned long)BASE) + 1) \
+ : (0UL))
+
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -788,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
wait_for_zero_refcount(mod);
mutex_unlock(&module_mutex);
- /* Final destruction now noone is using it. */
+ /* Final destruction now no one is using it. */
if (mod->exit != NULL)
mod->exit();
blocking_notifier_call_chain(&module_notify_list,
@@ -1147,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
{
struct module_sect_attr *sattr =
container_of(mattr, struct module_sect_attr, mattr);
- return sprintf(buf, "0x%lx\n", sattr->address);
+ return sprintf(buf, "0x%pK\n", (void *)sattr->address);
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
return 0;
}
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+ unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+ unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+
+ if (end_pfn > begin_pfn)
+ set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+
+static void set_section_ro_nx(void *base,
+ unsigned long text_size,
+ unsigned long ro_size,
+ unsigned long total_size)
+{
+ /* begin and end PFNs of the current subsection */
+ unsigned long begin_pfn;
+ unsigned long end_pfn;
+
+ /*
+ * Set RO for module text and RO-data:
+ * - Always protect first page.
+ * - Do not protect last partial page.
+ */
+ if (ro_size > 0)
+ set_page_attributes(base, base + ro_size, set_memory_ro);
+
+ /*
+ * Set NX permissions for module data:
+ * - Do not protect first partial page.
+ * - Always protect last page.
+ */
+ if (total_size > text_size) {
+ begin_pfn = PFN_UP((unsigned long)base + text_size);
+ end_pfn = PFN_UP((unsigned long)base + total_size);
+ if (end_pfn > begin_pfn)
+ set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+ }
+}
+
+/* Setting memory back to RW+NX before releasing it */
+void unset_section_ro_nx(struct module *mod, void *module_region)
+{
+ unsigned long total_pages;
+
+ if (mod->module_core == module_region) {
+ /* Set core as NX+RW */
+ total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+ set_memory_nx((unsigned long)mod->module_core, total_pages);
+ set_memory_rw((unsigned long)mod->module_core, total_pages);
+
+ } else if (mod->module_init == module_region) {
+ /* Set init as NX+RW */
+ total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+ set_memory_nx((unsigned long)mod->module_init, total_pages);
+ set_memory_rw((unsigned long)mod->module_init, total_pages);
+ }
+}
+
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw()
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if ((mod->module_core) && (mod->core_text_size)) {
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_text_size,
+ set_memory_rw);
+ }
+ if ((mod->module_init) && (mod->init_text_size)) {
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_text_size,
+ set_memory_rw);
+ }
+ }
+ mutex_unlock(&module_mutex);
+}
+
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro()
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if ((mod->module_core) && (mod->core_text_size)) {
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_text_size,
+ set_memory_ro);
+ }
+ if ((mod->module_init) && (mod->init_text_size)) {
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_text_size,
+ set_memory_ro);
+ }
+ }
+ mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+#endif
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
destroy_params(mod->kp, mod->num_kp);
/* This may be NULL, but that's OK */
+ unset_section_ro_nx(mod, mod->module_init);
module_free(mod, mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
lockdep_free_key_range(mod->module_core, mod->core_size);
/* Finally, free the core (containing the module structure) */
+ unset_section_ro_nx(mod, mod->module_core);
module_free(mod, mod->module_core);
#ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
DEBUGP("\t%s\n", name);
}
- if (m == 0)
+ switch (m) {
+ case 0: /* executable */
+ mod->core_size = debug_align(mod->core_size);
mod->core_text_size = mod->core_size;
+ break;
+ case 1: /* RO: text and ro-data */
+ mod->core_size = debug_align(mod->core_size);
+ mod->core_ro_size = mod->core_size;
+ break;
+ case 3: /* whole core */
+ mod->core_size = debug_align(mod->core_size);
+ break;
+ }
}
DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
| INIT_OFFSET_MASK);
DEBUGP("\t%s\n", sname);
}
- if (m == 0)
+ switch (m) {
+ case 0: /* executable */
+ mod->init_size = debug_align(mod->init_size);
mod->init_text_size = mod->init_size;
+ break;
+ case 1: /* RO: text and ro-data */
+ mod->init_size = debug_align(mod->init_size);
+ mod->init_ro_size = mod->init_size;
+ break;
+ case 3: /* whole init */
+ mod->init_size = debug_align(mod->init_size);
+ break;
+ }
}
}
@@ -2306,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info)
#endif
#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints = section_objs(info, "__tracepoints",
- sizeof(*mod->tracepoints),
- &mod->num_tracepoints);
+ mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+ sizeof(*mod->tracepoints_ptrs),
+ &mod->num_tracepoints);
#endif
#ifdef HAVE_JUMP_LABEL
mod->jump_entries = section_objs(info, "__jump_table",
@@ -2623,7 +2777,7 @@ static struct module *load_module(void __user *umod,
mod->state = MODULE_STATE_COMING;
/* Now sew it into the lists so we can get lockdep and oops
- * info during argument parsing. Noone should access us, since
+ * info during argument parsing. No one should access us, since
* strong_try_module_get() will fail.
* lockdep/oops can run asynchronous, so use the RCU list insertion
* function to insert in a way safe to concurrent readers.
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
+ /* Set RO and NX regions for core */
+ set_section_ro_nx(mod->module_core,
+ mod->core_text_size,
+ mod->core_ro_size,
+ mod->core_size);
+
+ /* Set RO and NX regions for init */
+ set_section_ro_nx(mod->module_init,
+ mod->init_text_size,
+ mod->init_ro_size,
+ mod->init_size);
+
do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
mod->symtab = mod->core_symtab;
mod->strtab = mod->core_strtab;
#endif
+ unset_section_ro_nx(mod, mod->module_init);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
@@ -2804,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod,
else
nextval = (unsigned long)mod->module_core+mod->core_text_size;
- /* Scan for closest preceeding symbol, and next symbol. (ELF
+ /* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
for (i = 1; i < mod->num_symtab; i++) {
if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3057,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading":
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%p", mod->module_core);
+ seq_printf(m, " 0x%pK", mod->module_core);
/* Taints info */
if (mod->taints)
@@ -3226,7 +3393,7 @@ void module_layout(struct module *mod,
struct modversion_info *ver,
struct kernel_param *kp,
struct kernel_symbol *ks,
- struct tracepoint *tp)
+ struct tracepoint * const *tp)
{
}
EXPORT_SYMBOL(module_layout);
@@ -3240,8 +3407,8 @@ void module_update_tracepoints(void)
mutex_lock(&module_mutex);
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
- tracepoint_update_probe_range(mod->tracepoints,
- mod->tracepoints + mod->num_tracepoints);
+ tracepoint_update_probe_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
mutex_unlock(&module_mutex);
}
@@ -3265,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
else if (iter_mod > iter->module)
iter->tracepoint = NULL;
found = tracepoint_get_iter_range(&iter->tracepoint,
- iter_mod->tracepoints,
- iter_mod->tracepoints
+ iter_mod->tracepoints_ptrs,
+ iter_mod->tracepoints_ptrs
+ iter_mod->num_tracepoints);
if (found) {
iter->module = iter_mod;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502..c4195fa9890 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- cpu_relax();
+ arch_mutex_cpu_relax();
}
#endif
spin_lock_mutex(&lock->wait_lock, flags);
@@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
__set_task_state(task, state);
- /* didnt get the lock, go to sleep: */
+ /* didn't get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
preempt_enable_no_resched();
schedule();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26..a05d191ffdd 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_ns;
}
- new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+ new_nsp->uts_ns = copy_utsname(flags, tsk);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
- new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+ new_nsp->ipc_ns = copy_ipcs(flags, tsk);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d2..b91941df5e6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
/*
* This cpu has to do the parallel processing of the next
* object. It's waiting in the cpu's parallelization queue,
- * so exit imediately.
+ * so exit immediately.
*/
if (PTR_ERR(padata) == -ENODATA) {
del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
/*
* The next object that needs serialization might have arrived to
* the reorder queues in the meantime, we will be called again
- * from the timer function if noone else cares for it.
+ * from the timer function if no one else cares for it.
*/
if (atomic_read(&pd->reorder_objects)
&& !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
put_online_cpus();
}
-/* Replace the internal control stucture with a new one. */
+/* Replace the internal control structure with a new one. */
static void padata_replace(struct padata_instance *pinst,
struct parallel_data *pd_new)
{
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
}
/**
- * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
* padata cpumasks.
*
* @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88eb..69231670eb9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -432,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 08107d18175..7ab388a48a2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
/* Find parameter */
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
- /* Noone handled NULL, so do it here. */
+ /* No one handled NULL, so do it here. */
if (!val && params[i].ops->set != param_set_bool)
return -EINVAL;
DEBUGP("They are equal! Calling %p\n",
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static void __init kernel_add_sysfs_param(const char *name,
- struct kernel_param *kparam,
- unsigned int name_skip)
+static struct module_kobject * __init locate_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
kobj = kset_find_obj(module_kset, name);
if (kobj) {
- /* We already have one. Remove params so we can add more. */
mk = to_module_kobject(kobj);
- /* We need to remove it before adding parameters. */
- sysfs_remove_group(&mk->kobj, &mk->mp->grp);
} else {
mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
"%s", name);
if (err) {
kobject_put(&mk->kobj);
- printk(KERN_ERR "Module '%s' failed add to sysfs, "
- "error number %d\n", name, err);
- printk(KERN_ERR "The system will be unstable now.\n");
- return;
+ printk(KERN_ERR
+ "Module '%s' failed add to sysfs, error number %d\n",
+ name, err);
+ printk(KERN_ERR
+ "The system will be unstable now.\n");
+ return NULL;
}
- /* So that exit path is even. */
+
+ /* So that we hold reference in both cases. */
kobject_get(&mk->kobj);
}
+ return mk;
+}
+
+static void __init kernel_add_sysfs_param(const char *name,
+ struct kernel_param *kparam,
+ unsigned int name_skip)
+{
+ struct module_kobject *mk;
+ int err;
+
+ mk = locate_module_kobject(name);
+ if (!mk)
+ return;
+
+ /* We need to remove old parameters before adding more. */
+ if (mk->mp)
+ sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+
/* These should not fail at boot. */
err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
}
}
+ssize_t __modver_version_show(struct module_attribute *mattr,
+ struct module *mod, char *buf)
+{
+ struct module_version_attribute *vattr =
+ container_of(mattr, struct module_version_attribute, mattr);
+
+ return sprintf(buf, "%s\n", vattr->version);
+}
+
+extern struct module_version_attribute __start___modver[], __stop___modver[];
+
+static void __init version_sysfs_builtin(void)
+{
+ const struct module_version_attribute *vattr;
+ struct module_kobject *mk;
+ int err;
+
+ for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+ mk = locate_module_kobject(vattr->module_name);
+ if (mk) {
+ err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+ kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
+ }
+ }
+}
/* module-related sysfs stuff */
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
}
module_sysfs_initialized = 1;
+ version_sysfs_builtin();
param_sysfs_builtin();
return 0;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eac7e336433..8e81a9860a0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
@@ -21,7 +22,9 @@
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
+#include <linux/reboot.h>
#include <linux/vmstat.h>
+#include <linux/device.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
@@ -35,7 +38,96 @@
#include <asm/irq_regs.h>
-atomic_t perf_task_events __read_mostly;
+struct remote_function_call {
+ struct task_struct *p;
+ int (*func)(void *info);
+ void *info;
+ int ret;
+};
+
+static void remote_function(void *data)
+{
+ struct remote_function_call *tfc = data;
+ struct task_struct *p = tfc->p;
+
+ if (p) {
+ tfc->ret = -EAGAIN;
+ if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ return;
+ }
+
+ tfc->ret = tfc->func(tfc->info);
+}
+
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ * -ESRCH - when the process isn't running
+ * -EAGAIN - when the process moved away
+ */
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = p,
+ .func = func,
+ .info = info,
+ .ret = -ESRCH, /* No such (running) process */
+ };
+
+ if (task_curr(p))
+ smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+
+ return data.ret;
+}
+
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = NULL,
+ .func = func,
+ .info = info,
+ .ret = -ENXIO, /* No such CPU */
+ };
+
+ smp_call_function_single(cpu, remote_function, &data, 1);
+
+ return data.ret;
+}
+
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+ PERF_FLAG_FD_OUTPUT |\
+ PERF_FLAG_PID_CGROUP)
+
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x1,
+ EVENT_PINNED = 0x2,
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
@@ -53,15 +145,43 @@ static struct srcu_struct pmus_srcu;
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
/*
* max perf event sample rate
*/
-int sysctl_perf_event_sample_rate __read_mostly = 100000;
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly =
+ DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+
+int perf_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+
+ return 0;
+}
static atomic64_t perf_event_id;
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
+
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -69,6 +189,366 @@ extern __weak const char *perf_pmu_name(void)
return "pmu";
}
+static inline u64 perf_clock(void)
+{
+ return local_clock();
+}
+
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+#ifdef CONFIG_CGROUP_PERF
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+ return container_of(task_subsys_state(task, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+ css_get(&event->cgrp->css);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+ css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+ perf_put_cgroup(event);
+ event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ struct perf_cgroup_info *t;
+
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+ struct perf_cgroup_info *info;
+ u64 now;
+
+ now = perf_clock();
+
+ info = this_cpu_ptr(cgrp->info);
+
+ info->time += now - info->timestamp;
+ info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+ struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+ if (cgrp_out)
+ __update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+ struct perf_cgroup *cgrp;
+
+ /*
+ * ensure we access cgroup data only when needed and
+ * when we know the cgroup is pinned (css_get)
+ */
+ if (!is_cgroup_event(event))
+ return;
+
+ cgrp = perf_cgroup_from_task(current);
+ /*
+ * Do not update time when cgroup is not active
+ */
+ if (cgrp == event->cgrp)
+ __update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+ struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
+
+ /*
+ * ctx->lock held by caller
+ * ensure we do not access cgroup data
+ * unless we have the cgroup pinned (css_get)
+ */
+ if (!task || !ctx->nr_cgroups)
+ return;
+
+ cgrp = perf_cgroup_from_task(task);
+ info = this_cpu_ptr(cgrp->info);
+ info->timestamp = ctx->timestamp;
+}
+
+#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /*
+ * disable interrupts to avoid geting nr_cgroup
+ * changes via __perf_event_disable(). Also
+ * avoids preemption.
+ */
+ local_irq_save(flags);
+
+ /*
+ * we reschedule only in the presence of cgroup
+ * constrained events.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ /*
+ * perf_cgroup_events says at least one
+ * context on this CPU has cgroup events.
+ *
+ * ctx->nr_cgroups reports the number of cgroup
+ * events for a context.
+ */
+ if (cpuctx->ctx.nr_cgroups > 0) {
+
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
+
+ if (mode & PERF_CGROUP_SWIN) {
+ WARN_ON_ONCE(cpuctx->cgrp);
+ /* set cgrp before ctxsw in to
+ * allow event_filter_match() to not
+ * have to pass task around
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ }
+ }
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+ perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ struct perf_cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct file *file;
+ int ret = 0, fput_needed;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+
+ css = cgroup_css_from_dir(file, perf_subsys_id);
+ if (IS_ERR(css)) {
+ ret = PTR_ERR(css);
+ goto out;
+ }
+
+ cgrp = container_of(css, struct perf_cgroup, css);
+ event->cgrp = cgrp;
+
+ /* must be done before we fput() the file */
+ perf_get_cgroup(event);
+
+ /*
+ * all events in a group must monitor
+ * the same cgroup because a task belongs
+ * to only one perf cgroup at a time
+ */
+ if (group_leader && group_leader->cgrp != cgrp) {
+ perf_detach_cgroup(event);
+ ret = -EINVAL;
+ }
+out:
+ fput_light(file, fput_needed);
+ return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+ struct perf_cgroup_info *t;
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+ /*
+ * when the current task's perf cgroup does not match
+ * the event's, we need to remember to call the
+ * perf_mark_enable() function the first time a task with
+ * a matching perf cgroup is scheduled in.
+ */
+ if (is_cgroup_event(event) && !perf_cgroup_match(event))
+ event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ if (!event->cgrp_defer_enabled)
+ return;
+
+ event->cgrp_defer_enabled = 0;
+
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->cgrp_defer_enabled = 0;
+ }
+ }
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+}
+#endif
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -133,6 +613,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
}
}
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_pid_nr_ns(p, event->ns);
+}
+
/*
* If we inherit events we want to return the parent event id
* to userspace.
@@ -212,12 +714,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
raw_spin_lock_irqsave(&ctx->lock, flags);
--ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- put_ctx(ctx);
-}
-
-static inline u64 perf_clock(void)
-{
- return local_clock();
}
/*
@@ -231,6 +727,16 @@ static void update_context_time(struct perf_event_context *ctx)
ctx->timestamp = now;
}
+static u64 perf_event_time(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time(event);
+
+ return ctx ? ctx->time : 0;
+}
+
/*
* Update the total_time_enabled and total_time_running fields for a event.
*/
@@ -242,8 +748,19 @@ static void update_event_times(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
-
- if (ctx->is_active)
+ /*
+ * in cgroup mode, time_enabled represents
+ * the time the event was enabled AND active
+ * tasks were in the monitored cgroup. This is
+ * independent of the activity of the context as
+ * there may be a mix of cgroup and non-cgroup events.
+ *
+ * That is why we treat cgroup events differently
+ * here.
+ */
+ if (is_cgroup_event(event))
+ run_end = perf_event_time(event);
+ else if (ctx->is_active)
run_end = ctx->time;
else
run_end = event->tstamp_stopped;
@@ -253,9 +770,10 @@ static void update_event_times(struct perf_event *event)
if (event->state == PERF_EVENT_STATE_INACTIVE)
run_end = event->tstamp_stopped;
else
- run_end = ctx->time;
+ run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
+
}
/*
@@ -304,6 +822,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}
+ if (is_cgroup_event(event))
+ ctx->nr_cgroups++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
@@ -312,9 +833,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_stat++;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += event->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+ event->read_size = size;
+}
+
+static void perf_event__header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ perf_event__read_size(event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ size += sizeof(data->ip);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ size += sizeof(data->addr);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ size += sizeof(data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ size += event->read_size;
+
+ event->header_size = size;
+}
+
+static void perf_event__id_header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ size += sizeof(data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ size += sizeof(data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ size += sizeof(data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ size += sizeof(data->cpu_entry);
+
+ event->id_header_size = size;
+}
+
static void perf_group_attach(struct perf_event *event)
{
- struct perf_event *group_leader = event->group_leader;
+ struct perf_event *group_leader = event->group_leader, *pos;
/*
* We can have double attach due to group movement in perf_event_open.
@@ -333,6 +929,11 @@ static void perf_group_attach(struct perf_event *event)
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
+
+ perf_event__header_size(group_leader);
+
+ list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ perf_event__header_size(pos);
}
/*
@@ -342,6 +943,7 @@ static void perf_group_attach(struct perf_event *event)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_cpu_context *cpuctx;
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -350,6 +952,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
+ if (is_cgroup_event(event)) {
+ ctx->nr_cgroups--;
+ cpuctx = __get_cpu_context(ctx);
+ /*
+ * if there are no more cgroup events
+ * then cler cgrp to avoid stale pointer
+ * in update_cgrp_time_from_cpuctx()
+ */
+ if (!ctx->nr_cgroups)
+ cpuctx->cgrp = NULL;
+ }
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -391,7 +1005,7 @@ static void perf_group_detach(struct perf_event *event)
if (event->group_leader != event) {
list_del_init(&event->group_entry);
event->group_leader->nr_siblings--;
- return;
+ goto out;
}
if (!list_empty(&event->group_entry))
@@ -410,12 +1024,19 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
}
+
+out:
+ perf_event__header_size(event->group_leader);
+
+ list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ perf_event__header_size(tmp);
}
static inline int
event_filter_match(struct perf_event *event)
{
- return event->cpu == -1 || event->cpu == smp_processor_id();
+ return (event->cpu == -1 || event->cpu == smp_processor_id())
+ && perf_cgroup_match(event);
}
static void
@@ -423,6 +1044,7 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
u64 delta;
/*
* An event which could not be activated because of
@@ -432,9 +1054,9 @@ event_sched_out(struct perf_event *event,
*/
if (event->state == PERF_EVENT_STATE_INACTIVE
&& !event_filter_match(event)) {
- delta = ctx->time - event->tstamp_stopped;
+ delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -445,7 +1067,7 @@ event_sched_out(struct perf_event *event,
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
@@ -476,47 +1098,30 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
/*
* Cross CPU call to remove a performance event
*
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
-static void __perf_event_remove_from_context(void *info)
+static int __perf_remove_from_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- /*
- * If this is a task context, we need to check whether it is
- * the current task context of this cpu. If not it has been
- * scheduled out before the smp call arrived.
- */
- if (ctx->task && cpuctx->task_ctx != ctx)
- return;
-
raw_spin_lock(&ctx->lock);
-
event_sched_out(event, cpuctx, ctx);
-
list_del_event(event, ctx);
-
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
* Remove the event from a task's (or a CPU's) list of events.
*
- * Must be called with ctx->mutex held.
- *
* CPU events are removed with a smp call. For task events we only
* call when the task is on a CPU.
*
@@ -527,49 +1132,48 @@ static void __perf_event_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_event_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ lockdep_assert_held(&ctx->mutex);
+
if (!task) {
/*
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- smp_call_function_single(event->cpu,
- __perf_event_remove_from_context,
- event, 1);
+ cpu_function_call(event->cpu, __perf_remove_from_context, event);
return;
}
retry:
- task_oncpu_function_call(task, __perf_event_remove_from_context,
- event);
+ if (!task_function_call(task, __perf_remove_from_context, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
/*
- * If the context is active we need to retry the smp call.
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
*/
- if (ctx->nr_active && !list_empty(&event->group_entry)) {
+ if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
goto retry;
}
/*
- * The lock prevents that this context is scheduled in so we
- * can remove the event safely, if the call above did not
- * succeed.
+ * Since the task isn't running, its safe to remove the event, us
+ * holding the ctx->lock ensures the task won't get scheduled in.
*/
- if (!list_empty(&event->group_entry))
- list_del_event(event, ctx);
+ list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
/*
* Cross CPU call to disable a performance event
*/
-static void __perf_event_disable(void *info)
+static int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -578,9 +1182,12 @@ static void __perf_event_disable(void *info)
/*
* If this is a per-task event, need to check whether this
* event's task is the current task on this cpu.
+ *
+ * Can trigger due to concurrent perf_event_context_sched_out()
+ * flipping contexts around.
*/
if (ctx->task && cpuctx->task_ctx != ctx)
- return;
+ return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -590,6 +1197,7 @@ static void __perf_event_disable(void *info)
*/
if (event->state >= PERF_EVENT_STATE_INACTIVE) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
@@ -599,6 +1207,8 @@ static void __perf_event_disable(void *info)
}
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
@@ -623,13 +1233,13 @@ void perf_event_disable(struct perf_event *event)
/*
* Disable the event on the cpu that it's on
*/
- smp_call_function_single(event->cpu, __perf_event_disable,
- event, 1);
+ cpu_function_call(event->cpu, __perf_event_disable, event);
return;
}
retry:
- task_oncpu_function_call(task, __perf_event_disable, event);
+ if (!task_function_call(task, __perf_event_disable, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
/*
@@ -637,6 +1247,11 @@ retry:
*/
if (event->state == PERF_EVENT_STATE_ACTIVE) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -648,20 +1263,71 @@ retry:
update_group_times(event);
event->state = PERF_EVENT_STATE_OFF;
}
-
raw_spin_unlock_irq(&ctx->lock);
}
+static void perf_set_shadow_time(struct perf_event *event,
+ struct perf_event_context *ctx,
+ u64 tstamp)
+{
+ /*
+ * use the correct time source for the time snapshot
+ *
+ * We could get by without this by leveraging the
+ * fact that to get to this function, the caller
+ * has most likely already called update_context_time()
+ * and update_cgrp_time_xx() and thus both timestamp
+ * are identical (or very close). Given that tstamp is,
+ * already adjusted for cgroup, we could say that:
+ * tstamp - ctx->timestamp
+ * is equivalent to
+ * tstamp - cgrp->timestamp.
+ *
+ * Then, in perf_output_read(), the calculation would
+ * work with no changes because:
+ * - event is guaranteed scheduled in
+ * - no scheduled out in between
+ * - thus the timestamp would be the same
+ *
+ * But this is a bit hairy.
+ *
+ * So instead, we have an explicit cgroup call to remain
+ * within the time time source all along. We believe it
+ * is cleaner and simpler to understand.
+ */
+ if (is_cgroup_event(event))
+ perf_cgroup_set_shadow_time(event, tstamp);
+ else
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
static int
event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
event->state = PERF_EVENT_STATE_ACTIVE;
event->oncpu = smp_processor_id();
+
+ /*
+ * Unthrottle events, since we scheduled we might have missed several
+ * ticks already, also for a heavily scheduling task there is little
+ * guarantee it'll get a tick in a timely manner.
+ */
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+ perf_log_throttle(event, 1);
+ event->hw.interrupts = 0;
+ }
+
/*
* The new state must be visible before we turn it on in the hardware:
*/
@@ -673,9 +1339,9 @@ event_sched_in(struct perf_event *event,
return -EAGAIN;
}
- event->tstamp_running += ctx->time - event->tstamp_stopped;
+ event->tstamp_running += tstamp - event->tstamp_stopped;
- event->shadow_ctx_time = ctx->time - ctx->timestamp;
+ perf_set_shadow_time(event, ctx, tstamp);
if (!is_software_event(event))
cpuctx->active_oncpu++;
@@ -787,19 +1453,24 @@ static int group_can_go_on(struct perf_event *event,
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = ctx->time;
- event->tstamp_running = ctx->time;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_enabled = tstamp;
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
}
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *tsk);
+
/*
* Cross CPU call to install and enable a performance event
*
* Must be called with ctx->mutex held
*/
-static void __perf_install_in_context(void *info)
+static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -808,25 +1479,26 @@ static void __perf_install_in_context(void *info)
int err;
/*
- * If this is a task context, we need to check whether it is
- * the current task context of this cpu. If not it has been
- * scheduled out before the smp call arrived.
- * Or possibly this is the right context but it isn't
- * on this cpu because it had no events.
+ * In case we're installing a new context to an already running task,
+ * could also happen before perf_event_task_sched_in() on architectures
+ * which do context switches with IRQs enabled.
*/
- if (ctx->task && cpuctx->task_ctx != ctx) {
- if (cpuctx->task_ctx || ctx->task != current)
- return;
- cpuctx->task_ctx = ctx;
- }
+ if (ctx->task && !cpuctx->task_ctx)
+ perf_event_context_sched_in(ctx, ctx->task);
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
update_context_time(ctx);
+ /*
+ * update cgrp time only if current cgrp
+ * matches event->cgrp. Must be done before
+ * calling add_event_to_ctx()
+ */
+ update_cgrp_time_from_event(event);
add_event_to_ctx(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
goto unlock;
/*
@@ -863,6 +1535,8 @@ static void __perf_install_in_context(void *info)
unlock:
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
@@ -874,8 +1548,6 @@ unlock:
* If the event is attached to a task which is on a CPU we use a smp
* call to enable it in the task context. The task might have been
* scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
*/
static void
perf_install_in_context(struct perf_event_context *ctx,
@@ -884,6 +1556,8 @@ perf_install_in_context(struct perf_event_context *ctx,
{
struct task_struct *task = ctx->task;
+ lockdep_assert_held(&ctx->mutex);
+
event->ctx = ctx;
if (!task) {
@@ -891,31 +1565,29 @@ perf_install_in_context(struct perf_event_context *ctx,
* Per cpu events are installed via an smp call and
* the install is always successful.
*/
- smp_call_function_single(cpu, __perf_install_in_context,
- event, 1);
+ cpu_function_call(cpu, __perf_install_in_context, event);
return;
}
retry:
- task_oncpu_function_call(task, __perf_install_in_context,
- event);
+ if (!task_function_call(task, __perf_install_in_context, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
/*
- * we need to retry the smp call.
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
*/
- if (ctx->is_active && list_empty(&event->group_entry)) {
+ if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
goto retry;
}
/*
- * The lock prevents that this context is scheduled in so we
- * can add the event safely, if it the call above did not
- * succeed.
+ * Since the task isn't running, its safe to add the event, us holding
+ * the ctx->lock ensures the task won't get scheduled in.
*/
- if (list_empty(&event->group_entry))
- add_event_to_ctx(event, ctx);
+ add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -931,21 +1603,20 @@ static void __perf_event_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = ctx->time - event->total_time_enabled;
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled =
- ctx->time - sub->total_time_enabled;
- }
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
}
}
/*
* Cross CPU call to enable a performance event
*/
-static void __perf_event_enable(void *info)
+static int __perf_event_enable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -953,26 +1624,27 @@ static void __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- /*
- * If this is a per-task event, need to check whether this
- * event's task is the current task on this cpu.
- */
- if (ctx->task && cpuctx->task_ctx != ctx) {
- if (cpuctx->task_ctx || ctx->task != current)
- return;
- cpuctx->task_ctx = ctx;
- }
+ if (WARN_ON_ONCE(!ctx->is_active))
+ return -EINVAL;
raw_spin_lock(&ctx->lock);
- ctx->is_active = 1;
update_context_time(ctx);
if (event->state >= PERF_EVENT_STATE_INACTIVE)
goto unlock;
+
+ /*
+ * set current task's cgroup time reference point
+ */
+ perf_cgroup_set_timestamp(current, ctx);
+
__perf_event_mark_enabled(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event)) {
+ if (is_cgroup_event(event))
+ perf_cgroup_defer_enabled(event);
goto unlock;
+ }
/*
* If the event is in a group and isn't the group leader,
@@ -1005,6 +1677,8 @@ static void __perf_event_enable(void *info)
unlock:
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
@@ -1025,8 +1699,7 @@ void perf_event_enable(struct perf_event *event)
/*
* Enable the event on the cpu that it's on
*/
- smp_call_function_single(event->cpu, __perf_event_enable,
- event, 1);
+ cpu_function_call(event->cpu, __perf_event_enable, event);
return;
}
@@ -1045,8 +1718,15 @@ void perf_event_enable(struct perf_event *event)
event->state = PERF_EVENT_STATE_OFF;
retry:
+ if (!ctx->is_active) {
+ __perf_event_mark_enabled(event, ctx);
+ goto out;
+ }
+
raw_spin_unlock_irq(&ctx->lock);
- task_oncpu_function_call(task, __perf_event_enable, event);
+
+ if (!task_function_call(task, __perf_event_enable, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
@@ -1054,15 +1734,14 @@ retry:
* If the context is active and the event is still off,
* we need to retry the cross-call.
*/
- if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+ if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+ /*
+ * task could have been flipped by a concurrent
+ * perf_event_context_sched_out()
+ */
+ task = ctx->task;
goto retry;
-
- /*
- * Since we have the lock this context can't be scheduled
- * in, so we can change the state safely.
- */
- if (event->state == PERF_EVENT_STATE_OFF)
- __perf_event_mark_enabled(event, ctx);
+ }
out:
raw_spin_unlock_irq(&ctx->lock);
@@ -1073,7 +1752,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
/*
* not supported on inherited events
*/
- if (event->attr.inherit)
+ if (event->attr.inherit || !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
@@ -1082,12 +1761,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
return 0;
}
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
@@ -1100,6 +1773,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (likely(!ctx->nr_events))
goto out;
update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
if (!ctx->nr_active)
goto out;
@@ -1212,8 +1886,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+ struct task_struct *next)
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
@@ -1289,6 +1963,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
+
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch out PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_out(task);
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1324,9 +2006,13 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -1356,9 +2042,13 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -1369,15 +2059,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
+ u64 now;
+
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
if (likely(!ctx->nr_events))
goto out;
- ctx->timestamp = perf_clock();
-
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@@ -1394,11 +2088,12 @@ out:
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
struct perf_event_context *ctx = &cpuctx->ctx;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, task);
}
static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1406,15 +2101,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
{
struct perf_cpu_context *cpuctx;
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = __get_cpu_context(ctx);
if (cpuctx->task_ctx == ctx)
return;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, NULL);
cpuctx->task_ctx = ctx;
}
-void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
@@ -1430,9 +2126,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
cpuctx->task_ctx = ctx;
@@ -1465,14 +2161,17 @@ void __perf_event_task_sched_in(struct task_struct *task)
if (likely(!ctx))
continue;
- perf_event_context_sched_in(ctx);
+ perf_event_context_sched_in(ctx, task);
}
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch in PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_in(task);
}
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_event *event, int enable);
-
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
u64 frequency = event->attr.sample_freq;
@@ -1500,7 +2199,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
* Reduce accuracy by one bit such that @a and @b converge
* to a similar magnitude.
*/
-#define REDUCE_FLS(a, b) \
+#define REDUCE_FLS(a, b) \
do { \
if (a##_fls > b##_fls) { \
a >>= 1; \
@@ -1583,7 +2282,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
hwc = &event->hw;
@@ -1670,7 +2369,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
if (ctx)
rotate_ctx(ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
if (ctx)
task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
@@ -1725,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
if (!ctx || !ctx->nr_events)
goto out;
+ /*
+ * We must ctxsw out cgroup events to avoid conflict
+ * when invoking perf_task_event_sched_in() later on
+ * in this function. Otherwise we end up trying to
+ * ctxswin cgroup events which are already scheduled
+ * in.
+ */
+ perf_cgroup_sched_out(current);
task_ctx_sched_out(ctx, EVENT_ALL);
raw_spin_lock(&ctx->lock);
@@ -1749,7 +2456,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
raw_spin_unlock(&ctx->lock);
- perf_event_context_sched_in(ctx);
+ /*
+ * Also calls ctxswin for cgroup events, if any:
+ */
+ perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
}
@@ -1774,11 +2484,14 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- update_context_time(ctx);
+ if (ctx->is_active) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
update_event_times(event);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
raw_spin_unlock(&ctx->lock);
-
- event->pmu->read(event);
}
static inline u64 perf_event_count(struct perf_event *event)
@@ -1805,8 +2518,10 @@ static u64 perf_event_read(struct perf_event *event)
* (e.g., thread is blocked), in that case
* we cannot update context time
*/
- if (ctx->is_active)
+ if (ctx->is_active) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -1872,8 +2587,7 @@ static int alloc_callchain_buffers(void)
* accessed from NMI. Use a temporary manual per cpu allocation
* until that gets sorted out.
*/
- size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
- num_possible_cpus();
+ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
entries = kzalloc(size, GFP_KERNEL);
if (!entries)
@@ -2074,13 +2788,6 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /*
- * Can't attach events to a dying task.
- */
- err = -ESRCH;
- if (task->flags & PF_EXITING)
- goto errout;
-
/* Reuse ptrace permission checks for now. */
err = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2093,6 +2800,9 @@ errout:
}
+/*
+ * Returns a matching context with refcount and pincount.
+ */
static struct perf_event_context *
find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
{
@@ -2101,14 +2811,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
unsigned long flags;
int ctxn, err;
- if (!task && cpu != -1) {
+ if (!task) {
/* Must be root to operate on a CPU event: */
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
- if (cpu < 0 || cpu >= nr_cpumask_bits)
- return ERR_PTR(-EINVAL);
-
/*
* We could be clever and allow to attach a event to an
* offline CPU and activate it when the CPU comes up, but
@@ -2120,6 +2827,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
+ ++ctx->pin_count;
return ctx;
}
@@ -2133,6 +2841,7 @@ retry:
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
unclone_ctx(ctx);
+ ++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -2144,14 +2853,29 @@ retry:
get_ctx(ctx);
- if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
- /*
- * We raced with some other task; use
- * the context they set.
- */
+ err = 0;
+ mutex_lock(&task->perf_event_mutex);
+ /*
+ * If it has already passed perf_event_exit_task().
+ * we must see PF_EXITING, it takes this mutex too.
+ */
+ if (task->flags & PF_EXITING)
+ err = -ESRCH;
+ else if (task->perf_event_ctxp[ctxn])
+ err = -EAGAIN;
+ else {
+ ++ctx->pin_count;
+ rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ }
+ mutex_unlock(&task->perf_event_mutex);
+
+ if (unlikely(err)) {
put_task_struct(task);
kfree(ctx);
- goto retry;
+
+ if (err == -EAGAIN)
+ goto retry;
+ goto errout;
}
}
@@ -2182,7 +2906,7 @@ static void free_event(struct perf_event *event)
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_dec(&perf_task_events);
+ jump_label_dec(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -2191,6 +2915,10 @@ static void free_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
put_callchain_buffers();
+ if (is_cgroup_event(event)) {
+ atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+ jump_label_dec(&perf_sched_events);
+ }
}
if (event->buffer) {
@@ -2198,6 +2926,9 @@ static void free_event(struct perf_event *event)
event->buffer = NULL;
}
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
if (event->destroy)
event->destroy(event);
@@ -2289,31 +3020,6 @@ static int perf_release(struct inode *inode, struct file *file)
return perf_event_release_kernel(event);
}
-static int perf_event_read_size(struct perf_event *event)
-{
- int entry = sizeof(u64); /* value */
- int size = 0;
- int nr = 1;
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_ID)
- entry += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
- size += sizeof(u64);
- }
-
- size += entry * nr;
-
- return size;
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -2428,7 +3134,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
if (event->state == PERF_EVENT_STATE_ERROR)
return 0;
- if (count < perf_event_read_size(event))
+ if (count < event->read_size)
return -ENOSPC;
WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +3220,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
int ret = 0;
u64 value;
- if (!event->attr.sample_period)
+ if (!is_sampling_event(event))
return -EINVAL;
if (copy_from_user(&value, arg, sizeof(value)))
@@ -3305,6 +4011,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
} while (len);
}
+static void __perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ data->type = sample_type;
+ header->size += event->id_header_size;
+
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ data->tid_entry.pid = perf_event_pid(event, current);
+ data->tid_entry.tid = perf_event_tid(event, current);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ data->time = perf_clock();
+
+ if (sample_type & PERF_SAMPLE_ID)
+ data->id = primary_event_id(event);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ data->stream_id = event->id;
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ data->cpu_entry.cpu = raw_smp_processor_id();
+ data->cpu_entry.reserved = 0;
+ }
+}
+
+static void perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ if (event->attr.sample_id_all)
+ __perf_event_header__init_id(header, data, event);
+}
+
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ u64 sample_type = data->type;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(handle, data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
+}
+
+static void perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample)
+{
+ if (event->attr.sample_id_all)
+ __perf_event__output_id_sample(handle, sample);
+}
+
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
int nmi, int sample)
@@ -3312,6 +4085,7 @@ int perf_output_begin(struct perf_output_handle *handle,
struct perf_buffer *buffer;
unsigned long tail, offset, head;
int have_lost;
+ struct perf_sample_data sample_data;
struct {
struct perf_event_header header;
u64 id;
@@ -3338,8 +4112,12 @@ int perf_output_begin(struct perf_output_handle *handle,
goto out;
have_lost = local_read(&buffer->lost);
- if (have_lost)
- size += sizeof(lost_event);
+ if (have_lost) {
+ lost_event.header.size = sizeof(lost_event);
+ perf_event_header__init_id(&lost_event.header, &sample_data,
+ event);
+ size += lost_event.header.size;
+ }
perf_output_get_handle(handle);
@@ -3370,11 +4148,11 @@ int perf_output_begin(struct perf_output_handle *handle,
if (have_lost) {
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
- lost_event.header.size = sizeof(lost_event);
lost_event.id = event->id;
lost_event.lost = local_xchg(&buffer->lost, 0);
perf_output_put(handle, lost_event);
+ perf_event__output_id_sample(event, handle, &sample_data);
}
return 0;
@@ -3407,28 +4185,6 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
-{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
-
- return task_tgid_nr_ns(p, event->ns);
-}
-
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
-{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
-
- return task_pid_nr_ns(p, event->ns);
-}
-
static void perf_output_read_one(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
@@ -3603,61 +4359,16 @@ void perf_prepare_sample(struct perf_event_header *header,
{
u64 sample_type = event->attr.sample_type;
- data->type = sample_type;
-
header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header);
+ header->size = sizeof(*header) + event->header_size;
header->misc = 0;
header->misc |= perf_misc_flags(regs);
- if (sample_type & PERF_SAMPLE_IP) {
- data->ip = perf_instruction_pointer(regs);
-
- header->size += sizeof(data->ip);
- }
-
- if (sample_type & PERF_SAMPLE_TID) {
- /* namespace issues */
- data->tid_entry.pid = perf_event_pid(event, current);
- data->tid_entry.tid = perf_event_tid(event, current);
-
- header->size += sizeof(data->tid_entry);
- }
-
- if (sample_type & PERF_SAMPLE_TIME) {
- data->time = perf_clock();
-
- header->size += sizeof(data->time);
- }
-
- if (sample_type & PERF_SAMPLE_ADDR)
- header->size += sizeof(data->addr);
-
- if (sample_type & PERF_SAMPLE_ID) {
- data->id = primary_event_id(event);
-
- header->size += sizeof(data->id);
- }
-
- if (sample_type & PERF_SAMPLE_STREAM_ID) {
- data->stream_id = event->id;
-
- header->size += sizeof(data->stream_id);
- }
+ __perf_event_header__init_id(header, data, event);
- if (sample_type & PERF_SAMPLE_CPU) {
- data->cpu_entry.cpu = raw_smp_processor_id();
- data->cpu_entry.reserved = 0;
-
- header->size += sizeof(data->cpu_entry);
- }
-
- if (sample_type & PERF_SAMPLE_PERIOD)
- header->size += sizeof(data->period);
-
- if (sample_type & PERF_SAMPLE_READ)
- header->size += perf_event_read_size(event);
+ if (sample_type & PERF_SAMPLE_IP)
+ data->ip = perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
@@ -3722,23 +4433,26 @@ perf_event_read_event(struct perf_event *event,
struct task_struct *task)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
struct perf_read_event read_event = {
.header = {
.type = PERF_RECORD_READ,
.misc = 0,
- .size = sizeof(read_event) + perf_event_read_size(event),
+ .size = sizeof(read_event) + event->read_size,
},
.pid = perf_event_pid(event, task),
.tid = perf_event_tid(event, task),
};
int ret;
+ perf_event_header__init_id(&read_event.header, &sample, event);
ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
if (ret)
return;
perf_output_put(&handle, read_event);
perf_output_read(&handle, event);
+ perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
@@ -3768,14 +4482,16 @@ static void perf_event_task_output(struct perf_event *event,
struct perf_task_event *task_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
struct task_struct *task = task_event->task;
- int size, ret;
+ int ret, size = task_event->event_id.header.size;
- size = task_event->event_id.header.size;
- ret = perf_output_begin(&handle, event, size, 0, 0);
+ perf_event_header__init_id(&task_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ task_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3785,7 +4501,11 @@ static void perf_event_task_output(struct perf_event *event,
perf_output_put(&handle, task_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ task_event->event_id.header.size = size;
}
static int perf_event_task_match(struct perf_event *event)
@@ -3793,7 +4513,7 @@ static int perf_event_task_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm || event->attr.mmap ||
@@ -3824,6 +4544,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_task_ctx(&cpuctx->ctx, task_event);
ctx = task_event->task_ctx;
@@ -3898,11 +4620,16 @@ static void perf_event_comm_output(struct perf_event *event,
struct perf_comm_event *comm_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
- int ret = perf_output_begin(&handle, event, size, 0, 0);
+ int ret;
+
+ perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ comm_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3910,7 +4637,12 @@ static void perf_event_comm_output(struct perf_event *event,
perf_output_put(&handle, comm_event->event_id);
perf_output_copy(&handle, comm_event->comm,
comm_event->comm_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ comm_event->event_id.header.size = size;
}
static int perf_event_comm_match(struct perf_event *event)
@@ -3918,7 +4650,7 @@ static int perf_event_comm_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm)
@@ -3955,10 +4687,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_comm_ctx(&cpuctx->ctx, comm_event);
ctxn = pmu->task_ctx_nr;
@@ -4034,11 +4767,15 @@ static void perf_event_mmap_output(struct perf_event *event,
struct perf_mmap_event *mmap_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
- int ret = perf_output_begin(&handle, event, size, 0, 0);
+ int ret;
+ perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ mmap_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -4046,7 +4783,12 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_put(&handle, mmap_event->event_id);
perf_output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ mmap_event->event_id.header.size = size;
}
static int perf_event_mmap_match(struct perf_event *event,
@@ -4056,7 +4798,7 @@ static int perf_event_mmap_match(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if ((!executable && event->attr.mmap_data) ||
@@ -4144,6 +4886,8 @@ got_name:
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
vma->vm_flags & VM_EXEC);
@@ -4199,6 +4943,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
static void perf_log_throttle(struct perf_event *event, int enable)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int ret;
struct {
@@ -4220,11 +4965,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
if (enable)
throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
- ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+ perf_event_header__init_id(&throttle_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ throttle_event.header.size, 1, 0);
if (ret)
return;
perf_output_put(&handle, throttle_event);
+ perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
@@ -4240,26 +4989,21 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
struct hw_perf_event *hwc = &event->hw;
int ret = 0;
- if (!throttle) {
- hwc->interrupts++;
- } else {
- if (hwc->interrupts != MAX_INTERRUPTS) {
- hwc->interrupts++;
- if (HZ * hwc->interrupts >
- (u64)sysctl_perf_event_sample_rate) {
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
- } else {
- /*
- * Keep re-disabling events even though on the previous
- * pass we disabled it - just in case we raced with a
- * sched-in and the event got enabled again:
- */
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
+ if (throttle) {
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
ret = 1;
}
- }
+ } else
+ hwc->interrupts++;
if (event->attr.freq) {
u64 now = perf_clock();
@@ -4385,7 +5129,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
if (!regs)
return;
- if (!hwc->sample_period)
+ if (!is_sampling_event(event))
return;
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4401,7 +5145,7 @@ static int perf_exclude_event(struct perf_event *event,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
- return 0;
+ return 1;
if (regs) {
if (event->attr.exclude_user && user_mode(regs))
@@ -4512,7 +5256,7 @@ int perf_swevent_get_recursion_context(void)
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
@@ -4548,7 +5292,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event);
}
@@ -4713,7 +5457,7 @@ static int perf_swevent_init(struct perf_event *event)
break;
}
- if (event_id > PERF_COUNT_SW_MAX)
+ if (event_id >= PERF_COUNT_SW_MAX)
return -ENOENT;
if (!event->parent) {
@@ -4757,6 +5501,8 @@ static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
/*
* All tracepoints are from kernel-space.
*/
@@ -4805,15 +5551,6 @@ static int perf_tp_event_init(struct perf_event *event)
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
- /*
- * Raw tracepoint data is a severe data leak, only allow root to
- * have these.
- */
- if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
- perf_paranoid_tracepoint_raw() &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
err = perf_trace_init(event);
if (err)
return err;
@@ -4836,7 +5573,7 @@ static struct pmu perf_tracepoint = {
static inline void perf_tp_register(void)
{
- perf_pmu_register(&perf_tracepoint);
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4905,6 +5642,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
u64 period;
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return HRTIMER_NORESTART;
+
event->pmu->read(event);
perf_sample_data_init(&data, 0);
@@ -4926,31 +5667,30 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
static void perf_swevent_start_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
+ s64 period;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
- if (hwc->sample_period) {
- s64 period = local64_read(&hwc->period_left);
+ if (!is_sampling_event(event))
+ return;
- if (period) {
- if (period < 0)
- period = 10000;
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
- local64_set(&hwc->period_left, 0);
- } else {
- period = max_t(u64, 10000, hwc->sample_period);
- }
- __hrtimer_start_range_ns(&hwc->hrtimer,
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
ns_to_ktime(period), 0,
HRTIMER_MODE_REL_PINNED, 0);
- }
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
@@ -4958,6 +5698,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
}
}
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_sampling_event(event))
+ return;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+
+ /*
+ * Since hrtimers have a fixed rate, we can do a static freq->period
+ * mapping and avoid the whole period adjust feedback stuff.
+ */
+ if (event->attr.freq) {
+ long freq = event->attr.sample_freq;
+
+ event->attr.sample_period = NSEC_PER_SEC / freq;
+ hwc->sample_period = event->attr.sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ event->attr.freq = 0;
+ }
+}
+
/*
* Software event: cpu wall time clock
*/
@@ -5010,6 +5774,8 @@ static int cpu_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
+ perf_swevent_init_hrtimer(event);
+
return 0;
}
@@ -5065,16 +5831,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
static void task_clock_event_read(struct perf_event *event)
{
- u64 time;
-
- if (!in_nmi()) {
- update_context_time(event->ctx);
- time = event->ctx->time;
- } else {
- u64 now = perf_clock();
- u64 delta = now - event->ctx->timestamp;
- time = event->ctx->time + delta;
- }
+ u64 now = perf_clock();
+ u64 delta = now - event->ctx->timestamp;
+ u64 time = event->ctx->time + delta;
task_clock_event_update(event, time);
}
@@ -5087,6 +5846,8 @@ static int task_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
return -ENOENT;
+ perf_swevent_init_hrtimer(event);
+
return 0;
}
@@ -5145,25 +5906,96 @@ static void *find_pmu_context(int ctxn)
return NULL;
}
-static void free_pmu_context(void * __percpu cpu_context)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
{
- struct pmu *pmu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+
+ if (cpuctx->active_pmu == old_pmu)
+ cpuctx->active_pmu = pmu;
+ }
+}
+
+static void free_pmu_context(struct pmu *pmu)
+{
+ struct pmu *i;
mutex_lock(&pmus_lock);
/*
* Like a real lame refcount.
*/
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->pmu_cpu_context == cpu_context)
+ list_for_each_entry(i, &pmus, entry) {
+ if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+ update_pmu_context(i, pmu);
goto out;
+ }
}
- free_percpu(cpu_context);
+ free_percpu(pmu->pmu_cpu_context);
out:
mutex_unlock(&pmus_lock);
}
+static struct idr pmu_idr;
-int perf_pmu_register(struct pmu *pmu)
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+
+static struct device_attribute pmu_dev_attrs[] = {
+ __ATTR_RO(type),
+ __ATTR_NULL,
+};
+
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+ .name = "event_source",
+ .dev_attrs = pmu_dev_attrs,
+};
+
+static void pmu_dev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+ int ret = -ENOMEM;
+
+ pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!pmu->dev)
+ goto out;
+
+ device_initialize(pmu->dev);
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
+ dev_set_drvdata(pmu->dev, pmu);
+ pmu->dev->bus = &pmu_bus;
+ pmu->dev->release = pmu_dev_release;
+ ret = device_add(pmu->dev);
+ if (ret)
+ goto free_dev;
+
+out:
+ return ret;
+
+free_dev:
+ put_device(pmu->dev);
+ goto out;
+}
+
+static struct lock_class_key cpuctx_mutex;
+
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
int cpu, ret;
@@ -5173,23 +6005,50 @@ int perf_pmu_register(struct pmu *pmu)
if (!pmu->pmu_disable_count)
goto unlock;
+ pmu->type = -1;
+ if (!name)
+ goto skip_type;
+ pmu->name = name;
+
+ if (type < 0) {
+ int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+ if (!err)
+ goto free_pdc;
+
+ err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+ if (err) {
+ ret = err;
+ goto free_pdc;
+ }
+ }
+ pmu->type = type;
+
+ if (pmu_bus_running) {
+ ret = pmu_dev_alloc(pmu);
+ if (ret)
+ goto free_idr;
+ }
+
+skip_type:
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
if (pmu->pmu_cpu_context)
goto got_cpu_context;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
- goto free_pdc;
+ goto free_dev;
for_each_possible_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
__perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
INIT_LIST_HEAD(&cpuctx->rotation_list);
+ cpuctx->active_pmu = pmu;
}
got_cpu_context:
@@ -5222,6 +6081,14 @@ unlock:
return ret;
+free_dev:
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+
+free_idr:
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+
free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
@@ -5241,17 +6108,33 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- free_pmu_context(pmu->pmu_cpu_context);
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ free_pmu_context(pmu);
}
struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
int idx;
+ int ret;
idx = srcu_read_lock(&pmus_srcu);
+
+ rcu_read_lock();
+ pmu = idr_find(&pmu_idr, event->attr.type);
+ rcu_read_unlock();
+ if (pmu) {
+ ret = pmu->event_init(event);
+ if (ret)
+ pmu = ERR_PTR(ret);
+ goto unlock;
+ }
+
list_for_each_entry_rcu(pmu, &pmus, entry) {
- int ret = pmu->event_init(event);
+ ret = pmu->event_init(event);
if (!ret)
goto unlock;
@@ -5282,6 +6165,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct hw_perf_event *hwc;
long err;
+ if ((unsigned)cpu >= nr_cpu_ids) {
+ if (!task || cpu != -1)
+ return ERR_PTR(-EINVAL);
+ }
+
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -5330,7 +6218,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!overflow_handler && parent_event)
overflow_handler = parent_event->overflow_handler;
-
+
event->overflow_handler = overflow_handler;
if (attr->disabled)
@@ -5372,7 +6260,7 @@ done:
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_inc(&perf_task_events);
+ jump_label_inc(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -5547,7 +6435,7 @@ SYSCALL_DEFINE5(perf_event_open,
int err;
/* for future expandability... */
- if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+ if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
err = perf_copy_attr(attr_uptr, &attr);
@@ -5564,6 +6452,15 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /*
+ * In cgroup mode, the pid argument is used to pass the fd
+ * opened to the cgroup directory in cgroupfs. The cpu argument
+ * designates the cpu on which to monitor threads from that
+ * cgroup.
+ */
+ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+ return -EINVAL;
+
event_fd = get_unused_fd_flags(O_RDWR);
if (event_fd < 0)
return event_fd;
@@ -5581,7 +6478,7 @@ SYSCALL_DEFINE5(perf_event_open,
group_leader = NULL;
}
- if (pid != -1) {
+ if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
@@ -5595,6 +6492,19 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
+ if (flags & PERF_FLAG_PID_CGROUP) {
+ err = perf_cgroup_connect(pid, event, &attr, group_leader);
+ if (err)
+ goto err_alloc;
+ /*
+ * one more event:
+ * - that has cgroup constraint on event->cpu
+ * - that may need work on context switch
+ */
+ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+ jump_label_inc(&perf_sched_events);
+ }
+
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -5633,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
+ if (task) {
+ put_task_struct(task);
+ task = NULL;
+ }
+
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -5680,10 +6595,10 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_event_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_event_remove_from_context(sibling);
+ perf_remove_from_context(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
@@ -5706,6 +6621,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_install_in_context(ctx, event, cpu);
++ctx->generation;
+ perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
event->owner = current;
@@ -5715,6 +6631,12 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&current->perf_event_mutex);
/*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
+ /*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
* of the group leader will find the pointer to itself in
@@ -5725,6 +6647,7 @@ SYSCALL_DEFINE5(perf_event_open,
return event_fd;
err_context:
+ perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
free_event(event);
@@ -5775,6 +6698,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
++ctx->generation;
+ perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
return event;
@@ -5826,17 +6750,20 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- struct perf_event *parent_event;
+ if (child_event->parent) {
+ raw_spin_lock_irq(&child_ctx->lock);
+ perf_group_detach(child_event);
+ raw_spin_unlock_irq(&child_ctx->lock);
+ }
- perf_event_remove_from_context(child_event);
+ perf_remove_from_context(child_event);
- parent_event = child_event->parent;
/*
- * It can happen that parent exits first, and has events
+ * It can happen that the parent exits first, and has events
* that are still around due to the child reference. These
- * events need to be zapped - but otherwise linger.
+ * events need to be zapped.
*/
- if (parent_event) {
+ if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
}
@@ -5860,7 +6787,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* scheduled, so we are now safe from rescheduling changing
* our context.
*/
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
@@ -6067,6 +6994,12 @@ inherit_event(struct perf_event *parent_event,
child_event->overflow_handler = parent_event->overflow_handler;
/*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(child_event);
+ perf_event__id_header_size(child_event);
+
+ /*
* Link it up in the child's context:
*/
raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6129,7 +7062,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp[ctxn];
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -6167,11 +7100,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
unsigned long flags;
int ret = 0;
- child->perf_event_ctxp[ctxn] = NULL;
-
- mutex_init(&child->perf_event_mutex);
- INIT_LIST_HEAD(&child->perf_event_list);
-
if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
@@ -6223,7 +7151,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
child_ctx = child->perf_event_ctxp[ctxn];
@@ -6231,12 +7158,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
/*
* Mark the child context as a clone of the parent
* context, or of whatever the parent is a clone of.
- * Note that if the parent is a clone, it could get
- * uncloned at any point, but that doesn't matter
- * because the list of events and the generation
- * count can't have changed since we took the mutex.
+ *
+ * Note that if the parent is a clone, the holding of
+ * parent_ctx->lock avoids it from being uncloned.
*/
- cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+ cloned_ctx = parent_ctx->parent_ctx;
if (cloned_ctx) {
child_ctx->parent_ctx = cloned_ctx;
child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6247,9 +7173,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
get_ctx(child_ctx->parent_ctx);
}
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
+ put_ctx(parent_ctx);
return ret;
}
@@ -6261,6 +7189,10 @@ int perf_event_init_task(struct task_struct *child)
{
int ctxn, ret;
+ memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ mutex_init(&child->perf_event_mutex);
+ INIT_LIST_HEAD(&child->perf_event_list);
+
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
if (ret)
@@ -6297,7 +7229,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
static void perf_pmu_rotate_stop(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6315,9 +7247,9 @@ static void __perf_event_exit_context(void *__info)
perf_pmu_rotate_stop(ctx->pmu);
list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
- __perf_event_remove_from_context(event);
+ __perf_remove_from_context(event);
list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
- __perf_event_remove_from_context(event);
+ __perf_remove_from_context(event);
}
static void perf_event_exit_cpu_context(int cpu)
@@ -6351,6 +7283,26 @@ static void perf_event_exit_cpu(int cpu)
static inline void perf_event_exit_cpu(int cpu) { }
#endif
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ perf_event_exit_cpu(cpu);
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+ .notifier_call = perf_reboot,
+ .priority = INT_MIN,
+};
+
static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
@@ -6379,14 +7331,125 @@ void __init perf_event_init(void)
{
int ret;
+ idr_init(&pmu_idr);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
- perf_pmu_register(&perf_swevent);
- perf_pmu_register(&perf_cpu_clock);
- perf_pmu_register(&perf_task_clock);
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, NULL, -1);
+ perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
perf_cpu_notifier(perf_cpu_notify);
+ register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
}
+
+static int __init perf_event_sysfs_init(void)
+{
+ struct pmu *pmu;
+ int ret;
+
+ mutex_lock(&pmus_lock);
+
+ ret = bus_register(&pmu_bus);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (!pmu->name || pmu->type < 0)
+ continue;
+
+ ret = pmu_dev_alloc(pmu);
+ WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+ }
+ pmu_bus_running = 1;
+ ret = 0;
+
+unlock:
+ mutex_unlock(&pmus_lock);
+
+ return ret;
+}
+device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+ struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+
+ jc = kzalloc(sizeof(*jc), GFP_KERNEL);
+ if (!jc)
+ return ERR_PTR(-ENOMEM);
+
+ jc->info = alloc_percpu(struct perf_cgroup_info);
+ if (!jc->info) {
+ kfree(jc);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+ jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+ struct perf_cgroup, css);
+ free_percpu(jc->info);
+ kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+ struct task_struct *task = info;
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ return 0;
+}
+
+static void perf_cgroup_move(struct task_struct *task)
+{
+ task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task,
+ bool threadgroup)
+{
+ perf_cgroup_move(task);
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ perf_cgroup_move(c);
+ }
+ rcu_read_unlock();
+ }
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ perf_cgroup_move(task);
+}
+
+struct cgroup_subsys perf_subsys = {
+ .name = "perf_event",
+ .subsys_id = perf_subsys_id,
+ .create = perf_cgroup_create,
+ .destroy = perf_cgroup_destroy,
+ .exit = perf_cgroup_exit,
+ .attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584..57a8346a270 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
return -1;
}
-int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
{
int offset;
struct pidmap *map, *end;
+ if (last >= PID_MAX_LIMIT)
+ return -1;
+
offset = (last + 1) & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
@@ -435,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
rcu_read_unlock();
return pid;
}
+EXPORT_SYMBOL_GPL(get_task_pid);
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
@@ -446,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
rcu_read_unlock();
return result;
}
+EXPORT_SYMBOL_GPL(get_pid_task);
struct pid *find_get_pid(pid_t nr)
{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0..e9c9adc84ca 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
+#include <linux/proc_fs.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
- int i;
+ int i, err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+ err = pid_ns_prepare_proc(ns);
+ if (err)
+ goto out_put_parent_pid_ns;
+
return ns;
+out_put_parent_pid_ns:
+ put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
out:
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f84682..0da058bff8e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos);
static int pm_qos_power_open(struct inode *inode, struct file *filp);
static int pm_qos_power_release(struct inode *inode, struct file *filp);
static const struct file_operations pm_qos_power_fops = {
.write = pm_qos_power_write,
+ .read = pm_qos_power_read,
.open = pm_qos_power_open,
.release = pm_qos_power_release,
.llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
}
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value;
+ unsigned long flags;
+ struct pm_qos_object *o;
+ struct pm_qos_request_list *pm_qos_req = filp->private_data;;
+
+ if (!pm_qos_req)
+ return -EINVAL;
+ if (!pm_qos_request_active(pm_qos_req))
+ return -EINVAL;
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ value = pm_qos_get_value(o);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos)
{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850..0791b13df7b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
return p->utime;
}
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
int error = check_clock(which_clock);
if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
return error;
}
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+static int
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
{
/*
* You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
}
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
{
const pid_t pid = CPUCLOCK_PID(which_clock);
int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
* This is called from sys_timer_create() and do_cpu_nanosleep() with the
* new timer already all-zeros initialized.
*/
-int posix_cpu_timer_create(struct k_itimer *new_timer)
+static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
int ret = 0;
const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
* If we return TIMER_RETRY, it's necessary to release the timer's lock
* and try again. (This happens when the timer is in the middle of firing.)
*/
-int posix_cpu_timer_del(struct k_itimer *timer)
+static int posix_cpu_timer_del(struct k_itimer *timer)
{
struct task_struct *p = timer->it.cpu.task;
int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
* If we return TIMER_RETRY, it's necessary to release the timer's lock
* and try again. (This happens when the timer is in the middle of firing.)
*/
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
- struct itimerspec *new, struct itimerspec *old)
+static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+ struct itimerspec *new, struct itimerspec *old)
{
struct task_struct *p = timer->it.cpu.task;
union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
return ret;
}
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
{
union cpu_time_count now;
struct task_struct *p = timer->it.cpu.task;
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
/*
* Now that all the timers on our list have the firing flag,
- * noone will touch their list entries but us. We'll take
+ * no one will touch their list entries but us. We'll take
* each timer's lock before clearing its firing flag, so no
* timer call will interfere.
*/
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
return error;
}
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
+
+static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
{
struct restart_block *restart_block =
- &current_thread_info()->restart_block;
+ &current_thread_info()->restart_block;
struct itimerspec it;
int error;
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
if (error == -ERESTART_RESTARTBLOCK) {
- if (flags & TIMER_ABSTIME)
+ if (flags & TIMER_ABSTIME)
return -ERESTARTNOHAND;
/*
- * Report back to the user the time still remaining.
- */
- if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
return -EFAULT;
restart_block->fn = posix_cpu_nsleep_restart;
- restart_block->arg0 = which_clock;
- restart_block->arg1 = (unsigned long) rmtp;
- restart_block->arg2 = rqtp->tv_sec;
- restart_block->arg3 = rqtp->tv_nsec;
+ restart_block->nanosleep.index = which_clock;
+ restart_block->nanosleep.rmtp = rmtp;
+ restart_block->nanosleep.expires = timespec_to_ns(rqtp);
}
return error;
}
-long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->arg0;
- struct timespec __user *rmtp;
+ clockid_t which_clock = restart_block->nanosleep.index;
struct timespec t;
struct itimerspec it;
int error;
- rmtp = (struct timespec __user *) restart_block->arg1;
- t.tv_sec = restart_block->arg2;
- t.tv_nsec = restart_block->arg3;
+ t = ns_to_timespec(restart_block->nanosleep.expires);
- restart_block->fn = do_no_restart_syscall;
error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
if (error == -ERESTART_RESTARTBLOCK) {
+ struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
/*
- * Report back to the user the time still remaining.
- */
- if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
return -EFAULT;
- restart_block->fn = posix_cpu_nsleep_restart;
- restart_block->arg0 = which_clock;
- restart_block->arg1 = (unsigned long) rmtp;
- restart_block->arg2 = t.tv_sec;
- restart_block->arg3 = t.tv_nsec;
+ restart_block->nanosleep.expires = timespec_to_ns(&t);
}
return error;
}
-
#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
timer->it_clock = THREAD_CLOCK;
return posix_cpu_timer_create(timer);
}
-static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
-{
- return -EINVAL;
-}
-static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
-{
- return -EINVAL;
-}
+
+struct k_clock clock_posix_cpu = {
+ .clock_getres = posix_cpu_clock_getres,
+ .clock_set = posix_cpu_clock_set,
+ .clock_get = posix_cpu_clock_get,
+ .timer_create = posix_cpu_timer_create,
+ .nsleep = posix_cpu_nsleep,
+ .nsleep_restart = posix_cpu_nsleep_restart,
+ .timer_set = posix_cpu_timer_set,
+ .timer_del = posix_cpu_timer_del,
+ .timer_get = posix_cpu_timer_get,
+};
static __init int init_posix_cpu_timers(void)
{
struct k_clock process = {
- .clock_getres = process_cpu_clock_getres,
- .clock_get = process_cpu_clock_get,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = process_cpu_timer_create,
- .nsleep = process_cpu_nsleep,
- .nsleep_restart = process_cpu_nsleep_restart,
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
};
struct k_clock thread = {
- .clock_getres = thread_cpu_clock_getres,
- .clock_get = thread_cpu_clock_get,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = thread_cpu_timer_create,
- .nsleep = thread_cpu_nsleep,
- .nsleep_restart = thread_cpu_nsleep_restart,
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
};
struct timespec ts;
- register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
- register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+ posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+ posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
cputime_to_timespec(cputime_one_jiffy, &ts);
onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736..e5498d7405c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/idr.h>
+#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
+/*
+ * parisc wants ENOTSUP instead of EOPNOTSUPP
+ */
+#ifndef ENOTSUP
+# define ENANOSLEEP_NOTSUP EOPNOTSUPP
+#else
+# define ENANOSLEEP_NOTSUP ENOTSUP
+#endif
/*
* The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
/*
* CLOCKs: The POSIX standard calls for a couple of clocks and allows us
* to implement others. This structure defines the various
- * clocks and allows the possibility of adding others. We
- * provide an interface to add clocks to the table and expect
- * the "arch" code to add at least one clock that is high
- * resolution. Here we define the standard CLOCK_REALTIME as a
- * 1/HZ resolution clock.
+ * clocks.
*
* RESOLUTION: Clock resolution is used to round up timer and interval
* times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
* necessary code is written. The standard says we should say
* something about this issue in the documentation...
*
- * FUNCTIONS: The CLOCKs structure defines possible functions to handle
- * various clock functions. For clocks that use the standard
- * system timer code these entries should be NULL. This will
- * allow dispatch without the overhead of indirect function
- * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
- * must supply functions here, even if the function just returns
- * ENOSYS. The standard POSIX timer management code assumes the
- * following: 1.) The k_itimer struct (sched.h) is used for the
- * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
- * fields are not modified by timer code.
+ * FUNCTIONS: The CLOCKs structure defines possible functions to
+ * handle various clock functions.
*
- * At this time all functions EXCEPT clock_nanosleep can be
- * redirected by the CLOCKS structure. Clock_nanosleep is in
- * there, but the code ignores it.
+ * The standard POSIX timer management code assumes the
+ * following: 1.) The k_itimer struct (sched.h) is used for
+ * the timer. 2.) The list, it_lock, it_clock, it_id and
+ * it_pid fields are not modified by timer code.
*
* Permissions: It is assumed that the clock_settime() function defined
* for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
*/
static int common_nsleep(const clockid_t, int flags, struct timespec *t,
struct timespec __user *rmtp);
+static int common_timer_create(struct k_itimer *new_timer);
static void common_timer_get(struct k_itimer *, struct itimerspec *);
static int common_timer_set(struct k_itimer *, int,
struct itimerspec *, struct itimerspec *);
@@ -145,83 +144,37 @@ static int common_timer_del(struct k_itimer *timer);
static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+
+#define lock_timer(tid, flags) \
+({ struct k_itimer *__timr; \
+ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
+ __timr; \
+})
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
spin_unlock_irqrestore(&timr->it_lock, flags);
}
-/*
- * Call the k_clock hook function if non-null, or the default function.
- */
-#define CLOCK_DISPATCH(clock, call, arglist) \
- ((clock) < 0 ? posix_cpu_##call arglist : \
- (posix_clocks[clock].call != NULL \
- ? (*posix_clocks[clock].call) arglist : common_##call arglist))
-
-/*
- * Default clock hook functions when the struct k_clock passed
- * to register_posix_clock leaves a function pointer null.
- *
- * The function common_CALL is the default implementation for
- * the function pointer CALL in struct k_clock.
- */
-
-static inline int common_clock_getres(const clockid_t which_clock,
- struct timespec *tp)
-{
- tp->tv_sec = 0;
- tp->tv_nsec = posix_clocks[which_clock].res;
- return 0;
-}
-
-/*
- * Get real time for posix timers
- */
-static int common_clock_get(clockid_t which_clock, struct timespec *tp)
+/* Get clock_realtime */
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
{
ktime_get_real_ts(tp);
return 0;
}
-static inline int common_clock_set(const clockid_t which_clock,
- struct timespec *tp)
+/* Set clock_realtime */
+static int posix_clock_realtime_set(const clockid_t which_clock,
+ const struct timespec *tp)
{
return do_sys_settimeofday(tp, NULL);
}
-static int common_timer_create(struct k_itimer *new_timer)
-{
- hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
- return 0;
-}
-
-static int no_timer_create(struct k_itimer *new_timer)
+static int posix_clock_realtime_adj(const clockid_t which_clock,
+ struct timex *t)
{
- return -EOPNOTSUPP;
-}
-
-static int no_nsleep(const clockid_t which_clock, int flags,
- struct timespec *tsave, struct timespec __user *rmtp)
-{
- return -EOPNOTSUPP;
-}
-
-/*
- * Return nonzero if we know a priori this clockid_t value is bogus.
- */
-static inline int invalid_clockid(const clockid_t which_clock)
-{
- if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
- return 0;
- if ((unsigned) which_clock >= MAX_CLOCKS)
- return 1;
- if (posix_clocks[which_clock].clock_getres != NULL)
- return 0;
- if (posix_clocks[which_clock].res != 0)
- return 0;
- return 1;
+ return do_adjtimex(t);
}
/*
@@ -234,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
}
/*
- * Get monotonic time for posix timers
+ * Get monotonic-raw time for posix timers
*/
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
{
@@ -261,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
*tp = ktime_to_timespec(KTIME_LOW_RES);
return 0;
}
+
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+ get_monotonic_boottime(tp);
+ return 0;
+}
+
+
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
{
struct k_clock clock_realtime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_clock_realtime_get,
+ .clock_set = posix_clock_realtime_set,
+ .clock_adj = posix_clock_realtime_adj,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
};
struct k_clock clock_monotonic = {
- .clock_getres = hrtimer_get_res,
- .clock_get = posix_ktime_get_ts,
- .clock_set = do_posix_clock_nosettime,
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_ktime_get_ts,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
};
struct k_clock clock_monotonic_raw = {
- .clock_getres = hrtimer_get_res,
- .clock_get = posix_get_monotonic_raw,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = no_timer_create,
- .nsleep = no_nsleep,
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_monotonic_raw,
};
struct k_clock clock_realtime_coarse = {
- .clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_realtime_coarse,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = no_timer_create,
- .nsleep = no_nsleep,
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_realtime_coarse,
};
struct k_clock clock_monotonic_coarse = {
- .clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_monotonic_coarse,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = no_timer_create,
- .nsleep = no_nsleep,
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_monotonic_coarse,
+ };
+ struct k_clock clock_boottime = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_boottime,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
};
- register_posix_clock(CLOCK_REALTIME, &clock_realtime);
- register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
- register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
- register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
- register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+ posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
+ posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+ posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+ posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+ posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -336,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
* restarted (i.e. we have flagged this in the sys_private entry of the
* info block).
*
- * To protect aginst the timer going away while the interrupt is queued,
+ * To protect against the timer going away while the interrupt is queued,
* we require that the it_requeue_pending flag be set.
*/
void do_schedule_next_timer(struct siginfo *info)
@@ -476,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
return task_pid(rtn);
}
-void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+void posix_timers_register_clock(const clockid_t clock_id,
+ struct k_clock *new_clock)
{
if ((unsigned) clock_id >= MAX_CLOCKS) {
- printk("POSIX clock register failed for clock_id %d\n",
+ printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+ clock_id);
+ return;
+ }
+
+ if (!new_clock->clock_get) {
+ printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+ clock_id);
+ return;
+ }
+ if (!new_clock->clock_getres) {
+ printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
clock_id);
return;
}
posix_clocks[clock_id] = *new_clock;
}
-EXPORT_SYMBOL_GPL(register_posix_clock);
+EXPORT_SYMBOL_GPL(posix_timers_register_clock);
static struct k_itimer * alloc_posix_timer(void)
{
@@ -517,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
kmem_cache_free(posix_timers_cache, tmr);
}
+static struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+ if (id < 0)
+ return (id & CLOCKFD_MASK) == CLOCKFD ?
+ &clock_posix_dynamic : &clock_posix_cpu;
+
+ if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+ return NULL;
+ return &posix_clocks[id];
+}
+
+static int common_timer_create(struct k_itimer *new_timer)
+{
+ hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ return 0;
+}
+
/* Create a POSIX.1b interval timer. */
SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
struct sigevent __user *, timer_event_spec,
timer_t __user *, created_timer_id)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct k_itimer *new_timer;
int error, new_timer_id;
sigevent_t event;
int it_id_set = IT_ID_NOT_SET;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
+ if (!kc->timer_create)
+ return -EOPNOTSUPP;
new_timer = alloc_posix_timer();
if (unlikely(!new_timer))
@@ -591,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
goto out;
}
- error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+ error = kc->timer_create(new_timer);
if (error)
goto out;
@@ -601,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
spin_unlock_irq(&current->sighand->siglock);
return 0;
- /*
+ /*
* In the case of the timer belonging to another task, after
* the task is unlocked, the timer is owned by the other task
* and may cease to exist at any time. Don't use or modify
@@ -619,7 +628,7 @@ out:
* the find to the timer lock. To avoid a dead lock, the timer id MUST
* be release with out holding the timer lock.
*/
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
/*
@@ -703,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec __user *, setting)
{
- struct k_itimer *timr;
struct itimerspec cur_setting;
+ struct k_itimer *timr;
+ struct k_clock *kc;
unsigned long flags;
+ int ret = 0;
timr = lock_timer(timer_id, &flags);
if (!timr)
return -EINVAL;
- CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+ kc = clockid_to_kclock(timr->it_clock);
+ if (WARN_ON_ONCE(!kc || !kc->timer_get))
+ ret = -EINVAL;
+ else
+ kc->timer_get(timr, &cur_setting);
unlock_timer(timr, flags);
- if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
return -EFAULT;
- return 0;
+ return ret;
}
/*
@@ -807,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
int error = 0;
unsigned long flag;
struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+ struct k_clock *kc;
if (!new_setting)
return -EINVAL;
@@ -822,8 +838,11 @@ retry:
if (!timr)
return -EINVAL;
- error = CLOCK_DISPATCH(timr->it_clock, timer_set,
- (timr, flags, &new_spec, rtn));
+ kc = clockid_to_kclock(timr->it_clock);
+ if (WARN_ON_ONCE(!kc || !kc->timer_set))
+ error = -EINVAL;
+ else
+ error = kc->timer_set(timr, flags, &new_spec, rtn);
unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
@@ -838,7 +857,7 @@ retry:
return error;
}
-static inline int common_timer_del(struct k_itimer *timer)
+static int common_timer_del(struct k_itimer *timer)
{
timer->it.real.interval.tv64 = 0;
@@ -849,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer)
static inline int timer_delete_hook(struct k_itimer *timer)
{
- return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+ struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+
+ if (WARN_ON_ONCE(!kc || !kc->timer_del))
+ return -EINVAL;
+ return kc->timer_del(timer);
}
/* Delete a POSIX.1b interval timer. */
@@ -921,69 +944,76 @@ void exit_itimers(struct signal_struct *sig)
}
}
-/* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
-{
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
-
-int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
- struct timespec *t, struct timespec __user *r)
-{
-#ifndef ENOTSUP
- return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
-#else /* parisc does define it separately. */
- return -ENOTSUP;
-#endif
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
-
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec new_tp;
- if (invalid_clockid(which_clock))
+ if (!kc || !kc->clock_set)
return -EINVAL;
+
if (copy_from_user(&new_tp, tp, sizeof (*tp)))
return -EFAULT;
- return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+ return kc->clock_set(which_clock, &new_tp);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec kernel_tp;
int error;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
- error = CLOCK_DISPATCH(which_clock, clock_get,
- (which_clock, &kernel_tp));
+
+ error = kc->clock_get(which_clock, &kernel_tp);
+
if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
error = -EFAULT;
return error;
+}
+
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+ struct timex __user *, utx)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timex ktx;
+ int err;
+
+ if (!kc)
+ return -EINVAL;
+ if (!kc->clock_adj)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&ktx, utx, sizeof(ktx)))
+ return -EFAULT;
+
+ err = kc->clock_adj(which_clock, &ktx);
+
+ if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+ return -EFAULT;
+ return err;
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec rtn_tp;
int error;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
- error = CLOCK_DISPATCH(which_clock, clock_getres,
- (which_clock, &rtn_tp));
+ error = kc->clock_getres(which_clock, &rtn_tp);
- if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+ if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
error = -EFAULT;
- }
return error;
}
@@ -1003,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec t;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
+ if (!kc->nsleep)
+ return -ENANOSLEEP_NOTSUP;
if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
return -EFAULT;
@@ -1014,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
if (!timespec_valid(&t))
return -EINVAL;
- return CLOCK_DISPATCH(which_clock, nsleep,
- (which_clock, flags, &t, rmtp));
-}
-
-/*
- * nanosleep_restart for monotonic and realtime clocks
- */
-static int common_nsleep_restart(struct restart_block *restart_block)
-{
- return hrtimer_nanosleep_restart(restart_block);
+ return kc->nsleep(which_clock, flags, &t, rmtp);
}
/*
* This will restart clock_nanosleep. This is required only by
* compat_clock_nanosleep_restart for now.
*/
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
+long clock_nanosleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->arg0;
+ clockid_t which_clock = restart_block->nanosleep.index;
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+
+ if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+ return -EINVAL;
- return CLOCK_DISPATCH(which_clock, nsleep_restart,
- (restart_block));
+ return kc->nsleep_restart(restart_block);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad3..6de9a8fc341 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,129 +1,12 @@
-config PM
- bool "Power Management support"
- depends on !IA64_HP_SIM
- ---help---
- "Power Management" means that parts of your computer are shut
- off or put into a power conserving "sleep" mode if they are not
- being used. There are two competing standards for doing this: APM
- and ACPI. If you want to use either one, say Y here and then also
- to the requisite support below.
-
- Power Management is most important for battery powered laptop
- computers; if you have a laptop, check out the Linux Laptop home
- page on the WWW at <http://www.linux-on-laptops.com/> or
- Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
- and the Battery Powered Linux mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>.
-
- Note that, even if you say N here, Linux on the x86 architecture
- will issue the hlt instruction if nothing is to be done, thereby
- sending the processor to sleep and saving power.
-
-config PM_DEBUG
- bool "Power Management Debug Support"
- depends on PM
- ---help---
- This option enables various debugging support in the Power Management
- code. This is helpful when debugging and reporting PM bugs, like
- suspend support.
-
-config PM_ADVANCED_DEBUG
- bool "Extra PM attributes in sysfs for low-level debugging/testing"
- depends on PM_DEBUG
- default n
- ---help---
- Add extra sysfs attributes allowing one to access some Power Management
- fields of device objects from user space. If you are not a kernel
- developer interested in debugging/testing Power Management, say "no".
-
-config PM_VERBOSE
- bool "Verbose Power Management debugging"
- depends on PM_DEBUG
- default n
- ---help---
- This option enables verbose messages from the Power Management code.
-
-config CAN_PM_TRACE
- def_bool y
- depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
-
-config PM_TRACE
- bool
- help
- This enables code to save the last PM event point across
- reboot. The architecture needs to support this, x86 for
- example does by saving things in the RTC, see below.
-
- The architecture specific code must provide the extern
- functions from <linux/resume-trace.h> as well as the
- <asm/resume-trace.h> header with a TRACE_RESUME() macro.
-
- The way the information is presented is architecture-
- dependent, x86 will print the information during a
- late_initcall.
-
-config PM_TRACE_RTC
- bool "Suspend/resume event tracing"
- depends on CAN_PM_TRACE
- depends on X86
- select PM_TRACE
- default n
- ---help---
- This enables some cheesy code to save the last PM event point in the
- RTC across reboots, so that you can debug a machine that just hangs
- during suspend (or more commonly, during resume).
-
- To use this debugging feature you should attempt to suspend the
- machine, reboot it and then run
-
- dmesg -s 1000000 | grep 'hash matches'
-
- CAUTION: this option will cause your machine's real-time clock to be
- set to an invalid time after a resume.
-
-config PM_SLEEP_SMP
- bool
- depends on SMP
- depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
- depends on PM_SLEEP
- select HOTPLUG
- select HOTPLUG_CPU
- default y
-
-config PM_SLEEP
- bool
- depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
- default y
-
-config PM_SLEEP_ADVANCED_DEBUG
- bool
- depends on PM_ADVANCED_DEBUG
- default n
-
-config SUSPEND_NVS
- bool
-
config SUSPEND
bool "Suspend to RAM and standby"
- depends on PM && ARCH_SUSPEND_POSSIBLE
- select SUSPEND_NVS if HAS_IOMEM
+ depends on ARCH_SUSPEND_POSSIBLE
default y
---help---
Allow the system to enter sleep states in which main memory is
powered and thus its contents are preserved, such as the
suspend-to-RAM state (e.g. the ACPI S3 state).
-config PM_TEST_SUSPEND
- bool "Test suspend/resume and wakealarm during bootup"
- depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
- ---help---
- This option will let you suspend your machine during bootup, and
- make it wake up a few seconds later using an RTC wakeup alarm.
- Enable this with a kernel parameter like "test_suspend=mem".
-
- You probably want to have your system's RTC driver statically
- linked, ensuring that it's available when this test runs.
-
config SUSPEND_FREEZER
bool "Enable freezer for suspend to RAM/standby" \
if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -135,12 +18,15 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HIBERNATE_CALLBACKS
+ bool
+
config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
- depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+ depends on SWAP && ARCH_HIBERNATION_POSSIBLE
+ select HIBERNATE_CALLBACKS
select LZO_COMPRESS
select LZO_DECOMPRESS
- select SUSPEND_NVS if HAS_IOMEM
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
@@ -201,6 +87,106 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
+config PM_SLEEP
+ def_bool y
+ depends on SUSPEND || HIBERNATE_CALLBACKS
+
+config PM_SLEEP_SMP
+ def_bool y
+ depends on SMP
+ depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+ depends on PM_SLEEP
+ select HOTPLUG
+ select HOTPLUG_CPU
+
+config PM_RUNTIME
+ bool "Run-time PM core functionality"
+ depends on !IA64_HP_SIM
+ ---help---
+ Enable functionality allowing I/O devices to be put into energy-saving
+ (low power) states at run time (or autosuspended) after a specified
+ period of inactivity and woken up in response to a hardware-generated
+ wake-up event or a driver's request.
+
+ Hardware support is generally required for this functionality to work
+ and the bus type drivers of the buses the devices are on are
+ responsible for the actual handling of the autosuspend requests and
+ wake-up events.
+
+config PM
+ def_bool y
+ depends on PM_SLEEP || PM_RUNTIME
+
+config PM_DEBUG
+ bool "Power Management Debug Support"
+ depends on PM
+ ---help---
+ This option enables various debugging support in the Power Management
+ code. This is helpful when debugging and reporting PM bugs, like
+ suspend support.
+
+config PM_VERBOSE
+ bool "Verbose Power Management debugging"
+ depends on PM_DEBUG
+ ---help---
+ This option enables verbose messages from the Power Management code.
+
+config PM_ADVANCED_DEBUG
+ bool "Extra PM attributes in sysfs for low-level debugging/testing"
+ depends on PM_DEBUG
+ ---help---
+ Add extra sysfs attributes allowing one to access some Power Management
+ fields of device objects from user space. If you are not a kernel
+ developer interested in debugging/testing Power Management, say "no".
+
+config PM_TEST_SUSPEND
+ bool "Test suspend/resume and wakealarm during bootup"
+ depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+ ---help---
+ This option will let you suspend your machine during bootup, and
+ make it wake up a few seconds later using an RTC wakeup alarm.
+ Enable this with a kernel parameter like "test_suspend=mem".
+
+ You probably want to have your system's RTC driver statically
+ linked, ensuring that it's available when this test runs.
+
+config CAN_PM_TRACE
+ def_bool y
+ depends on PM_DEBUG && PM_SLEEP
+
+config PM_TRACE
+ bool
+ help
+ This enables code to save the last PM event point across
+ reboot. The architecture needs to support this, x86 for
+ example does by saving things in the RTC, see below.
+
+ The architecture specific code must provide the extern
+ functions from <linux/resume-trace.h> as well as the
+ <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+
+ The way the information is presented is architecture-
+ dependent, x86 will print the information during a
+ late_initcall.
+
+config PM_TRACE_RTC
+ bool "Suspend/resume event tracing"
+ depends on CAN_PM_TRACE
+ depends on X86
+ select PM_TRACE
+ ---help---
+ This enables some cheesy code to save the last PM event point in the
+ RTC across reboots, so that you can debug a machine that just hangs
+ during suspend (or more commonly, during resume).
+
+ To use this debugging feature you should attempt to suspend the
+ machine, reboot it and then run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+ CAUTION: this option will cause your machine's real-time clock to be
+ set to an invalid time after a resume.
+
config APM_EMULATION
tristate "Advanced Power Management Emulation"
depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -227,31 +213,11 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config PM_RUNTIME
- bool "Run-time PM core functionality"
- depends on PM
- ---help---
- Enable functionality allowing I/O devices to be put into energy-saving
- (low power) states at run time (or autosuspended) after a specified
- period of inactivity and woken up in response to a hardware-generated
- wake-up event or a driver's request.
-
- Hardware support is generally required for this functionality to work
- and the bus type drivers of the buses the devices are on are
- responsible for the actual handling of the autosuspend requests and
- wake-up events.
-
-config PM_OPS
- bool
- depends on PM_SLEEP || PM_RUNTIME
- default y
-
config ARCH_HAS_OPP
bool
config PM_OPP
bool "Operating Performance Point (OPP) Layer library"
- depends on PM
depends on ARCH_HAS_OPP
---help---
SOCs have a standard set of tuples consisting of frequency and
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185..c5ebc6a9064 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,5 @@
-ifeq ($(CONFIG_PM_DEBUG),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +8,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
block_io.o
-obj-$(CONFIG_SUSPEND_NVS) += nvs.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df..d09dd10c5a5 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
static int submit(int rw, struct block_device *bdev, sector_t sector,
struct page *page, struct bio **bio_chain)
{
- const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+ const int bio_rw = rw | REQ_SYNC;
struct bio *bio;
bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b51483..50aae660174 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/gfp.h>
+#include <linux/syscore_ops.h>
#include <scsi/scsi_scan.h>
#include <asm/suspend.h>
@@ -51,18 +52,18 @@ enum {
static int hibernation_mode = HIBERNATION_SHUTDOWN;
-static struct platform_hibernation_ops *hibernation_ops;
+static const struct platform_hibernation_ops *hibernation_ops;
/**
* hibernation_set_ops - set the global hibernate operations
* @ops: the hibernation operations to use in subsequent hibernation transitions
*/
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
- && ops->restore_cleanup)) {
+ && ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
@@ -272,13 +273,18 @@ static int create_image(int platform_mode)
local_irq_disable();
error = sysdev_suspend(PMSG_FREEZE);
+ if (!error) {
+ error = syscore_suspend();
+ if (error)
+ sysdev_resume();
+ }
if (error) {
printk(KERN_ERR "PM: Some system devices failed to power down, "
"aborting hibernation\n");
goto Enable_irqs;
}
- if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
+ if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
goto Power_up;
in_suspend = 1;
@@ -295,6 +301,7 @@ static int create_image(int platform_mode)
}
Power_up:
+ syscore_resume();
sysdev_resume();
/* NOTE: dpm_resume_noirq() is just a resume() for devices
* that suspended with irqs off ... no overall powerup.
@@ -403,6 +410,11 @@ static int resume_target_kernel(bool platform_mode)
local_irq_disable();
error = sysdev_suspend(PMSG_QUIESCE);
+ if (!error) {
+ error = syscore_suspend();
+ if (error)
+ sysdev_resume();
+ }
if (error)
goto Enable_irqs;
@@ -429,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
restore_processor_state();
touch_softlockup_watchdog();
+ syscore_resume();
sysdev_resume();
Enable_irqs:
@@ -516,7 +529,8 @@ int hibernation_platform_enter(void)
local_irq_disable();
sysdev_suspend(PMSG_HIBERNATE);
- if (!pm_check_wakeup_events()) {
+ syscore_suspend();
+ if (pm_wakeup_pending()) {
error = -EAGAIN;
goto Power_up;
}
@@ -526,6 +540,7 @@ int hibernation_platform_enter(void)
while (1);
Power_up:
+ syscore_resume();
sysdev_resume();
local_irq_enable();
enable_nonboot_cpus();
@@ -647,6 +662,7 @@ int hibernate(void)
swsusp_free();
if (!error)
power_down();
+ in_suspend = 0;
pm_restore_gfp_mask();
} else {
pr_debug("PM: Image restored successfully.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561..de9aef8742f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
DEFINE_MUTEX(pm_mutex);
-unsigned int pm_flags;
-EXPORT_SYMBOL(pm_flags);
-
#ifdef CONFIG_PM_SLEEP
/* Routines for PM-transition notifications */
@@ -227,7 +224,7 @@ power_attr(state);
* writing to 'state'. It first should read from 'wakeup_count' and store
* the read value. Then, after carrying out its own preparations for the system
* transition to a sleep state, it should write the stored value to
- * 'wakeup_count'. If that fails, at least one wakeup event has occured since
+ * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
* 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
* is allowed to write to 'state', but the transition will be aborted if there
* are any wakeup events detected after 'wakeup_count' was written to.
@@ -326,7 +323,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
static int __init pm_start_workqueue(void)
{
- pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
+ pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
return pm_wq ? 0 : -ENOMEM;
}
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb..00000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * suspend and to restore the contents of this memory during the subsequent
- * resume. The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
- unsigned long phys_start;
- unsigned int size;
- void *kaddr;
- void *data;
- struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- * suspend_nvs_register - register platform NVS memory region to save
- * @start - physical address of the region
- * @size - size of the region
- *
- * The NVS region need not be page-aligned (both ends) and we arrange
- * things so that the data from page-aligned addresses in this region will
- * be copied into separate RAM pages.
- */
-int suspend_nvs_register(unsigned long start, unsigned long size)
-{
- struct nvs_page *entry, *next;
-
- while (size > 0) {
- unsigned int nr_bytes;
-
- entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
- if (!entry)
- goto Error;
-
- list_add_tail(&entry->node, &nvs_list);
- entry->phys_start = start;
- nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
- entry->size = (size < nr_bytes) ? size : nr_bytes;
-
- start += entry->size;
- size -= entry->size;
- }
- return 0;
-
- Error:
- list_for_each_entry_safe(entry, next, &nvs_list, node) {
- list_del(&entry->node);
- kfree(entry);
- }
- return -ENOMEM;
-}
-
-/**
- * suspend_nvs_free - free data pages allocated for saving NVS regions
- */
-void suspend_nvs_free(void)
-{
- struct nvs_page *entry;
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data) {
- free_page((unsigned long)entry->data);
- entry->data = NULL;
- if (entry->kaddr) {
- iounmap(entry->kaddr);
- entry->kaddr = NULL;
- }
- }
-}
-
-/**
- * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int suspend_nvs_alloc(void)
-{
- struct nvs_page *entry;
-
- list_for_each_entry(entry, &nvs_list, node) {
- entry->data = (void *)__get_free_page(GFP_KERNEL);
- if (!entry->data) {
- suspend_nvs_free();
- return -ENOMEM;
- }
- }
- return 0;
-}
-
-/**
- * suspend_nvs_save - save NVS memory regions
- */
-void suspend_nvs_save(void)
-{
- struct nvs_page *entry;
-
- printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data) {
- entry->kaddr = ioremap(entry->phys_start, entry->size);
- memcpy(entry->data, entry->kaddr, entry->size);
- }
-}
-
-/**
- * suspend_nvs_restore - restore NVS memory regions
- *
- * This function is going to be called with interrupts disabled, so it
- * cannot iounmap the virtual addresses used to access the NVS region.
- */
-void suspend_nvs_restore(void)
-{
- struct nvs_page *entry;
-
- printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data)
- memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0..0cf3a27a6c9 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
*/
#define TIMEOUT (20 * HZ)
-static inline int freezeable(struct task_struct * p)
+static inline int freezable(struct task_struct * p)
{
if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
- if (frozen(p) || !freezeable(p))
+ if (frozen(p) || !freezable(p))
continue;
if (!freeze_task(p, sig_only))
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
* perturb a task in TASK_STOPPED or TASK_TRACED.
* It is "frozen enough". If the task does wake
* up, it will immediately call try_to_freeze.
+ *
+ * Because freeze_task() goes through p's
+ * scheduler lock after setting TIF_FREEZE, it's
+ * guaranteed that either we see TASK_RUNNING or
+ * try_to_stop() after schedule() in ptrace/signal
+ * stop sees TIF_FREEZE.
*/
if (!task_is_stopped_or_traced(p) &&
!freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
if (!todo || time_after(jiffies, end_time))
break;
- if (!pm_check_wakeup_events()) {
+ if (pm_wakeup_pending()) {
wakeup = true;
break;
}
@@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
read_lock(&tasklist_lock);
do_each_thread(g, p) {
- if (!freezeable(p))
+ if (!freezable(p))
continue;
if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea445..ca0aacc2487 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *);
/*
* Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
- * size will not exceed N bytes, but if that is impossible, it will
- * try to create the smallest image possible.
+ * When it is set to N, the image creating code will do its best to
+ * ensure the image size will not exceed N bytes, but if that is
+ * impossible, it will try to create the smallest image possible.
*/
unsigned long image_size;
void __init hibernate_image_size_init(void)
{
- image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+ image_size = (totalram_pages / 3) * PAGE_SIZE;
}
/* List of PBEs needed for restoring the pages that were allocated before
@@ -1519,11 +1519,8 @@ static int
swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
unsigned int nr_pages, unsigned int nr_highmem)
{
- int error = 0;
-
if (nr_highmem > 0) {
- error = get_highmem_buffer(PG_ANY);
- if (error)
+ if (get_highmem_buffer(PG_ANY))
goto err_out;
if (nr_highmem > alloc_highmem) {
nr_highmem -= alloc_highmem;
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
err_out:
swsusp_free();
- return error;
+ return -ENOMEM;
}
asmlinkage int swsusp_save(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ecf770509d0..8935369d503 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/suspend.h>
+#include <linux/syscore_ops.h>
+#include <trace/events/power.h>
#include "power.h"
@@ -30,13 +32,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_MEM] = "mem",
};
-static struct platform_suspend_ops *suspend_ops;
+static const struct platform_suspend_ops *suspend_ops;
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Pointer to ops structure.
*/
-void suspend_set_ops(struct platform_suspend_ops *ops)
+void suspend_set_ops(const struct platform_suspend_ops *ops)
{
mutex_lock(&pm_mutex);
suspend_ops = ops;
@@ -163,10 +165,16 @@ static int suspend_enter(suspend_state_t state)
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
- if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
+ error = syscore_suspend();
+ if (error)
+ sysdev_resume();
+ }
+ if (!error) {
+ if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
error = suspend_ops->enter(state);
events_check_enabled = false;
}
+ syscore_resume();
sysdev_resume();
}
@@ -201,6 +209,7 @@ int suspend_devices_and_enter(suspend_state_t state)
if (!suspend_ops)
return -ENOSYS;
+ trace_machine_suspend(state);
if (suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
@@ -229,6 +238,7 @@ int suspend_devices_and_enter(suspend_state_t state)
Close:
if (suspend_ops->end)
suspend_ops->end();
+ trace_machine_suspend(PWR_EVENT_EXIT);
return error;
Recover_platform:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index baf667bb279..7c97c3a0eee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,7 +30,7 @@
#include "power.h"
-#define HIBERNATE_SIG "LINHIB0001"
+#define HIBERNATE_SIG "S1SUSPEND"
/*
* The swap map is a data structure used for keeping track of each page
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void)
return res;
root_swap = res;
- res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+ res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
if (res)
return res;
@@ -888,7 +888,7 @@ out_finish:
/**
* swsusp_read - read the hibernation image.
* @flags_p: flags passed by the "frozen" kernel in the image header should
- * be written into this memeory location
+ * be written into this memory location
*/
int swsusp_read(unsigned int *flags_p)
@@ -930,7 +930,8 @@ int swsusp_check(void)
{
int error;
- hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+ FMODE_READ, NULL);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 1b2ea31e6bd..c36c3b9e8a8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
free_all_swap_pages(data->swap);
if (data->frozen)
thaw_processes();
- pm_notifier_call_chain(data->mode == O_WRONLY ?
+ pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk.c b/kernel/printk.c
index 9a2264fc42c..da8ca817eae 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,16 +39,11 @@
#include <linux/syslog.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/rculist.h>
#include <asm/uaccess.h>
/*
- * for_each_console() allows you to iterate on each console
- */
-#define for_each_console(con) \
- for (con = console_drivers; con != NULL; con = con->next)
-
-/*
* Architectures can override it:
*/
void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -58,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
/* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
/* We show everything that is MORE important than this.. */
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -102,7 +97,7 @@ static int console_locked, console_suspended;
/*
* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
* It is also used in interesting ways to provide interlocking in
- * release_console_sem().
+ * console_unlock();.
*/
static DEFINE_SPINLOCK(logbuf_lock);
@@ -118,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
/*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+
+/*
* Array of consoles built from command line options (console=)
*/
struct console_cmdline
@@ -267,25 +267,47 @@ int dmesg_restrict = 1;
int dmesg_restrict;
#endif
+static int syslog_action_restricted(int type)
+{
+ if (dmesg_restrict)
+ return 1;
+ /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
+ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+ /*
+ * If this is from /proc/kmsg and we've already opened it, then we've
+ * already done the capabilities checks at open time.
+ */
+ if (from_file && type != SYSLOG_ACTION_OPEN)
+ return 0;
+
+ if (syslog_action_restricted(type)) {
+ if (capable(CAP_SYSLOG))
+ return 0;
+ /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
+ if (capable(CAP_SYS_ADMIN)) {
+ WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+ "but no CAP_SYSLOG (deprecated).\n");
+ return 0;
+ }
+ return -EPERM;
+ }
+ return 0;
+}
+
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
unsigned i, j, limit, count;
int do_clear = 0;
char c;
- int error = 0;
+ int error;
- /*
- * If this is from /proc/kmsg we only do the capabilities checks
- * at open time.
- */
- if (type == SYSLOG_ACTION_OPEN || !from_file) {
- if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if ((type != SYSLOG_ACTION_READ_ALL &&
- type != SYSLOG_ACTION_SIZE_BUFFER) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
- }
+ error = check_syslog_permissions(type, from_file);
+ if (error)
+ goto out;
error = security_syslog(type);
if (error)
@@ -459,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
struct console *con;
for_each_console(con) {
+ if (exclusive_console && con != exclusive_console)
+ continue;
if ((con->flags & CON_ENABLED) && con->write &&
(cpu_online(smp_processor_id()) ||
(con->flags & CON_ANYTIME)))
@@ -498,9 +522,74 @@ static void _call_console_drivers(unsigned start,
}
/*
+ * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
+ * lower 3 bit are the log level, the rest are the log facility. In case
+ * userspace passes usual userspace syslog messages to /dev/kmsg or
+ * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
+ * to extract the correct log level for in-kernel processing, and not mangle
+ * the original value.
+ *
+ * If a prefix is found, the length of the prefix is returned. If 'level' is
+ * passed, it will be filled in with the log level without a possible facility
+ * value. If 'special' is passed, the special printk prefix chars are accepted
+ * and returned. If no valid header is found, 0 is returned and the passed
+ * variables are not touched.
+ */
+static size_t log_prefix(const char *p, unsigned int *level, char *special)
+{
+ unsigned int lev = 0;
+ char sp = '\0';
+ size_t len;
+
+ if (p[0] != '<' || !p[1])
+ return 0;
+ if (p[2] == '>') {
+ /* usual single digit level number or special char */
+ switch (p[1]) {
+ case '0' ... '7':
+ lev = p[1] - '0';
+ break;
+ case 'c': /* KERN_CONT */
+ case 'd': /* KERN_DEFAULT */
+ sp = p[1];
+ break;
+ default:
+ return 0;
+ }
+ len = 3;
+ } else {
+ /* multi digit including the level and facility number */
+ char *endp = NULL;
+
+ if (p[1] < '0' && p[1] > '9')
+ return 0;
+
+ lev = (simple_strtoul(&p[1], &endp, 10) & 7);
+ if (endp == NULL || endp[0] != '>')
+ return 0;
+ len = (endp + 1) - p;
+ }
+
+ /* do not accept special char if not asked for */
+ if (sp && !special)
+ return 0;
+
+ if (special) {
+ *special = sp;
+ /* return special char, do not touch level */
+ if (sp)
+ return len;
+ }
+
+ if (level)
+ *level = lev;
+ return len;
+}
+
+/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
- * The console_sem must be held.
+ * The console_lock must be held.
*/
static void call_console_drivers(unsigned start, unsigned end)
{
@@ -512,13 +601,9 @@ static void call_console_drivers(unsigned start, unsigned end)
cur_index = start;
start_print = start;
while (cur_index != end) {
- if (msg_level < 0 && ((end - cur_index) > 2) &&
- LOG_BUF(cur_index + 0) == '<' &&
- LOG_BUF(cur_index + 1) >= '0' &&
- LOG_BUF(cur_index + 1) <= '7' &&
- LOG_BUF(cur_index + 2) == '>') {
- msg_level = LOG_BUF(cur_index + 1) - '0';
- cur_index += 3;
+ if (msg_level < 0 && ((end - cur_index) > 2)) {
+ /* strip log prefix */
+ cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
start_print = cur_index;
}
while (cur_index != end) {
@@ -603,11 +688,11 @@ static int have_callable_console(void)
*
* This is printk(). It can be called from any context. We want it to work.
*
- * We try to grab the console_sem. If we succeed, it's easy - we log the output and
+ * We try to grab the console_lock. If we succeed, it's easy - we log the output and
* call the console drivers. If we fail to get the semaphore we place the output
* into the log buffer and return. The current holder of the console_sem will
- * notice the new output in release_console_sem() and will send it to the
- * consoles before releasing the semaphore.
+ * notice the new output in console_unlock(); and will send it to the
+ * consoles before releasing the lock.
*
* One effect of this deferred printing is that code which calls printk() and
* then changes console_loglevel may break. This is because console_loglevel
@@ -658,19 +743,19 @@ static inline int can_use_console(unsigned int cpu)
/*
* Try to get console ownership to actually show the kernel
* messages from a 'printk'. Return true (and with the
- * console_semaphore held, and 'console_locked' set) if it
+ * console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*
* This gets called with the 'logbuf_lock' spinlock held and
* interrupts disabled. It should return with 'lockbuf_lock'
* released but interrupts still disabled.
*/
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu)
__releases(&logbuf_lock)
{
int retval = 0;
- if (!try_acquire_console_sem()) {
+ if (console_trylock()) {
retval = 1;
/*
@@ -716,6 +801,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
unsigned long flags;
int this_cpu;
char *p;
+ size_t plen;
+ char special;
boot_delay_msec();
printk_delay();
@@ -756,45 +843,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printed_len += vscnprintf(printk_buf + printed_len,
sizeof(printk_buf) - printed_len, fmt, args);
-
p = printk_buf;
- /* Do we have a loglevel in the string? */
- if (p[0] == '<') {
- unsigned char c = p[1];
- if (c && p[2] == '>') {
- switch (c) {
- case '0' ... '7': /* loglevel */
- current_log_level = c - '0';
- /* Fallthrough - make sure we're on a new line */
- case 'd': /* KERN_DEFAULT */
- if (!new_text_line) {
- emit_log_char('\n');
- new_text_line = 1;
- }
- /* Fallthrough - skip the loglevel */
- case 'c': /* KERN_CONT */
- p += 3;
- break;
+ /* Read log level and handle special printk prefix */
+ plen = log_prefix(p, &current_log_level, &special);
+ if (plen) {
+ p += plen;
+
+ switch (special) {
+ case 'c': /* Strip <c> KERN_CONT, continue line */
+ plen = 0;
+ break;
+ case 'd': /* Strip <d> KERN_DEFAULT, start new line */
+ plen = 0;
+ default:
+ if (!new_text_line) {
+ emit_log_char('\n');
+ new_text_line = 1;
}
}
}
/*
- * Copy the output into log_buf. If the caller didn't provide
- * appropriate log level tags, we insert them here
+ * Copy the output into log_buf. If the caller didn't provide
+ * the appropriate log prefix, we insert them here
*/
- for ( ; *p; p++) {
+ for (; *p; p++) {
if (new_text_line) {
- /* Always output the token */
- emit_log_char('<');
- emit_log_char(current_log_level + '0');
- emit_log_char('>');
- printed_len += 3;
new_text_line = 0;
+ if (plen) {
+ /* Copy original log prefix */
+ int i;
+
+ for (i = 0; i < plen; i++)
+ emit_log_char(printk_buf[i]);
+ printed_len += plen;
+ } else {
+ /* Add log prefix */
+ emit_log_char('<');
+ emit_log_char(current_log_level + '0');
+ emit_log_char('>');
+ printed_len += 3;
+ }
+
if (printk_time) {
- /* Follow the token with the time */
+ /* Add the current time stamp */
char tbuf[50], *tp;
unsigned tlen;
unsigned long long t;
@@ -826,12 +920,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* actual magic (print out buffers, wake up klogd,
* etc).
*
- * The acquire_console_semaphore_for_printk() function
+ * The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
* actually gets the semaphore or not.
*/
- if (acquire_console_semaphore_for_printk(this_cpu))
- release_console_sem();
+ if (console_trylock_for_printk(this_cpu))
+ console_unlock();
lockdep_on();
out_restore_irqs:
@@ -992,7 +1086,7 @@ void suspend_console(void)
if (!console_suspend_enabled)
return;
printk("Suspending console(s) (use no_console_suspend to debug)\n");
- acquire_console_sem();
+ console_lock();
console_suspended = 1;
up(&console_sem);
}
@@ -1003,7 +1097,7 @@ void resume_console(void)
return;
down(&console_sem);
console_suspended = 0;
- release_console_sem();
+ console_unlock();
}
/**
@@ -1026,21 +1120,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
case CPU_DYING:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
- acquire_console_sem();
- release_console_sem();
+ console_lock();
+ console_unlock();
}
return NOTIFY_OK;
}
/**
- * acquire_console_sem - lock the console system for exclusive use.
+ * console_lock - lock the console system for exclusive use.
*
- * Acquires a semaphore which guarantees that the caller has
+ * Acquires a lock which guarantees that the caller has
* exclusive access to the console system and the console_drivers list.
*
* Can sleep, returns nothing.
*/
-void acquire_console_sem(void)
+void console_lock(void)
{
BUG_ON(in_interrupt());
down(&console_sem);
@@ -1049,21 +1143,29 @@ void acquire_console_sem(void)
console_locked = 1;
console_may_schedule = 1;
}
-EXPORT_SYMBOL(acquire_console_sem);
+EXPORT_SYMBOL(console_lock);
-int try_acquire_console_sem(void)
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Tried to acquire a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
{
if (down_trylock(&console_sem))
- return -1;
+ return 0;
if (console_suspended) {
up(&console_sem);
- return -1;
+ return 0;
}
console_locked = 1;
console_may_schedule = 0;
- return 0;
+ return 1;
}
-EXPORT_SYMBOL(try_acquire_console_sem);
+EXPORT_SYMBOL(console_trylock);
int is_console_locked(void)
{
@@ -1074,38 +1176,40 @@ static DEFINE_PER_CPU(int, printk_pending);
void printk_tick(void)
{
- if (__get_cpu_var(printk_pending)) {
- __get_cpu_var(printk_pending) = 0;
+ if (__this_cpu_read(printk_pending)) {
+ __this_cpu_write(printk_pending, 0);
wake_up_interruptible(&log_wait);
}
}
int printk_needs_cpu(int cpu)
{
- return per_cpu(printk_pending, cpu);
+ if (cpu_is_offline(cpu))
+ printk_tick();
+ return __this_cpu_read(printk_pending);
}
void wake_up_klogd(void)
{
if (waitqueue_active(&log_wait))
- __raw_get_cpu_var(printk_pending) = 1;
+ this_cpu_write(printk_pending, 1);
}
/**
- * release_console_sem - unlock the console system
+ * console_unlock - unlock the console system
*
- * Releases the semaphore which the caller holds on the console system
+ * Releases the console_lock which the caller holds on the console system
* and the console driver list.
*
- * While the semaphore was held, console output may have been buffered
- * by printk(). If this is the case, release_console_sem() emits
- * the output prior to releasing the semaphore.
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
*
* If there is output waiting for klogd, we wake it up.
*
- * release_console_sem() may be called from any context.
+ * console_unlock(); may be called from any context.
*/
-void release_console_sem(void)
+void console_unlock(void)
{
unsigned long flags;
unsigned _con_start, _log_end;
@@ -1133,12 +1237,17 @@ void release_console_sem(void)
local_irq_restore(flags);
}
console_locked = 0;
+
+ /* Release the exclusive_console once it is used */
+ if (unlikely(exclusive_console))
+ exclusive_console = NULL;
+
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd)
wake_up_klogd();
}
-EXPORT_SYMBOL(release_console_sem);
+EXPORT_SYMBOL(console_unlock);
/**
* console_conditional_schedule - yield the CPU if required
@@ -1147,7 +1256,7 @@ EXPORT_SYMBOL(release_console_sem);
* if this CPU should yield the CPU to another task, do
* so here.
*
- * Must be called within acquire_console_sem().
+ * Must be called within console_lock();.
*/
void __sched console_conditional_schedule(void)
{
@@ -1168,14 +1277,14 @@ void console_unblank(void)
if (down_trylock(&console_sem) != 0)
return;
} else
- acquire_console_sem();
+ console_lock();
console_locked = 1;
console_may_schedule = 0;
for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
- release_console_sem();
+ console_unlock();
}
/*
@@ -1186,7 +1295,7 @@ struct tty_driver *console_device(int *index)
struct console *c;
struct tty_driver *driver = NULL;
- acquire_console_sem();
+ console_lock();
for_each_console(c) {
if (!c->device)
continue;
@@ -1194,7 +1303,7 @@ struct tty_driver *console_device(int *index)
if (driver)
break;
}
- release_console_sem();
+ console_unlock();
return driver;
}
@@ -1205,20 +1314,32 @@ struct tty_driver *console_device(int *index)
*/
void console_stop(struct console *console)
{
- acquire_console_sem();
+ console_lock();
console->flags &= ~CON_ENABLED;
- release_console_sem();
+ console_unlock();
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- acquire_console_sem();
+ console_lock();
console->flags |= CON_ENABLED;
- release_console_sem();
+ console_unlock();
}
EXPORT_SYMBOL(console_start);
+static int __read_mostly keep_bootcon;
+
+static int __init keep_bootcon_setup(char *str)
+{
+ keep_bootcon = 1;
+ printk(KERN_INFO "debug: skip boot console de-registration.\n");
+
+ return 0;
+}
+
+early_param("keep_bootcon", keep_bootcon_setup);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -1337,7 +1458,7 @@ void register_console(struct console *newcon)
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
- acquire_console_sem();
+ console_lock();
if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
newcon->next = console_drivers;
console_drivers = newcon;
@@ -1349,14 +1470,21 @@ void register_console(struct console *newcon)
}
if (newcon->flags & CON_PRINTBUFFER) {
/*
- * release_console_sem() will print out the buffered messages
+ * console_unlock(); will print out the buffered messages
* for us.
*/
spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
spin_unlock_irqrestore(&logbuf_lock, flags);
+ /*
+ * We're about to replay the log buffer. Only do this to the
+ * just-registered console to avoid excessive message spam to
+ * the already-registered consoles.
+ */
+ exclusive_console = newcon;
}
- release_console_sem();
+ console_unlock();
+ console_sysfs_notify();
/*
* By unregistering the bootconsoles after we enable the real console
@@ -1365,7 +1493,9 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+ if (bcon &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+ !keep_bootcon) {
/* we need to iterate through twice, to make sure we print
* everything out, before we unregister the console(s)
*/
@@ -1392,7 +1522,7 @@ int unregister_console(struct console *console)
return braille_unregister_console(console);
#endif
- acquire_console_sem();
+ console_lock();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
@@ -1414,7 +1544,8 @@ int unregister_console(struct console *console)
if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
- release_console_sem();
+ console_unlock();
+ console_sysfs_notify();
return res;
}
EXPORT_SYMBOL(unregister_console);
@@ -1498,7 +1629,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
/* Don't allow registering multiple times */
if (!dumper->registered) {
dumper->registered = 1;
- list_add_tail(&dumper->list, &dump_list);
+ list_add_tail_rcu(&dumper->list, &dump_list);
err = 0;
}
spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1522,29 +1653,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
spin_lock_irqsave(&dump_list_lock, flags);
if (dumper->registered) {
dumper->registered = 0;
- list_del(&dumper->list);
+ list_del_rcu(&dumper->list);
err = 0;
}
spin_unlock_irqrestore(&dump_list_lock, flags);
+ synchronize_rcu();
return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char * const kmsg_reasons[] = {
- [KMSG_DUMP_OOPS] = "oops",
- [KMSG_DUMP_PANIC] = "panic",
- [KMSG_DUMP_KEXEC] = "kexec",
-};
-
-static const char *kmsg_to_str(enum kmsg_dump_reason reason)
-{
- if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
- return "unknown";
-
- return kmsg_reasons[reason];
-}
-
/**
* kmsg_dump - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
@@ -1583,13 +1701,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
l2 = chars;
}
- if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
- printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
- kmsg_to_str(reason));
- return;
- }
- list_for_each_entry(dumper, &dump_list, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(dumper, &dump_list, list)
dumper->dump(dumper, reason, s1, l1, s2, l2);
- spin_unlock_irqrestore(&dump_list_lock, flags);
+ rcu_read_unlock();
}
#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0..dc7ab65f3b3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/regset.h>
+#include <linux/hw_breakpoint.h>
/*
@@ -134,21 +135,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return 0;
rcu_read_lock();
tcred = __task_cred(task);
- if ((cred->uid != tcred->euid ||
- cred->uid != tcred->suid ||
- cred->uid != tcred->uid ||
- cred->gid != tcred->egid ||
- cred->gid != tcred->sgid ||
- cred->gid != tcred->gid) &&
- !capable(CAP_SYS_PTRACE)) {
- rcu_read_unlock();
- return -EPERM;
- }
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ goto ok;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+ goto ok;
+ rcu_read_unlock();
+ return -EPERM;
+ok:
rcu_read_unlock();
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !capable(CAP_SYS_PTRACE))
+ if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
return -EPERM;
return security_ptrace_access_check(task, mode);
@@ -163,7 +167,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
-int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task)
{
int retval;
@@ -198,7 +202,7 @@ int ptrace_attach(struct task_struct *task)
goto unlock_tasklist;
task->ptrace = PT_PTRACED;
- if (capable(CAP_SYS_PTRACE))
+ if (task_ns_capable(task, CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
@@ -219,7 +223,7 @@ out:
* Performs checks and sets PT_PTRACED.
* Should be used by all ptrace implementations for PTRACE_TRACEME.
*/
-int ptrace_traceme(void)
+static int ptrace_traceme(void)
{
int ret = -EPERM;
@@ -293,7 +297,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
return false;
}
-int ptrace_detach(struct task_struct *child, unsigned int data)
+static int ptrace_detach(struct task_struct *child, unsigned int data)
{
bool dead = false;
@@ -313,7 +317,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
child->exit_code = data;
dead = __ptrace_detach(current, child);
if (!child->exit_state)
- wake_up_process(child);
+ wake_up_state(child, TASK_TRACED | TASK_STOPPED);
}
write_unlock_irq(&tasklist_lock);
@@ -876,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
return ret;
}
#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+int ptrace_get_breakpoints(struct task_struct *tsk)
+{
+ if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
+ return 0;
+
+ return -1;
+}
+
+void ptrace_put_breakpoints(struct task_struct *tsk)
+{
+ if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
+ flush_ptrace_hw_breakpoint(tsk);
+}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d..f3240e98792 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
+ * Note that the machinery to reliably determine whether
+ * or not we are in an RCU read-side critical section
+ * exists only in the preemptible RCU implementations
+ * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
+ * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
*/
-#ifndef CONFIG_PREEMPT
- WARN_ON(1);
- return 0;
-#else
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
rcu_barrier_bh();
debug_object_free(head, &rcuhead_debug_descr);
return 1;
-#endif
default:
return 0;
}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342a..0c343b9a46d 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
#include <linux/time.h>
#include <linux/cpu.h>
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
/* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
void rcu_bh_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
}
/*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
*/
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
struct rcu_head *next, *list;
unsigned long flags;
+ RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,59 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
+ local_bh_disable();
list->func(list);
+ local_bh_enable();
list = next;
+ RCU_TRACE(cb_count++);
}
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
}
/*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed. It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
{
- __rcu_process_callbacks(&rcu_sched_ctrlblk);
- __rcu_process_callbacks(&rcu_bh_ctrlblk);
- rcu_preempt_process_callbacks();
+ unsigned long work;
+ unsigned long morework;
+ unsigned long flags;
+
+ for (;;) {
+ wait_event_interruptible(rcu_kthread_wq,
+ have_rcu_kthread_work != 0);
+ morework = rcu_boost();
+ local_irq_save(flags);
+ work = have_rcu_kthread_work;
+ have_rcu_kthread_work = morework;
+ local_irq_restore(flags);
+ if (work) {
+ rcu_process_callbacks(&rcu_sched_ctrlblk);
+ rcu_process_callbacks(&rcu_bh_ctrlblk);
+ rcu_preempt_process_callbacks();
+ }
+ schedule_timeout_interruptible(1); /* Leave CPU for others. */
+ }
+
+ return 0; /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+ local_irq_restore(flags);
}
/*
@@ -230,6 +256,7 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
+ RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
}
@@ -282,7 +309,16 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
{
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ struct sched_param sp;
+
+ rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+ sp.sched_priority = RCU_BOOST_PRIO;
+ sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+ return 0;
}
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745f..3cb8e362e88 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ RCU_TRACE(long qlen); /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
#ifdef CONFIG_TINY_PREEMPT_RCU
#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
struct list_head *gp_tasks;
/* Pointer to the first task blocking the */
/* current grace period, or NULL if there */
- /* is not such task. */
+ /* is no such task. */
struct list_head *exp_tasks;
/* Pointer to first task blocking the */
/* current expedited grace period, or NULL */
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority-boosted, or NULL if no priority */
+ /* boosting is needed. If there is no */
+ /* current or expedited grace period, there */
+ /* can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
u8 gpnum; /* Current grace period. */
u8 gpcpu; /* Last grace period blocked by the CPU. */
u8 completed; /* Last grace period completed. */
/* If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+ s8 boosted_this_gp; /* Has boosting already happened? */
+ unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+ unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+ unsigned long n_tasks_boosted;
+ unsigned long n_exp_boosts;
+ unsigned long n_normal_boosts;
+ unsigned long n_normal_balk_blkd_tasks;
+ unsigned long n_normal_balk_gp_tasks;
+ unsigned long n_normal_balk_boost_tasks;
+ unsigned long n_normal_balk_boosted;
+ unsigned long n_normal_balk_notyet;
+ unsigned long n_normal_balk_nos;
+ unsigned long n_exp_balk_blkd_tasks;
+ unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
};
static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
}
/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+ seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+ rcu_preempt_ctrlblk.rcb.qlen,
+ rcu_preempt_ctrlblk.n_grace_periods,
+ rcu_preempt_ctrlblk.gpnum,
+ rcu_preempt_ctrlblk.gpcpu,
+ rcu_preempt_ctrlblk.completed,
+ "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+ "N."[!rcu_preempt_ctrlblk.gp_tasks],
+ "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, " ttb=%c btg=",
+ "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+ switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+ case -1:
+ seq_puts(m, "exp");
+ break;
+ case 0:
+ seq_puts(m, "no");
+ break;
+ case 1:
+ seq_puts(m, "begun");
+ break;
+ case 2:
+ seq_puts(m, "done");
+ break;
+ default:
+ seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+ }
+ seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ rcu_preempt_ctrlblk.n_tasks_boosted,
+ rcu_preempt_ctrlblk.n_exp_boosts,
+ rcu_preempt_ctrlblk.n_normal_boosts,
+ (int)(jiffies & 0xffff),
+ (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+ seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+ "normal balk",
+ rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boosted,
+ rcu_preempt_ctrlblk.n_normal_balk_notyet,
+ rcu_preempt_ctrlblk.n_normal_balk_nos);
+ seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct list_head *np;
+ struct task_struct *t;
+
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+ return 0; /* Nothing to boost. */
+ raw_local_irq_save(flags);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+ rcu_node_entry);
+ np = rcu_next_node_entry(t);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ raw_local_irq_restore(flags);
+ rt_mutex_lock(&mtx);
+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ rt_mutex_unlock(&mtx);
+ return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them. If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1. Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+ if (!rcu_preempt_blocked_readers_cgp()) {
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+ return 0;
+ }
+ if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+ rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_boost_trace());
+ return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+ rcu_preempt_ctrlblk.boost_tasks =
+ rcu_preempt_ctrlblk.blkd_tasks.next;
+ rcu_preempt_ctrlblk.boosted_this_gp = -1;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_exp_boost_trace());
+ raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+ rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+ if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+ rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+ return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
+/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* in a quiescent state. There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+ /* If there is no GP then there is nothing more to do. */
+ if (!rcu_preempt_gp_in_progress())
+ return;
/*
- * If there is no GP, or if blocked readers are still blocking GP,
- * then there is nothing more to do.
+ * Check up on boosting. If there are no readers blocking the
+ * current grace period, leave.
*/
- if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+ if (rcu_initiate_boost())
return;
/* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
if (!rcu_preempt_blocked_readers_any())
rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
- /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+ /* If there are done callbacks, cause them to be invoked. */
if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
/* Official start of GP. */
rcu_preempt_ctrlblk.gpnum++;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
/* Any blocked RCU readers block new GP. */
if (rcu_preempt_blocked_readers_any())
rcu_preempt_ctrlblk.gp_tasks =
rcu_preempt_ctrlblk.blkd_tasks.next;
+ /* Set up for RCU priority boosting. */
+ rcu_preempt_boost_start_gp();
+
/* If there is no running reader, CPU is done with GP. */
if (!rcu_preempt_running_reader())
rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
*/
empty = !rcu_preempt_blocked_readers_cgp();
empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
- np = t->rcu_node_entry.next;
- if (np == &rcu_preempt_ctrlblk.blkd_tasks)
- np = NULL;
+ np = rcu_next_node_entry(t);
list_del(&t->rcu_node_entry);
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
rcu_preempt_ctrlblk.gp_tasks = np;
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+ rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&t->rcu_node_entry);
/*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
rcu_report_exp_done();
}
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost self if was boosted. */
+ if (special & RCU_READ_UNLOCK_BOOSTED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+ rt_mutex_unlock(t->rcu_boost_mutex);
+ t->rcu_boost_mutex = NULL;
+ }
+#endif /* #ifdef CONFIG_RCU_BOOST */
local_irq_restore(flags);
}
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
rcu_preempt_cpu_qs();
if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
rcu_preempt_ctrlblk.rcb.donetail)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
if (rcu_preempt_gp_in_progress() &&
rcu_cpu_blocking_cur_gp() &&
rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
/*
* TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
* handle that case. Of course, it is invoked for all flavors of
* RCU, but RCU callbacks can appear only on one of the lists, and
* neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
*/
static void rcu_preempt_process_callbacks(void)
{
- __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+ rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
}
/*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
local_irq_save(flags);
*rcu_preempt_ctrlblk.nexttail = head;
rcu_preempt_ctrlblk.nexttail = &head->next;
+ RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
rcu_preempt_start_gp(); /* checks to see if GP needed. */
local_irq_restore(flags);
}
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
/* Wait for tail of ->blkd_tasks list to drain. */
if (rcu_preempted_readers_exp())
+ rcu_initiate_expedited_boost();
wait_event(sync_rcu_preempt_exp_wq,
!rcu_preempted_readers_exp());
@@ -567,11 +852,32 @@ void exit_rcu(void)
if (t->rcu_read_lock_nesting == 0)
return;
t->rcu_read_lock_nesting = 1;
- rcu_read_unlock();
+ __rcu_read_unlock();
}
#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
#include <linux/kernel_stat.h>
/*
* During boot, we forgive RCU lockdep issues. After this function is
* invoked, we start taking RCU lockdep issues seriously.
*/
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
rcu_scheduler_active = 1;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+ if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+ else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+ else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+ rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+ else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+ rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+ else
+ rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+ if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+ else
+ rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ rcp->qlen -= n;
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+ show_tiny_preempt_stats(m);
+ seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+ seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+ return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = show_tiny_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+ struct dentry *retval;
+
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto free_out;
+ retval = debugfs_create_file("rcudata", 0444, rcudir,
+ NULL, &show_tiny_stats_fops);
+ if (!retval)
+ goto free_out;
+ return 0;
+free_out:
+ debugfs_remove_recursive(rcudir);
+ return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+ debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515..c224da41890 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
static int fqs_holdoff = 0; /* Hold time within burst (us). */
static int fqs_stutter = 3; /* Wait time between bursts (s). */
+static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
static char *torture_type = "rcu"; /* What RCU implementation to torture. */
module_param(nreaders, int, 0444);
@@ -88,6 +91,12 @@ module_param(fqs_holdoff, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -109,6 +118,7 @@ static struct task_struct *stats_task;
static struct task_struct *shuffler_task;
static struct task_struct *stutter_task;
static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
#define RCU_TORTURE_PIPE_LEN 10
@@ -134,6 +144,12 @@ static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +163,16 @@ static int stutter_pause_test;
#endif
int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime; /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+ /* and boost task create/destroy. */
+
/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
@@ -277,6 +303,7 @@ struct rcu_torture_ops {
void (*fqs)(void);
int (*stats)(char *page);
int irq_capable;
+ int can_boost;
char *name;
};
@@ -366,6 +393,7 @@ static struct rcu_torture_ops rcu_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu"
};
@@ -408,6 +436,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_sync"
};
@@ -424,6 +453,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_expedited"
};
@@ -684,6 +714,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
};
/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked. If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+ struct rcu_head rcu;
+ int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+ struct rcu_boost_inflight *rbip =
+ container_of(head, struct rcu_boost_inflight, rcu);
+
+ smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+ rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+ unsigned long call_rcu_time;
+ unsigned long endtime;
+ unsigned long oldstarttime;
+ struct rcu_boost_inflight rbi = { .inflight = 0 };
+ struct sched_param sp;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+ /* Set real-time priority. */
+ sp.sched_priority = 1;
+ if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+ n_rcu_torture_boost_rterror++;
+ }
+
+ /* Each pass through the following loop does one boost-test cycle. */
+ do {
+ /* Wait for the next test interval. */
+ oldstarttime = boost_starttime;
+ while (jiffies - oldstarttime > ULONG_MAX / 2) {
+ schedule_timeout_uninterruptible(1);
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /* Do one boost-test interval. */
+ endtime = oldstarttime + test_boost_duration * HZ;
+ call_rcu_time = jiffies;
+ while (jiffies - endtime > ULONG_MAX / 2) {
+ /* If we don't have a callback in flight, post one. */
+ if (!rbi.inflight) {
+ smp_mb(); /* RCU core before ->inflight = 1. */
+ rbi.inflight = 1;
+ call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+ if (jiffies - call_rcu_time >
+ test_boost_duration * HZ - HZ / 2) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ }
+ call_rcu_time = jiffies;
+ }
+ cond_resched();
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /*
+ * Set the start time of the next test interval.
+ * Yes, this is vulnerable to long delays, but such
+ * delays simply cause a false negative for the next
+ * interval. Besides, we are running at RT priority,
+ * so delays should be relatively rare.
+ */
+ while (oldstarttime == boost_starttime) {
+ if (mutex_trylock(&boost_mutex)) {
+ boost_starttime = jiffies +
+ test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ mutex_unlock(&boost_mutex);
+ break;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /* Go do the stutter. */
+checkwait: rcu_stutter_wait("rcu_torture_boost");
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+
+ /* Clean up and exit. */
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_boost");
+ while (!kthread_should_stop() || rbi.inflight)
+ schedule_timeout_uninterruptible(1);
+ smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ return 0;
+}
+
+/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
* of occurrence of some important types of race conditions.
@@ -933,7 +1067,8 @@ rcu_torture_printk(char *page)
cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d nt: %ld",
+ "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+ "rtbf: %ld rtb: %ld nt: %ld",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
@@ -941,8 +1076,19 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free),
atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror,
+ n_rcu_torture_boost_allocerror,
+ n_rcu_torture_boost_afferror,
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
n_rcu_torture_timers);
- if (atomic_read(&n_rcu_torture_mberror) != 0)
+ if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_boost_ktrerror != 0 ||
+ n_rcu_torture_boost_rterror != 0 ||
+ n_rcu_torture_boost_allocerror != 0 ||
+ n_rcu_torture_boost_afferror != 0 ||
+ n_rcu_torture_boost_failure != 0)
cnt += sprintf(&page[cnt], " !!!");
cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (i > 1) {
@@ -1094,22 +1240,91 @@ rcu_torture_stutter(void *arg)
}
static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
{
printk(KERN_ALERT "%s" TORTURE_FLAG
"--- %s: nreaders=%d nfakewriters=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
"shuffle_interval=%d stutter=%d irqreader=%d "
- "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
- stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration);
}
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
.notifier_call = rcutorture_shutdown_notify,
};
+static void rcutorture_booster_cleanup(int cpu)
+{
+ struct task_struct *t;
+
+ if (boost_tasks[cpu] == NULL)
+ return;
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+ t = boost_tasks[cpu];
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+
+ /* This must be outside of the mutex, otherwise deadlock! */
+ kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+ int retval;
+
+ if (boost_tasks[cpu] != NULL)
+ return 0; /* Already created, nothing more to do. */
+
+ /* Don't allow time recalculation while creating a new task. */
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+ boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+ "rcu_torture_boost");
+ if (IS_ERR(boost_tasks[cpu])) {
+ retval = PTR_ERR(boost_tasks[cpu]);
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+ n_rcu_torture_boost_ktrerror++;
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+ return retval;
+ }
+ kthread_bind(boost_tasks[cpu], cpu);
+ wake_up_process(boost_tasks[cpu]);
+ mutex_unlock(&boost_mutex);
+ return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ (void)rcutorture_booster_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcutorture_booster_cleanup(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+ .notifier_call = rcutorture_cpu_notify,
+};
+
static void
rcu_torture_cleanup(void)
{
@@ -1127,7 +1342,7 @@ rcu_torture_cleanup(void)
}
fullstop = FULLSTOP_RMMOD;
mutex_unlock(&fullstop_mutex);
- unregister_reboot_notifier(&rcutorture_nb);
+ unregister_reboot_notifier(&rcutorture_shutdown_nb);
if (stutter_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
kthread_stop(stutter_task);
@@ -1184,6 +1399,12 @@ rcu_torture_cleanup(void)
kthread_stop(fqs_task);
}
fqs_task = NULL;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ unregister_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i)
+ rcutorture_booster_cleanup(i);
+ }
/* Wait for all RCU callbacks to fire. */
@@ -1195,9 +1416,9 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup)
cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
- rcu_torture_print_module_parms("End of test: FAILURE");
+ rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else
- rcu_torture_print_module_parms("End of test: SUCCESS");
+ rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
}
static int __init
@@ -1242,7 +1463,7 @@ rcu_torture_init(void)
nrealreaders = nreaders;
else
nrealreaders = 2 * num_online_cpus();
- rcu_torture_print_module_parms("Start of test");
+ rcu_torture_print_module_parms(cur_ops, "Start of test");
fullstop = FULLSTOP_DONTSTOP;
/* Set up the freelist. */
@@ -1263,6 +1484,12 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_boost_ktrerror = 0;
+ n_rcu_torture_boost_rterror = 0;
+ n_rcu_torture_boost_allocerror = 0;
+ n_rcu_torture_boost_afferror = 0;
+ n_rcu_torture_boost_failure = 0;
+ n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
atomic_set(&rcu_torture_wcount[i], 0);
for_each_possible_cpu(cpu) {
@@ -1376,7 +1603,27 @@ rcu_torture_init(void)
goto unwind;
}
}
- register_reboot_notifier(&rcutorture_nb);
+ if (test_boost_interval < 1)
+ test_boost_interval = 1;
+ if (test_boost_duration < 2)
+ test_boost_duration = 2;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ int retval;
+
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ register_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i) {
+ if (cpu_is_offline(i))
+ continue; /* Heuristic: CPU can go offline. */
+ retval = rcutorture_booster_init(i);
+ if (retval < 0) {
+ firsterr = retval;
+ goto unwind;
+ }
+ }
+ }
+ register_reboot_notifier(&rcutorture_shutdown_nb);
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c4798..dd4aea806f8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
- .orphan_cbs_list = NULL, \
- .orphan_cbs_tail = &structname.orphan_cbs_list, \
- .orphan_qlen = 0, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
@@ -367,8 +364,8 @@ void rcu_irq_exit(void)
WARN_ON_ONCE(rdtp->dynticks & 0x1);
/* If the interrupt queued a callback, get out of dyntick mode. */
- if (__get_cpu_var(rcu_sched_data).nxtlist ||
- __get_cpu_var(rcu_bh_data).nxtlist)
+ if (__this_cpu_read(rcu_sched_data.nxtlist) ||
+ __this_cpu_read(rcu_bh_data.nxtlist))
set_need_resched();
}
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
if (rdp->gpnum != rnp->gpnum) {
- rdp->qs_pending = 1;
- rdp->passed_quiesc = 0;
+ /*
+ * If the current grace period is waiting for this CPU,
+ * set up to detect a quiescent state, otherwise don't
+ * go looking for one.
+ */
rdp->gpnum = rnp->gpnum;
+ if (rnp->qsmask & rdp->grpmask) {
+ rdp->qs_pending = 1;
+ rdp->passed_quiesc = 0;
+ } else
+ rdp->qs_pending = 0;
}
}
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
+
+ /*
+ * If we were in an extended quiescent state, we may have
+ * missed some grace periods that others CPUs handled on
+ * our behalf. Catch up with this state to avoid noting
+ * spurious new grace periods. If another grace period
+ * has started, then rnp->gpnum will have advanced, so
+ * we will detect this later on.
+ */
+ if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+ rdp->gpnum = rdp->completed;
+
+ /*
+ * If RCU does not need a quiescent state from this CPU,
+ * then make sure that this CPU doesn't go looking for one.
+ */
+ if ((rnp->qsmask & rdp->grpmask) == 0)
+ rdp->qs_pending = 0;
}
}
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU. The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first. Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
*/
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
int i;
+ /* current DYING CPU is cleared in the cpu_online_mask */
+ int receive_cpu = cpumask_any(cpu_online_mask);
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
if (rdp->nxtlist == NULL)
return; /* irqs disabled, so comparison is stable. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
- *rsp->orphan_cbs_tail = rdp->nxtlist;
- rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+ *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+ receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ receive_rdp->qlen += rdp->qlen;
+ receive_rdp->n_cbs_adopted += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
+
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
- rsp->orphan_qlen += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen = 0;
- raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_data *rdp;
-
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
- rdp = this_cpu_ptr(rsp->rda);
- if (rsp->orphan_cbs_list == NULL) {
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- return;
- }
- *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
- rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
- rdp->qlen += rsp->orphan_qlen;
- rdp->n_cbs_adopted += rsp->orphan_qlen;
- rsp->orphan_cbs_list = NULL;
- rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
- rsp->orphan_qlen = 0;
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
}
/*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp);
-
- rcu_adopt_orphan_cbs(rsp);
}
/*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
}
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
*/
local_irq_save(flags);
rdp = this_cpu_ptr(rsp->rda);
- rcu_process_gp_end(rsp, rdp);
- check_for_new_grace_period(rsp, rdp);
/* Add the callback to our list. */
*rdp->nxttail[RCU_NEXT_TAIL] = head;
rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
- /* Start a new grace period if one not already started. */
- if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
- }
-
/*
* Force the grace period if too many callbacks or too long waiting.
* Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* is the only one waiting for a grace period to complete.
*/
if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
- rdp->blimit = LONG_MAX;
- if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp, 0);
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
+
+ /* Are we ignoring a completed grace period? */
+ rcu_process_gp_end(rsp, rdp);
+ check_for_new_grace_period(rsp, rdp);
+
+ /* Start a new grace period if one not already started. */
+ if (!rcu_gp_in_progress(rsp)) {
+ unsigned long nestflag;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+ rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
+ } else {
+ /* Give the grace period a kick. */
+ rdp->blimit = LONG_MAX;
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp, 0);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ }
} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
force_quiescent_state(rsp, 1);
local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
* decrement rcu_barrier_cpu_count -- otherwise the first CPU
* might complete its grace period before all of the other CPUs
* did their increment, causing this function to return too
- * early.
+ * early. Note that on_each_cpu() disables irqs, which prevents
+ * any CPUs from coming online or going offline until each online
+ * CPU has queued its RCU-barrier callback.
*/
atomic_set(&rcu_barrier_cpu_count, 1);
- preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
- rcu_adopt_orphan_cbs(rsp);
on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
- preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
case CPU_DYING:
case CPU_DYING_FROZEN:
/*
- * preempt_disable() in _rcu_barrier() prevents stop_machine(),
- * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
- * returns, all online cpus have queued rcu_barrier_func().
- * The dying CPU clears its cpu_online_mask bit and
- * moves all of its RCU callbacks to ->orphan_cbs_list
- * in the context of stop_machine(), so subsequent calls
- * to _rcu_barrier() will adopt these callbacks and only
- * then queue rcu_barrier_func() on all remaining CPUs.
+ * The whole machine is "stopped" except this CPU, so we can
+ * touch any data without introducing corruption. We send the
+ * dying CPU's callbacks to an arbitrarily chosen online CPU.
*/
- rcu_send_cbs_to_orphanage(&rcu_bh_state);
- rcu_send_cbs_to_orphanage(&rcu_sched_state);
- rcu_preempt_send_cbs_to_orphanage();
+ rcu_send_cbs_to_online(&rcu_bh_state);
+ rcu_send_cbs_to_online(&rcu_sched_state);
+ rcu_preempt_send_cbs_to_online();
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+ for (i = NUM_RCU_LVLS - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ rsp->levelspread[0] = RCU_FANOUT_LEAF;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c1..e8f057e44e3 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
/*
* Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
* In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
-#define RCU_FANOUT (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF 16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
# define NUM_RCU_LVLS 1
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
# define NUM_RCU_LVLS 2
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
# define NUM_RCU_LVLS 3
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-# define NUM_RCU_LVL_3 NR_CPUS
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
# define NUM_RCU_LVLS 4
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-# define NUM_RCU_LVL_4 NR_CPUS
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_4 (NR_CPUS)
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
+ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
raw_spinlock_t onofflock; /* exclude on/offline and */
- /* starting new GP. Also */
- /* protects the following */
- /* orphan_cbs fields. */
- struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
- /* orphaned by all CPUs in */
- /* a given leaf rcu_node */
- /* going offline. */
- struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
- long orphan_qlen; /* Number of orphaned cbs. */
+ /* starting new GP. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
static int rcu_preempt_pending(int cpu);
static int rcu_preempt_needs_cpu(int cpu);
static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f..a3638710dc6 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
*/
#include <linux/delay.h>
+#include <linux/stop_machine.h>
/*
* Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
}
/*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
- rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+ rcu_send_cbs_to_online(&rcu_preempt_state);
}
/*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
/*
* Because there is no preemptable RCU, there are no callbacks to move.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
}
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly. This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier. Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word. Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs. If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period. We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done. If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot. In this case, our work is
+ * done for us, and we can simply return. Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+ int firstsnap, s, snap, trycount = 0;
+
+ /* Note that atomic_inc_return() implies full memory barrier. */
+ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+ get_online_cpus();
+
+ /*
+ * Each pass through the following loop attempts to force a
+ * context switch on each CPU.
+ */
+ while (try_stop_cpus(cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
+ put_online_cpus();
+
+ /* No joy, try again later. Or just synchronize_sched(). */
+ if (trycount++ < 10)
+ udelay(trycount * num_online_cpus());
+ else {
+ synchronize_sched();
+ return;
+ }
+
+ /* Check to see if someone else did our work for us. */
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ return;
+ }
+
+ /*
+ * Refetching sync_sched_expedited_started allows later
+ * callers to piggyback on our grace period. We subtract
+ * 1 to get the same token that the last incrementer got.
+ * We retry after they started, so our grace period works
+ * for them, and they started after our first try, so their
+ * grace period works for us.
+ */
+ get_online_cpus();
+ snap = atomic_read(&sync_sched_expedited_started) - 1;
+ smp_mb(); /* ensure read is before try_stop_cpus(). */
+ }
+
+ /*
+ * Everyone up to our most recent fetch is covered by our grace
+ * period. Update the counter, but only if our work is still
+ * relevant -- which it won't be if someone who started later
+ * than we did beat us to the punch.
+ */
+ do {
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ break;
+ }
+ } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d12..c8e97853b97 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+ "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
rsp->completed, gpnum, rsp->signaled,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh, rsp->orphan_qlen);
+ rsp->n_force_qs_lh);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
{
struct dentry *retval;
@@ -337,14 +337,14 @@ free_out:
return 1;
}
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
{
debugfs_remove_recursive(rcudir);
}
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768..34683efa2cc 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
pos, buf, s - buf);
}
+#if BITS_PER_LONG == 32
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+ unsigned long flags;
+ u64 ret;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ ret = *res_counter_member(counter, member);
+ spin_unlock_irqrestore(&counter->lock, flags);
+
+ return ret;
+}
+#else
u64 res_counter_read_u64(struct res_counter *counter, int member)
{
return *res_counter_member(counter, member);
}
+#endif
int res_counter_memparse_write_strategy(const char *buf,
unsigned long long *res)
diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0d..798e2fae2a0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
static DEFINE_RWLOCK(resource_lock);
-/*
- * By default, we allocate free space bottom-up. The architecture can request
- * top-down by clearing this flag. The user can override the architecture's
- * choice with the "resource_alloc_from_bottom" kernel boot option, but that
- * should only be a debugging tool.
- */
-int resource_alloc_from_bottom = 1;
-
-static __init int setup_alloc_from_bottom(char *s)
-{
- printk(KERN_INFO
- "resource: allocating from bottom-up; please report a bug\n");
- resource_alloc_from_bottom = 1;
- return 0;
-}
-early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
-
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
}
+void __weak arch_remove_reservations(struct resource *avail)
+{
+}
+
static resource_size_t simple_align_resource(void *data,
const struct resource *avail,
resource_size_t size,
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
}
/*
- * Find the resource before "child" in the sibling list of "root" children.
- */
-static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
-{
- struct resource *this;
-
- for (this = root->child; this; this = this->sibling)
- if (this->sibling == child)
- return this;
-
- return NULL;
-}
-
-/*
* Find empty slot in the resource tree given range and alignment.
- * This version allocates from the end of the root resource first.
- */
-static int find_resource_from_top(struct resource *root, struct resource *new,
- resource_size_t size, resource_size_t min,
- resource_size_t max, resource_size_t align,
- resource_size_t (*alignf)(void *,
- const struct resource *,
- resource_size_t,
- resource_size_t),
- void *alignf_data)
-{
- struct resource *this;
- struct resource tmp, avail, alloc;
-
- tmp.start = root->end;
- tmp.end = root->end;
-
- this = find_sibling_prev(root, NULL);
- for (;;) {
- if (this) {
- if (this->end < root->end)
- tmp.start = this->end + 1;
- } else
- tmp.start = root->start;
-
- resource_clip(&tmp, min, max);
-
- /* Check for overflow after ALIGN() */
- avail = *new;
- avail.start = ALIGN(tmp.start, align);
- avail.end = tmp.end;
- if (avail.start >= tmp.start) {
- alloc.start = alignf(alignf_data, &avail, size, align);
- alloc.end = alloc.start + size - 1;
- if (resource_contains(&avail, &alloc)) {
- new->start = alloc.start;
- new->end = alloc.end;
- return 0;
- }
- }
-
- if (!this || this->start == root->start)
- break;
-
- tmp.end = this->start - 1;
- this = find_sibling_prev(root, this);
- }
- return -EBUSY;
-}
-
-/*
- * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the beginning of the root resource first.
*/
static int find_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new,
struct resource *this = root->child;
struct resource tmp = *new, avail, alloc;
+ tmp.flags = new->flags;
tmp.start = root->start;
/*
- * Skip past an allocated resource that starts at 0, since the
- * assignment of this->start - 1 to tmp->end below would cause an
- * underflow.
+ * Skip past an allocated resource that starts at 0, since the assignment
+ * of this->start - 1 to tmp->end below would cause an underflow.
*/
if (this && this->start == 0) {
tmp.start = this->end + 1;
this = this->sibling;
}
- for (;;) {
+ for(;;) {
if (this)
tmp.end = this->start - 1;
else
tmp.end = root->end;
resource_clip(&tmp, min, max);
+ arch_remove_reservations(&tmp);
/* Check for overflow after ALIGN() */
avail = *new;
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new,
return 0;
}
}
-
if (!this)
break;
-
tmp.start = this->end + 1;
this = this->sibling;
}
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new,
alignf = simple_align_resource;
write_lock(&resource_lock);
- if (resource_alloc_from_bottom)
- err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
- else
- err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
+ err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c..3c7cbc2c33b 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
put_pid(waiter->deadlock_task_pid);
TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
- TRACE_WARN_ON(waiter->task);
memset(waiter, 0x22, sizeof(*waiter));
}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef..5c9ccd38096 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/sched.h>
-#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/sysdev.h>
#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
int opcode;
int opdata;
int mutexes[MAX_RT_TEST_MUTEXES];
- int bkl;
int event;
struct sys_device sysdev;
};
@@ -46,9 +44,8 @@ enum test_opcodes {
RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
- RTTEST_LOCKBKL, /* 9 Lock BKL */
- RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
- RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
+ /* 9, 10 - reserved for BKL commemoration */
+ RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
RTTEST_RESET = 99, /* 99 Reset all pending operations */
};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
td->mutexes[i] = 0;
}
}
-
- if (!lockwakeup && td->bkl == 4) {
-#ifdef CONFIG_LOCK_KERNEL
- unlock_kernel();
-#endif
- td->bkl = 0;
- }
return 0;
case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
td->mutexes[id] = 0;
return 0;
- case RTTEST_LOCKBKL:
- if (td->bkl)
- return 0;
- td->bkl = 1;
-#ifdef CONFIG_LOCK_KERNEL
- lock_kernel();
-#endif
- td->bkl = 4;
- return 0;
-
- case RTTEST_UNLOCKBKL:
- if (td->bkl != 4)
- break;
-#ifdef CONFIG_LOCK_KERNEL
- unlock_kernel();
-#endif
- td->bkl = 0;
- return 0;
-
default:
break;
}
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
td->event = atomic_add_return(1, &rttest_event);
break;
- case RTTEST_LOCKBKL:
default:
break;
}
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
td->event = atomic_add_return(1, &rttest_event);
return;
- case RTTEST_LOCKBKL:
- return;
default:
return;
}
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
spin_lock(&rttest_lock);
curr += sprintf(curr,
- "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+ "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
td->opcode, td->event, tsk->state,
(MAX_RT_PRIO - 1) - tsk->prio,
(MAX_RT_PRIO - 1) - tsk->normal_prio,
- tsk->pi_blocked_on, td->bkl);
+ tsk->pi_blocked_on);
for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786..ab449117aaf 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
/*
* lock->owner state tracking:
*
- * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
- * are used to keep track of the "owner is pending" and "lock has
- * waiters" state.
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
+ * is used to keep track of the "lock has waiters" state.
*
- * owner bit1 bit0
- * NULL 0 0 lock is free (fast acquire possible)
- * NULL 0 1 invalid state
- * NULL 1 0 Transitional State*
- * NULL 1 1 invalid state
- * taskpointer 0 0 lock is held (fast release possible)
- * taskpointer 0 1 task is pending owner
- * taskpointer 1 0 lock is held and has waiters
- * taskpointer 1 1 task is pending owner and lock has more waiters
- *
- * Pending ownership is assigned to the top (highest priority)
- * waiter of the lock, when the lock is released. The thread is woken
- * up and can now take the lock. Until the lock is taken (bit 0
- * cleared) a competing higher priority thread can steal the lock
- * which puts the woken up thread back on the waiters list.
+ * owner bit0
+ * NULL 0 lock is free (fast acquire possible)
+ * NULL 1 lock is free and has waiters and the top waiter
+ * is going to take the lock*
+ * taskpointer 0 lock is held (fast release possible)
+ * taskpointer 1 lock is held and has waiters**
*
* The fast atomic compare exchange based acquire and release is only
- * possible when bit 0 and 1 of lock->owner are 0.
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
*
- * (*) There's a small time where the owner can be NULL and the
- * "lock has waiters" bit is set. This can happen when grabbing the lock.
- * To prevent a cmpxchg of the owner releasing the lock, we need to set this
- * bit before looking at the lock, hence the reason this is a transitional
- * state.
+ * (**) There is a small time when bit 0 is set but there are no
+ * waiters. This can happen when grabbing the lock in the slow path.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
+ * set this bit before looking at the lock.
*/
static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
- unsigned long mask)
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
{
- unsigned long val = (unsigned long)owner | mask;
+ unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* reached or the state of the chain has changed while we
* dropped the locks.
*/
- if (!waiter || !waiter->task)
+ if (!waiter)
goto out_unlock_pi;
/*
* Check the orig_waiter state. After we dropped the locks,
- * the previous owner of the lock might have released the lock
- * and made us the pending owner:
+ * the previous owner of the lock might have released the lock.
*/
- if (orig_waiter && !orig_waiter->task)
+ if (orig_waiter && !rt_mutex_owner(orig_lock))
goto out_unlock_pi;
/*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ if (!rt_mutex_owner(lock)) {
+ /*
+ * If the requeue above changed the top waiter, then we need
+ * to wake the new top waiter up to try to get the lock.
+ */
+
+ if (top_waiter != rt_mutex_top_waiter(lock))
+ wake_up_process(rt_mutex_top_waiter(lock)->task);
+ raw_spin_unlock(&lock->wait_lock);
+ goto out_put_task;
+ }
put_task_struct(task);
/* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * Optimization: check if we can steal the lock from the
- * assigned pending owner [which might not have taken the
- * lock yet]:
- */
-static inline int try_to_steal_lock(struct rt_mutex *lock,
- struct task_struct *task)
-{
- struct task_struct *pendowner = rt_mutex_owner(lock);
- struct rt_mutex_waiter *next;
- unsigned long flags;
-
- if (!rt_mutex_owner_pending(lock))
- return 0;
-
- if (pendowner == task)
- return 1;
-
- raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
- if (task->prio >= pendowner->prio) {
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
- return 0;
- }
-
- /*
- * Check if a waiter is enqueued on the pending owners
- * pi_waiters list. Remove it and readjust pending owners
- * priority.
- */
- if (likely(!rt_mutex_has_waiters(lock))) {
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
- return 1;
- }
-
- /* No chain handling, pending owner is not blocked on anything: */
- next = rt_mutex_top_waiter(lock);
- plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
- __rt_mutex_adjust_prio(pendowner);
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
- /*
- * We are going to steal the lock and a waiter was
- * enqueued on the pending owners pi_waiters queue. So
- * we have to enqueue this waiter into
- * task->pi_waiters list. This covers the case,
- * where task is boosted because it holds another
- * lock and gets unboosted because the booster is
- * interrupted, so we would delay a waiter with higher
- * priority as task->normal_prio.
- *
- * Note: in the rare case of a SCHED_OTHER task changing
- * its priority and thus stealing the lock, next->task
- * might be task:
- */
- if (likely(next->task != task)) {
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- plist_add(&next->pi_list_entry, &task->pi_waiters);
- __rt_mutex_adjust_prio(task);
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- }
- return 1;
-}
-
-/*
* Try to take an rt-mutex
*
- * This fails
- * - when the lock has a real owner
- * - when a different pending owner exists and has higher priority than current
- *
* Must be called with lock->wait_lock held.
+ *
+ * @lock: the lock to be acquired.
+ * @task: the task which wants to acquire the lock
+ * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
{
/*
* We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
*/
mark_rt_mutex_waiters(lock);
- if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
+ if (rt_mutex_owner(lock))
return 0;
+ /*
+ * It will get the lock because of one of these conditions:
+ * 1) there is no waiter
+ * 2) higher priority than waiters
+ * 3) it is top waiter
+ */
+ if (rt_mutex_has_waiters(lock)) {
+ if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+ if (!waiter || waiter != rt_mutex_top_waiter(lock))
+ return 0;
+ }
+ }
+
+ if (waiter || rt_mutex_has_waiters(lock)) {
+ unsigned long flags;
+ struct rt_mutex_waiter *top;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /* remove the queued waiter. */
+ if (waiter) {
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ task->pi_blocked_on = NULL;
+ }
+
+ /*
+ * We have to enqueue the top waiter(if it exists) into
+ * task->pi_waiters list.
+ */
+ if (rt_mutex_has_waiters(lock)) {
+ top = rt_mutex_top_waiter(lock);
+ top->pi_list_entry.prio = top->list_entry.prio;
+ plist_add(&top->pi_list_entry, &task->pi_waiters);
+ }
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ }
+
/* We got the lock. */
debug_rt_mutex_lock(lock);
- rt_mutex_set_owner(lock, current, 0);
+ rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, current);
+ rt_mutex_deadlock_account_lock(lock, task);
return 1;
}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ if (!owner)
+ return 0;
+
if (waiter == rt_mutex_top_waiter(lock)) {
raw_spin_lock_irqsave(&owner->pi_lock, flags);
plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/*
* Wake up the next waiter on the lock.
*
- * Remove the top waiter from the current tasks waiter list and from
- * the lock waiter list. Set it as pending owner. Then wake it up.
+ * Remove the top waiter from the current tasks waiter list and wake it up.
*
* Called with lock->wait_lock held.
*/
static void wakeup_next_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- struct task_struct *pendowner;
unsigned long flags;
raw_spin_lock_irqsave(&current->pi_lock, flags);
waiter = rt_mutex_top_waiter(lock);
- plist_del(&waiter->list_entry, &lock->wait_list);
/*
* Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
* lock->wait_lock.
*/
plist_del(&waiter->pi_list_entry, &current->pi_waiters);
- pendowner = waiter->task;
- waiter->task = NULL;
- rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+ rt_mutex_set_owner(lock, NULL);
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- /*
- * Clear the pi_blocked_on variable and enqueue a possible
- * waiter into the pi_waiters list of the pending owner. This
- * prevents that in case the pending owner gets unboosted a
- * waiter with higher priority than pending-owner->normal_prio
- * is blocked on the unboosted (pending) owner.
- */
- raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-
- WARN_ON(!pendowner->pi_blocked_on);
- WARN_ON(pendowner->pi_blocked_on != waiter);
- WARN_ON(pendowner->pi_blocked_on->lock != lock);
-
- pendowner->pi_blocked_on = NULL;
-
- if (rt_mutex_has_waiters(lock)) {
- struct rt_mutex_waiter *next;
-
- next = rt_mutex_top_waiter(lock);
- plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
- }
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
- wake_up_process(pendowner);
+ wake_up_process(waiter->task);
}
/*
- * Remove a waiter from a lock
+ * Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held
+ * Must be called with lock->wait_lock held and
+ * have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_lock_irqsave(&current->pi_lock, flags);
plist_del(&waiter->list_entry, &lock->wait_list);
- waiter->task = NULL;
current->pi_blocked_on = NULL;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- if (first && owner != current) {
+ if (!owner)
+ return;
+
+ if (first) {
raw_spin_lock_irqsave(&owner->pi_lock, flags);
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
* or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
- * @detect_deadlock: passed to task_blocks_on_rt_mutex
*
* lock->wait_lock must be held by the caller.
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock)
+ struct rt_mutex_waiter *waiter)
{
int ret = 0;
for (;;) {
/* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock))
+ if (try_to_take_rt_mutex(lock, current, waiter))
break;
/*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
- /*
- * waiter->task is NULL the first time we come here and
- * when we have been woken up by the previous owner
- * but the lock got stolen by a higher prio task.
- */
- if (!waiter->task) {
- ret = task_blocks_on_rt_mutex(lock, waiter, current,
- detect_deadlock);
- /*
- * If we got woken up by the owner then start loop
- * all over without going into schedule to try
- * to get the lock now:
- */
- if (unlikely(!waiter->task)) {
- /*
- * Reset the return value. We might
- * have returned with -EDEADLK and the
- * owner released the lock while we
- * were walking the pi chain.
- */
- ret = 0;
- continue;
- }
- if (unlikely(ret))
- break;
- }
-
raw_spin_unlock(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
- if (waiter->task)
- schedule_rt_mutex(lock);
+ schedule_rt_mutex(lock);
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
- waiter.task = NULL;
raw_spin_lock(&lock->wait_lock);
/* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock)) {
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
raw_spin_unlock(&lock->wait_lock);
return 0;
}
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
timeout->task = NULL;
}
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
- detect_deadlock);
+ ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+
+ if (likely(!ret))
+ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
set_current_state(TASK_RUNNING);
- if (unlikely(waiter.task))
+ if (unlikely(ret))
remove_waiter(lock, &waiter);
/*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
if (unlikely(timeout))
hrtimer_cancel(&timeout->timer);
- /*
- * Readjust priority, when we did not get the lock. We might
- * have been the pending owner and boosted. Since we did not
- * take the lock, the PI boost has to go.
- */
- if (unlikely(ret))
- rt_mutex_adjust_prio(current);
-
debug_rt_mutex_free_waiter(&waiter);
return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
if (likely(rt_mutex_owner(lock) != current)) {
- ret = try_to_take_rt_mutex(lock);
+ ret = try_to_take_rt_mutex(lock, current, NULL);
/*
* try_to_take_rt_mutex() sets the lock waiters
* bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
{
__rt_mutex_init(lock, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
- rt_mutex_set_owner(lock, proxy_owner, 0);
+ rt_mutex_set_owner(lock, proxy_owner);
rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner)
{
debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL, 0);
+ rt_mutex_set_owner(lock, NULL);
rt_mutex_deadlock_account_unlock(proxy_owner);
}
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
raw_spin_lock(&lock->wait_lock);
- mark_rt_mutex_waiters(lock);
-
- if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
- /* We got the lock for task. */
- debug_rt_mutex_lock(lock);
- rt_mutex_set_owner(lock, task, 0);
+ if (try_to_take_rt_mutex(lock, task, NULL)) {
raw_spin_unlock(&lock->wait_lock);
- rt_mutex_deadlock_account_lock(lock, task);
return 1;
}
ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
- if (ret && !waiter->task) {
+ if (ret && !rt_mutex_owner(lock)) {
/*
* Reset the return value. We might have
* returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
*/
ret = 0;
}
+
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+
raw_spin_unlock(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
- detect_deadlock);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
set_current_state(TASK_RUNNING);
- if (unlikely(waiter->task))
+ if (unlikely(ret))
remove_waiter(lock, waiter);
/*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- /*
- * Readjust priority, when we did not get the lock. We might have been
- * the pending owner and boosted. Since we did not take the lock, the
- * PI boost has to go.
- */
- if (unlikely(ret))
- rt_mutex_adjust_prio(current);
-
return ret;
}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866a..53a66c85261 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
/*
* lock->owner state tracking:
*/
-#define RT_MUTEX_OWNER_PENDING 1UL
-#define RT_MUTEX_HAS_WAITERS 2UL
-#define RT_MUTEX_OWNER_MASKALL 3UL
+#define RT_MUTEX_HAS_WAITERS 1UL
+#define RT_MUTEX_OWNER_MASKALL 1UL
static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
}
-static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
-{
- return (struct task_struct *)
- ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
-}
-
-static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
-{
- return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
-}
-
/*
* PI-futex support (proxy locking functions, etc.):
*/
diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac..312f8b95c2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
@@ -75,9 +74,11 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_autogroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -253,6 +254,8 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+
+ atomic_t load_weight;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +271,18 @@ struct task_group {
struct task_group *parent;
struct list_head siblings;
struct list_head children;
-};
-#define root_task_group init_task_group
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
+};
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
- return list_empty(&root_task_group.children);
-}
-#endif
-
-# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
/*
* A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +295,13 @@ static int root_task_group_empty(void)
#define MIN_SHARES 2
#define MAX_SHARES (1UL << 18)
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
#endif
/* Default task group.
* Every task in system belong to this group at bootup.
*/
-struct task_group init_task_group;
+struct task_group root_task_group;
#endif /* CONFIG_CGROUP_SCHED */
@@ -327,7 +323,7 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next, *last;
+ struct sched_entity *curr, *next, *last, *skip;
unsigned int nr_spread_over;
@@ -342,6 +338,7 @@ struct cfs_rq {
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
+ int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
@@ -360,14 +357,17 @@ struct cfs_rq {
unsigned long h_load;
/*
- * this cpu's part of tg->shares
+ * Maintaining per-cpu shares distribution for group scheduling
+ *
+ * load_stamp is the last time we updated the load average
+ * load_last is the last time we updated the load average and saw load
+ * load_unacc_exec_time is currently unaccounted execution time
*/
- unsigned long shares;
+ u64 load_avg;
+ u64 load_period;
+ u64 load_stamp, load_last, load_unacc_exec_time;
- /*
- * load.weight at the time we set shares
- */
- unsigned long rq_weight;
+ unsigned long load_contribution;
#endif
#endif
};
@@ -552,9 +552,6 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
-
- /* BKL stats */
- unsigned int bkl_count;
#endif
};
@@ -605,11 +602,14 @@ static inline int cpu_of(struct rq *rq)
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+
+ return autogroup_task_group(p, tg);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -636,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
#endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update) {
- int cpu = cpu_of(rq);
- u64 irq_time;
+ s64 delta;
- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
+ if (rq->skip_clock_update)
+ return;
- sched_irq_time_avg_update(rq, irq_time);
- }
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}
/*
@@ -664,10 +660,9 @@ inline void update_rq_clock(struct rq *rq)
#endif
/**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
* @cpu: the processor in question.
*
- * Returns true if the current cpu runqueue is locked.
* This interface allows printk to be called with the runqueue lock
* held and know whether or not it is OK to wake up the klogd.
*/
@@ -741,7 +736,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
cmp = strstrip(buf);
- if (strncmp(buf, "NO_", 3) == 0) {
+ if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
@@ -797,20 +792,6 @@ late_initcall(sched_init_debug);
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
-/*
* period over which we average the RT time consumption, measured
* in ms.
*
@@ -1359,6 +1340,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
lw->inv_weight = 0;
}
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1547,101 +1534,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares,
- unsigned long sd_rq_weight,
- unsigned long *usd_rq_weight)
-{
- unsigned long shares, rq_weight;
- int boost = 0;
-
- rq_weight = usd_rq_weight[cpu];
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum_j shares_j * rq_weight_i
- * shares_i = -----------------------------
- * \Sum_j rq_weight_j
- */
- shares = (sd_shares * rq_weight) / sd_rq_weight;
- shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
- if (abs(shares - tg->se[cpu]->load.weight) >
- sysctl_sched_shares_thresh) {
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- __set_se_shares(tg->se[cpu], shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
- unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
- unsigned long *usd_rq_weight;
- struct sched_domain *sd = data;
- unsigned long flags;
- int i;
-
- if (!tg->se[0])
- return 0;
-
- local_irq_save(flags);
- usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
- for_each_cpu(i, sched_domain_span(sd)) {
- weight = tg->cfs_rq[i]->load.weight;
- usd_rq_weight[i] = weight;
-
- rq_weight += weight;
- /*
- * If there are currently no tasks on the cpu pretend there
- * is one of average load so that when a new task gets to
- * run here it will not get delayed by group starvation.
- */
- if (!weight)
- weight = NICE_0_LOAD;
-
- sum_weight += weight;
- shares += tg->cfs_rq[i]->shares;
- }
-
- if (!rq_weight)
- rq_weight = sum_weight;
-
- if ((!shares && rq_weight) || shares > tg->shares)
- shares = tg->shares;
-
- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
- shares = tg->shares;
-
- for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
- local_irq_restore(flags);
-
- return 0;
-}
-
/*
* Compute the cpu's hierarchical load factor for each task group.
* This needs to be done in a top-down fashion because the load of a child
@@ -1656,7 +1548,7 @@ static int tg_load_down(struct task_group *tg, void *data)
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->cfs_rq[cpu]->shares;
+ load *= tg->se[cpu]->load.weight;
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}
@@ -1665,34 +1557,11 @@ static int tg_load_down(struct task_group *tg, void *data)
return 0;
}
-static void update_shares(struct sched_domain *sd)
-{
- s64 elapsed;
- u64 now;
-
- if (root_task_group_empty())
- return;
-
- now = local_clock();
- elapsed = now - sd->last_update;
-
- if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
- sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, sd);
- }
-}
-
static void update_h_load(long cpu)
{
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
#endif
#ifdef CONFIG_PREEMPT
@@ -1812,15 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
-#endif
+#else /* CONFIG_SMP */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
{
-#ifdef CONFIG_SMP
- cfs_rq->shares = shares;
-#endif
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
}
+
#endif
static void calc_load_account_idle(struct rq *this_rq);
@@ -1924,10 +1817,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
* They are read and saved off onto struct rq in update_rq_clock().
* This may result in other CPU reading this CPU's irq time and can
* race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
*/
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1945,19 +1837,58 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
{
- if (!sched_clock_irqtime)
- return 0;
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
void account_system_vtime(struct task_struct *curr)
{
unsigned long flags;
+ s64 delta;
int cpu;
- u64 now, delta;
if (!sched_clock_irqtime)
return;
@@ -1965,9 +1896,10 @@ void account_system_vtime(struct task_struct *curr)
local_irq_save(flags);
cpu = smp_processor_id();
- now = sched_clock_cpu(cpu);
- delta = now - per_cpu(irq_start_time, cpu);
- per_cpu(irq_start_time, cpu) = now;
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -1975,37 +1907,92 @@ void account_system_vtime(struct task_struct *curr)
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
- per_cpu(cpu_hardirq_time, cpu) += delta;
- else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- per_cpu(cpu_softirq_time, cpu) += delta;
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ __this_cpu_add(cpu_softirq_time, delta);
+ irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- rq->prev_irq_time = curr_irq_time;
- sched_rt_avg_update(rq, delta_irq);
- }
+ s64 irq_delta;
+
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
}
-#else
+static int irqtime_account_hi_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
-static u64 irq_time_cpu(int cpu)
+static int irqtime_account_si_update(void)
{
- return 0;
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
}
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#endif
+#define sched_clock_irqtime (0)
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+ rq->clock_task += delta;
+}
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_autogroup.c"
#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
@@ -2098,14 +2085,14 @@ inline int task_curr(const struct task_struct *p)
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
- int oldprio, int running)
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
- prev_class->switched_from(rq, p, running);
- p->sched_class->switched_to(rq, p, running);
- } else
- p->sched_class->prio_changed(rq, p, oldprio, running);
+ prev_class->switched_from(rq, p);
+ p->sched_class->switched_to(rq, p);
+ } else if (oldprio != p->prio)
+ p->sched_class->prio_changed(rq, p, oldprio);
}
static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2129,7 +2116,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (test_tsk_need_resched(rq->curr))
+ if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -2198,10 +2185,8 @@ static int migration_cpu_stop(void *data);
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
{
- struct rq *rq = task_rq(p);
-
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
@@ -2299,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(on_rq)) {
- schedule_timeout_uninterruptible(1);
+ ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
continue;
}
@@ -2321,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* Cause a process which is running on another CPU to enter
* kernel-mode, without any delay. (to get signals handled.)
*
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
* because all it wants to ensure is that the remote task enters
* the kernel. If the IPI races and the task has been migrated
* to another CPU then no harm is done and the purpose has been
@@ -2340,27 +2328,6 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
#endif /* CONFIG_SMP */
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p: the task to evaluate
- * @func: the function to be called
- * @info: the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
- void (*func) (void *info), void *info)
-{
- int cpu;
-
- preempt_disable();
- cpu = task_cpu(p);
- if (task_curr(p))
- smp_call_function_single(cpu, func, info, 1);
- preempt_enable();
-}
-
#ifdef CONFIG_SMP
/*
* ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2381,18 +2348,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (unlikely(dest_cpu >= nr_cpu_ids)) {
- dest_cpu = cpuset_cpus_allowed_fallback(p);
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no "
- "longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
- }
+ dest_cpu = cpuset_cpus_allowed_fallback(p);
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
}
return dest_cpu;
@@ -2583,7 +2547,7 @@ out:
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not alredy there. The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task. this_rq() stays locked over invocation.
*/
@@ -2644,6 +2608,7 @@ static void __sched_fork(struct task_struct *p)
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
+ p->se.vruntime = 0;
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2728,7 +2693,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
put_cpu();
}
@@ -2852,9 +2819,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ sched_info_switch(prev, next);
+ perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+ trace_sched_switch(prev, next);
}
/**
@@ -2987,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_sched_switch(prev, next);
+
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -3119,6 +3089,15 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
#ifdef CONFIG_NO_HZ
/*
* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3127,128 @@ static long calc_load_fold_idle(void)
return delta;
}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+ long delta, active, n;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+
+ /*
+ * If we crossed a calc_load_update boundary, make sure to fold
+ * any pending idle changes, the respective CPUs might have
+ * missed the tick driven calc_load_account_active() update
+ * due to NO_HZ.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ /*
+ * If we were idle for multiple load cycles, apply them.
+ */
+ if (ticks >= LOAD_FREQ) {
+ n = ticks / LOAD_FREQ;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Its possible the remainder of the above division also crosses
+ * a LOAD_FREQ period, the regular check in calc_global_load()
+ * which comes after this will take care of that.
+ *
+ * Consider us being 11 ticks before a cycle completion, and us
+ * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ * age us 4 cycles, and the test in calc_global_load() will
+ * pick up the final one.
+ */
+}
#else
static void calc_load_account_idle(struct rq *this_rq)
{
@@ -3157,6 +3258,10 @@ static inline long calc_load_fold_idle(void)
{
return 0;
}
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
#endif
/**
@@ -3174,24 +3279,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- return load >> FSHIFT;
-}
-
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
{
- unsigned long upd = calc_load_update + 10;
long active;
- if (time_before(jiffies, upd))
+ calc_global_nohz(ticks);
+
+ if (time_before(jiffies, calc_load_update + 10))
return;
active = atomic_long_read(&calc_load_tasks);
@@ -3364,7 +3462,7 @@ void sched_exec(void)
* select_task_rq() can race against ->cpus_allowed
*/
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
task_rq_unlock(rq, &flags);
@@ -3516,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
}
/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+ cputime64_t tmp = cputime_to_cputime64(cputime);
+
+ /* Add system time to process. */
+ p->stime = cputime_add(p->stime, cputime);
+ p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
+/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3526,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t tmp;
+ cputime64_t *target_cputime64;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
return;
}
- /* Add system time to process. */
- p->stime = cputime_add(p->stime, cputime);
- p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
- cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ target_cputime64 = &cpustat->irq;
else if (in_serving_softirq())
- cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ target_cputime64 = &cpustat->softirq;
else
- cpustat->system = cputime64_add(cpustat->system, tmp);
-
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ target_cputime64 = &cpustat->system;
- /* Account for system time used */
- acct_update_integrals(p);
+ __account_system_time(p, cputime, cputime_scaled, target_cputime64);
}
/*
* Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
*/
void account_steal_time(cputime_t cputime)
{
@@ -3583,6 +3697,73 @@ void account_idle_time(cputime_t cputime)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+ if (irqtime_account_hi_update()) {
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ } else if (irqtime_account_si_update()) {
+ cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->softirq);
+ } else if (user_tick) {
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime_one_jiffy);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else {
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->system);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ int i;
+ struct rq *rq = this_rq();
+
+ for (i = 0; i < ticks; i++)
+ irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
@@ -3593,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq);
+ return;
+ }
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3618,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
account_idle_time(jiffies_to_cputime(ticks));
}
@@ -3835,7 +4027,7 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
#ifdef CONFIG_SCHEDSTATS
if (unlikely(prev->lock_depth >= 0)) {
- schedstat_inc(this_rq(), bkl_count);
+ schedstat_inc(this_rq(), rq_sched_info.bkl_count);
schedstat_inc(prev, sched_info.bkl_count);
}
#endif
@@ -3845,7 +4037,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
if (prev->se.on_rq)
update_rq_clock(rq);
- rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
@@ -3894,16 +4085,12 @@ need_resched:
rcu_note_context_switch(cpu);
prev = rq->curr;
- release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
schedule_debug(prev);
if (sched_feat(HRTICK))
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- clear_tsk_need_resched(prev);
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3924,6 +4111,16 @@ need_resched_nonpreemptible:
try_to_wake_up_local(to_wakeup);
}
deactivate_task(rq, prev, DEQUEUE_SLEEP);
+
+ /*
+ * If we are going to sleep and we have plugged IO queued, make
+ * sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(prev)) {
+ raw_spin_unlock(&rq->lock);
+ blk_schedule_flush_plug(prev);
+ raw_spin_lock(&rq->lock);
+ }
}
switch_count = &prev->nvcsw;
}
@@ -3935,11 +4132,10 @@ need_resched_nonpreemptible:
put_prev_task(rq, prev);
next = pick_next_task(rq);
+ clear_tsk_need_resched(prev);
+ rq->skip_clock_update = 0;
if (likely(prev != next)) {
- sched_info_switch(prev, next);
- perf_event_task_sched_out(prev, next);
-
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -3958,9 +4154,6 @@ need_resched_nonpreemptible:
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(prev)))
- goto need_resched_nonpreemptible;
-
preempt_enable_no_resched();
if (need_resched())
goto need_resched;
@@ -4029,7 +4222,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
if (task_thread_info(rq->curr) != owner || need_resched())
return 0;
- cpu_relax();
+ arch_mutex_cpu_relax();
}
return 1;
@@ -4161,6 +4354,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
{
__wake_up_common(q, mode, 1, 0, key);
}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4341,7 +4535,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
@@ -4374,7 +4568,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_killable_timeout(struct completion *x,
unsigned long timeout)
{
@@ -4518,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
+ check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, &flags);
}
@@ -4709,14 +4902,17 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
+ if (cred->user->user_ns == pcred->user->user_ns)
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ else
+ match = false;
rcu_read_unlock();
return match;
}
static int __sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param, bool user)
+ const struct sched_param *param, bool user)
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
@@ -4770,12 +4966,15 @@ recheck:
param->sched_priority > rlim_rtprio)
return -EPERM;
}
+
/*
- * Like positive nice levels, dont allow tasks to
- * move out of SCHED_IDLE either:
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
- return -EPERM;
+ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (!can_nice(p, TASK_NICE(p)))
+ return -EPERM;
+ }
/* can't change other user's priorities */
if (!check_same_owner(p))
@@ -4798,7 +4997,7 @@ recheck:
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
/*
- * To be able to change p->policy safely, the apropriate
+ * To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
rq = __task_rq_lock(p);
@@ -4812,6 +5011,17 @@ recheck:
return -EINVAL;
}
+ /*
+ * If not changing anything there's no need to proceed further:
+ */
+ if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+ param->sched_priority == p->rt_priority))) {
+
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return 0;
+ }
+
#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
/*
@@ -4819,7 +5029,8 @@ recheck:
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0) {
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return -EPERM;
@@ -4849,11 +5060,10 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
activate_task(rq, p, 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
+ check_class_changed(rq, p, prev_class, oldprio);
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4871,7 +5081,7 @@ recheck:
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, true);
}
@@ -4889,7 +5099,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* but our caller might not have that capability.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, false);
}
@@ -5035,7 +5245,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
@@ -5270,6 +5480,67 @@ void __sched yield(void)
}
EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ bool yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ double_rq_lock(rq, p_rq);
+ while (task_rq(p) != p_rq) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ if (!curr->sched_class->yield_to_task)
+ goto out;
+
+ if (curr->sched_class != p->sched_class)
+ goto out;
+
+ if (task_running(p_rq, p) || p->state)
+ goto out;
+
+ yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ if (yielded) {
+ schedstat_inc(rq, yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity takes care of
+ * fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_task(p_rq->curr);
+ }
+
+out:
+ double_rq_unlock(rq, p_rq);
+ local_irq_restore(flags);
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
@@ -5280,6 +5551,7 @@ void __sched io_schedule(void)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
@@ -5295,6 +5567,7 @@ long __sched io_schedule_timeout(long timeout)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout);
current->in_iowait = 0;
@@ -5405,7 +5678,7 @@ void sched_show_task(struct task_struct *p)
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
- printk(KERN_INFO "%-13.13s %c", p->comm,
+ printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
@@ -5443,7 +5716,7 @@ void show_state_filter(unsigned long state_filter)
do_each_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
- * console might take alot of time:
+ * console might take a lot of time:
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
@@ -5518,7 +5791,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
- ftrace_graph_init_task(idle);
+ ftrace_graph_init_idle_task(idle, cpu);
}
/*
@@ -5569,7 +5842,6 @@ static void update_sysctl(void)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
- SET_SYSCTL(sched_shares_ratelimit);
#undef SET_SYSCTL
}
@@ -5645,7 +5917,7 @@ again:
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, dest_cpu)) {
+ if (migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags);
@@ -5727,29 +5999,20 @@ static int migration_cpu_stop(void *data)
}
#ifdef CONFIG_HOTPLUG_CPU
+
/*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
*/
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
{
- struct rq *rq = cpu_rq(dead_cpu);
- int needs_cpu, uninitialized_var(dest_cpu);
- unsigned long flags;
+ struct mm_struct *mm = current->active_mm;
- local_irq_save(flags);
+ BUG_ON(cpu_online(smp_processor_id()));
- raw_spin_lock(&rq->lock);
- needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
- if (needs_cpu)
- dest_cpu = select_fallback_rq(dead_cpu, p);
- raw_spin_unlock(&rq->lock);
- /*
- * It can only fail if we race with set_cpus_allowed(),
- * in the racer should migrate the task anyway.
- */
- if (needs_cpu)
- __migrate_task(p, dead_cpu, dest_cpu);
- local_irq_restore(flags);
+ if (mm != &init_mm)
+ switch_mm(mm, &init_mm, current);
+ mmdrop(mm);
}
/*
@@ -5762,128 +6025,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
static void migrate_nr_uninterruptible(struct rq *rq_src)
{
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
- unsigned long flags;
- local_irq_save(flags);
- double_rq_lock(rq_src, rq_dest);
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
rq_src->nr_uninterruptible = 0;
- double_rq_unlock(rq_src, rq_dest);
- local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
- struct task_struct *p, *t;
-
- read_lock(&tasklist_lock);
-
- do_each_thread(t, p) {
- if (p == current)
- continue;
-
- if (task_cpu(p) == src_cpu)
- move_task_off_dead_cpu(src_cpu, p);
- } while_each_thread(t, p);
-
- read_unlock(&tasklist_lock);
}
/*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
*/
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
{
- int this_cpu = smp_processor_id();
- struct rq *rq = cpu_rq(this_cpu);
- struct task_struct *p = rq->idle;
- unsigned long flags;
-
- /* cpu has to be offline */
- BUG_ON(cpu_online(this_cpu));
-
- /*
- * Strictly not necessary since rest of the CPUs are stopped by now
- * and interrupts disabled on the current cpu.
- */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
- activate_task(rq, p, 0);
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
}
/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
*/
-void idle_task_exit(void)
-{
- struct mm_struct *mm = current->active_mm;
-
- BUG_ON(cpu_online(smp_processor_id()));
-
- if (mm != &init_mm)
- switch_mm(mm, &init_mm, current);
- mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
{
struct rq *rq = cpu_rq(dead_cpu);
-
- /* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(!p->exit_state);
-
- /* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(p->state == TASK_DEAD);
-
- get_task_struct(p);
+ struct task_struct *next, *stop = rq->stop;
+ int dest_cpu;
/*
- * Drop lock around migration; if someone else moves it,
- * that's OK. No task can be added to this CPU, so iteration is
- * fine.
+ * Fudge the rq selection such that the below task selection loop
+ * doesn't get stuck on the currently eligible stop task.
+ *
+ * We're currently inside stop_machine() and the rq is either stuck
+ * in the stop_machine_cpu_stop() loop, or we're executing this code,
+ * either way we should never end up calling schedule() until we're
+ * done here.
*/
- raw_spin_unlock_irq(&rq->lock);
- move_task_off_dead_cpu(dead_cpu, p);
- raw_spin_lock_irq(&rq->lock);
-
- put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
- struct rq *rq = cpu_rq(dead_cpu);
- struct task_struct *next;
+ rq->stop = NULL;
for ( ; ; ) {
- if (!rq->nr_running)
+ /*
+ * There's this thread running, bail when that's the only
+ * remaining thread.
+ */
+ if (rq->nr_running == 1)
break;
+
next = pick_next_task(rq);
- if (!next)
- break;
+ BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
- migrate_dead(dead_cpu, next);
+ /* Find suitable destination for @next, with force if needed. */
+ dest_cpu = select_fallback_rq(dead_cpu, next);
+ raw_spin_unlock(&rq->lock);
+
+ __migrate_task(next, dead_cpu, dest_cpu);
+
+ raw_spin_lock(&rq->lock);
}
-}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
+ rq->stop = stop;
}
+
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6093,15 +6297,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned long flags;
struct rq *rq = cpu_rq(cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
@@ -6113,33 +6315,25 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_live_tasks(cpu);
- /* Idle task back to normal (off runqueue, low prio) */
- raw_spin_lock_irq(&rq->lock);
- deactivate_task(rq, rq->idle, 0);
- __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
- rq->idle->sched_class = &idle_sched_class;
- migrate_dead_tasks(cpu);
- raw_spin_unlock_irq(&rq->lock);
- migrate_nr_uninterruptible(rq);
- BUG_ON(rq->nr_running != 0);
- calc_global_load_remove(rq);
- break;
-
case CPU_DYING:
- case CPU_DYING_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
+ migrate_tasks(cpu);
+ BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ migrate_nr_uninterruptible(rq);
+ calc_global_load_remove(rq);
break;
#endif
}
+
+ update_max_interval();
+
return NOTIFY_OK;
}
@@ -7825,6 +8019,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
INIT_LIST_HEAD(&cfs_rq->tasks);
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
+ /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+ cfs_rq->load_stamp = 1;
+#endif
#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
}
@@ -7867,18 +8065,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
- struct sched_entity *se, int cpu, int add,
+ struct sched_entity *se, int cpu,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
tg->cfs_rq[cpu] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
- if (add)
- list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
tg->se[cpu] = se;
- /* se could be NULL for init_task_group */
+ /* se could be NULL for root_task_group */
if (!se)
return;
@@ -7888,15 +8084,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- se->load.weight = tg->shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, 0);
se->parent = parent;
}
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- struct sched_rt_entity *rt_se, int cpu, int add,
+ struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
@@ -7905,8 +8100,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
init_rt_rq(rt_rq, rq);
rt_rq->tg = tg;
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (add)
- list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
tg->rt_se[cpu] = rt_se;
if (!rt_se)
@@ -7941,18 +8134,18 @@ void __init sched_init(void)
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
- init_task_group.se = (struct sched_entity **)ptr;
+ root_task_group.se = (struct sched_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
- init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
- init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
- init_task_group.rt_rq = (struct rt_rq **)ptr;
+ root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7972,20 +8165,16 @@ void __init sched_init(void)
global_rt_period(), global_rt_runtime());
#ifdef CONFIG_RT_GROUP_SCHED
- init_rt_bandwidth(&init_task_group.rt_bandwidth,
+ init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
- list_add(&init_task_group.list, &task_groups);
- INIT_LIST_HEAD(&init_task_group.children);
-
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
- update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
- __alignof__(unsigned long));
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -7997,38 +8186,34 @@ void __init sched_init(void)
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
- init_task_group.shares = init_task_group_load;
+ root_task_group.shares = root_task_group_load;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
/*
- * How much cpu bandwidth does init_task_group get?
+ * How much cpu bandwidth does root_task_group get?
*
* In case of task-groups formed thr' the cgroup filesystem, it
* gets 100% of the cpu resources in the system. This overall
* system cpu resource is divided among the tasks of
- * init_task_group and its child task-groups in a fair manner,
+ * root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
- * In other words, if init_task_group has 10 tasks of weight
+ * In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the cpu resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
- * We achieve this by letting init_task_group's tasks sit
- * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+ * We achieve this by letting root_task_group's tasks sit
+ * directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+ init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
- init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+ init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8108,8 +8293,6 @@ void __init sched_init(void)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
- perf_event_init();
-
scheduler_running = 1;
}
@@ -8118,7 +8301,7 @@ static inline int preempt_count_equals(int preempt_offset)
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
- return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+ return (nested == preempt_offset);
}
void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8153,6 +8336,8 @@ EXPORT_SYMBOL(__might_sleep);
#ifdef CONFIG_MAGIC_SYSRQ
static void normalize_task(struct rq *rq, struct task_struct *p)
{
+ const struct sched_class *prev_class = p->sched_class;
+ int old_prio = p->prio;
int on_rq;
on_rq = p->se.on_rq;
@@ -8163,6 +8348,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
activate_task(rq, p, 0);
resched_task(rq->curr);
}
+
+ check_class_changed(rq, p, prev_class, old_prio);
}
void normalize_rt_tasks(void)
@@ -8278,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8291,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -8303,7 +8487,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!se)
goto err_free_rq;
- init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
return 1;
@@ -8314,15 +8498,21 @@ err:
return 0;
}
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
- list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
- &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
- list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#else /* !CONFG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
@@ -8335,10 +8525,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
}
@@ -8393,7 +8579,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
- init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
return 1;
@@ -8403,17 +8589,6 @@ err_free_rq:
err:
return 0;
}
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
- list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
- &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
- list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
#else /* !CONFIG_RT_GROUP_SCHED */
static inline void free_rt_sched_group(struct task_group *tg)
{
@@ -8424,14 +8599,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
@@ -8439,6 +8606,7 @@ static void free_sched_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
+ autogroup_free(tg);
kfree(tg);
}
@@ -8447,7 +8615,6 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
unsigned long flags;
- int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -8460,10 +8627,6 @@ struct task_group *sched_create_group(struct task_group *parent)
goto err;
spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i) {
- register_fair_sched_group(tg, i);
- register_rt_sched_group(tg, i);
- }
list_add_rcu(&tg->list, &task_groups);
WARN_ON(!parent); /* root should already exist */
@@ -8493,11 +8656,11 @@ void sched_destroy_group(struct task_group *tg)
unsigned long flags;
int i;
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i) {
+ /* end participation in shares distribution */
+ for_each_possible_cpu(i)
unregister_fair_sched_group(tg, i);
- unregister_rt_sched_group(tg, i);
- }
+
+ spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8544,33 +8707,6 @@ void sched_move_task(struct task_struct *tsk)
#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- int on_rq;
-
- on_rq = se->on_rq;
- if (on_rq)
- dequeue_entity(cfs_rq, se, 0);
-
- se->load.weight = shares;
- se->load.inv_weight = 0;
-
- if (on_rq)
- enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __set_se_shares(se, shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8593,37 +8729,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (tg->shares == shares)
goto done;
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i)
- unregister_fair_sched_group(tg, i);
- list_del_rcu(&tg->siblings);
- spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for any ongoing reference to this group to finish */
- synchronize_sched();
-
- /*
- * Now we are free to modify the group's share on each cpu
- * w/o tripping rebalance_share or load_balance_fair.
- */
tg->shares = shares;
for_each_possible_cpu(i) {
- /*
- * force a rebalance
- */
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
- set_se_shares(tg->se[i], shares);
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se;
+
+ se = tg->se[i];
+ /* Propagate contribution to hierarchy */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
- /*
- * Enable load balance activity on this group, by inserting it back on
- * each cpu's rq->leaf_cfs_rq_list.
- */
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i)
- register_fair_sched_group(tg, i);
- list_add_rcu(&tg->siblings, &tg->parent->children);
- spin_unlock_irqrestore(&task_group_lock, flags);
done:
mutex_unlock(&shares_mutex);
return 0;
@@ -8922,7 +9040,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
- return &init_task_group.css;
+ return &root_task_group.css;
}
parent = cgroup_tg(cgrp->parent);
@@ -8993,6 +9111,21 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
}
}
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ sched_move_task(task);
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
@@ -9065,6 +9198,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.destroy = cpu_cgroup_destroy,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
.early_init = 1,
@@ -9349,72 +9483,3 @@ struct cgroup_subsys cpuacct_subsys = {
};
#endif /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
- barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
- int snap, trycount = 0;
-
- smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- get_online_cpus();
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
- get_online_cpus();
- }
- atomic_inc(&synchronize_sched_expedited_count);
- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 00000000000..429242f3c48
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,275 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+static void __init autogroup_init(struct task_struct *init_task)
+{
+ autogroup_default.tg = &root_task_group;
+ kref_init(&autogroup_default.kref);
+ init_rwsem(&autogroup_default.lock);
+ init_task->signal->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_free(struct task_group *tg)
+{
+ kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+ struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* We've redirected RT tasks to the root task group... */
+ ag->tg->rt_se = NULL;
+ ag->tg->rt_rq = NULL;
+#endif
+ sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+ kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+ kref_get(&ag->kref);
+ return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+ struct autogroup *ag;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return autogroup_kref_get(&autogroup_default);
+
+ ag = autogroup_kref_get(p->signal->autogroup);
+ unlock_task_sighand(p, &flags);
+
+ return ag;
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
+
+static inline struct autogroup *autogroup_create(void)
+{
+ struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+ struct task_group *tg;
+
+ if (!ag)
+ goto out_fail;
+
+ tg = sched_create_group(&root_task_group);
+
+ if (IS_ERR(tg))
+ goto out_free;
+
+ kref_init(&ag->kref);
+ init_rwsem(&ag->lock);
+ ag->id = atomic_inc_return(&autogroup_seq_nr);
+ ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Autogroup RT tasks are redirected to the root task group
+ * so we don't have to move tasks around upon policy change,
+ * or flail around trying to allocate bandwidth on the fly.
+ * A bandwidth exception in __sched_setscheduler() allows
+ * the policy change to proceed. Thereafter, task_group()
+ * returns &root_task_group, so zero bandwidth is required.
+ */
+ free_rt_sched_group(tg);
+ tg->rt_se = root_task_group.rt_se;
+ tg->rt_rq = root_task_group.rt_rq;
+#endif
+ tg->autogroup = ag;
+
+ return ag;
+
+out_free:
+ kfree(ag);
+out_fail:
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "autogroup_create: %s failure.\n",
+ ag ? "sched_create_group()" : "kmalloc()");
+ }
+
+ return autogroup_kref_get(&autogroup_default);
+}
+
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+ if (tg != &root_task_group)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ /*
+ * We can only assume the task group can't go away on us if
+ * autogroup_move_group() can see us on ->thread_group list.
+ */
+ if (p->flags & PF_EXITING)
+ return false;
+
+ return true;
+}
+
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return !!tg->autogroup;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (enabled && task_wants_autogroup(p, tg))
+ return p->signal->autogroup->tg;
+
+ return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+ struct autogroup *prev;
+ struct task_struct *t;
+ unsigned long flags;
+
+ BUG_ON(!lock_task_sighand(p, &flags));
+
+ prev = p->signal->autogroup;
+ if (prev == ag) {
+ unlock_task_sighand(p, &flags);
+ return;
+ }
+
+ p->signal->autogroup = autogroup_kref_get(ag);
+
+ if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ goto out;
+
+ t = p;
+ do {
+ sched_move_task(t);
+ } while_each_thread(p, t);
+
+out:
+ unlock_task_sighand(p, &flags);
+ autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+ struct autogroup *ag = autogroup_create();
+
+ autogroup_move_group(p, ag);
+ /* drop extra reference added by autogroup_create() */
+ autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock. Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+ autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+ sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+ autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;
+
+ return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+ static unsigned long next = INITIAL_JIFFIES;
+ struct autogroup *ag;
+ int err;
+
+ if (*nice < -20 || *nice > 19)
+ return -EINVAL;
+
+ err = security_task_setnice(current, *nice);
+ if (err)
+ return err;
+
+ if (*nice < 0 && !can_nice(current, *nice))
+ return -EPERM;
+
+ /* this is a heavy operation taking global locks.. */
+ if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+ return -EAGAIN;
+
+ next = HZ / 10 + jiffies;
+ ag = autogroup_task_get(p);
+
+ down_write(&ag->lock);
+ err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+ if (!err)
+ ag->nice = *nice;
+ up_write(&ag->lock);
+
+ autogroup_kref_put(ag);
+
+ return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+ struct autogroup *ag = autogroup_task_get(p);
+
+ if (!task_group_is_autogroup(ag->tg))
+ goto out;
+
+ down_read(&ag->lock);
+ seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+ up_read(&ag->lock);
+
+out:
+ autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ if (!task_group_is_autogroup(tg))
+ return 0;
+
+ return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 00000000000..05577055cfc
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,41 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+ /*
+ * reference doesn't mean how many thread attach to this
+ * autogroup now. It just stands for the number of task
+ * could use this autogroup.
+ */
+ struct kref kref;
+ struct task_group *tg;
+ struct rw_semaphore lock;
+ unsigned long id;
+ int nice;
+};
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) { }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return 0;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb..9d8af0b3fb6 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
}
EXPORT_SYMBOL_GPL(sched_clock);
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9..7bacd83a415 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+static DEFINE_SPINLOCK(sched_debug_lock);
+
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec)
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
- struct task_group *tg)
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
if (!se)
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
}
#endif
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+
+static char *task_group_path(struct task_group *tg)
+{
+ if (autogroup_path(tg, group_path, PATH_MAX))
+ return group_path;
+
+ /*
+ * May be NULL if the underlying cgroup isn't fully-created yet
+ */
+ if (!tg->css.cgroup) {
+ group_path[0] = '\0';
+ return group_path;
+ }
+ cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ return group_path;
+}
+#endif
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
-
#ifdef CONFIG_CGROUP_SCHED
- {
- char path[64];
-
- rcu_read_lock();
- cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
- rcu_read_unlock();
- SEQ_printf(m, " %s", path);
- }
+ SEQ_printf(m, " %s", task_group_path(task_group(p)));
#endif
+
SEQ_printf(m, "\n");
}
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_unlock_irqrestore(&tasklist_lock, flags);
}
-#if defined(CONFIG_CGROUP_SCHED) && \
- (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
- /* may be NULL if the underlying cgroup isn't fully-created yet */
- if (!tg->css.cgroup) {
- buf[0] = '\0';
- return;
- }
- cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
struct sched_entity *last;
unsigned long flags;
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
- char path[128];
- struct task_group *tg = cfs_rq->tg;
-
- task_group_path(tg, path, sizeof(path));
-
- SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
#endif
@@ -183,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
raw_spin_lock_irqsave(&rq->lock, flags);
if (cfs_rq->rb_leftmost)
- MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+ MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
max_vruntime = last->vruntime;
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
spread0 = min_vruntime - rq0_min_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
SPLIT_NS(spread0));
- SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
- SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
+ SPLIT_NS(cfs_rq->load_avg));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
+ SPLIT_NS(cfs_rq->load_period));
+ SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
+ cfs_rq->load_contribution);
+ SEQ_printf(m, " .%-30s: %d\n", "load_tg",
+ atomic_read(&cfs_rq->tg->load_weight));
#endif
+
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
- char path[128];
- struct task_group *tg = rt_rq->tg;
-
- task_group_path(tg, path, sizeof(path));
-
- SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
+#ifdef CONFIG_RT_GROUP_SCHED
+ SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
#else
SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
#endif
-
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
#define PN(x) \
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#undef P
}
+extern __read_mostly int sched_clock_running;
+
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
#ifdef CONFIG_X86
{
@@ -296,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
P(ttwu_count);
P(ttwu_local);
- P(bkl_count);
+ SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
+ rq->rq_sched_info.bkl_count);
#undef P
+#undef P64
#endif
+ spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ rcu_read_lock();
print_rq(m, rq, cpu);
+ rcu_read_unlock();
+ spin_unlock_irqrestore(&sched_debug_lock, flags);
}
static const char *sched_tunable_scaling_names[] = {
@@ -314,21 +320,42 @@ static const char *sched_tunable_scaling_names[] = {
static int sched_debug_show(struct seq_file *m, void *v)
{
- u64 now = ktime_to_ns(ktime_get());
+ u64 ktime, sched_clk, cpu_clk;
+ unsigned long flags;
int cpu;
- SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+ local_irq_save(flags);
+ ktime = ktime_to_ns(ktime_get());
+ sched_clk = sched_clock();
+ cpu_clk = local_clock();
+ local_irq_restore(flags);
+
+ SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+ SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(ktime);
+ PN(sched_clk);
+ PN(cpu_clk);
+ P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "sysctl_sched\n");
#define P(x) \
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- P(jiffies);
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd768667..6fa833ab2cb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
#include <linux/latencytop.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
-/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
@@ -89,6 +82,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
static const struct sched_class fair_sched_class;
/**************************************************************
@@ -143,6 +143,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
return cfs_rq->tg->cfs_rq[this_cpu];
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_rq->on_list) {
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases.
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ } else {
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ }
+
+ cfs_rq->on_list = 1;
+ }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->on_list) {
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ cfs_rq->on_list = 0;
+ }
+}
+
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +276,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
return &cpu_rq(this_cpu)->cfs;
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@@ -374,7 +412,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
@@ -384,6 +422,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
return rb_entry(left, struct sched_entity, run_node);
}
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+ struct rb_node *next = rb_next(&se->run_node);
+
+ if (!next)
+ return NULL;
+
+ return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -398,7 +447,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SCHED_DEBUG
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -417,7 +465,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
- WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL
return 0;
@@ -495,6 +542,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -514,6 +564,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +687,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
- se->on_rq = 1;
}
static void
@@ -647,9 +700,164 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
- se->on_rq = 0;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+ int global_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long load_avg;
+
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ load_avg -= cfs_rq->load_contribution;
+
+ if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+ atomic_add(load_avg, &tg->load_weight);
+ cfs_rq->load_contribution += load_avg;
+ }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+ u64 period = sysctl_sched_shares_window;
+ u64 now, delta;
+ unsigned long load = cfs_rq->load.weight;
+
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = rq_of(cfs_rq)->clock_task;
+ delta = now - cfs_rq->load_stamp;
+
+ /* truncate load history at 4 idle periods */
+ if (cfs_rq->load_stamp > cfs_rq->load_last &&
+ now - cfs_rq->load_last > 4 * period) {
+ cfs_rq->load_period = 0;
+ cfs_rq->load_avg = 0;
+ delta = period - 1;
+ }
+
+ cfs_rq->load_stamp = now;
+ cfs_rq->load_unacc_exec_time = 0;
+ cfs_rq->load_period += delta;
+ if (load) {
+ cfs_rq->load_last = now;
+ cfs_rq->load_avg += delta * load;
+ }
+
+ /* consider updating load contribution on each fold or truncate */
+ if (global_update || cfs_rq->load_period > period
+ || !cfs_rq->load_period)
+ update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+ while (cfs_rq->load_period > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (cfs_rq->load_period));
+ cfs_rq->load_period /= 2;
+ cfs_rq->load_avg /= 2;
+ }
+
+ if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+ list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ long load_weight, load, shares;
+
+ load = cfs_rq->load.weight;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight += load;
+ load_weight -= cfs_rq->load_contribution;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
+ }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ }
+
+ update_load_set(&se->load, weight);
+
+ if (se->on_rq)
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg;
+ struct sched_entity *se;
+ long shares;
+
+ tg = cfs_rq->tg;
+ se = tg->se[cpu_of(rq_of(cfs_rq))];
+ if (!se)
+ return;
+#ifndef CONFIG_SMP
+ if (likely(se->load.weight == tg->shares))
+ return;
+#endif
+ shares = calc_cfs_shares(cfs_rq, tg);
+
+ reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -771,7 +979,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -782,21 +992,55 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
+
+ if (cfs_rq->nr_running == 1)
+ list_add_leaf_cfs_rq(cfs_rq);
}
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
{
- if (!se || cfs_rq->last == se)
- cfs_rq->last = NULL;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+ else
+ break;
+ }
+}
- if (!se || cfs_rq->next == se)
- cfs_rq->next = NULL;
+static void __clear_buddies_next(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ else
+ break;
+ }
+}
+
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->skip == se)
+ cfs_rq->skip = NULL;
+ else
+ break;
+ }
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- for_each_sched_entity(se)
- __clear_buddies(cfs_rq_of(se), se);
+ if (cfs_rq->last == se)
+ __clear_buddies_last(se);
+
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+
+ if (cfs_rq->skip == se)
+ __clear_buddies_skip(se);
}
static void
@@ -825,8 +1069,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -869,9 +1116,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (cfs_rq->nr_running > 1) {
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
+ if (delta < 0)
+ return;
+
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
@@ -910,13 +1160,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
+ /*
+ * Avoid running the skip buddy, if running something else can
+ * be done without getting too unfair.
+ */
+ if (cfs_rq->skip == se) {
+ struct sched_entity *second = __pick_next_entity(se);
+ if (second && wakeup_preempt_entity(second, left) < 1)
+ se = second;
+ }
/*
* Prefer last buddy, try to return the CPU to a preempted task.
@@ -924,6 +1188,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;
+
clear_buddies(cfs_rq, se);
return se;
@@ -955,6 +1225,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
update_curr(cfs_rq);
+ /*
+ * Update share accounting for long-running entities.
+ */
+ update_entity_shares_tick(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1330,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
flags = ENQUEUE_WAKEUP;
}
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
+ }
+
hrtick_update(rq);
}
@@ -1071,59 +1353,21 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
flags |= DEQUEUE_SLEEP;
}
- hrtick_update(rq);
-}
-
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
- struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- struct sched_entity *rightmost, *se = &curr->se;
-
- /*
- * Are we the only task in the tree?
- */
- if (unlikely(cfs_rq->nr_running == 1))
- return;
-
- clear_buddies(cfs_rq, se);
-
- if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
- return;
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
- /*
- * Find the rightmost entry in the rbtree:
- */
- rightmost = __pick_last_entity(cfs_rq);
- /*
- * Already in the rightmost position?
- */
- if (unlikely(!rightmost || entity_before(rightmost, se)))
- return;
- /*
- * Minimally necessary key value to be last in the tree:
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- se->vruntime = rightmost->vruntime + 1;
+ hrtick_update(rq);
}
#ifdef CONFIG_SMP
@@ -1143,67 +1387,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
*/
-static long effective_load(struct task_group *tg, int cpu,
- long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
if (!tg->parent)
return wl;
- /*
- * By not taking the decrease of shares on the other cpu into
- * account our error leans towards reducing the affine wakeups.
- */
- if (!wl && sched_feat(ASYM_EFF_LOAD))
- return wl;
-
for_each_sched_entity(se) {
- long S, rw, s, a, b;
- long more_w;
-
- /*
- * Instead of using this increment, also add the difference
- * between when the shares were last updated and now.
- */
- more_w = se->my_q->load.weight - se->my_q->rq_weight;
- wl += more_w;
- wg += more_w;
+ long lw, w;
- S = se->my_q->tg->shares;
- s = se->my_q->shares;
- rw = se->my_q->rq_weight;
+ tg = se->my_q->tg;
+ w = se->my_q->load.weight;
- a = S*(rw + wl);
- b = S*rw + s*wg;
+ /* use this cpu's instantaneous contribution */
+ lw = atomic_read(&tg->load_weight);
+ lw -= se->my_q->load_contribution;
+ lw += w + wg;
- wl = s*(a-b);
+ wl += w;
- if (likely(b))
- wl /= b;
+ if (lw > 0 && wl < lw)
+ wl = (wl * tg->shares) / lw;
+ else
+ wl = tg->shares;
- /*
- * Assume the group is already running and will
- * thus already be accounted for in the weight.
- *
- * That is, moving shares between CPUs, does not
- * alter the group weight.
- */
+ /* zero point is MIN_SHARES */
+ if (wl < MIN_SHARES)
+ wl = MIN_SHARES;
+ wl -= se->load.weight;
wg = 0;
}
@@ -1222,7 +1435,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- unsigned long this_load, load;
+ s64 this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
struct task_group *tg;
@@ -1261,8 +1474,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- if (this_load) {
- unsigned long this_eff_load, prev_eff_load;
+ if (this_load > 0) {
+ s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
this_eff_load *= power_of(prev_cpu);
@@ -1508,23 +1721,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
sd = tmp;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (sched_feat(LB_SHARES_UPDATE)) {
- /*
- * Pick the largest domain to update shares over
- */
- tmp = sd;
- if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
- tmp = affine_sd;
-
- if (tmp) {
- raw_spin_unlock(&rq->lock);
- update_shares(tmp);
- raw_spin_lock(&rq->lock);
- }
- }
-#endif
-
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu);
@@ -1644,6 +1840,14 @@ static void set_next_buddy(struct sched_entity *se)
}
}
+static void set_skip_buddy(struct sched_entity *se)
+{
+ if (likely(task_of(se)->policy != SCHED_IDLE)) {
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->skip = se;
+ }
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -1667,16 +1871,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (test_tsk_need_resched(curr))
return;
+ /* Idle tasks are by definition preempted by non-idle tasks. */
+ if (unlikely(curr->policy == SCHED_IDLE) &&
+ likely(p->policy != SCHED_IDLE))
+ goto preempt;
+
/*
- * Batch and idle tasks do not preempt (their preemption is driven by
- * the tick):
+ * Batch and idle tasks do not preempt non-idle tasks (their preemption
+ * is driven by the tick):
*/
if (unlikely(p->policy != SCHED_NORMAL))
return;
- /* Idle tasks are by definition preempted by everybody. */
- if (unlikely(curr->policy == SCHED_IDLE))
- goto preempt;
if (!sched_feat(WAKEUP_PREEMPT))
return;
@@ -1742,6 +1948,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
}
}
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *se = &curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(rq->nr_running == 1))
+ return;
+
+ clear_buddies(cfs_rq, se);
+
+ if (curr->policy != SCHED_BATCH) {
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ }
+
+ set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+ struct sched_entity *se = &p->se;
+
+ if (!se->on_rq)
+ return false;
+
+ /* Tell the scheduler that we'd really like pse to run next. */
+ set_next_buddy(se);
+
+ yield_task_fair(rq);
+
+ return true;
+}
+
#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods:
@@ -1853,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
enum cpu_idle_type idle, int *all_pinned,
int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
{
- int loops = 0, pulled = 0, pinned = 0;
+ int loops = 0, pulled = 0;
long rem_load_move = max_load_move;
struct task_struct *p, *n;
if (max_load_move == 0)
goto out;
- pinned = 1;
-
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
if (loops++ > sysctl_sched_nr_migrate)
break;
if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+ !can_migrate_task(p, busiest, this_cpu, sd, idle,
+ all_pinned))
continue;
pull_task(busiest, p, this_rq, this_cpu);
@@ -1902,13 +2152,52 @@ out:
*/
schedstat_add(sd, lb_gained[idle], pulled);
- if (all_pinned)
- *all_pinned = pinned;
-
return max_load_move - rem_load_move;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (!tg->se[cpu])
+ return 0;
+
+ rq = cpu_rq(cpu);
+ cfs_rq = tg->cfs_rq[cpu];
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ update_rq_clock(rq);
+ update_cfs_load(cfs_rq, 1);
+
+ /*
+ * We need to update shares after updating tg->load_weight in
+ * order to adjust the weight of groups with long running tasks.
+ */
+ update_cfs_shares(cfs_rq);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return 0;
+}
+
+static void update_shares(int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ struct rq *rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(rq, cfs_rq)
+ update_shares_cpu(cfs_rq->tg, cpu);
+ rcu_read_unlock();
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -1956,6 +2245,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
return max_load_move - rem_load_move;
}
#else
+static inline void update_shares(int cpu)
+{
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -2374,7 +2667,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
@@ -2382,7 +2674,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
*/
static inline void update_sg_lb_stats(struct sched_domain *sd,
struct sched_group *group, int this_cpu,
- enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ enum cpu_idle_type idle, int load_idx,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
@@ -2402,9 +2694,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
- if (*sd_idle && rq->nr_running)
- *sd_idle = 0;
-
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {
@@ -2449,7 +2738,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
/*
* Consider the group unbalanced when the imbalance is larger
- * than the average weight of two tasks.
+ * than the average weight of a task.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
@@ -2459,7 +2748,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
+ if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2519,15 +2808,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- enum cpu_idle_type idle, int *sd_idle,
- const struct cpumask *cpus, int *balance,
- struct sd_lb_stats *sds)
+ enum cpu_idle_type idle, const struct cpumask *cpus,
+ int *balance, struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
struct sched_group *sg = sd->groups;
@@ -2545,7 +2832,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+ update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
local_group, cpus, balance, &sgs);
if (local_group && !(*balance))
@@ -2771,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
/*
* if *imbalance is less than the average load per runnable task
- * there is no gaurantee that any tasks will be moved so we'll have
+ * there is no guarantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
@@ -2797,7 +3084,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* @imbalance: Variable which stores amount of weighted load which should
* be moved to restore balance/put a group to idle.
* @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
* @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
@@ -2810,7 +3096,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const struct cpumask *cpus, int *balance)
+ const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
@@ -2820,22 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
- balance, &sds);
-
- /* Cases where imbalance does not exist from POV of this_cpu */
- /* 1) this_cpu is not the appropriate cpu to perform load balancing
- * at this level.
- * 2) There is no busy sibling group to pull from.
- * 3) This group is the busiest group.
- * 4) This group is more busy than the avg busieness at this
- * sched_domain.
- * 5) The imbalance is within the specified limit.
- *
- * Note: when doing newidle balance, if the local group has excess
- * capacity (i.e. nr_running < group_capacity) and the busiest group
- * does not have any capacity, we force a load balance to pull tasks
- * to the local group. In this case, we skip past checks 3, 4 and 5.
+ update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+
+ /*
+ * this_cpu is not the appropriate cpu to perform load balancing at
+ * this level.
*/
if (!(*balance))
goto ret;
@@ -2844,41 +3119,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
check_asym_packing(sd, &sds, this_cpu, imbalance))
return sds.busiest;
+ /* There is no busy sibling group to pull tasks from */
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
+ /*
+ * If the busiest group is imbalanced the below checks don't
+ * work because they assumes all things are equal, which typically
+ * isn't true due to cpus_allowed constraints and the like.
+ */
+ if (sds.group_imb)
+ goto force_balance;
+
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
+ /*
+ * If the local group is more busy than the selected busiest group
+ * don't try and pull any tasks.
+ */
if (sds.this_load >= sds.max_load)
goto out_balanced;
- sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
+ /*
+ * Don't pull any tasks if this group is already above the domain
+ * average load.
+ */
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- /*
- * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
- * And to check for busy balance use !idle_cpu instead of
- * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
- * even when they are idle.
- */
- if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
- goto out_balanced;
- } else {
+ if (idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
- if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+ if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
sds.busiest_nr_running <= sds.busiest_group_weight)
goto out_balanced;
+ } else {
+ /*
+ * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+ * imbalance_pct to be conservative.
+ */
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
}
force_balance:
@@ -2957,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
int busiest_cpu, int this_cpu)
{
if (idle == CPU_NEWLY_IDLE) {
@@ -2989,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
* move_tasks() will succeed. ld_moved will be true and this
* active balance code will not be triggered.
*/
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- return 0;
-
if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
return 0;
}
@@ -3010,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+ int ld_moved, all_pinned = 0, active_balance = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
@@ -3019,21 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
cpumask_copy(cpus, cpu_active_mask);
- /*
- * When power savings policy is enabled for the parent domain, idle
- * sibling can pick up load irrespective of busy siblings. In this case,
- * let the state of idle sibling percolate up as CPU_IDLE, instead of
- * portraying it as CPU_NOT_IDLE.
- */
- if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- sd_idle = 1;
-
schedstat_inc(sd, lb_count[idle]);
redo:
- update_shares(sd);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle,
cpus, balance);
if (*balance == 0)
@@ -3062,6 +3337,7 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
+ all_pinned = 1;
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3095,8 +3371,7 @@ redo:
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
- if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
- this_cpu)) {
+ if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -3151,10 +3426,6 @@ redo:
sd->balance_interval *= 2;
}
- if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
-
goto out;
out_balanced:
@@ -3168,14 +3439,8 @@ out_one_pinned:
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
- else
- ld_moved = 0;
+ ld_moved = 0;
out:
- if (ld_moved)
- update_shares(sd);
return ld_moved;
}
@@ -3199,6 +3464,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
*/
raw_spin_unlock(&this_rq->lock);
+ update_shares(this_cpu);
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
@@ -3552,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick)
static DEFINE_SPINLOCK(balancing);
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
@@ -3569,6 +3846,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
+ update_shares(cpu);
+
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -3579,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
- if (unlikely(!interval))
- interval = 1;
- if (interval > HZ*NR_CPUS/10)
- interval = HZ*NR_CPUS/10;
+ interval = clamp(interval, 1UL, max_load_balance_interval);
need_serialize = sd->flags & SD_SERIALIZE;
@@ -3595,8 +3871,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
- * longer idle, or one of our SMT siblings is
- * not idle.
+ * longer idle.
*/
idle = CPU_NOT_IDLE;
}
@@ -3843,33 +4118,62 @@ static void task_fork_fair(struct task_struct *p)
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (running) {
+ if (rq->curr == p) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p, 0);
}
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Ensure the task's vruntime is normalized, so that when its
+ * switched back to the fair class the enqueue_entity(.flags=0) will
+ * do the right thing.
+ *
+ * If it was on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it was !on_rq, then only when
+ * the task is sleeping will it still have non-normalized vruntime.
+ */
+ if (!se->on_rq && p->state != TASK_RUNNING) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+}
+
/*
* We switched to the sched_fair class.
*/
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (running)
+ if (rq->curr == p)
resched_task(rq->curr);
else
check_preempt_curr(rq, p, 0);
@@ -3935,6 +4239,7 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
+ .yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
@@ -3955,6 +4260,7 @@ static const struct sched_class fair_sched_class = {
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
+ .switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a..68e69acc29b 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
/*
* Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87..a776a639642 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
{
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
- /* Can this actually happen?? */
- if (running)
- resched_task(rq->curr);
- else
- check_preempt_curr(rq, p, 0);
+ BUG();
}
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
- /* This can happen for hot plug CPUS */
-
- /*
- * Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
- */
- if (running) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else
- check_preempt_curr(rq, p, 0);
+ BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
-
- /* no .task_new for idle tasks */
};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9..e7cebdc65f8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
}
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+ list_add_rcu(&rt_rq->leaf_rt_rq_list,
+ &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+ list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
@@ -199,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- int this_cpu = smp_processor_id();
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
struct sched_rt_entity *rt_se;
- rt_se = rt_rq->tg->rt_se[this_cpu];
+ int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+
+ rt_se = rt_rq->tg->rt_se[cpu];
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
@@ -215,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
- int this_cpu = smp_processor_id();
struct sched_rt_entity *rt_se;
+ int cpu = cpu_of(rq_of_rt_rq(rt_rq));
- rt_se = rt_rq->tg->rt_se[this_cpu];
+ rt_se = rt_rq->tg->rt_se[cpu];
if (rt_se && on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
@@ -276,6 +288,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(def_rt_bandwidth.rt_period);
}
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
@@ -546,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
- } else if (rt_rq->rt_nr_running)
+ } else if (rt_rq->rt_nr_running) {
idle = 0;
+ if (!rt_rq_throttled(rt_rq))
+ enqueue = 1;
+ }
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
@@ -606,7 +629,7 @@ static void update_curr_rt(struct rq *rq)
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
- if (!task_has_rt_policy(curr))
+ if (curr->sched_class != &rt_sched_class)
return;
delta_exec = rq->clock_task - curr->se.exec_start;
@@ -825,6 +848,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
+ if (!rt_rq->rt_nr_running)
+ list_add_leaf_rt_rq(rt_rq);
+
if (head)
list_add(&rt_se->run_list, queue);
else
@@ -844,6 +870,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
+ if (!rt_rq->rt_nr_running)
+ list_del_leaf_rt_rq(rt_rq);
}
/*
@@ -1350,7 +1378,7 @@ retry:
task = pick_next_pushable_task(rq);
if (task_cpu(next_task) == rq->cpu && task == next_task) {
/*
- * If we get here, the task hasnt moved at all, but
+ * If we get here, the task hasn't moved at all, but
* it has failed to push. We will not try again,
* since the other cpus will pull from us when they
* are ready.
@@ -1460,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq)
/*
* We continue with the search, just in
* case there's an even higher prio task
- * in another runqueue. (low likelyhood
+ * in another runqueue. (low likelihood
* but possible)
*/
}
@@ -1571,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
* When switch from the rt queue, we bring ourselves to a position
* that we might want to pull RT tasks from other runqueues.
*/
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
/*
* If there are other RT tasks then we will reschedule
@@ -1581,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!rq->rt.rt_nr_running)
+ if (p->se.on_rq && !rq->rt.rt_nr_running)
pull_rt_task(rq);
}
@@ -1600,8 +1627,7 @@ static inline void init_sched_rt_class(void)
* with RT tasks. In this case we try to push them off to
* other runqueues.
*/
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
int check_resched = 1;
@@ -1612,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (!running) {
+ if (p->se.on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->rt.overloaded && push_rt_task(rq) &&
/* Don't resched if we changed runqueues */
@@ -1628,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
* Priority of the task has changed. This may cause
* us to initiate a push or pull.
*/
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (running) {
+ if (!p->se.on_rq)
+ return;
+
+ if (rq->curr == p) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c..1ba2bd40fda 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
-static void prio_changed_stop(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG(); /* how!?, what priority? */
}
@@ -103,6 +102,4 @@ static const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
-
- /* no .task_new for stop tasks */
};
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdc..7165af5f1b1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig)
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
- * appopriate lock must be held to stop the target task from exiting
+ * appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tracehook_consider_fatal_signal(tsk, sig);
}
-
-/* Notify the system that a driver wants to block all signals for this
+/*
+ * Notify the system that a driver wants to block all signals for this
* process, and wants to be notified if any signals at all were to be
* sent/acted upon. If the notifier routine returns non-zero, then the
* signal will be acted upon after all. If the notifier routine returns 0,
* then then signal will be blocked. Only one block per process is
* allowed. priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not. */
-
+ * can use to determine if the signal should be blocked or not.
+ */
void
block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
{
@@ -434,9 +434,10 @@ still_pending:
copy_siginfo(info, &first->info);
__sigqueue_free(first);
} else {
- /* Ok, it wasn't in the queue. This must be
- a fast-pathed signal or we must have been
- out of queue space. So zero out the info.
+ /*
+ * Ok, it wasn't in the queue. This must be
+ * a fast-pathed signal or we must have been
+ * out of queue space. So zero out the info.
*/
info->si_signo = sig;
info->si_errno = 0;
@@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
}
/*
- * Dequeue a signal and return the element to the caller, which is
+ * Dequeue a signal and return the element to the caller, which is
* expected to free it.
*
* All callers have to hold the siglock.
@@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* itimers are process shared and we restart periodic
* itimers in the signal delivery path to prevent DoS
* attacks in the high resolution timer case. This is
- * compliant with the old way of self restarting
+ * compliant with the old way of self-restarting
* itimers, as the SIGALRM is a legacy signal and only
* queued once. Changing the restart behaviour to
* restart the timer in the signal dequeue path is
@@ -636,13 +637,33 @@ static inline bool si_fromuser(const struct siginfo *info)
}
/*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = __task_cred(t);
+
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->euid == tcred->suid ||
+ cred->euid == tcred->uid ||
+ cred->uid == tcred->suid ||
+ cred->uid == tcred->uid))
+ return 1;
+
+ if (ns_capable(tcred->user->user_ns, CAP_KILL))
+ return 1;
+
+ return 0;
+}
+
+/*
* Bad permissions for sending the signal
* - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
{
- const struct cred *cred, *tcred;
struct pid *sid;
int error;
@@ -656,14 +677,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
if (error)
return error;
- cred = current_cred();
- tcred = __task_cred(t);
if (!same_thread_group(current, t) &&
- (cred->euid ^ tcred->suid) &&
- (cred->euid ^ tcred->uid) &&
- (cred->uid ^ tcred->suid) &&
- (cred->uid ^ tcred->uid) &&
- !capable(CAP_KILL)) {
+ !kill_ok_by_cred(t)) {
switch (sig) {
case SIGCONT:
sid = task_session(t);
@@ -909,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
if (info == SEND_SIG_FORCED)
goto out_set;
- /* Real-time signals must be queued if sent by sigqueue, or
- some other real-time mechanism. It is implementation
- defined whether kill() does so. We attempt to do so, on
- the principle of least surprise, but since kill is not
- allowed to fail with EAGAIN when low on memory we just
- make sure at least one signal gets delivered and don't
- pass on the info struct. */
-
+ /*
+ * Real-time signals must be queued if sent by sigqueue, or
+ * some other real-time mechanism. It is implementation
+ * defined whether kill() does so. We attempt to do so, on
+ * the principle of least surprise, but since kill is not
+ * allowed to fail with EAGAIN when low on memory we just
+ * make sure at least one signal gets delivered and don't
+ * pass on the info struct.
+ */
if (sig < SIGRTMIN)
override_rlimit = (is_si_special(info) || info->si_code >= 0);
else
@@ -1187,8 +1203,7 @@ retry:
return error;
}
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{
int error;
rcu_read_lock();
@@ -1285,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
* These are for backward compatibility with the rest of the kernel source.
*/
-int
-send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
/*
* Make sure legacy kernel users don't send in bad values
@@ -1354,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid);
* These functions support sending signals using preallocated sigqueue
* structures. This is needed "because realtime applications cannot
* afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions". In the case of Posix Timers
+ * expirations or I/O completions". In the case of POSIX Timers
* we allocate the sigqueue structure from the timer_create. If this
* allocation fails we are able to report the failure to the application
* with an EAGAIN error.
@@ -1539,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
info.si_signo = SIGCHLD;
info.si_errno = 0;
/*
- * see comment in do_notify_parent() abot the following 3 lines
+ * see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1597,7 +1611,7 @@ static inline int may_ptrace_stop(void)
}
/*
- * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Return non-zero if there is a SIGKILL that should be waking us up.
* Called with the siglock held.
*/
static int sigkill_pending(struct task_struct *tsk)
@@ -1721,7 +1735,7 @@ void ptrace_notify(int exit_code)
/*
* This performs the stopping for SIGSTOP and other stop signals.
* We have to stop all threads in the thread group.
- * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns non-zero if we've actually stopped and released the siglock.
* Returns zero if we didn't stop and still hold the siglock.
*/
static int do_signal_stop(int signr)
@@ -1809,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
current->exit_code = 0;
- /* Update the siginfo structure if the signal has
- changed. If the debugger wanted something
- specific in the siginfo structure then it should
- have updated *info via PTRACE_SETSIGINFO. */
+ /*
+ * Update the siginfo structure if the signal has
+ * changed. If the debugger wanted something
+ * specific in the siginfo structure then it should
+ * have updated *info via PTRACE_SETSIGINFO.
+ */
if (signr != info->si_signo) {
info->si_signo = signr;
info->si_errno = 0;
@@ -1871,7 +1887,7 @@ relock:
for (;;) {
struct k_sigaction *ka;
/*
- * Tracing can induce an artifical signal and choose sigaction.
+ * Tracing can induce an artificial signal and choose sigaction.
* The return value in @signr determines the default action,
* but @info->si_signo is the signal number we will report.
*/
@@ -2020,7 +2036,8 @@ void exit_signals(struct task_struct *tsk)
if (!signal_pending(tsk))
goto out;
- /* It could be that __group_complete_signal() choose us to
+ /*
+ * It could be that __group_complete_signal() choose us to
* notify about group-wide signal. Another thread should be
* woken now to take the signal since we will not.
*/
@@ -2058,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals);
* System call entry points.
*/
+/**
+ * sys_restart_syscall - restart a system call
+ */
SYSCALL_DEFINE0(restart_syscall)
{
struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2111,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
return error;
}
+/**
+ * sys_rt_sigprocmask - change the list of currently blocked signals
+ * @how: whether to add, remove, or set signals
+ * @set: stores pending signals
+ * @oset: previous value of signal mask if non-null
+ * @sigsetsize: size of sigset_t type
+ */
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
sigset_t __user *, oset, size_t, sigsetsize)
{
@@ -2169,8 +2196,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
out:
return error;
-}
+}
+/**
+ * sys_rt_sigpending - examine a pending signal that has been raised
+ * while blocked
+ * @set: stores pending signals
+ * @sigsetsize: size of sigset_t type or larger
+ */
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
{
return do_sigpending(set, sigsetsize);
@@ -2219,9 +2252,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
err |= __put_user(from->si_trapno, &to->si_trapno);
#endif
#ifdef BUS_MCEERR_AO
- /*
+ /*
* Other callers might not initialize the si_lsb field,
- * so check explicitely for the right codes here.
+ * so check explicitly for the right codes here.
*/
if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2250,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
#endif
+/**
+ * sys_rt_sigtimedwait - synchronously wait for queued signals specified
+ * in @uthese
+ * @uthese: queued signals to wait for
+ * @uinfo: if non-null, the signal's siginfo is returned here
+ * @uts: upper bound on process time suspension
+ * @sigsetsize: size of sigset_t type
+ */
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
siginfo_t __user *, uinfo, const struct timespec __user *, uts,
size_t, sigsetsize)
@@ -2266,7 +2307,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
-
+
/*
* Invert the set of allowed signals to get those we
* want to block.
@@ -2291,9 +2332,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
+ (ts.tv_sec || ts.tv_nsec));
if (timeout) {
- /* None ready -- temporarily unblock those we're
+ /*
+ * None ready -- temporarily unblock those we're
* interested while we are sleeping in so that we'll
- * be awakened when they arrive. */
+ * be awakened when they arrive.
+ */
current->real_blocked = current->blocked;
sigandsets(&current->blocked, &current->blocked, &these);
recalc_sigpending();
@@ -2325,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
return ret;
}
+/**
+ * sys_kill - send a signal to a process
+ * @pid: the PID of the process
+ * @sig: signal to be sent
+ */
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct siginfo info;
@@ -2400,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
return do_tkill(tgid, pid, sig);
}
-/*
+/**
+ * sys_tkill - send signal to one specific task
+ * @pid: the PID of the task
+ * @sig: signal to be sent
+ *
* Send a signal to only one task, even if it's a CLONE_THREAD task.
*/
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2412,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
return do_tkill(0, pid, sig);
}
+/**
+ * sys_rt_sigqueueinfo - send signal information to a signal
+ * @pid: the PID of the thread
+ * @sig: signal to be sent
+ * @uinfo: signal info to be sent
+ */
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
@@ -2421,9 +2479,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
return -EFAULT;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info.si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info.si_code >= 0 || info.si_code == SI_TKILL) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info.si_code < 0);
return -EPERM;
+ }
info.si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2499,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
return -EINVAL;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info->si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
+ }
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
@@ -2531,12 +2597,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
error = -EINVAL;
/*
- *
- * Note - this code used to test ss_flags incorrectly
+ * Note - this code used to test ss_flags incorrectly:
* old code may have been written using ss_flags==0
* to mean ss_flags==SS_ONSTACK (as this was the only
* way that worked) - this fix preserves that older
- * mechanism
+ * mechanism.
*/
if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
goto out;
@@ -2570,6 +2635,10 @@ out:
#ifdef __ARCH_WANT_SYS_SIGPENDING
+/**
+ * sys_sigpending - examine pending signals
+ * @set: where mask of pending signal is returned
+ */
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
{
return do_sigpending(set, sizeof(*set));
@@ -2578,8 +2647,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#endif
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
-/* Some platforms have their own version with special arguments others
- support only sys_rt_sigprocmask. */
+/**
+ * sys_sigprocmask - examine and change blocked signals
+ * @how: whether to add, remove, or set signals
+ * @set: signals to add or remove (if non-null)
+ * @oset: previous value of signal mask if non-null
+ *
+ * Some platforms have their own version with special arguments;
+ * others support only sys_rt_sigprocmask.
+ */
SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
old_sigset_t __user *, oset)
@@ -2632,6 +2708,13 @@ out:
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+/**
+ * sys_rt_sigaction - alter an action taken by a process
+ * @sig: signal to be sent
+ * @act: new sigaction
+ * @oact: used to save the previous sigaction
+ * @sigsetsize: size of sigset_t type
+ */
SYSCALL_DEFINE4(rt_sigaction, int, sig,
const struct sigaction __user *, act,
struct sigaction __user *, oact,
@@ -2718,6 +2801,12 @@ SYSCALL_DEFINE0(pause)
#endif
#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+/**
+ * sys_rt_sigsuspend - replace the signal mask for a value with the
+ * @unewset value until a signal is received
+ * @unewset: new signal mask value
+ * @sigsetsize: size of sigset_t type
+ */
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2..73a19519355 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
static struct {
struct list_head queue;
raw_spinlock_t lock;
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
*/
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
+ smp_call_func_t func;
- if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
+ /*
+ * Since we walk the list without any locks, we might
+ * see an entry that was completed, removed from the
+ * list and is in the process of being reused.
+ *
+ * We must check that the cpu is in the cpumask before
+ * checking the refs, and both must be set before
+ * executing the callback on this cpu.
+ */
+
+ if (!cpumask_test_cpu(cpu, data->cpumask))
+ continue;
+
+ smp_rmb();
+
+ if (atomic_read(&data->refs) == 0)
continue;
- data->csd.func(data->csd.info);
+ func = data->csd.func; /* save for later warn */
+ func(data->csd.info);
+
+ /*
+ * If the cpu mask is not still set then func enabled
+ * interrupts (BUG), and this cpu took another smp call
+ * function interrupt and executed func(info) twice
+ * on this cpu. That nested execution decremented refs.
+ */
+ if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+ WARN(1, "%pf enabled interrupts and double executed\n", func);
+ continue;
+ }
refs = atomic_dec_return(&data->refs);
WARN_ON(refs < 0);
- if (!refs) {
- raw_spin_lock(&call_function.lock);
- list_del_rcu(&data->csd.list);
- raw_spin_unlock(&call_function.lock);
- }
if (refs)
continue;
+ WARN_ON(!cpumask_empty(data->cpumask));
+
+ raw_spin_lock(&call_function.lock);
+ list_del_rcu(&data->csd.list);
+ raw_spin_unlock(&call_function.lock);
+
csd_unlock(&data->csd);
}
@@ -420,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
{
struct call_function_data *data;
unsigned long flags;
- int cpu, next_cpu, this_cpu = smp_processor_id();
+ int refs, cpu, next_cpu, this_cpu = smp_processor_id();
/*
* Can deadlock when called with interrupts disabled.
@@ -429,9 +459,9 @@ void smp_call_function_many(const struct cpumask *mask,
* can't happen.
*/
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress);
+ && !oops_in_progress && !early_boot_irqs_disabled);
- /* So, what's a CPU they want? Ignoring this one. */
+ /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -454,11 +484,48 @@ void smp_call_function_many(const struct cpumask *mask,
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
+ /* This BUG_ON verifies our reuse assertions and can be removed */
+ BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
+
+ /*
+ * The global call function queue list add and delete are protected
+ * by a lock, but the list is traversed without any lock, relying
+ * on the rcu list add and delete to allow safe concurrent traversal.
+ * We reuse the call function data without waiting for any grace
+ * period after some other cpu removes it from the global queue.
+ * This means a cpu might find our data block as it is being
+ * filled out.
+ *
+ * We hold off the interrupt handler on the other cpu by
+ * ordering our writes to the cpu mask vs our setting of the
+ * refs counter. We assert only the cpu owning the data block
+ * will set a bit in cpumask, and each bit will only be cleared
+ * by the subject cpu. Each cpu must first find its bit is
+ * set and then check that refs is set indicating the element is
+ * ready to be processed, otherwise it must skip the entry.
+ *
+ * On the previous iteration refs was set to 0 by another cpu.
+ * To avoid the use of transitivity, set the counter to 0 here
+ * so the wmb will pair with the rmb in the interrupt handler.
+ */
+ atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
+
data->csd.func = func;
data->csd.info = info;
+
+ /* Ensure 0 refs is visible before mask. Also orders func and info */
+ smp_wmb();
+
+ /* We rely on the "and" being processed before the store */
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
- atomic_set(&data->refs, cpumask_weight(data->cpumask));
+ refs = cpumask_weight(data->cpumask);
+
+ /* Some callers race with other cpus changing the passed mask */
+ if (unlikely(!refs)) {
+ csd_unlock(&data->csd);
+ return;
+ }
raw_spin_lock_irqsave(&call_function.lock, flags);
/*
@@ -467,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
* will not miss any other list entries:
*/
list_add_rcu(&data->csd.list, &call_function.queue);
+ /*
+ * We rely on the wmb() in list_add_rcu to complete our writes
+ * to the cpumask before this write to refs, which indicates
+ * data is on the list and is ready to be processed.
+ */
+ atomic_set(&data->refs, refs);
raw_spin_unlock_irqrestore(&call_function.lock, flags);
/*
@@ -529,3 +602,105 @@ void ipi_call_unlock_irq(void)
{
raw_spin_unlock_irq(&call_function.lock);
}
+#endif /* USE_GENERIC_SMP_HELPERS */
+
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+
+
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+
+void __weak arch_disable_smp_support(void) { }
+
+static int __init nosmp(char *str)
+{
+ setup_max_cpus = 0;
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("nosmp", nosmp);
+
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+ int nr_cpus;
+
+ get_option(&str, &nr_cpus);
+ if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+ nr_cpu_ids = nr_cpus;
+
+ return 0;
+}
+
+early_param("nr_cpus", nrcpus);
+
+static int __init maxcpus(char *str)
+{
+ get_option(&str, &setup_max_cpus);
+ if (setup_max_cpus == 0)
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("maxcpus", maxcpus);
+
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+ nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+ unsigned int cpu;
+
+ /* FIXME: This should be done in userspace --RR */
+ for_each_present_cpu(cpu) {
+ if (num_online_cpus() >= setup_max_cpus)
+ break;
+ if (!cpu_online(cpu))
+ cpu_up(cpu);
+ }
+
+ /* Any cleanup work */
+ printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+ smp_cpus_done(setup_max_cpus);
+}
+
+/*
+ * Call a function on all processors. May be used during early boot while
+ * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ preempt_disable();
+ ret = smp_call_function(func, info, wait);
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe..174f976c287 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
static void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
- struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+ struct task_struct *tsk = __this_cpu_read(ksoftirqd);
if (tsk && tsk->state != TASK_RUNNING)
wake_up_process(tsk);
@@ -311,9 +311,21 @@ void irq_enter(void)
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
-# define invoke_softirq() __do_softirq()
+static inline void invoke_softirq(void)
+{
+ if (!force_irqthreads)
+ __do_softirq();
+ else
+ wakeup_softirqd();
+}
#else
-# define invoke_softirq() do_softirq()
+static inline void invoke_softirq(void)
+{
+ if (!force_irqthreads)
+ do_softirq();
+ else
+ wakeup_softirqd();
+}
#endif
/*
@@ -388,8 +400,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
local_irq_save(flags);
t->next = NULL;
- *__get_cpu_var(tasklet_vec).tail = t;
- __get_cpu_var(tasklet_vec).tail = &(t->next);
+ *__this_cpu_read(tasklet_vec.tail) = t;
+ __this_cpu_write(tasklet_vec.tail, &(t->next));
raise_softirq_irqoff(TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
@@ -402,8 +414,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
local_irq_save(flags);
t->next = NULL;
- *__get_cpu_var(tasklet_hi_vec).tail = t;
- __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+ *__this_cpu_read(tasklet_hi_vec.tail) = t;
+ __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_restore(flags);
}
@@ -414,8 +426,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
{
BUG_ON(!irqs_disabled());
- t->next = __get_cpu_var(tasklet_hi_vec).head;
- __get_cpu_var(tasklet_hi_vec).head = t;
+ t->next = __this_cpu_read(tasklet_hi_vec.head);
+ __this_cpu_write(tasklet_hi_vec.head, t);
__raise_softirq_irqoff(HI_SOFTIRQ);
}
@@ -426,9 +438,9 @@ static void tasklet_action(struct softirq_action *a)
struct tasklet_struct *list;
local_irq_disable();
- list = __get_cpu_var(tasklet_vec).head;
- __get_cpu_var(tasklet_vec).head = NULL;
- __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+ list = __this_cpu_read(tasklet_vec.head);
+ __this_cpu_write(tasklet_vec.head, NULL);
+ __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
local_irq_enable();
while (list) {
@@ -449,8 +461,8 @@ static void tasklet_action(struct softirq_action *a)
local_irq_disable();
t->next = NULL;
- *__get_cpu_var(tasklet_vec).tail = t;
- __get_cpu_var(tasklet_vec).tail = &(t->next);
+ *__this_cpu_read(tasklet_vec.tail) = t;
+ __this_cpu_write(tasklet_vec.tail, &(t->next));
__raise_softirq_irqoff(TASKLET_SOFTIRQ);
local_irq_enable();
}
@@ -461,9 +473,9 @@ static void tasklet_hi_action(struct softirq_action *a)
struct tasklet_struct *list;
local_irq_disable();
- list = __get_cpu_var(tasklet_hi_vec).head;
- __get_cpu_var(tasklet_hi_vec).head = NULL;
- __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
+ list = __this_cpu_read(tasklet_hi_vec.head);
+ __this_cpu_write(tasklet_hi_vec.head, NULL);
+ __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
local_irq_enable();
while (list) {
@@ -484,8 +496,8 @@ static void tasklet_hi_action(struct softirq_action *a)
local_irq_disable();
t->next = NULL;
- *__get_cpu_var(tasklet_hi_vec).tail = t;
- __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+ *__this_cpu_read(tasklet_hi_vec.tail) = t;
+ __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
__raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
}
@@ -555,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
/**
* tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
* @ttimer: tasklet_hrtimer which is initialized
- * @function: hrtimer callback funtion which gets called from softirq context
+ * @function: hrtimer callback function which gets called from softirq context
* @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
* @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
*/
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
{
set_current_state(TASK_INTERRUPTIBLE);
- current->flags |= PF_KSOFTIRQD;
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
don't process */
if (cpu_is_offline((long)__bind_cpu))
goto wait_to_die;
- do_softirq();
+ local_irq_disable();
+ if (local_softirq_pending())
+ __do_softirq();
+ local_irq_enable();
preempt_enable_no_resched();
cond_resched();
preempt_disable();
@@ -802,16 +816,16 @@ static void takeover_tasklets(unsigned int cpu)
/* Find end, append list for that CPU. */
if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
- *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
- __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+ *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
+ this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
per_cpu(tasklet_vec, cpu).head = NULL;
per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
}
raise_softirq_irqoff(TASKLET_SOFTIRQ);
if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
- *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
- __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+ *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
+ __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
per_cpu(tasklet_hi_vec, cpu).head = NULL;
per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
}
@@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+ p = kthread_create_on_node(run_ksoftirqd,
+ hcpu,
+ cpu_to_node(hotcpu),
+ "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
return notifier_from_errno(PTR_ERR(p));
@@ -853,7 +870,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
cpumask_any(cpu_online_mask));
case CPU_DEAD:
case CPU_DEAD_FROZEN: {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ static const struct sched_param param = {
+ .sched_priority = MAX_RT_PRIO-1
+ };
p = per_cpu(ksoftirqd, hotcpu);
per_cpu(ksoftirqd, hotcpu) = NULL;
@@ -883,25 +902,6 @@ static __init int spawn_ksoftirqd(void)
}
early_initcall(spawn_ksoftirqd);
-#ifdef CONFIG_SMP
-/*
- * Call a function on all processors
- */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
-{
- int ret = 0;
-
- preempt_disable();
- ret = smp_call_function(func, info, wait);
- local_irq_disable();
- func(info);
- local_irq_enable();
- preempt_enable();
- return ret;
-}
-EXPORT_SYMBOL(on_each_cpu);
-#endif
-
/*
* [ These __weak aliases are kept in a separate compilation unit, so that
* GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e0750053..73ce23feaea 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/smp.h>
+#include <linux/delay.h>
#include <linux/srcu.h>
static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -155,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections. If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods. This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+
+/*
* Helper function for synchronize_srcu() and synchronize_srcu_expedited().
*/
static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -203,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
* all srcu_read_lock() calls using the old counters have completed.
* Their corresponding critical sections might well be still
* executing, but the srcu_read_lock() primitives themselves
- * will have finished executing.
+ * will have finished executing. We initially give readers
+ * an arbitrarily chosen 10 microseconds to get out of their
+ * SRCU read-side critical sections, then loop waiting 1/HZ
+ * seconds per iteration. The 10-microsecond value has done
+ * very well in testing.
*/
+ if (srcu_readers_active_idx(sp, idx))
+ udelay(SYNCHRONIZE_SRCU_READER_DELAY);
while (srcu_readers_active_idx(sp, idx))
schedule_timeout_interruptible(1);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03be..e3516b29076 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
BUG_ON(stopper->thread || stopper->enabled ||
!list_empty(&stopper->works));
- p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
- cpu);
+ p = kthread_create_on_node(cpu_stopper_thread,
+ stopper,
+ cpu_to_node(cpu),
+ "migration/%d", cpu);
if (IS_ERR(p))
return notifier_from_errno(PTR_ERR(p));
get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a..af468edf096 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,12 +37,15 @@
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
#include <linux/gfp.h>
+#include <linux/syscore_ops.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/kmsg_dump.h>
+
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>
@@ -117,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
void (*pm_power_off_prepare)(void);
/*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+
+ if (pcred->user->user_ns == cred->user->user_ns &&
+ (pcred->uid == cred->euid ||
+ pcred->euid == cred->euid))
+ return true;
+ if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+ return true;
+ return false;
+}
+
+/*
* set the priority of a task
* - the caller must hold the RCU read lock
*/
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
- const struct cred *cred = current_cred(), *pcred = __task_cred(p);
int no_nice;
- if (pcred->uid != cred->euid &&
- pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
+ if (!set_one_prio_perm(p)) {
error = -EPERM;
goto out;
}
@@ -285,6 +305,7 @@ out_unlock:
*/
void emergency_restart(void)
{
+ kmsg_dump(KMSG_DUMP_EMERG);
machine_emergency_restart();
}
EXPORT_SYMBOL_GPL(emergency_restart);
@@ -295,6 +316,7 @@ void kernel_restart_prepare(char *cmd)
system_state = SYSTEM_RESTART;
device_shutdown();
sysdev_shutdown();
+ syscore_shutdown();
}
/**
@@ -312,6 +334,7 @@ void kernel_restart(char *cmd)
printk(KERN_EMERG "Restarting system.\n");
else
printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+ kmsg_dump(KMSG_DUMP_RESTART);
machine_restart(cmd);
}
EXPORT_SYMBOL_GPL(kernel_restart);
@@ -332,7 +355,9 @@ void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
sysdev_shutdown();
+ syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
+ kmsg_dump(KMSG_DUMP_HALT);
machine_halt();
}
@@ -350,7 +375,9 @@ void kernel_power_off(void)
pm_power_off_prepare();
disable_nonboot_cpus();
sysdev_shutdown();
+ syscore_shutdown();
printk(KERN_EMERG "Power down.\n");
+ kmsg_dump(KMSG_DUMP_POWEROFF);
machine_power_off();
}
EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -496,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
if (rgid != (gid_t) -1) {
if (old->gid == rgid ||
old->egid == rgid ||
- capable(CAP_SETGID))
+ nsown_capable(CAP_SETGID))
new->gid = rgid;
else
goto error;
@@ -505,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
if (old->gid == egid ||
old->egid == egid ||
old->sgid == egid ||
- capable(CAP_SETGID))
+ nsown_capable(CAP_SETGID))
new->egid = egid;
else
goto error;
@@ -540,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
old = current_cred();
retval = -EPERM;
- if (capable(CAP_SETGID))
+ if (nsown_capable(CAP_SETGID))
new->gid = new->egid = new->sgid = new->fsgid = gid;
else if (gid == old->gid || gid == old->sgid)
new->egid = new->fsgid = gid;
@@ -607,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
new->uid = ruid;
if (old->uid != ruid &&
old->euid != ruid &&
- !capable(CAP_SETUID))
+ !nsown_capable(CAP_SETUID))
goto error;
}
@@ -616,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
if (old->uid != euid &&
old->euid != euid &&
old->suid != euid &&
- !capable(CAP_SETUID))
+ !nsown_capable(CAP_SETUID))
goto error;
}
@@ -664,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
old = current_cred();
retval = -EPERM;
- if (capable(CAP_SETUID)) {
+ if (nsown_capable(CAP_SETUID)) {
new->suid = new->uid = uid;
if (uid != old->uid) {
retval = set_user(new);
@@ -706,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
old = current_cred();
retval = -EPERM;
- if (!capable(CAP_SETUID)) {
+ if (!nsown_capable(CAP_SETUID)) {
if (ruid != (uid_t) -1 && ruid != old->uid &&
ruid != old->euid && ruid != old->suid)
goto error;
@@ -770,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
old = current_cred();
retval = -EPERM;
- if (!capable(CAP_SETGID)) {
+ if (!nsown_capable(CAP_SETGID)) {
if (rgid != (gid_t) -1 && rgid != old->gid &&
rgid != old->egid && rgid != old->sgid)
goto error;
@@ -830,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
if (uid == old->uid || uid == old->euid ||
uid == old->suid || uid == old->fsuid ||
- capable(CAP_SETUID)) {
+ nsown_capable(CAP_SETUID)) {
if (uid != old_fsuid) {
new->fsuid = uid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -863,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
if (gid == old->gid || gid == old->egid ||
gid == old->sgid || gid == old->fsgid ||
- capable(CAP_SETGID)) {
+ nsown_capable(CAP_SETGID)) {
if (gid != old_fsgid) {
new->fsgid = gid;
goto change_okay;
@@ -1080,8 +1107,10 @@ SYSCALL_DEFINE0(setsid)
err = session;
out:
write_unlock_irq(&tasklist_lock);
- if (err > 0)
+ if (err > 0) {
proc_sid_connector(group_leader);
+ sched_autogroup_create_attach(group_leader);
+ }
return err;
}
@@ -1169,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
+
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
down_write(&uts_sem);
@@ -1218,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
@@ -1333,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
rlim = tsk->signal->rlim + resource;
task_lock(tsk->group_leader);
if (new_rlim) {
+ /* Keep the capable check against init_user_ns until
+ cgroups can contain all limits */
if (new_rlim->rlim_max > rlim->rlim_max &&
!capable(CAP_SYS_RESOURCE))
retval = -EPERM;
@@ -1376,18 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
{
const struct cred *cred = current_cred(), *tcred;
- tcred = __task_cred(task);
- if ((cred->uid != tcred->euid ||
- cred->uid != tcred->suid ||
- cred->uid != tcred->uid ||
- cred->gid != tcred->egid ||
- cred->gid != tcred->sgid ||
- cred->gid != tcred->gid) &&
- !capable(CAP_SYS_RESOURCE)) {
- return -EPERM;
- }
+ if (current == task)
+ return 0;
- return 0;
+ tcred = __task_cred(task);
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ return 0;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+ return 0;
+
+ return -EPERM;
}
SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c..25cc41cd8f3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open);
/* fanotify! */
cond_syscall(sys_fanotify_init);
cond_syscall(sys_fanotify_mark);
+
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa151855..c0bb32414b1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/signal.h>
+#include <linux/printk.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
@@ -116,6 +117,7 @@ static int neg_one = -1;
static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
static unsigned long one_ul = 1;
static int one_hundred = 100;
#ifdef CONFIG_PRINTK
@@ -168,8 +170,14 @@ static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
#ifdef CONFIG_MAGIC_SYSRQ
-static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+/* Note: sysrq code uses it's own private copy */
+static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
static int sysrq_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -192,9 +200,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
static struct ctl_table root_table[];
static struct ctl_table_root sysctl_table_root;
static struct ctl_table_header root_table_header = {
- .count = 1,
+ {{.count = 1,
.ctl_table = root_table,
- .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
+ .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
.root = &sysctl_table_root,
.set = &sysctl_table_root.default_set,
};
@@ -245,10 +253,6 @@ static struct ctl_table root_table[] = {
.mode = 0555,
.child = dev_table,
},
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -259,8 +263,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
#endif
#ifdef CONFIG_COMPACTION
@@ -305,15 +307,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_wakeup_granularity_ns,
},
{
- .procname = "sched_shares_ratelimit",
- .data = &sysctl_sched_shares_ratelimit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_shares_ratelimit,
- .extra2 = &max_sched_shares_ratelimit,
- },
- {
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +316,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_tunable_scaling,
},
{
- .procname = "sched_shares_thresh",
- .data = &sysctl_sched_shares_thresh,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
.procname = "sched_migration_cost",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
@@ -352,6 +337,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "sched_shares_window",
+ .data = &sysctl_sched_shares_window,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "timer_migration",
.data = &sysctl_timer_migration,
.maxlen = sizeof(unsigned int),
@@ -375,13 +367,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rt_handler,
},
+#ifdef CONFIG_SCHED_AUTOGROUP
{
- .procname = "sched_compat_yield",
- .data = &sysctl_sched_compat_yield,
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
},
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -711,6 +707,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dmesg_restrict,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
#endif
{
.procname = "ngroups_max",
@@ -745,21 +750,21 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
-#endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
{
- .procname = "unknown_nmi_panic",
- .data = &unknown_nmi_panic,
+ .procname = "nmi_watchdog",
+ .data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dowatchdog_enabled,
},
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
- .procname = "nmi_watchdog",
- .data = &nmi_watchdog_enabled,
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_nmi_enabled,
+ .proc_handler = proc_dointvec,
},
#endif
#if defined(CONFIG_X86)
@@ -942,7 +947,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_perf_event_sample_rate,
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = perf_proc_update_handler,
},
#endif
#ifdef CONFIG_KMEMCHECK
@@ -963,10 +968,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -976,14 +977,18 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "panic_on_oom",
.data = &sysctl_panic_on_oom,
.maxlen = sizeof(sysctl_panic_on_oom),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "oom_kill_allocating_task",
@@ -1011,7 +1016,8 @@ static struct ctl_table vm_table[] = {
.data = &page_cluster,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "dirty_background_ratio",
@@ -1059,7 +1065,8 @@ static struct ctl_table vm_table[] = {
.data = &dirty_expire_interval,
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "nr_pdflush_threads",
@@ -1135,6 +1142,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &three,
},
#ifdef CONFIG_COMPACTION
{
@@ -1327,11 +1336,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
-
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -1487,10 +1491,6 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -1574,11 +1574,16 @@ void sysctl_head_get(struct ctl_table_header *head)
spin_unlock(&sysctl_lock);
}
+static void free_head(struct rcu_head *rcu)
+{
+ kfree(container_of(rcu, struct ctl_table_header, rcu));
+}
+
void sysctl_head_put(struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
if (!--head->count)
- kfree(head);
+ call_rcu(&head->rcu, free_head);
spin_unlock(&sysctl_lock);
}
@@ -1692,13 +1697,8 @@ static int test_perm(int mode, int op)
int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
{
- int error;
int mode;
- error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
- if (error)
- return error;
-
if (root->permissions)
mode = root->permissions(root, current->nsproxy, table);
else
@@ -1955,10 +1955,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
start_unregistering(header);
if (!--header->parent->count) {
WARN_ON(1);
- kfree(header->parent);
+ call_rcu(&header->parent->rcu, free_head);
}
if (!--header->count)
- kfree(header);
+ call_rcu(&header->rcu, free_head);
spin_unlock(&sysctl_lock);
}
@@ -2399,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
struct do_proc_dointvec_minmax_conv_param {
int *min;
int *max;
@@ -2900,7 +2911,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
}
-#else /* CONFIG_PROC_FS */
+#else /* CONFIG_PROC_SYSCTL */
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2952,7 +2963,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
}
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_PROC_SYSCTL */
/*
* No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c578606..3b8e028b960 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
{ CTL_INT, KERN_COMPAT_LOG, "compat-log" },
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
- { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
{}
};
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
buf[result] = '\0';
- /* Convert the decnet addresss to binary */
+ /* Convert the decnet address to binary */
result = -EIO;
nodep = strchr(buf, '.') + 1;
if (!nodep)
@@ -1322,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
const struct bin_table *table = NULL;
- struct nameidata nd;
struct vfsmount *mnt;
struct file *file;
ssize_t result;
char *pathname;
int flags;
- int acc_mode;
pathname = sysctl_getname(name, nlen, &table);
result = PTR_ERR(pathname);
@@ -1338,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
/* How should the sysctl be accessed? */
if (oldval && oldlen && newval && newlen) {
flags = O_RDWR;
- acc_mode = MAY_READ | MAY_WRITE;
} else if (newval && newlen) {
flags = O_WRONLY;
- acc_mode = MAY_WRITE;
} else if (oldval && oldlen) {
flags = O_RDONLY;
- acc_mode = MAY_READ;
} else {
result = 0;
goto out_putname;
}
mnt = current->nsproxy->pid_ns->proc_mnt;
- result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
- if (result)
- goto out_putname;
-
- result = may_open(&nd.path, acc_mode, flags);
- if (result)
- goto out_putpath;
-
- file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+ file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
result = PTR_ERR(file);
if (IS_ERR(file))
goto out_putname;
@@ -1371,10 +1357,6 @@ out_putname:
putname(pathname);
out:
return result;
-
-out_putpath:
- path_put(&nd.path);
- goto out_putname;
}
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c..4e4932a7b36 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
const char *fail = NULL;
if (table->parent) {
- if (table->procname && !table->parent->procname)
+ if (!table->parent->procname)
set_fail(&fail, table, "Parent without procname");
}
- if (!table->procname)
- set_fail(&fail, table, "No procname");
if (table->child) {
if (table->data)
set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
set_fail(&fail, table, "No maxlen");
}
#ifdef CONFIG_PROC_SYSCTL
- if (table->procname && !table->proc_handler)
+ if (!table->proc_handler)
set_fail(&fail, table, "No proc_handler");
#endif
-#if 0
- if (!table->procname && table->proc_handler)
- set_fail(&fail, table, "proc_handler without procname");
-#endif
sysctl_check_leaf(namespaces, table, &fail);
}
if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c8231fb1570..9ffea360a77 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
return -ENOMEM;
if (!info) {
- int seq = get_cpu_var(taskstats_seqnum)++;
- put_cpu_var(taskstats_seqnum);
+ int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
} else
@@ -349,25 +348,47 @@ static int parse(struct nlattr *na, struct cpumask *mask)
return ret;
}
+#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define TASKSTATS_NEEDS_PADDING 1
+#endif
+
static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
{
struct nlattr *na, *ret;
int aggr;
- /* If we don't pad, we end up with alignment on a 4 byte boundary.
- * This causes lots of runtime warnings on systems requiring 8 byte
- * alignment */
- u32 pids[2] = { pid, 0 };
- int pid_size = ALIGN(sizeof(pid), sizeof(long));
-
aggr = (type == TASKSTATS_TYPE_PID)
? TASKSTATS_TYPE_AGGR_PID
: TASKSTATS_TYPE_AGGR_TGID;
+ /*
+ * The taskstats structure is internally aligned on 8 byte
+ * boundaries but the layout of the aggregrate reply, with
+ * two NLA headers and the pid (each 4 bytes), actually
+ * force the entire structure to be unaligned. This causes
+ * the kernel to issue unaligned access warnings on some
+ * architectures like ia64. Unfortunately, some software out there
+ * doesn't properly unroll the NLA packet and assumes that the start
+ * of the taskstats structure will always be 20 bytes from the start
+ * of the netlink payload. Aligning the start of the taskstats
+ * structure breaks this software, which we don't want. So, for now
+ * the alignment only happens on architectures that require it
+ * and those users will have to update to fixed versions of those
+ * packages. Space is reserved in the packet only when needed.
+ * This ifdef should be removed in several years e.g. 2012 once
+ * we can be confident that fixed versions are installed on most
+ * systems. We add the padding before the aggregate since the
+ * aggregate is already a defined type.
+ */
+#ifdef TASKSTATS_NEEDS_PADDING
+ if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
+ goto err;
+#endif
na = nla_nest_start(skb, aggr);
if (!na)
goto err;
- if (nla_put(skb, type, pid_size, pids) < 0)
+
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0)
goto err;
ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
if (!ret)
@@ -456,6 +477,18 @@ out:
return rc;
}
+static size_t taskstats_packet_size(void)
+{
+ size_t size;
+
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+#ifdef TASKSTATS_NEEDS_PADDING
+ size += nla_total_size(0); /* Padding for alignment */
+#endif
+ return size;
+}
+
static int cmd_attr_pid(struct genl_info *info)
{
struct taskstats *stats;
@@ -464,8 +497,7 @@ static int cmd_attr_pid(struct genl_info *info)
u32 pid;
int rc;
- size = nla_total_size(sizeof(u32)) +
- nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+ size = taskstats_packet_size();
rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
if (rc < 0)
@@ -494,8 +526,7 @@ static int cmd_attr_tgid(struct genl_info *info)
u32 tgid;
int rc;
- size = nla_total_size(sizeof(u32)) +
- nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+ size = taskstats_packet_size();
rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
if (rc < 0)
@@ -570,8 +601,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
/*
* Size includes space for nested attributes
*/
- size = nla_total_size(sizeof(u32)) +
- nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+ size = taskstats_packet_size();
is_thread_group = !!taskstats_tgid_alloc(tsk);
if (is_thread_group) {
@@ -581,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
fill_tgid_exit(tsk);
}
- listeners = &__raw_get_cpu_var(listener_array);
+ listeners = __this_cpu_ptr(&listener_array);
if (list_empty(&listeners->list))
return;
@@ -655,7 +685,7 @@ static int __init taskstats_init(void)
goto err_cgroup_ops;
family_registered = 1;
- printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+ pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
return 0;
err_cgroup_ops:
genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d183..8e8dc6d705c 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
* various programs will get confused when the clock gets warped.
*/
-int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
{
static int firsttime = 1;
int error = 0;
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
* Avoid unnecessary multiplications/divisions in the
* two most common HZ cases:
*/
-unsigned int inline jiffies_to_msecs(const unsigned long j)
+inline unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
-unsigned int inline jiffies_to_usecs(const unsigned long j)
+inline unsigned int jiffies_to_usecs(const unsigned long j)
{
#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
}
/**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
*
* @n: nsecs in u64
*
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
*/
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
#endif
}
-#if (BITS_PER_LONG < 64)
-u64 get_jiffies_64(void)
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n: nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
{
- unsigned long seq;
- u64 ret;
-
- do {
- seq = read_seqbegin(&xtime_lock);
- ret = jiffies_64;
- } while (read_seqretry(&xtime_lock, seq));
- return ret;
+ return (unsigned long)nsecs_to_jiffies64(n);
}
-EXPORT_SYMBOL(get_jiffies_64);
-#endif
-
-EXPORT_SYMBOL(jiffies);
/*
* Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06..b0425991e9a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timeconv.o posix-clock.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f..0d74b9ba90c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
-#include <linux/tick.h>
#include "tick-internal.h"
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4..6519cf62d9c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
* @shift: pointer to shift variable
* @from: frequency to convert from
* @to: frequency to convert to
- * @minsec: guaranteed runtime conversion range in seconds
+ * @maxsec: guaranteed runtime conversion range in seconds
*
* The function evaluates the shift/mult pair for the scaled math
* operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
* NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
* event @to is the counter frequency and @from is NSEC_PER_SEC.
*
- * The @minsec conversion range argument controls the time frame in
+ * The @maxsec conversion range argument controls the time frame in
* seconds which must be covered by the runtime conversion with the
* calculated mult and shift factors. This guarantees that no 64bit
* overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
* factors.
*/
void
-clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
{
u64 tmp;
u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
* Calculate the shift factor which is limiting the conversion
* range:
*/
- tmp = ((u64)minsec * from) >> 32;
+ tmp = ((u64)maxsec * from) >> 32;
while (tmp) {
tmp >>=1;
sftacc--;
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
*/
for (sft = 32; sft > 0; sft--) {
tmp = (u64) to << sft;
+ tmp += from / 2;
do_div(tmp, from);
if ((tmp >> sftacc) == 0)
break;
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
- /* Intialize mult/shift and max_idle_ns */
+ /* Initialize mult/shift and max_idle_ns */
__clocksource_updatefreq_scale(cs, scale, freq);
/* Add clocksource to the clcoksource list */
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a845690..a470154e040 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
************************************************************************/
#include <linux/clocksource.h>
#include <linux/jiffies.h>
+#include <linux/module.h>
#include <linux/init.h>
+#include "tick-internal.h"
+
/* The Jiffies based clocksource is the lowest common
* denominator clock source which should function on
* all systems. It has the same coarse resolution as
@@ -31,7 +34,7 @@
* inaccuracies caused by missed or lost timer
* interrupts and the inability for the timer
* interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not reccomended
+ * requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
.shift = JIFFIES_SHIFT,
};
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+ unsigned long seq;
+ u64 ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ ret = jiffies_64;
+ } while (read_seqretry(&xtime_lock, seq));
+ return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
+
static int __init init_jiffies_clocksource(void)
{
return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538..f6117a4c7cb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,9 @@
#include <linux/timex.h>
#include <linux/time.h>
#include <linux/mm.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
/*
* NTP timekeeping variables:
@@ -74,6 +77,162 @@ static long time_adjust;
/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
static s64 ntp_tick_adj;
+#ifdef CONFIG_NTP_PPS
+
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID 10 /* PPS signal watchdog max (s) */
+#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
+#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
+#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
+#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
+ increase pps_shift or consecutive bad
+ intervals to decrease it */
+#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
+
+static int pps_valid; /* signal watchdog counter */
+static long pps_tf[3]; /* phase median filter */
+static long pps_jitter; /* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift; /* current interval duration (s) (shift) */
+static int pps_intcnt; /* interval counter */
+static s64 pps_freq; /* frequency offset (scaled ns/s) */
+static long pps_stabil; /* current stability (scaled ns/s) */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt; /* calibration intervals */
+static long pps_jitcnt; /* jitter limit exceeded */
+static long pps_stbcnt; /* stability limit exceeded */
+static long pps_errcnt; /* calibration errors */
+
+
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ return offset;
+ else
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void)
+{
+ /* the PPS calibration interval may end
+ surprisingly early */
+ pps_shift = PPS_INTMIN;
+ pps_intcnt = 0;
+}
+
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_clear(void)
+{
+ pps_reset_freq_interval();
+ pps_tf[0] = 0;
+ pps_tf[1] = 0;
+ pps_tf[2] = 0;
+ pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+ pps_freq = 0;
+}
+
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_dec_valid(void)
+{
+ if (pps_valid > 0)
+ pps_valid--;
+ else {
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ pps_clear();
+ }
+}
+
+static inline void pps_set_freq(s64 freq)
+{
+ pps_freq = freq;
+}
+
+static inline int is_error_status(int status)
+{
+ return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ /* PPS signal lost when either PPS time or
+ * PPS frequency synchronization requested
+ */
+ || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+ && !(time_status & STA_PPSSIGNAL))
+ /* PPS jitter exceeded when
+ * PPS time synchronization requested */
+ || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+ == (STA_PPSTIME|STA_PPSJITTER))
+ /* PPS wander exceeded or calibration error when
+ * PPS frequency synchronization requested
+ */
+ || ((time_status & STA_PPSFREQ)
+ && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+ PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ txc->jitter = pps_jitter;
+ if (!(time_status & STA_NANO))
+ txc->jitter /= NSEC_PER_USEC;
+ txc->shift = pps_shift;
+ txc->stabil = pps_stabil;
+ txc->jitcnt = pps_jitcnt;
+ txc->calcnt = pps_calcnt;
+ txc->errcnt = pps_errcnt;
+ txc->stbcnt = pps_stbcnt;
+}
+
+#else /* !CONFIG_NTP_PPS */
+
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+
+static inline int is_error_status(int status)
+{
+ return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
+}
+
+#endif /* CONFIG_NTP_PPS */
+
/*
* NTP methods:
*/
@@ -185,6 +344,9 @@ void ntp_clear(void)
tick_length = tick_length_base;
time_offset = 0;
+
+ /* Clear PPS state variables */
+ pps_clear();
}
/*
@@ -250,16 +412,16 @@ void second_overflow(void)
time_status |= STA_UNSYNC;
}
- /*
- * Compute the phase adjustment for the next second. The offset is
- * reduced by a fixed factor times the time constant.
- */
+ /* Compute the phase adjustment for the next second */
tick_length = tick_length_base;
- delta = shift_right(time_offset, SHIFT_PLL + time_constant);
+ delta = ntp_offset_chunk(time_offset);
time_offset -= delta;
tick_length += delta;
+ /* Check PPS signal */
+ pps_dec_valid();
+
if (!time_adjust)
return;
@@ -369,6 +531,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
+ /* restart PPS frequency calibration */
+ pps_reset_freq_interval();
}
/*
@@ -418,6 +582,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
time_freq = txc->freq * PPM_SCALE;
time_freq = min(time_freq, MAXFREQ_SCALED);
time_freq = max(time_freq, -MAXFREQ_SCALED);
+ /* update pps_freq */
+ pps_set_freq(time_freq);
}
if (txc->modes & ADJ_MAXERROR)
@@ -482,6 +648,19 @@ int do_adjtimex(struct timex *txc)
hrtimer_cancel(&leap_timer);
}
+ if (txc->modes & ADJ_SETOFFSET) {
+ struct timespec delta;
+ delta.tv_sec = txc->time.tv_sec;
+ delta.tv_nsec = txc->time.tv_usec;
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+ if (!(txc->modes & ADJ_NANO))
+ delta.tv_nsec *= 1000;
+ result = timekeeping_inject_offset(&delta);
+ if (result)
+ return result;
+ }
+
getnstimeofday(&ts);
write_seqlock_irq(&xtime_lock);
@@ -508,7 +687,8 @@ int do_adjtimex(struct timex *txc)
}
result = time_state; /* mostly `TIME_OK' */
- if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ /* check for errors */
+ if (is_error_status(time_status))
result = TIME_ERROR;
txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +702,8 @@ int do_adjtimex(struct timex *txc)
txc->tick = tick_usec;
txc->tai = time_tai;
- /* PPS is not implemented, so these are zero */
- txc->ppsfreq = 0;
- txc->jitter = 0;
- txc->shift = 0;
- txc->stabil = 0;
- txc->jitcnt = 0;
- txc->calcnt = 0;
- txc->errcnt = 0;
- txc->stbcnt = 0;
+ /* fill PPS status fields */
+ pps_fill_timex(txc);
write_sequnlock_irq(&xtime_lock);
@@ -544,6 +717,243 @@ int do_adjtimex(struct timex *txc)
return result;
}
+#ifdef CONFIG_NTP_PPS
+
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+ __kernel_time_t sec; /* seconds */
+ long nsec; /* nanoseconds */
+};
+
+/* normalize the timestamp so that nsec is in the
+ ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+ struct pps_normtime norm = {
+ .sec = ts.tv_sec,
+ .nsec = ts.tv_nsec
+ };
+
+ if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+ norm.nsec -= NSEC_PER_SEC;
+ norm.sec++;
+ }
+
+ return norm;
+}
+
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+ *jitter = pps_tf[0] - pps_tf[1];
+ if (*jitter < 0)
+ *jitter = -*jitter;
+
+ /* TODO: test various filters */
+ return pps_tf[0];
+}
+
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = err;
+}
+
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+ if (--pps_intcnt <= -PPS_INTCOUNT) {
+ pps_intcnt = -PPS_INTCOUNT;
+ if (pps_shift > PPS_INTMIN) {
+ pps_shift--;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+ if (++pps_intcnt >= PPS_INTCOUNT) {
+ pps_intcnt = PPS_INTCOUNT;
+ if (pps_shift < PPS_INTMAX) {
+ pps_shift++;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+ long delta, delta_mod;
+ s64 ftemp;
+
+ /* check if the frequency interval was too long */
+ if (freq_norm.sec > (2 << pps_shift)) {
+ time_status |= STA_PPSERROR;
+ pps_errcnt++;
+ pps_dec_freq_interval();
+ pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+ freq_norm.sec);
+ return 0;
+ }
+
+ /* here the raw frequency offset and wander (stability) is
+ * calculated. If the wander is less than the wander threshold
+ * the interval is increased; otherwise it is decreased.
+ */
+ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+ freq_norm.sec);
+ delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+ pps_freq = ftemp;
+ if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+ pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ pps_dec_freq_interval();
+ } else { /* good sample */
+ pps_inc_freq_interval();
+ }
+
+ /* the stability metric is calculated as the average of recent
+ * frequency changes, but is used only for performance
+ * monitoring
+ */
+ delta_mod = delta;
+ if (delta_mod < 0)
+ delta_mod = -delta_mod;
+ pps_stabil += (div_s64(((s64)delta_mod) <<
+ (NTP_SCALE_SHIFT - SHIFT_USEC),
+ NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+
+ /* if enabled, the system clock frequency is updated */
+ if ((time_status & STA_PPSFREQ) != 0 &&
+ (time_status & STA_FREQHOLD) == 0) {
+ time_freq = pps_freq;
+ ntp_update_frequency();
+ }
+
+ return delta;
+}
+
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+ long correction = -error;
+ long jitter;
+
+ /* add the sample to the median filter */
+ pps_phase_filter_add(correction);
+ correction = pps_phase_filter_get(&jitter);
+
+ /* Nominal jitter is due to PPS signal noise. If it exceeds the
+ * threshold, the sample is discarded; otherwise, if so enabled,
+ * the time offset is updated.
+ */
+ if (jitter > (pps_jitter << PPS_POPCORN)) {
+ pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+ jitter, (pps_jitter << PPS_POPCORN));
+ time_status |= STA_PPSJITTER;
+ pps_jitcnt++;
+ } else if (time_status & STA_PPSTIME) {
+ /* correct the time using the phase offset */
+ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+ NTP_INTERVAL_FREQ);
+ /* cancel running adjtime() */
+ time_adjust = 0;
+ }
+ /* update jitter */
+ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+ struct pps_normtime pts_norm, freq_norm;
+ unsigned long flags;
+
+ pts_norm = pps_normalize_ts(*phase_ts);
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ /* clear the error bits, they will be set again if needed */
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+
+ /* indicate signal presence */
+ time_status |= STA_PPSSIGNAL;
+ pps_valid = PPS_VALID;
+
+ /* when called for the first time,
+ * just start the frequency interval */
+ if (unlikely(pps_fbase.tv_sec == 0)) {
+ pps_fbase = *raw_ts;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ return;
+ }
+
+ /* ok, now we have a base for frequency calculation */
+ freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+
+ /* check that the signal is in the range
+ * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+ if ((freq_norm.sec == 0) ||
+ (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+ (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+ time_status |= STA_PPSJITTER;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ pr_err("hardpps: PPSJITTER: bad pulse\n");
+ return;
+ }
+
+ /* signal is ok */
+
+ /* check if the current frequency interval is finished */
+ if (freq_norm.sec >= (1 << pps_shift)) {
+ pps_calcnt++;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ hardpps_update_freq(freq_norm);
+ }
+
+ hardpps_update_phase(pts_norm.nsec);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+
+#endif /* CONFIG_NTP_PPS */
+
static int __init ntp_tick_adj_setup(char *str)
{
ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 00000000000..c340ca658f3
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,445 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+static void delete_clock(struct kref *kref);
+
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+
+ down_read(&clk->rwsem);
+
+ if (!clk->zombie)
+ return clk;
+
+ up_read(&clk->rwsem);
+
+ return NULL;
+}
+
+static void put_posix_clock(struct posix_clock *clk)
+{
+ up_read(&clk->rwsem);
+}
+
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -EINVAL;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.read)
+ err = clk->ops.read(clk, fp->f_flags, buf, count);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int result = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.poll)
+ result = clk->ops.poll(clk, fp, wait);
+
+ put_posix_clock(clk);
+
+ return result;
+}
+
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.fasync)
+ err = clk->ops.fasync(clk, fd, fp, on);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENODEV;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.mmap)
+ err = clk->ops.mmap(clk, vma);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static long posix_clock_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+#endif
+
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+ int err;
+ struct posix_clock *clk =
+ container_of(inode->i_cdev, struct posix_clock, cdev);
+
+ down_read(&clk->rwsem);
+
+ if (clk->zombie) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (clk->ops.open)
+ err = clk->ops.open(clk, fp->f_mode);
+ else
+ err = 0;
+
+ if (!err) {
+ kref_get(&clk->kref);
+ fp->private_data = clk;
+ }
+out:
+ up_read(&clk->rwsem);
+ return err;
+}
+
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+ int err = 0;
+
+ if (clk->ops.release)
+ err = clk->ops.release(clk);
+
+ kref_put(&clk->kref, delete_clock);
+
+ fp->private_data = NULL;
+
+ return err;
+}
+
+static const struct file_operations posix_clock_file_operations = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = posix_clock_read,
+ .poll = posix_clock_poll,
+ .unlocked_ioctl = posix_clock_ioctl,
+ .open = posix_clock_open,
+ .release = posix_clock_release,
+ .fasync = posix_clock_fasync,
+ .mmap = posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = posix_clock_compat_ioctl,
+#endif
+};
+
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+ int err;
+
+ kref_init(&clk->kref);
+ init_rwsem(&clk->rwsem);
+
+ cdev_init(&clk->cdev, &posix_clock_file_operations);
+ clk->cdev.owner = clk->ops.owner;
+ err = cdev_add(&clk->cdev, devid, 1);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+
+static void delete_clock(struct kref *kref)
+{
+ struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+
+ if (clk->release)
+ clk->release(clk);
+}
+
+void posix_clock_unregister(struct posix_clock *clk)
+{
+ cdev_del(&clk->cdev);
+
+ down_write(&clk->rwsem);
+ clk->zombie = true;
+ up_write(&clk->rwsem);
+
+ kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+
+struct posix_clock_desc {
+ struct file *fp;
+ struct posix_clock *clk;
+};
+
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+ struct file *fp = fget(CLOCKID_TO_FD(id));
+ int err = -EINVAL;
+
+ if (!fp)
+ return err;
+
+ if (fp->f_op->open != posix_clock_open || !fp->private_data)
+ goto out;
+
+ cd->fp = fp;
+ cd->clk = get_posix_clock(fp);
+
+ err = cd->clk ? 0 : -ENODEV;
+out:
+ if (err)
+ fput(fp);
+ return err;
+}
+
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+ put_posix_clock(cd->clk);
+ fput(cd->fp);
+}
+
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_adjtime)
+ err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_gettime)
+ err = cd.clk->ops.clock_gettime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_getres)
+ err = cd.clk->ops.clock_getres(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_settime)
+ err = cd.clk->ops.clock_settime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_create(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_create)
+ err = cd.clk->ops.timer_create(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_delete(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_delete)
+ err = cd.clk->ops.timer_delete(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+
+ if (get_clock_desc(id, &cd))
+ return;
+
+ if (cd.clk->ops.timer_gettime)
+ cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
+ put_clock_desc(&cd);
+}
+
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+ struct itimerspec *ts, struct itimerspec *old)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_settime)
+ err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+struct k_clock clock_posix_dynamic = {
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
+ .timer_create = pc_timer_create,
+ .timer_set = pc_timer_settime,
+ .timer_del = pc_timer_delete,
+ .timer_get = pc_timer_gettime,
+};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b566..da800ffa810 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include "tick-internal.h"
@@ -600,4 +599,14 @@ int tick_broadcast_oneshot_active(void)
return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
}
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
+
#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eee..119528de823 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include <asm/irq_regs.h>
@@ -49,9 +48,13 @@ struct tick_device *tick_get_device(int cpu)
*/
int tick_is_oneshot_available(void)
{
- struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return 0;
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 1;
+ return tick_broadcast_oneshot_available();
}
/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f6..1009b06d6f8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
/*
* tick internal variable and functions used by low/high res code
*/
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
#define TICK_DO_TIMER_NONE -1
#define TICK_DO_TIMER_BOOT -2
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
# endif /* !BROADCAST */
#else /* !ONESHOT */
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
return 0;
}
static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
#endif /* !TICK_ONESHOT */
/*
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
{
return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}
+
+#endif
+
+extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680..2d04411a5f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include "tick-internal.h"
@@ -95,7 +94,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
*/
int tick_program_event(ktime_t expires, int force)
{
- struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
return tick_dev_program_event(dev, expires, force);
}
@@ -167,7 +166,7 @@ int tick_oneshot_mode_active(void)
int ret;
local_irq_save(flags);
- ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+ ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
local_irq_restore(flags);
return ret;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd..d5097c44b40 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include <linux/module.h>
#include <asm/irq_regs.h>
@@ -642,8 +641,7 @@ static void tick_nohz_switch_to_nohz(void)
}
local_irq_enable();
- printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
- smp_processor_id());
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
}
/*
@@ -795,8 +793,10 @@ void tick_setup_sched_timer(void)
}
#ifdef CONFIG_NO_HZ
- if (tick_nohz_enabled)
+ if (tick_nohz_enabled) {
ts->nohz_mode = NOHZ_MODE_HIGHRES;
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
+ }
#endif
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176c..a9ae369925c 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/math64.h>
+#include <linux/kernel.h>
/*
* fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
int index;
int num_samples = sync->num_samples;
- if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+ if (num_samples > ARRAY_SIZE(buffer)) {
samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
if (!samples) {
samples = buffer;
- num_samples = sizeof(buffer)/sizeof(buffer[0]);
+ num_samples = ARRAY_SIZE(buffer);
}
} else {
samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f7..8ad5d576755 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
@@ -32,6 +32,8 @@ struct timekeeper {
cycle_t cycle_interval;
/* Number of clock shifted nano seconds in one NTP interval. */
u64 xtime_interval;
+ /* shifted nano seconds left over when rounding cycle_interval */
+ s64 xtime_remainder;
/* Raw nano seconds accumulated per NTP interval. */
u32 raw_interval;
@@ -47,7 +49,7 @@ struct timekeeper {
u32 mult;
};
-struct timekeeper timekeeper;
+static struct timekeeper timekeeper;
/**
* timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
static void timekeeper_setup_internals(struct clocksource *clock)
{
cycle_t interval;
- u64 tmp;
+ u64 tmp, ntpinterval;
timekeeper.clock = clock;
clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
tmp <<= clock->shift;
+ ntpinterval = tmp;
tmp += clock->mult/2;
do_div(tmp, clock->mult);
if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
/* Go back from cycles -> shifted ns */
timekeeper.xtime_interval = (u64) interval * clock->mult;
+ timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
timekeeper.raw_interval =
((u64) interval * clock->mult) >> clock->shift;
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time;
/*
* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
*/
-struct timespec raw_time;
+static struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
+#ifdef CONFIG_NTP_PPS
+
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw: pointer to the timespec to be set to raw monotonic time
+ * @ts_real: pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+ unsigned long seq;
+ s64 nsecs_raw, nsecs_real;
+
+ WARN_ON_ONCE(timekeeping_suspended);
+
+ do {
+ u32 arch_offset;
+
+ seq = read_seqbegin(&xtime_lock);
+
+ *ts_raw = raw_time;
+ *ts_real = xtime;
+
+ nsecs_raw = timekeeping_get_ns_raw();
+ nsecs_real = timekeeping_get_ns();
+
+ /* If arch requires, add in gettimeoffset() */
+ arch_offset = arch_gettimeoffset();
+ nsecs_raw += arch_offset;
+ nsecs_real += arch_offset;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts_raw, nsecs_raw);
+ timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+
+#endif /* CONFIG_NTP_PPS */
+
/**
* do_gettimeofday - Returns the time of day in a timeval
* @tv: pointer to the timeval to be set
@@ -306,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
-int do_settimeofday(struct timespec *tv)
+int do_settimeofday(const struct timespec *tv)
{
struct timespec ts_delta;
unsigned long flags;
@@ -340,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv: pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+ unsigned long flags;
+
+ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ timekeeping_forward_now();
+
+ xtime = timespec_add(xtime, *ts);
+ wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+
+ timekeeper.ntp_error = 0;
+ ntp_clear();
+
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
+
/**
* change_clocksource - Swaps clocksources if a new one is available
*
@@ -514,13 +597,12 @@ static struct timespec timekeeping_suspend_time;
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev: unused
*
* This is for the generic clocksource timekeeping.
* xtime/wall_to_monotonic/jiffies/etc are
* still managed by arch specific suspend/resume code.
*/
-static int timekeeping_resume(struct sys_device *dev)
+static void timekeeping_resume(void)
{
unsigned long flags;
struct timespec ts;
@@ -549,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev)
/* Resume hrtimers */
hres_timers_resume();
-
- return 0;
}
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+static int timekeeping_suspend(void)
{
unsigned long flags;
@@ -571,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
}
/* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
- .name = "timekeeping",
+static struct syscore_ops timekeeping_syscore_ops = {
.resume = timekeeping_resume,
.suspend = timekeeping_suspend,
};
-static struct sys_device device_timer = {
- .id = 0,
- .cls = &timekeeping_sysclass,
-};
-
-static int __init timekeeping_init_device(void)
+static int __init timekeeping_init_ops(void)
{
- int error = sysdev_class_register(&timekeeping_sysclass);
- if (!error)
- error = sysdev_register(&device_timer);
- return error;
+ register_syscore_ops(&timekeeping_syscore_ops);
+ return 0;
}
-device_initcall(timekeeping_init_device);
+device_initcall(timekeeping_init_ops);
/*
* If the error is already larger, we look ahead even further
@@ -719,7 +791,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
/* Accumulate error between NTP and clock interval */
timekeeper.ntp_error += tick_length << shift;
- timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ timekeeper.ntp_error -=
+ (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
(timekeeper.ntp_error_shift + shift);
return offset;
@@ -731,7 +804,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
*
* Called from the timer interrupt, must hold a write on xtime_lock.
*/
-void update_wall_time(void)
+static void update_wall_time(void)
{
struct clocksource *clock;
cycle_t offset;
@@ -823,7 +896,7 @@ void update_wall_time(void)
* getboottime - Return the real time of system boot.
* @ts: pointer to the timespec to be set
*
- * Returns the time of day in a timespec.
+ * Returns the wall-time of boot in a timespec.
*
* This is based on the wall_to_monotonic offset and the total suspend
* time. Calls to settimeofday will affect the value returned (which
@@ -841,6 +914,55 @@ void getboottime(struct timespec *ts)
}
EXPORT_SYMBOL_GPL(getboottime);
+
+/**
+ * get_monotonic_boottime - Returns monotonic time since boot
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the monotonic time since boot in a timespec.
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
+ * includes the time spent in suspend.
+ */
+void get_monotonic_boottime(struct timespec *ts)
+{
+ struct timespec tomono, sleep;
+ unsigned int seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ *ts = xtime;
+ tomono = wall_to_monotonic;
+ sleep = total_sleep_time;
+ nsecs = timekeeping_get_ns();
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(get_monotonic_boottime);
+
+/**
+ * ktime_get_boottime - Returns monotonic time since boot in a ktime
+ *
+ * Returns the monotonic time since boot in a ktime
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get, but also
+ * includes the time spent in suspend.
+ */
+ktime_t ktime_get_boottime(void)
+{
+ struct timespec ts;
+
+ get_monotonic_boottime(&ts);
+ return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_boottime);
+
/**
* monotonic_to_bootbased - Convert the monotonic time to boot based.
* @ts: pointer to the timespec to be converted
@@ -862,11 +984,6 @@ struct timespec __current_kernel_time(void)
return xtime;
}
-struct timespec __get_wall_to_monotonic(void)
-{
- return wall_to_monotonic;
-}
-
struct timespec current_kernel_time(void)
{
struct timespec now;
@@ -898,3 +1015,48 @@ struct timespec get_monotonic_coarse(void)
now.tv_nsec + mono.tv_nsec);
return now;
}
+
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+void do_timer(unsigned long ticks)
+{
+ jiffies_64 += ticks;
+ update_wall_time();
+ calc_global_load(ticks);
+}
+
+/**
+ * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ * and sleep offsets.
+ * @xtim: pointer to timespec to be set with xtime
+ * @wtom: pointer to timespec to be set with wall_to_monotonic
+ * @sleep: pointer to timespec to be set with time in suspend
+ */
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+ struct timespec *wtom, struct timespec *sleep)
+{
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ *xtim = xtime;
+ *wtom = wall_to_monotonic;
+ *sleep = total_sleep_time;
+ } while (read_seqretry(&xtime_lock, seq));
+}
+
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks: number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+ write_seqlock(&xtime_lock);
+ do_timer(ticks);
+ write_sequnlock(&xtime_lock);
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa9..3258455549f 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
char symname[KSYM_NAME_LEN];
if (lookup_symbol_name((unsigned long)sym, symname) < 0)
- SEQ_printf(m, "<%p>", sym);
+ SEQ_printf(m, "<%pK>", sym);
else
SEQ_printf(m, "%s", symname);
}
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
{
struct hrtimer *timer, tmp;
unsigned long next = 0, i;
- struct rb_node *curr;
+ struct timerqueue_node *curr;
unsigned long flags;
next_one:
i = 0;
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
- curr = base->first;
+ curr = timerqueue_getnext(&base->active);
/*
* Crude but we have to do this O(N*N) thing, because
* we have to unlock the base when printing:
*/
while (curr && i < next) {
- curr = rb_next(curr);
+ curr = timerqueue_iterate_next(curr);
i++;
}
if (curr) {
- timer = rb_entry(curr, struct hrtimer, node);
+ timer = container_of(curr, struct hrtimer, node);
tmp = *timer;
raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
@@ -112,7 +112,7 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %p\n", base);
+ SEQ_printf(m, " .base: %pK\n", base);
SEQ_printf(m, " .index: %d\n",
base->index);
SEQ_printf(m, " .resolution: %Lu nsecs\n",
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7..a5d0a3a85dd 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
unsigned int timer_flag)
{
/*
- * It doesnt matter which lock we take:
+ * It doesn't matter which lock we take:
*/
raw_spinlock_t *lock;
struct entry *entry, input;
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b..fd6198692b5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG (0x1)
-
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
{
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
static inline void timer_set_deferrable(struct timer_list *timer)
{
- timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
- TBASE_DEFERRABLE_FLAG));
+ timer->base = TBASE_MAKE_DEFERRED(timer->base);
}
static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
}
EXPORT_SYMBOL_GPL(set_timer_slack);
-
-static inline void set_running_timer(struct tvec_base *base,
- struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
- base->running_timer = timer;
-#endif
-}
-
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
@@ -426,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
static struct debug_obj_descr timer_debug_descr;
+static void *timer_debug_hint(void *addr)
+{
+ return ((struct timer_list *) addr)->function;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
@@ -499,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
static struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
+ .debug_hint = timer_debug_hint,
.fixup_init = timer_fixup_init,
.fixup_activate = timer_fixup_activate,
.fixup_free = timer_fixup_free,
@@ -936,15 +920,12 @@ int del_timer(struct timer_list *timer)
}
EXPORT_SYMBOL(del_timer);
-#ifdef CONFIG_SMP
/**
* try_to_del_timer_sync - Try to deactivate a timer
* @timer: timer do del
*
* This function tries to deactivate a timer. Upon successful (ret >= 0)
* exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
*/
int try_to_del_timer_sync(struct timer_list *timer)
{
@@ -973,6 +954,7 @@ out:
}
EXPORT_SYMBOL(try_to_del_timer_sync);
+#ifdef CONFIG_SMP
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -988,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
* add_timer_on(). Upon exit the timer is not queued and the handler is
* not running on any CPU.
*
+ * Note: You must not hold locks that are held in interrupt context
+ * while calling this function. Even if the lock has nothing to do
+ * with the timer in question. Here's why:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
+ * <IRQ>
+ * spin_lock(somelock);
+ * del_timer_sync(mytimer);
+ * while (base->running_timer == mytimer);
+ *
+ * Now del_timer_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but
+ * it has interrupted the softirq that CPU0 is waiting to finish.
+ *
* The function returns whether it has deactivated a pending timer or not.
*/
int del_timer_sync(struct timer_list *timer)
@@ -995,12 +996,20 @@ int del_timer_sync(struct timer_list *timer)
#ifdef CONFIG_LOCKDEP
unsigned long flags;
+ /*
+ * If lockdep gives a backtrace here, please reference
+ * the synchronization rules above.
+ */
local_irq_save(flags);
lock_map_acquire(&timer->lockdep_map);
lock_map_release(&timer->lockdep_map);
local_irq_restore(flags);
#endif
-
+ /*
+ * don't use it in hardirq context, because it
+ * could lead to deadlock.
+ */
+ WARN_ON(in_irq());
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@@ -1111,7 +1120,7 @@ static inline void __run_timers(struct tvec_base *base)
timer_stats_account_timer(timer);
- set_running_timer(base, timer);
+ base->running_timer = timer;
detach_timer(timer, 1);
spin_unlock_irq(&base->lock);
@@ -1119,7 +1128,7 @@ static inline void __run_timers(struct tvec_base *base)
spin_lock_irq(&base->lock);
}
}
- set_running_timer(base, NULL);
+ base->running_timer = NULL;
spin_unlock_irq(&base->lock);
}
@@ -1249,9 +1258,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
*/
unsigned long get_next_timer_interrupt(unsigned long now)
{
- struct tvec_base *base = __get_cpu_var(tvec_bases);
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
unsigned long expires;
+ /*
+ * Pretend that there is no timer pending if the cpu is offline.
+ * Possible pending timers will be migrated later to an active cpu.
+ */
+ if (cpu_is_offline(smp_processor_id()))
+ return now + NEXT_TIMER_MAX_DELTA;
spin_lock(&base->lock);
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
@@ -1292,7 +1307,7 @@ void update_process_times(int user_tick)
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = __get_cpu_var(tvec_bases);
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
hrtimer_run_pending();
@@ -1309,19 +1324,6 @@ void run_local_timers(void)
raise_softirq(TIMER_SOFTIRQ);
}
-/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
- */
-
-void do_timer(unsigned long ticks)
-{
- jiffies_64 += ticks;
- update_wall_time();
- calc_global_load();
-}
-
#ifdef __ARCH_WANT_SYS_ALARM
/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ea37e2ff416..2ad39e556cb 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
+config EVENT_POWER_TRACING_DEPRECATED
+ depends on EVENT_TRACING
+ bool "Deprecated power event trace API, to be removed"
+ default y
+ help
+ Provides old power event types:
+ C-state/idle accounting events:
+ power:power_start
+ power:power_end
+ and old cpufreq accounting event:
+ power:power_frequency
+ This is for userspace compatibility
+ and will vanish after 5 kernel iterations,
+ namely 2.6.41.
+
config CONTEXT_SWITCH_TRACER
bool
@@ -126,7 +141,7 @@ if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
- select FRAME_POINTER if !ARM_UNWIND && !S390
+ select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
@@ -260,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
This tracer profiles all the the likely and unlikely macros
in the kernel. It will display the results in:
- /sys/kernel/debug/tracing/profile_annotated_branch
+ /sys/kernel/debug/tracing/trace_stat/branch_annotated
Note: this will add a significant overhead; only turn this
on if you need to profile the system's use of these macros.
@@ -273,7 +288,7 @@ config PROFILE_ALL_BRANCHES
taken in the kernel is recorded whether it hit or miss.
The results will be displayed in:
- /sys/kernel/debug/tracing/profile_branch
+ /sys/kernel/debug/tracing/trace_stat/branch_all
This option also enables the likely/unlikely profiler.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b2..761c510a06c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec028154..6957aa298df 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
!blk_tracer_enabled))
return;
+ /*
+ * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
+ * message to the trace.
+ */
+ if (!(bt->act_mask & BLK_TC_NOTIFY))
+ return;
+
local_irq_save(flags);
buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
va_start(args, fmt);
@@ -696,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
*
**/
static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
- u32 what)
+ u32 what)
{
struct blk_trace *bt = q->blk_trace;
- int rw = rq->cmd_flags & 0x03;
if (likely(!bt))
return;
- if (rq->cmd_flags & REQ_DISCARD)
- rw |= REQ_DISCARD;
-
- if (rq->cmd_flags & REQ_SECURE)
- rw |= REQ_SECURE;
-
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
- what, rq->errors, 0, NULL);
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+ rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -758,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
* @q: queue the io is for
* @bio: the source bio
* @what: the action
+ * @error: error, if any
*
* Description:
* Records an action against a bio. Will log the bio offset + size.
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what)
+ u32 what, int error)
{
struct blk_trace *bt = q->blk_trace;
if (likely(!bt))
return;
+ if (!error && !bio_flagged(bio, BIO_UPTODATE))
+ error = EIO;
+
__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
- !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+ error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
- struct request_queue *q, struct bio *bio)
+ struct request_queue *q, struct bio *bio,
+ int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
static void blk_add_trace_bio_backmerge(void *ignore,
struct request_queue *q,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
}
static void blk_add_trace_bio_frontmerge(void *ignore,
struct request_queue *q,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore,
@@ -812,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
@@ -827,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
@@ -845,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
+ unsigned int depth, bool explicit)
{
struct blk_trace *bt = q->blk_trace;
if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
- __be64 rpdu = cpu_to_be64(pdu);
-
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
- sizeof(rpdu), &rpdu);
- }
-}
-
-static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
-{
- struct blk_trace *bt = q->blk_trace;
+ __be64 rpdu = cpu_to_be64(depth);
+ u32 what;
- if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
- __be64 rpdu = cpu_to_be64(pdu);
+ if (explicit)
+ what = BLK_TA_UNPLUG_IO;
+ else
+ what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
- sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
}
@@ -887,7 +884,7 @@ static void blk_add_trace_split(void *ignore,
}
/**
- * blk_add_trace_remap - Add a trace for a remap operation
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
* @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @bio: the source bio
@@ -899,9 +896,9 @@ static void blk_add_trace_split(void *ignore,
* it spans a stripe (or similar). Add a trace for that action.
*
**/
-static void blk_add_trace_remap(void *ignore,
- struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_bio_remap(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
@@ -1010,13 +1007,11 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
- WARN_ON(ret);
- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+ ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
WARN_ON(ret);
ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
- ret = register_trace_block_remap(blk_add_trace_remap, NULL);
+ ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
WARN_ON(ret);
@@ -1025,10 +1020,9 @@ static void blk_register_tracepoints(void)
static void blk_unregister_tracepoints(void)
{
unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
- unregister_trace_block_remap(blk_add_trace_remap, NULL);
+ unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
unregister_trace_block_split(blk_add_trace_split, NULL);
- unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+ unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1815,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i] = '\0';
}
-void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
-{
- int rw = rq->cmd_flags & 0x03;
- int bytes;
-
- if (rq->cmd_flags & REQ_DISCARD)
- rw |= REQ_DISCARD;
-
- if (rq->cmd_flags & REQ_SECURE)
- rw |= REQ_SECURE;
-
- bytes = blk_rq_bytes(rq);
-
- blk_fill_rwbs(rwbs, rw, bytes);
-}
-
#endif /* CONFIG_EVENT_TRACING */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae8388..ee24fa1935a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod)
p->flags = 0L;
/*
- * Do the initial record convertion from mcount jump
+ * Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
if (!ftrace_code_disable(mod, p)) {
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
return t_hash_next(m, pos);
(*pos)++;
- iter->pos = *pos;
+ iter->pos = iter->func_pos = *pos;
if (iter->flags & FTRACE_ITER_PRINTALL)
return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
if (!rec)
return t_hash_start(m, pos);
- iter->func_pos = *pos;
iter->func = rec;
return iter;
@@ -3328,7 +3327,7 @@ static int start_graph_tracing(void)
/* The cpu_boot init_task->ret_stack will never be freed */
for_each_online_cpu(cpu) {
if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_task(idle_task(cpu));
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
}
do {
@@ -3418,6 +3417,49 @@ void unregister_ftrace_graph(void)
mutex_unlock(&ftrace_lock);
}
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+ atomic_set(&t->tracing_graph_pause, 0);
+ atomic_set(&t->trace_overrun, 0);
+ t->ftrace_timestamp = 0;
+ /* make curr_ret_stack visible before we add the ret_stack */
+ smp_wmb();
+ t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+ t->curr_ret_stack = -1;
+ /*
+ * The idle task has no parent, it either has its own
+ * stack or no stack at all.
+ */
+ if (t->ret_stack)
+ WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = per_cpu(idle_ret_stack, cpu);
+ if (!ret_stack) {
+ ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack)
+ return;
+ per_cpu(idle_ret_stack, cpu) = ret_stack;
+ }
+ graph_init_task(t, ret_stack);
+ }
+}
+
/* Allocate a return stack for newly created task */
void ftrace_graph_init_task(struct task_struct *t)
{
@@ -3433,12 +3475,7 @@ void ftrace_graph_init_task(struct task_struct *t)
GFP_KERNEL);
if (!ret_stack)
return;
- atomic_set(&t->tracing_graph_pause, 0);
- atomic_set(&t->trace_overrun, 0);
- t->ftrace_timestamp = 0;
- /* make curr_ret_stack visable before we add the ret_stack */
- smp_wmb();
- t->ret_stack = ret_stack;
+ graph_init_task(t, ret_stack);
}
}
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a0616..f55fcf61b22 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+#ifdef EVENT_POWER_TRACING_DEPRECATED
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ed509a015d..0ef7b4b2a1f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
*/
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
-#include <linux/ftrace_irq.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
@@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
* the reader page). But if the next page is a header page,
* its flags will be non zero.
*/
-static int inline
+static inline int
rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *page, struct list_head *list)
{
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+{
+ mutex_lock(&buffer->mutex);
+ if (val)
+ buffer->flags |= RB_FL_OVERWRITE;
+ else
+ buffer->flags &= ~RB_FL_OVERWRITE;
+ mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
+
static inline void *
__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
{
@@ -1468,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
return local_read(&bpage->entries) & RB_WRITE_MASK;
}
-/* Size is determined by what has been commited */
+/* Size is determined by what has been committed */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
return rb_page_commit(bpage);
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
if (likely(ts >= cpu_buffer->write_stamp)) {
delta = diff;
if (unlikely(test_time_stamp(delta))) {
+ int local_clock_stable = 1;
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ local_clock_stable = sched_clock_stable;
+#endif
WARN_ONCE(delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
(unsigned long long)delta,
(unsigned long long)ts,
- (unsigned long long)cpu_buffer->write_stamp);
+ (unsigned long long)cpu_buffer->write_stamp,
+ local_clock_stable ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n");
add_timestamp = 1;
}
}
@@ -2914,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
/*
* cpu_buffer->pages just needs to point to the buffer, it
* has no specific buffer page to point to. Lets move it out
- * of our way so we don't accidently swap it.
+ * of our way so we don't accidentally swap it.
*/
cpu_buffer->pages = reader->list.prev;
@@ -3853,6 +3871,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
/* Need to copy one event at a time */
do {
+ /* We need the size of one event, because
+ * rb_advance_reader only advances by one event,
+ * whereas rb_event_ts_length may include the size of
+ * one or two events.
+ * We have already ensured there's enough space if this
+ * is a time extend. */
+ size = rb_event_length(event);
memcpy(bpage->data + pos, rpage->data + rpos, size);
len -= size;
@@ -3867,7 +3892,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
event = rb_reader_event(cpu_buffer);
/* Always keep the time extend and data together */
size = rb_event_ts_length(event);
- } while (len > size);
+ } while (len >= size);
/* update bpage */
local_set(&bpage->commit, pos);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c380612273b..1cb49be7c7f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
#include "trace.h"
#include "trace_output.h"
-#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
-
/*
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
+ TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
static int trace_stop_count;
static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
"sleep-time",
"graph-time",
"record-cmd",
+ "overwrite",
NULL
};
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
tracing_reset_online_cpus(tr);
current_trace = type;
+
+ /* If we expanded the buffers, make sure the max is expanded too */
+ if (ring_buffer_expanded && type->use_max_tr)
+ ring_buffer_resize(max_tr.buffer, trace_buf_size);
+
/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
/* Only reset on passing, to avoid touching corrupted buffers */
tracing_reset_online_cpus(tr);
+ /* Shrink the max buffer again */
+ if (ring_buffer_expanded && type->use_max_tr)
+ ring_buffer_resize(max_tr.buffer, 1);
+
printk(KERN_CONT "PASSED\n");
}
#endif
@@ -1102,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
+ entry->padding = 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1313,12 +1321,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
__this_cpu_inc(user_stack_count);
-
-
event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
sizeof(*entry), flags, pc);
if (!event)
- return;
+ goto out_drop_count;
entry = ring_buffer_event_data(event);
entry->tgid = current->tgid;
@@ -1333,8 +1339,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
if (!filter_check_discard(call, entry, buffer, event))
ring_buffer_unlock_commit(buffer, event);
+ out_drop_count:
__this_cpu_dec(user_stack_count);
-
out:
preempt_enable();
}
@@ -1751,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m)
seq_puts(m, "# | / _----=> need-resched \n");
seq_puts(m, "# || / _---=> hardirq/softirq \n");
seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| /_--=> lock-depth \n");
- seq_puts(m, "# |||||/ delay \n");
- seq_puts(m, "# cmd pid |||||| time | caller \n");
- seq_puts(m, "# \\ / |||||| \\ | / \n");
+ seq_puts(m, "# |||| / delay \n");
+ seq_puts(m, "# cmd pid ||||| time | caller \n");
+ seq_puts(m, "# \\ / ||||| \\ | / \n");
}
static void print_func_help_header(struct seq_file *m)
@@ -2338,11 +2343,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
return count;
}
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+ if (file->f_mode & FMODE_READ)
+ return seq_lseek(file, offset, origin);
+ else
+ return 0;
+}
+
static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
.write = tracing_write_stub,
- .llseek = seq_lseek,
+ .llseek = tracing_seek,
.release = tracing_release,
};
@@ -2523,6 +2536,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
+
+ if (mask == TRACE_ITER_OVERWRITE)
+ ring_buffer_change_overwrite(global_trace.buffer, enabled);
}
static ssize_t
@@ -2704,6 +2720,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
mutex_lock(&trace_types_lock);
if (tracer_enabled ^ val) {
+
+ /* Only need to warn if this is used to change the state */
+ WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+
if (val) {
tracer_enabled = 1;
if (current_trace->start)
@@ -3220,7 +3240,7 @@ waitagain:
trace_seq_init(&iter->seq);
/*
- * If there was nothing to send to user, inspite of consuming trace
+ * If there was nothing to send to user, in spite of consuming trace
* entries, go back to wait for more entries.
*/
if (sret == -EBUSY)
@@ -4545,9 +4565,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
+ enum ring_buffer_flags rb_flags;
int i;
int ret = -ENOMEM;
+
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
goto out;
@@ -4560,12 +4582,13 @@ __init static int tracer_alloc_buffers(void)
else
ring_buf_size = 1;
+ rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
cpumask_copy(tracing_cpumask, cpu_all_mask);
/* TODO: make the number of buffers hot pluggable with CPUS */
- global_trace.buffer = ring_buffer_alloc(ring_buf_size,
- TRACE_BUFFER_FLAGS);
+ global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
if (!global_trace.buffer) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
WARN_ON(1);
@@ -4575,7 +4598,7 @@ __init static int tracer_alloc_buffers(void)
#ifdef CONFIG_TRACER_MAX_TRACE
- max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
+ max_tr.buffer = ring_buffer_alloc(1, rb_flags);
if (!max_tr.buffer) {
printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c..5e9dfc6286d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
/* If you handled the flag setting, return 0 */
int (*set_flag)(u32 old_flags, u32 bit, int set);
struct tracer *next;
- int print_max;
struct tracer_flags *flags;
+ int print_max;
int use_max_tr;
};
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
TRACE_ITER_SLEEP_TIME = 0x40000,
TRACE_ITER_GRAPH_TIME = 0x80000,
TRACE_ITER_RECORD_CMD = 0x100000,
+ TRACE_ITER_OVERWRITE = 0x200000,
};
/*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
};
struct event_filter {
- int n_preds;
- struct filter_pred **preds;
+ int n_preds; /* Number assigned */
+ int a_preds; /* allocated */
+ struct filter_pred *preds;
+ struct filter_pred *root;
char *filter_string;
};
@@ -674,11 +677,23 @@ struct event_subsystem {
int nr_events;
};
+#define FILTER_PRED_INVALID ((unsigned short)-1)
+#define FILTER_PRED_IS_RIGHT (1 << 15)
+#define FILTER_PRED_FOLD (1 << 15)
+
+/*
+ * The max preds is the size of unsigned short with
+ * two flags at the MSBs. One bit is used for both the IS_RIGHT
+ * and FOLD flags. The other is reserved.
+ *
+ * 2^14 preds is way more than enough.
+ */
+#define MAX_FILTER_PRED 16384
+
struct filter_pred;
struct regex;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
- int val1, int val2);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
typedef int (*regex_match_func)(char *str, struct regex *r, int len);
@@ -700,11 +715,23 @@ struct filter_pred {
filter_pred_fn_t fn;
u64 val;
struct regex regex;
- char *field_name;
+ /*
+ * Leaf nodes use field_name, ops is used by AND and OR
+ * nodes. The field_name is always freed when freeing a pred.
+ * We can overload field_name for ops and have it freed
+ * as well.
+ */
+ union {
+ char *field_name;
+ unsigned short *ops;
+ };
int offset;
int not;
int op;
- int pop_n;
+ unsigned short index;
+ unsigned short parent;
+ unsigned short left;
+ unsigned short right;
};
extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db..6302747a139 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
}
/*
- * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * trace_clock(): 'between' trace clock. Not completely serialized,
* but not completely incorrect when crossing CPUs either.
*
* This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e..e32744c84d9 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
* in the structure.
*
* * for structures within structures, the format of the internal
- * structure is layed out. This allows the internal structure
+ * structure is laid out. This allows the internal structure
* to be deciphered for the format file. Although these macros
* may become out of sync with the internal structure, they
* will create a compile error if it happens. Since the
@@ -53,7 +53,7 @@
*/
/*
- * Function trace entry - function address and parent function addres:
+ * Function trace entry - function address and parent function address:
*/
FTRACE_ENTRY(function, ftrace_entry,
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
*/
#define FTRACE_CTX_FIELDS \
__field( unsigned int, prev_pid ) \
+ __field( unsigned int, next_pid ) \
+ __field( unsigned int, next_cpu ) \
__field( unsigned char, prev_prio ) \
__field( unsigned char, prev_state ) \
- __field( unsigned int, next_pid ) \
__field( unsigned char, next_prio ) \
- __field( unsigned char, next_state ) \
- __field( unsigned int, next_cpu )
+ __field( unsigned char, next_state )
FTRACE_ENTRY(context_switch, ctx_switch_entry,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670..19a359d5e6d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ /* No tracing, just counting, so no obvious leak */
+ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+ return 0;
+
+ /* Some events are ok to be traced by non-root users... */
+ if (p_event->attach_state == PERF_ATTACH_TASK) {
+ if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+ return 0;
+ }
+
+ /*
+ * ...otherwise raw tracepoint data can be a severe data leak,
+ * only allow root to have these.
+ */
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return 0;
+}
+
static int perf_trace_event_init(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
struct hlist_head __percpu *list;
- int ret = -ENOMEM;
+ int ret;
int cpu;
+ ret = perf_trace_event_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+
p_event->tp_event = tp_event;
if (tp_event->perf_refcount++ > 0)
return 0;
+ ret = -ENOMEM;
+
list = alloc_percpu(struct hlist_head);
if (!list)
goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0725eeab193..2fe11034135 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
+
LIST_HEAD(ftrace_events);
LIST_HEAD(ftrace_common_fields);
@@ -110,7 +116,7 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
- __common_field(int, lock_depth);
+ __common_field(int, padding);
return ret;
}
@@ -320,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
{
return __ftrace_set_clr_event(NULL, system, event, set);
}
+EXPORT_SYMBOL_GPL(trace_set_clr_event);
/* 128 should be much more than enough */
#define EVENT_BUF_SIZE 127
@@ -1278,7 +1285,7 @@ trace_create_file_ops(struct module *mod)
static void trace_module_add_events(struct module *mod)
{
struct ftrace_module_file_ops *file_ops = NULL;
- struct ftrace_event_call *call, *start, *end;
+ struct ftrace_event_call **call, **start, **end;
start = mod->trace_events;
end = mod->trace_events + mod->num_trace_events;
@@ -1291,7 +1298,7 @@ static void trace_module_add_events(struct module *mod)
return;
for_each_event(call, start, end) {
- __trace_add_event_call(call, mod,
+ __trace_add_event_call(*call, mod,
&file_ops->id, &file_ops->enable,
&file_ops->filter, &file_ops->format);
}
@@ -1361,8 +1368,8 @@ static struct notifier_block trace_module_nb = {
.priority = 0,
};
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
+extern struct ftrace_event_call *__start_ftrace_events[];
+extern struct ftrace_event_call *__stop_ftrace_events[];
static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
@@ -1378,7 +1385,7 @@ __setup("trace_event=", setup_trace_event);
static __init int event_trace_init(void)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_call **call;
struct dentry *d_tracer;
struct dentry *entry;
struct dentry *d_events;
@@ -1424,7 +1431,7 @@ static __init int event_trace_init(void)
pr_warning("tracing: Failed to allocate common fields");
for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
- __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+ __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
&ftrace_enable_fops,
&ftrace_event_filter_fops,
&ftrace_event_format_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17..8008ddcfbf2 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
} operand;
};
+struct pred_stack {
+ struct filter_pred **preds;
+ int index;
+};
+
#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_##type(struct filter_pred *pred, void *event, \
- int val1, int val2) \
+static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
}
#define DEFINE_EQUALITY_PRED(size) \
-static int filter_pred_##size(struct filter_pred *pred, void *event, \
- int val1, int val2) \
+static int filter_pred_##size(struct filter_pred *pred, void *event) \
{ \
u##size *addr = (u##size *)(event + pred->offset); \
u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
DEFINE_EQUALITY_PRED(8);
-static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
- void *event __attribute((unused)),
- int val1, int val2)
-{
- return val1 && val2;
-}
-
-static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
- void *event __attribute((unused)),
- int val1, int val2)
-{
- return val1 || val2;
-}
-
/* Filter predicate for fixed sized arrays of characters */
-static int filter_pred_string(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_string(struct filter_pred *pred, void *event)
{
char *addr = (char *)(event + pred->offset);
int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
}
/* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
{
char **addr = (char **)(event + pred->offset);
int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
* and add it to the address of the entry, and at last we have
* the address of the string.
*/
-static int filter_pred_strloc(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_strloc(struct filter_pred *pred, void *event)
{
u32 str_item = *(u32 *)(event + pred->offset);
int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
return match;
}
-static int filter_pred_none(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_none(struct filter_pred *pred, void *event)
{
return 0;
}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
pred->not ^= not;
}
+enum move_type {
+ MOVE_DOWN,
+ MOVE_UP_FROM_LEFT,
+ MOVE_UP_FROM_RIGHT
+};
+
+static struct filter_pred *
+get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
+ int index, enum move_type *move)
+{
+ if (pred->parent & FILTER_PRED_IS_RIGHT)
+ *move = MOVE_UP_FROM_RIGHT;
+ else
+ *move = MOVE_UP_FROM_LEFT;
+ pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
+
+ return pred;
+}
+
+/*
+ * A series of AND or ORs where found together. Instead of
+ * climbing up and down the tree branches, an array of the
+ * ops were made in order of checks. We can just move across
+ * the array and short circuit if needed.
+ */
+static int process_ops(struct filter_pred *preds,
+ struct filter_pred *op, void *rec)
+{
+ struct filter_pred *pred;
+ int match = 0;
+ int type;
+ int i;
+
+ /*
+ * Micro-optimization: We set type to true if op
+ * is an OR and false otherwise (AND). Then we
+ * just need to test if the match is equal to
+ * the type, and if it is, we can short circuit the
+ * rest of the checks:
+ *
+ * if ((match && op->op == OP_OR) ||
+ * (!match && op->op == OP_AND))
+ * return match;
+ */
+ type = op->op == OP_OR;
+
+ for (i = 0; i < op->val; i++) {
+ pred = &preds[op->ops[i]];
+ match = pred->fn(pred, rec);
+ if (!!match == type)
+ return match;
+ }
+ return match;
+}
+
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
- int match, top = 0, val1 = 0, val2 = 0;
- int stack[MAX_FILTER_PRED];
+ int match = -1;
+ enum move_type move = MOVE_DOWN;
+ struct filter_pred *preds;
struct filter_pred *pred;
- int i;
+ struct filter_pred *root;
+ int n_preds;
+ int done = 0;
+
+ /* no filter is considered a match */
+ if (!filter)
+ return 1;
+
+ n_preds = filter->n_preds;
+
+ if (!n_preds)
+ return 1;
+
+ /*
+ * n_preds, root and filter->preds are protect with preemption disabled.
+ */
+ preds = rcu_dereference_sched(filter->preds);
+ root = rcu_dereference_sched(filter->root);
+ if (!root)
+ return 1;
+
+ pred = root;
- for (i = 0; i < filter->n_preds; i++) {
- pred = filter->preds[i];
- if (!pred->pop_n) {
- match = pred->fn(pred, rec, val1, val2);
- stack[top++] = match;
+ /* match is currently meaningless */
+ match = -1;
+
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ /* only AND and OR have children */
+ if (pred->left != FILTER_PRED_INVALID) {
+ /* If ops is set, then it was folded. */
+ if (!pred->ops) {
+ /* keep going to down the left side */
+ pred = &preds[pred->left];
+ continue;
+ }
+ /* We can treat folded ops as a leaf node */
+ match = process_ops(preds, pred, rec);
+ } else
+ match = pred->fn(pred, rec);
+ /* If this pred is the only pred */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ /*
+ * Check for short circuits.
+ *
+ * Optimization: !!match == (pred->op == OP_OR)
+ * is the same as:
+ * if ((match && pred->op == OP_OR) ||
+ * (!match && pred->op == OP_AND))
+ */
+ if (!!match == (pred->op == OP_OR)) {
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ /* now go down the right side of the tree. */
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ /* We finished this equation. */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
continue;
}
- if (pred->pop_n > top) {
- WARN_ON_ONCE(1);
- return 0;
- }
- val1 = stack[--top];
- val2 = stack[--top];
- match = pred->fn(pred, rec, val1, val2);
- stack[top++] = match;
- }
+ done = 1;
+ } while (!done);
- return stack[--top];
+ return match;
}
EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
static void remove_filter_string(struct event_filter *filter)
{
+ if (!filter)
+ return;
+
kfree(filter->filter_string);
filter->filter_string = NULL;
}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- struct event_filter *filter = call->filter;
+ struct event_filter *filter;
mutex_lock(&event_mutex);
+ filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s)
{
- struct event_filter *filter = system->filter;
+ struct event_filter *filter;
mutex_lock(&event_mutex);
+ filter = system->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
pred->regex.len = 0;
}
-static int filter_set_pred(struct filter_pred *dest,
+static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
+{
+ stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+ if (!stack->preds)
+ return -ENOMEM;
+ stack->index = n_preds;
+ return 0;
+}
+
+static void __free_pred_stack(struct pred_stack *stack)
+{
+ kfree(stack->preds);
+ stack->index = 0;
+}
+
+static int __push_pred_stack(struct pred_stack *stack,
+ struct filter_pred *pred)
+{
+ int index = stack->index;
+
+ if (WARN_ON(index == 0))
+ return -ENOSPC;
+
+ stack->preds[--index] = pred;
+ stack->index = index;
+ return 0;
+}
+
+static struct filter_pred *
+__pop_pred_stack(struct pred_stack *stack)
+{
+ struct filter_pred *pred;
+ int index = stack->index;
+
+ pred = stack->preds[index++];
+ if (!pred)
+ return NULL;
+
+ stack->index = index;
+ return pred;
+}
+
+static int filter_set_pred(struct event_filter *filter,
+ int idx,
+ struct pred_stack *stack,
struct filter_pred *src,
filter_pred_fn_t fn)
{
+ struct filter_pred *dest = &filter->preds[idx];
+ struct filter_pred *left;
+ struct filter_pred *right;
+
*dest = *src;
if (src->field_name) {
dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
return -ENOMEM;
}
dest->fn = fn;
+ dest->index = idx;
- return 0;
+ if (dest->op == OP_OR || dest->op == OP_AND) {
+ right = __pop_pred_stack(stack);
+ left = __pop_pred_stack(stack);
+ if (!left || !right)
+ return -EINVAL;
+ /*
+ * If both children can be folded
+ * and they are the same op as this op or a leaf,
+ * then this op can be folded.
+ */
+ if (left->index & FILTER_PRED_FOLD &&
+ (left->op == dest->op ||
+ left->left == FILTER_PRED_INVALID) &&
+ right->index & FILTER_PRED_FOLD &&
+ (right->op == dest->op ||
+ right->left == FILTER_PRED_INVALID))
+ dest->index |= FILTER_PRED_FOLD;
+
+ dest->left = left->index & ~FILTER_PRED_FOLD;
+ dest->right = right->index & ~FILTER_PRED_FOLD;
+ left->parent = dest->index & ~FILTER_PRED_FOLD;
+ right->parent = dest->index | FILTER_PRED_IS_RIGHT;
+ } else {
+ /*
+ * Make dest->left invalid to be used as a quick
+ * way to know this is a leaf node.
+ */
+ dest->left = FILTER_PRED_INVALID;
+
+ /* All leafs allow folding the parent ops. */
+ dest->index |= FILTER_PRED_FOLD;
+ }
+
+ return __push_pred_stack(stack, dest);
}
-static void filter_disable_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
{
- struct event_filter *filter = call->filter;
int i;
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
+ if (filter->preds) {
+ for (i = 0; i < filter->a_preds; i++)
+ kfree(filter->preds[i].field_name);
+ kfree(filter->preds);
+ filter->preds = NULL;
+ }
+ filter->a_preds = 0;
filter->n_preds = 0;
-
- for (i = 0; i < MAX_FILTER_PRED; i++)
- filter->preds[i]->fn = filter_pred_none;
}
-static void __free_preds(struct event_filter *filter)
+static void filter_disable(struct ftrace_event_call *call)
{
- int i;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
+}
+static void __free_filter(struct event_filter *filter)
+{
if (!filter)
return;
- for (i = 0; i < MAX_FILTER_PRED; i++) {
- if (filter->preds[i])
- filter_free_pred(filter->preds[i]);
- }
- kfree(filter->preds);
+ __free_preds(filter);
kfree(filter->filter_string);
kfree(filter);
}
+/*
+ * Called when destroying the ftrace_event_call.
+ * The call is being freed, so we do not need to worry about
+ * the call being currently used. This is for module code removing
+ * the tracepoints from within it.
+ */
void destroy_preds(struct ftrace_event_call *call)
{
- __free_preds(call->filter);
+ __free_filter(call->filter);
call->filter = NULL;
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
-static struct event_filter *__alloc_preds(void)
+static struct event_filter *__alloc_filter(void)
{
struct event_filter *filter;
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ return filter;
+}
+
+static int __alloc_preds(struct event_filter *filter, int n_preds)
+{
struct filter_pred *pred;
int i;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
- if (!filter)
- return ERR_PTR(-ENOMEM);
+ if (filter->preds)
+ __free_preds(filter);
- filter->n_preds = 0;
+ filter->preds =
+ kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
- filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
if (!filter->preds)
- goto oom;
+ return -ENOMEM;
- for (i = 0; i < MAX_FILTER_PRED; i++) {
- pred = kzalloc(sizeof(*pred), GFP_KERNEL);
- if (!pred)
- goto oom;
+ filter->a_preds = n_preds;
+ filter->n_preds = 0;
+
+ for (i = 0; i < n_preds; i++) {
+ pred = &filter->preds[i];
pred->fn = filter_pred_none;
- filter->preds[i] = pred;
}
- return filter;
-
-oom:
- __free_preds(filter);
- return ERR_PTR(-ENOMEM);
-}
-
-static int init_preds(struct ftrace_event_call *call)
-{
- if (call->filter)
- return 0;
-
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
- call->filter = __alloc_preds();
- if (IS_ERR(call->filter))
- return PTR_ERR(call->filter);
-
return 0;
}
-static int init_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
{
struct ftrace_event_call *call;
- int err;
list_for_each_entry(call, &ftrace_events, list) {
if (strcmp(call->class->system, system->name) != 0)
continue;
- err = init_preds(call);
- if (err)
- return err;
+ filter_disable(call);
+ remove_filter_string(call->filter);
}
-
- return 0;
}
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_filters(struct event_subsystem *system)
{
struct ftrace_event_call *call;
list_for_each_entry(call, &ftrace_events, list) {
if (strcmp(call->class->system, system->name) != 0)
continue;
-
- filter_disable_preds(call);
- remove_filter_string(call->filter);
+ __free_filter(call->filter);
+ call->filter = NULL;
}
}
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_pred *pred,
+ struct pred_stack *stack,
filter_pred_fn_t fn)
{
int idx, err;
- if (filter->n_preds == MAX_FILTER_PRED) {
+ if (WARN_ON(filter->n_preds == filter->a_preds)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
return -ENOSPC;
}
idx = filter->n_preds;
- filter_clear_pred(filter->preds[idx]);
- err = filter_set_pred(filter->preds[idx], pred, fn);
+ filter_clear_pred(&filter->preds[idx]);
+ err = filter_set_pred(filter, idx, stack, pred, fn);
if (err)
return err;
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_pred *pred,
+ struct pred_stack *stack,
bool dry_run)
{
struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
unsigned long long val;
int ret;
- pred->fn = filter_pred_none;
+ fn = pred->fn = filter_pred_none;
- if (pred->op == OP_AND) {
- pred->pop_n = 2;
- fn = filter_pred_and;
+ if (pred->op == OP_AND)
goto add_pred_fn;
- } else if (pred->op == OP_OR) {
- pred->pop_n = 2;
- fn = filter_pred_or;
+ else if (pred->op == OP_OR)
goto add_pred_fn;
- }
field = find_event_field(call, pred->field_name);
if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
add_pred_fn:
if (!dry_run)
- return filter_add_pred_fn(ps, call, filter, pred, fn);
+ return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
return 0;
}
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
return 0;
}
+static int count_preds(struct filter_parse_state *ps)
+{
+ struct postfix_elt *elt;
+ int n_preds = 0;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE)
+ continue;
+ n_preds++;
+ }
+
+ return n_preds;
+}
+
+/*
+ * The tree is walked at filtering of an event. If the tree is not correctly
+ * built, it may cause an infinite loop. Check here that the tree does
+ * indeed terminate.
+ */
+static int check_pred_tree(struct event_filter *filter,
+ struct filter_pred *root)
+{
+ struct filter_pred *preds;
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int count = 0;
+ int done = 0;
+ int max;
+
+ /*
+ * The max that we can hit a node is three times.
+ * Once going down, once coming up from left, and
+ * once coming up from right. This is more than enough
+ * since leafs are only hit a single time.
+ */
+ max = 3 * filter->n_preds;
+
+ preds = filter->preds;
+ if (!preds)
+ return -EINVAL;
+ pred = root;
+
+ do {
+ if (WARN_ON(count++ > max))
+ return -EINVAL;
+
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ /* A leaf at the root is just a leaf in the tree */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ /* We are fine. */
+ return 0;
+}
+
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int count = 0;
+ int done = 0;
+
+ pred = root;
+
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ /* A leaf at the root is just a leaf in the tree */
+ if (pred == root)
+ return 1;
+ count++;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ return count;
+}
+
+static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
+{
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int count = 0;
+ int children;
+ int done = 0;
+
+ /* No need to keep the fold flag */
+ root->index &= ~FILTER_PRED_FOLD;
+
+ /* If the root is a leaf then do nothing */
+ if (root->left == FILTER_PRED_INVALID)
+ return 0;
+
+ /* count the children */
+ children = count_leafs(preds, &preds[root->left]);
+ children += count_leafs(preds, &preds[root->right]);
+
+ root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+ if (!root->ops)
+ return -ENOMEM;
+
+ root->val = children;
+
+ pred = root;
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ if (WARN_ON(count == children))
+ return -EINVAL;
+ pred->index &= ~FILTER_PRED_FOLD;
+ root->ops[count++] = pred->index;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ return 0;
+}
+
+/*
+ * To optimize the processing of the ops, if we have several "ors" or
+ * "ands" together, we can put them in an array and process them all
+ * together speeding up the filter logic.
+ */
+static int fold_pred_tree(struct event_filter *filter,
+ struct filter_pred *root)
+{
+ struct filter_pred *preds;
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int done = 0;
+ int err;
+
+ preds = filter->preds;
+ if (!preds)
+ return -EINVAL;
+ pred = root;
+
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->index & FILTER_PRED_FOLD) {
+ err = fold_pred(preds, pred);
+ if (err)
+ return err;
+ /* Folded nodes are like leafs */
+ } else if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+
+ /* A leaf at the root is just a leaf in the tree */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ return 0;
+}
+
static int replace_preds(struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
{
char *operand1 = NULL, *operand2 = NULL;
struct filter_pred *pred;
+ struct filter_pred *root;
struct postfix_elt *elt;
+ struct pred_stack stack = { }; /* init to NULL */
int err;
int n_preds = 0;
+ n_preds = count_preds(ps);
+ if (n_preds >= MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
err = check_preds(ps);
if (err)
return err;
+ if (!dry_run) {
+ err = __alloc_pred_stack(&stack, n_preds);
+ if (err)
+ return err;
+ err = __alloc_preds(filter, n_preds);
+ if (err)
+ goto fail;
+ }
+
+ n_preds = 0;
list_for_each_entry(elt, &ps->postfix, list) {
if (elt->op == OP_NONE) {
if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
operand2 = elt->operand;
else {
parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
- return -EINVAL;
+ err = -EINVAL;
+ goto fail;
}
continue;
}
- if (n_preds++ == MAX_FILTER_PRED) {
+ if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
+ err = -ENOSPC;
+ goto fail;
}
if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
if (!operand1 || !operand2) {
parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
- return -EINVAL;
+ err = -EINVAL;
+ goto fail;
}
pred = create_pred(elt->op, operand1, operand2);
add_pred:
- if (!pred)
- return -ENOMEM;
- err = filter_add_pred(ps, call, filter, pred, dry_run);
+ if (!pred) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
filter_free_pred(pred);
if (err)
- return err;
+ goto fail;
operand1 = operand2 = NULL;
}
- return 0;
+ if (!dry_run) {
+ /* We should have one item left on the stack */
+ pred = __pop_pred_stack(&stack);
+ if (!pred)
+ return -EINVAL;
+ /* This item is where we start from in matching */
+ root = pred;
+ /* Make sure the stack is empty */
+ pred = __pop_pred_stack(&stack);
+ if (WARN_ON(pred)) {
+ err = -EINVAL;
+ filter->root = NULL;
+ goto fail;
+ }
+ err = check_pred_tree(filter, root);
+ if (err)
+ goto fail;
+
+ /* Optimize the tree */
+ err = fold_pred_tree(filter, root);
+ if (err)
+ goto fail;
+
+ /* We don't set root until we know it works */
+ barrier();
+ filter->root = root;
+ }
+
+ err = 0;
+fail:
+ __free_pred_stack(&stack);
+ return err;
}
+struct filter_list {
+ struct list_head list;
+ struct event_filter *filter;
+};
+
static int replace_system_preds(struct event_subsystem *system,
struct filter_parse_state *ps,
char *filter_string)
{
struct ftrace_event_call *call;
+ struct filter_list *filter_item;
+ struct filter_list *tmp;
+ LIST_HEAD(filter_list);
bool fail = true;
int err;
list_for_each_entry(call, &ftrace_events, list) {
- struct event_filter *filter = call->filter;
if (strcmp(call->class->system, system->name) != 0)
continue;
- /* try to see if the filter can be applied */
- err = replace_preds(call, filter, ps, filter_string, true);
+ /*
+ * Try to see if the filter can be applied
+ * (filter arg is ignored on dry_run)
+ */
+ err = replace_preds(call, NULL, ps, filter_string, true);
if (err)
+ goto fail;
+ }
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ struct event_filter *filter;
+
+ if (strcmp(call->class->system, system->name) != 0)
continue;
- /* really apply the filter */
- filter_disable_preds(call);
- err = replace_preds(call, filter, ps, filter_string, false);
+ filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+ if (!filter_item)
+ goto fail_mem;
+
+ list_add_tail(&filter_item->list, &filter_list);
+
+ filter_item->filter = __alloc_filter();
+ if (!filter_item->filter)
+ goto fail_mem;
+ filter = filter_item->filter;
+
+ /* Can only fail on no memory */
+ err = replace_filter_string(filter, filter_string);
if (err)
- filter_disable_preds(call);
- else {
+ goto fail_mem;
+
+ err = replace_preds(call, filter, ps, filter_string, false);
+ if (err) {
+ filter_disable(call);
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ append_filter_err(ps, filter);
+ } else
call->flags |= TRACE_EVENT_FL_FILTERED;
- replace_filter_string(filter, filter_string);
- }
+ /*
+ * Regardless of if this returned an error, we still
+ * replace the filter for the call.
+ */
+ filter = call->filter;
+ call->filter = filter_item->filter;
+ filter_item->filter = filter;
+
fail = false;
}
- if (fail) {
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- return -EINVAL;
+ if (fail)
+ goto fail;
+
+ /*
+ * The calls can still be using the old filters.
+ * Do a synchronize_sched() to ensure all calls are
+ * done with them before we free them.
+ */
+ synchronize_sched();
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
}
return 0;
+ fail:
+ /* No call succeeded */
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return -EINVAL;
+ fail_mem:
+ /* If any call succeeded, we still need to sync */
+ if (!fail)
+ synchronize_sched();
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ return -ENOMEM;
}
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
- int err;
struct filter_parse_state *ps;
+ struct event_filter *filter;
+ struct event_filter *tmp;
+ int err = 0;
mutex_lock(&event_mutex);
- err = init_preds(call);
- if (err)
- goto out_unlock;
-
if (!strcmp(strstrip(filter_string), "0")) {
- filter_disable_preds(call);
- remove_filter_string(call->filter);
+ filter_disable(call);
+ filter = call->filter;
+ if (!filter)
+ goto out_unlock;
+ call->filter = NULL;
+ /* Make sure the filter is not being used */
+ synchronize_sched();
+ __free_filter(filter);
goto out_unlock;
}
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
if (!ps)
goto out_unlock;
- filter_disable_preds(call);
- replace_filter_string(call->filter, filter_string);
+ filter = __alloc_filter();
+ if (!filter) {
+ kfree(ps);
+ goto out_unlock;
+ }
+
+ replace_filter_string(filter, filter_string);
parse_init(ps, filter_ops, filter_string);
err = filter_parse(ps);
if (err) {
- append_filter_err(ps, call->filter);
+ append_filter_err(ps, filter);
goto out;
}
- err = replace_preds(call, call->filter, ps, filter_string, false);
- if (err)
- append_filter_err(ps, call->filter);
- else
+ err = replace_preds(call, filter, ps, filter_string, false);
+ if (err) {
+ filter_disable(call);
+ append_filter_err(ps, filter);
+ } else
call->flags |= TRACE_EVENT_FL_FILTERED;
out:
+ /*
+ * Always swap the call filter with the new filter
+ * even if there was an error. If there was an error
+ * in the filter, we disable the filter and show the error
+ * string
+ */
+ tmp = call->filter;
+ call->filter = filter;
+ if (tmp) {
+ /* Make sure the call is done with the filter */
+ synchronize_sched();
+ __free_filter(tmp);
+ }
filter_opstack_clear(ps);
postfix_clear(ps);
kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string)
{
- int err;
struct filter_parse_state *ps;
+ struct event_filter *filter;
+ int err = 0;
mutex_lock(&event_mutex);
- err = init_subsystem_preds(system);
- if (err)
- goto out_unlock;
-
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(system);
remove_filter_string(system->filter);
+ filter = system->filter;
+ system->filter = NULL;
+ /* Ensure all filters are no longer used */
+ synchronize_sched();
+ filter_free_subsystem_filters(system);
+ __free_filter(filter);
goto out_unlock;
}
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
if (!ps)
goto out_unlock;
- replace_filter_string(system->filter, filter_string);
+ filter = __alloc_filter();
+ if (!filter)
+ goto out;
+
+ replace_filter_string(filter, filter_string);
+ /*
+ * No event actually uses the system filter
+ * we can free it without synchronize_sched().
+ */
+ __free_filter(system->filter);
+ system->filter = filter;
parse_init(ps, filter_ops, filter_string);
err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
struct event_filter *filter = event->filter;
event->filter = NULL;
- __free_preds(filter);
+ __free_filter(filter);
}
int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
if (event->filter)
goto out_unlock;
- filter = __alloc_preds();
- if (IS_ERR(filter)) {
+ filter = __alloc_filter();
+ if (!filter) {
err = PTR_ERR(filter);
goto out_unlock;
}
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
err = -ENOMEM;
ps = kzalloc(sizeof(*ps), GFP_KERNEL);
if (!ps)
- goto free_preds;
+ goto free_filter;
parse_init(ps, filter_ops, filter_str);
err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
postfix_clear(ps);
kfree(ps);
-free_preds:
+free_filter:
if (err)
- __free_preds(filter);
+ __free_filter(filter);
out_unlock:
mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac2..bbeec31e0ae 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __array
#define __array(type, item, len) \
- BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+ do { \
+ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
+ mutex_lock(&event_storage_mutex); \
+ snprintf(event_storage, sizeof(event_storage), \
+ "%s[%d]", #type, len); \
+ ret = trace_define_field(event_call, event_storage, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
is_signed_type(type), FILTER_OTHER); \
- if (ret) \
- return ret;
+ mutex_unlock(&event_storage_mutex); \
+ if (ret) \
+ return ret; \
+ } while (0);
#undef __array_desc
#define __array_desc(type, container, item, len) \
@@ -155,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
}; \
\
-struct ftrace_event_call __used \
-__attribute__((__aligned__(4))) \
-__attribute__((section("_ftrace_events"))) event_##call = { \
+struct ftrace_event_call __used event_##call = { \
.name = #call, \
.event.type = etype, \
.class = &event_class_ftrace_##call, \
.print_fmt = print, \
}; \
+struct ftrace_event_call __used \
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225..962cdb24ed8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
*
* returns 1 if
* - we are inside irq code
- * - we just extered irq code
+ * - we just entered irq code
*
* retunns 0 if
* - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b88..a4969b47afc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
* skip the latency if the sequence has changed - some other section
* did a maximum and could disturb our measurement with serial console
* printouts, etc. Truly coinciding maximum latencies should be rare
- * and what happens together happens separately as well, so this doesnt
+ * and what happens together happens separately as well, so this doesn't
* decrease the validity of the maximum found:
*/
static __cacheline_aligned_in_smp unsigned long max_sequence;
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
* Stubs:
*/
-void early_boot_irqs_off(void)
-{
-}
-
-void early_boot_irqs_on(void)
-{
-}
-
void trace_softirqs_on(unsigned long ip)
{
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b..35d55a38614 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
kfree(data);
}
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+ struct fetch_param orig;
+ unsigned char hi_shift;
+ unsigned char low_shift;
+};
+
+#define DEFINE_FETCH_bitfield(type) \
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct bitfield_fetch_param *bprm = data; \
+ type buf = 0; \
+ call_fetch(&bprm->orig, regs, &buf); \
+ if (buf) { \
+ buf <<= bprm->hi_shift; \
+ buf >>= bprm->low_shift; \
+ } \
+ *(type *)dest = buf; \
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
/* Default (unsigned long) fetch type */
#define __DEFAULT_FETCH_TYPE(t) u##t
#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
FETCH_MTD_memory,
FETCH_MTD_symbol,
FETCH_MTD_deref,
+ FETCH_MTD_bitfield,
FETCH_MTD_END,
};
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
ASSIGN_FETCH_FUNC(memory, ftype), \
ASSIGN_FETCH_FUNC(symbol, ftype), \
ASSIGN_FETCH_FUNC(deref, ftype), \
+ASSIGN_FETCH_FUNC(bitfield, ftype), \
} \
}
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
if (!type)
type = DEFAULT_FETCH_TYPE_STR;
+ /* Special case: bitfield */
+ if (*type == 'b') {
+ unsigned long bs;
+ type = strchr(type, '/');
+ if (!type)
+ goto fail;
+ type++;
+ if (strict_strtoul(type, 0, &bs))
+ goto fail;
+ switch (bs) {
+ case 8:
+ return find_fetch_type("u8");
+ case 16:
+ return find_fetch_type("u16");
+ case 32:
+ return find_fetch_type("u32");
+ case 64:
+ return find_fetch_type("u64");
+ default:
+ goto fail;
+ }
+ }
+
for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
if (strcmp(type, fetch_type_table[i].name) == 0)
return &fetch_type_table[i];
+fail:
return NULL;
}
@@ -586,7 +649,9 @@ error:
static void free_probe_arg(struct probe_arg *arg)
{
- if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ free_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
free_deref_fetch_param(arg->fetch.data);
else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
}
break;
case '+': /* deref memory */
+ arg++; /* Skip '+', because strict_strtol() rejects it. */
case '-':
tmp = strchr(arg, '(');
if (!tmp)
break;
*tmp = '\0';
- ret = strict_strtol(arg + 1, 0, &offset);
+ ret = strict_strtol(arg, 0, &offset);
if (ret)
break;
- if (arg[0] == '-')
- offset = -offset;
arg = tmp + 1;
tmp = strrchr(arg, ')');
if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
return ret;
}
+#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
+
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+ const struct fetch_type *t,
+ struct fetch_param *f)
+{
+ struct bitfield_fetch_param *bprm;
+ unsigned long bw, bo;
+ char *tail;
+
+ if (*bf != 'b')
+ return 0;
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm)
+ return -ENOMEM;
+ bprm->orig = *f;
+ f->fn = t->fetch[FETCH_MTD_bitfield];
+ f->data = (void *)bprm;
+
+ bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
+ if (bw == 0 || *tail != '@')
+ return -EINVAL;
+
+ bf = tail + 1;
+ bo = simple_strtoul(bf, &tail, 0);
+ if (tail == bf || *tail != '/')
+ return -EINVAL;
+
+ bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+ bprm->low_shift = bprm->hi_shift + bo;
+ return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+
/* String length checking wrapper */
static int parse_probe_arg(char *arg, struct trace_probe *tp,
struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
parg->offset = tp->size;
tp->size += parg->type->size;
ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+ if (ret >= 0 && t != NULL)
+ ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
if (ret >= 0) {
parg->fetch_size.fn = get_fetch_size_function(parg->type,
parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
return ret;
}
-#define WRITE_BUFSIZE 128
+#define WRITE_BUFSIZE 4096
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
@@ -1738,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp)
kfree(tp->call.print_fmt);
}
-/* Make a debugfs interface for controling probe points */
+/* Make a debugfs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
struct dentry *d_tracer;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa220..456be9063c2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
* @entry: The trace entry field from the ring buffer
*
* Prints the generic fields of irqs off, in hard or softirq, preempt
- * count and lock depth.
+ * count.
*/
int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
- int hardirq, softirq;
+ char hardsoft_irq;
+ char need_resched;
+ char irqs_off;
+ int hardirq;
+ int softirq;
int ret;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ irqs_off =
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
+ '.';
+ need_resched =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+ hardsoft_irq =
+ (hardirq && softirq) ? 'H' :
+ hardirq ? 'h' :
+ softirq ? 's' :
+ '.';
+
if (!trace_seq_printf(s, "%c%c%c",
- (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
- 'X' : '.',
- (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
- 'N' : '.',
- (hardirq && softirq) ? 'H' :
- hardirq ? 'h' : softirq ? 's' : '.'))
+ irqs_off, need_resched, hardsoft_irq))
return 0;
if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
else
ret = trace_seq_putc(s, '.');
- if (!ret)
- return 0;
-
- if (entry->lock_depth < 0)
- return trace_seq_putc(s, '.');
-
- return trace_seq_printf(s, "%d", entry->lock_depth);
+ return ret;
}
static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c4..7e62c0a1845 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
ctx_trace = tr;
}
-static void stop_sched_trace(struct trace_array *tr)
-{
- tracing_stop_sched_switch_record();
-}
-
-static int sched_switch_trace_init(struct trace_array *tr)
-{
- ctx_trace = tr;
- tracing_reset_online_cpus(tr);
- tracing_start_sched_switch_record();
- return 0;
-}
-
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
- if (sched_ref)
- stop_sched_trace(tr);
-}
-
-static void sched_switch_trace_start(struct trace_array *tr)
-{
- sched_stopped = 0;
-}
-
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
- sched_stopped = 1;
-}
-
-static struct tracer sched_switch_trace __read_mostly =
-{
- .name = "sched_switch",
- .init = sched_switch_trace_init,
- .reset = sched_switch_trace_reset,
- .start = sched_switch_trace_start,
- .stop = sched_switch_trace_stop,
- .wait_pipe = poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_sched_switch,
-#endif
-};
-
-__init static int init_sched_switch_trace(void)
-{
- return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
-
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b320..659732eba07 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
static int trace_wakeup_test_thread(void *data)
{
/* Make this a RT thread, doesn't need to be too high */
- struct sched_param param = { .sched_priority = 5 };
+ static const struct sched_param param = { .sched_priority = 5 };
struct completion *x = data;
sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb..ee7b5a0bb9f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
-
static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
@@ -34,61 +31,66 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
return &entry->enter_fields;
}
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
- return &syscall_exit_fields;
-}
-
struct trace_event_functions enter_syscall_print_funcs = {
- .trace = print_syscall_enter,
+ .trace = print_syscall_enter,
};
struct trace_event_functions exit_syscall_print_funcs = {
- .trace = print_syscall_exit,
+ .trace = print_syscall_exit,
};
struct ftrace_event_class event_class_syscall_enter = {
- .system = "syscalls",
- .reg = syscall_enter_register,
- .define_fields = syscall_enter_define_fields,
- .get_fields = syscall_get_enter_fields,
- .raw_init = init_syscall_trace,
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
};
struct ftrace_event_class event_class_syscall_exit = {
- .system = "syscalls",
- .reg = syscall_exit_register,
- .define_fields = syscall_exit_define_fields,
- .get_fields = syscall_get_exit_fields,
- .raw_init = init_syscall_trace,
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
+ .raw_init = init_syscall_trace,
};
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
+extern struct syscall_metadata *__start_syscalls_metadata[];
+extern struct syscall_metadata *__stop_syscalls_metadata[];
static struct syscall_metadata **syscalls_metadata;
-static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+ /*
+ * Only compare after the "sys" prefix. Archs that use
+ * syscall wrappers may have syscalls symbols aliases prefixed
+ * with "SyS" instead of "sys", leading to an unwanted
+ * mismatch.
+ */
+ return !strcmp(sym + 3, name + 3);
+}
+#endif
+
+static __init struct syscall_metadata *
+find_syscall_meta(unsigned long syscall)
{
- struct syscall_metadata *start;
- struct syscall_metadata *stop;
+ struct syscall_metadata **start;
+ struct syscall_metadata **stop;
char str[KSYM_SYMBOL_LEN];
- start = (struct syscall_metadata *)__start_syscalls_metadata;
- stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+ start = __start_syscalls_metadata;
+ stop = __stop_syscalls_metadata;
kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+ if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+ return NULL;
+
for ( ; start < stop; start++) {
- /*
- * Only compare after the "sys" prefix. Archs that use
- * syscall wrappers may have syscalls symbols aliases prefixed
- * with "SyS" instead of "sys", leading to an unwanted
- * mismatch.
- */
- if (start->name && !strcmp(start->name + 3, str + 3))
- return start;
+ if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
+ return *start;
}
return NULL;
}
@@ -367,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_enter)
@@ -385,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_enter--;
@@ -401,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_exit)
@@ -419,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_exit--;
@@ -432,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
int init_syscall_trace(struct ftrace_event_call *call)
{
int id;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls) {
+ pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+ ((struct syscall_metadata *)call->data)->name);
+ return -ENOSYS;
+ }
if (set_syscall_print_fmt(call) < 0)
return -ENOMEM;
@@ -446,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
return id;
}
-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
{
return (unsigned long)sys_call_table[nr];
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e95ee7f31d4..68187af4889 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -27,8 +27,8 @@
#include <linux/sched.h>
#include <linux/jump_label.h>
-extern struct tracepoint __start___tracepoints[];
-extern struct tracepoint __stop___tracepoints[];
+extern struct tracepoint * const __start___tracepoints_ptrs[];
+extern struct tracepoint * const __stop___tracepoints_ptrs[];
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem)
*
* Updates the probe callback corresponding to a range of tracepoints.
*/
-void
-tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end)
{
- struct tracepoint *iter;
+ struct tracepoint * const *iter;
struct tracepoint_entry *mark_entry;
if (!begin)
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
mutex_lock(&tracepoints_mutex);
for (iter = begin; iter < end; iter++) {
- mark_entry = get_tracepoint(iter->name);
+ mark_entry = get_tracepoint((*iter)->name);
if (mark_entry) {
- set_tracepoint(&mark_entry, iter,
+ set_tracepoint(&mark_entry, *iter,
!!mark_entry->refcount);
} else {
- disable_tracepoint(iter);
+ disable_tracepoint(*iter);
}
}
mutex_unlock(&tracepoints_mutex);
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
static void tracepoint_update_probes(void)
{
/* Core kernel tracepoints */
- tracepoint_update_probe_range(__start___tracepoints,
- __stop___tracepoints);
+ tracepoint_update_probe_range(__start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs);
/* tracepoints in modules. */
module_update_tracepoints();
}
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
* Will return the first tracepoint in the range if the input tracepoint is
* NULL.
*/
-int tracepoint_get_iter_range(struct tracepoint **tracepoint,
- struct tracepoint *begin, struct tracepoint *end)
+int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+ struct tracepoint * const *begin, struct tracepoint * const *end)
{
if (!*tracepoint && begin != end) {
*tracepoint = begin;
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
/* Core kernel tracepoints */
if (!iter->module) {
found = tracepoint_get_iter_range(&iter->tracepoint,
- __start___tracepoints, __stop___tracepoints);
+ __start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs);
if (found)
goto end;
}
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
case MODULE_STATE_GOING:
- tracepoint_update_probe_range(mod->tracepoints,
- mod->tracepoints + mod->num_tracepoints);
+ tracepoint_update_probe_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
break;
}
return 0;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d8..51c6e89e861 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!capable(CAP_SETGID))
+ if (!nsown_capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a..92cb706c7fc 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
/*
* Removes a registered user return notifier. Must be called from atomic
- * context, and from the same cpu registration occured in.
+ * context, and from the same cpu registration occurred in.
*/
void user_return_notifier_unregister(struct user_return_notifier *urn)
{
diff --git a/kernel/user.c b/kernel/user.c
index 2c7d8d5914b..9e03e9c1df8 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
#include <linux/module.h>
#include <linux/user_namespace.h>
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
struct user_namespace init_user_ns = {
.kref = {
- .refcount = ATOMIC_INIT(2),
+ .refcount = ATOMIC_INIT(3),
},
.creator = &root_user,
};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
*/
static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
struct user_struct root_user = {
.__count = ATOMIC_INIT(2),
.processes = ATOMIC_INIT(1),
@@ -158,6 +162,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ put_user_ns(ns);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291..9da289c34f2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
#include <linux/highuid.h>
#include <linux/cred.h>
+static struct kmem_cache *user_ns_cachep __read_mostly;
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
struct user_struct *root_user;
int n;
- ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+ ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
return -ENOMEM;
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
/* Alloc new root user. */
root_user = alloc_uid(ns, 0);
if (!root_user) {
- kfree(ns);
+ kmem_cache_free(user_ns_cachep, ns);
return -ENOMEM;
}
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
struct user_namespace *ns =
container_of(work, struct user_namespace, destroyer);
free_uid(ns->creator);
- kfree(ns);
+ kmem_cache_free(user_ns_cachep, ns);
}
void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
/* No useful relationship so no mapping */
return overflowgid;
}
+
+static __init int user_namespaces_init(void)
+{
+ user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+ return 0;
+}
+module_init(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea5..44646179eab 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
#include <linux/utsname.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/user_namespace.h>
static struct uts_namespace *create_uts_ns(void)
{
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
* @old_ns: namespace to clone
* Return NULL on error (failure to kmalloc), new ns otherwise
*/
-static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+ struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
up_read(&uts_sem);
return ns;
}
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
* utsname of this process won't be seen by parent, and vice
* versa.
*/
-struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags,
+ struct task_struct *tsk)
{
+ struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
if (!(flags & CLONE_NEWUTS))
return old_ns;
- new_ns = clone_uts_ns(old_ns);
+ new_ns = clone_uts_ns(tsk, old_ns);
put_uts_ns(old_ns);
return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
struct uts_namespace *ns;
ns = container_of(kref, struct uts_namespace, kref);
+ put_user_ns(ns->user_ns);
kfree(ns);
}
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1..f45ea8d2a1c 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
* woken up through the queue.
*
* This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and noone wakes up
+ * aborts and is woken up concurrently and no one wakes up
* the next waiter.
*/
void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024..14733d4d156 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,7 +27,7 @@
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
-int watchdog_enabled;
+int watchdog_enabled = 1;
int __read_mostly softlockup_thresh = 60;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -43,20 +43,22 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
-static int no_watchdog;
-
-
/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic;
+static int hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static int __init hardlockup_panic_setup(char *str)
{
if (!strncmp(str, "panic", 5))
hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled = 0;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -75,7 +77,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- no_watchdog = 1;
+ watchdog_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
@@ -83,7 +85,7 @@ __setup("nowatchdog", nowatchdog_setup);
/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
- no_watchdog = 1;
+ watchdog_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
@@ -116,12 +118,12 @@ static void __touch_watchdog(void)
{
int this_cpu = smp_processor_id();
- __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+ __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
}
void touch_softlockup_watchdog(void)
{
- __raw_get_cpu_var(watchdog_touch_ts) = 0;
+ __this_cpu_write(watchdog_touch_ts, 0);
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -165,12 +167,12 @@ void touch_softlockup_watchdog_sync(void)
/* watchdog detector functions */
static int is_hardlockup(void)
{
- unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
+ unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
- if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
+ if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
return 1;
- __get_cpu_var(hrtimer_interrupts_saved) = hrint;
+ __this_cpu_write(hrtimer_interrupts_saved, hrint);
return 0;
}
#endif
@@ -203,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
- if (__get_cpu_var(watchdog_nmi_touch) == true) {
- __get_cpu_var(watchdog_nmi_touch) = false;
+ if (__this_cpu_read(watchdog_nmi_touch) == true) {
+ __this_cpu_write(watchdog_nmi_touch, false);
return;
}
@@ -218,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
int this_cpu = smp_processor_id();
/* only print hardlockups once */
- if (__get_cpu_var(hard_watchdog_warn) == true)
+ if (__this_cpu_read(hard_watchdog_warn) == true)
return;
if (hardlockup_panic)
@@ -226,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
else
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- __get_cpu_var(hard_watchdog_warn) = true;
+ __this_cpu_write(hard_watchdog_warn, true);
return;
}
- __get_cpu_var(hard_watchdog_warn) = false;
+ __this_cpu_write(hard_watchdog_warn, false);
return;
}
static void watchdog_interrupt_count(void)
{
- __get_cpu_var(hrtimer_interrupts)++;
+ __this_cpu_inc(hrtimer_interrupts);
}
#else
static inline void watchdog_interrupt_count(void) { return; }
@@ -244,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; }
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
- unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+ unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
@@ -252,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
watchdog_interrupt_count();
/* kick the softlockup detector */
- wake_up_process(__get_cpu_var(softlockup_watchdog));
+ wake_up_process(__this_cpu_read(softlockup_watchdog));
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
if (touch_ts == 0) {
- if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
+ if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
* make sure the scheduler tick is up to date.
*/
- __get_cpu_var(softlockup_touch_sync) = false;
+ __this_cpu_write(softlockup_touch_sync, false);
sched_clock_tick();
}
__touch_watchdog();
@@ -279,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
duration = is_softlockup(touch_ts);
if (unlikely(duration)) {
/* only warn once */
- if (__get_cpu_var(soft_watchdog_warn) == true)
+ if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -294,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (softlockup_panic)
panic("softlockup: hung tasks");
- __get_cpu_var(soft_watchdog_warn) = true;
+ __this_cpu_write(soft_watchdog_warn, true);
} else
- __get_cpu_var(soft_watchdog_warn) = false;
+ __this_cpu_write(soft_watchdog_warn, false);
return HRTIMER_RESTART;
}
@@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
static int watchdog(void *unused)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -364,7 +366,14 @@ static int watchdog_nmi_enable(int cpu)
goto out_save;
}
- printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+
+ /* vary the KERN level based on the returned errno */
+ if (PTR_ERR(event) == -EOPNOTSUPP)
+ printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ else if (PTR_ERR(event) == -ENOENT)
+ printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+ else
+ printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
return PTR_ERR(event);
/* success path */
@@ -409,19 +418,25 @@ static int watchdog_prepare_cpu(int cpu)
static int watchdog_enable(int cpu)
{
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err;
+ int err = 0;
/* enable the perf event */
err = watchdog_nmi_enable(cpu);
- if (err)
- return err;
+
+ /* Regardless of err above, fall through and start softlockup */
/* create the watchdog thread */
if (!p) {
p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
if (IS_ERR(p)) {
printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
- return PTR_ERR(p);
+ if (!err) {
+ /* if hardlockup hasn't already set this */
+ err = PTR_ERR(p);
+ /* and disable the perf event */
+ watchdog_nmi_disable(cpu);
+ }
+ goto out;
}
kthread_bind(p, cpu);
per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -429,10 +444,8 @@ static int watchdog_enable(int cpu)
wake_up_process(p);
}
- /* if any cpu succeeds, watchdog is considered enabled for the system */
- watchdog_enabled = 1;
-
- return 0;
+out:
+ return err;
}
static void watchdog_disable(int cpu)
@@ -459,12 +472,16 @@ static void watchdog_disable(int cpu)
static void watchdog_enable_all_cpus(void)
{
int cpu;
- int result = 0;
+
+ watchdog_enabled = 0;
for_each_online_cpu(cpu)
- result += watchdog_enable(cpu);
+ if (!watchdog_enable(cpu))
+ /* if any cpu succeeds, watchdog is considered
+ enabled for the system */
+ watchdog_enabled = 1;
- if (result)
+ if (!watchdog_enabled)
printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
}
@@ -473,9 +490,6 @@ static void watchdog_disable_all_cpus(void)
{
int cpu;
- if (no_watchdog)
- return;
-
for_each_online_cpu(cpu)
watchdog_disable(cpu);
@@ -495,10 +509,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
{
proc_dointvec(table, write, buffer, length, ppos);
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
+ if (write) {
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+ }
return 0;
}
@@ -527,7 +543,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- err = watchdog_enable(hotcpu);
+ if (watchdog_enabled)
+ err = watchdog_enable(hotcpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
@@ -540,27 +557,29 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
- return notifier_from_errno(err);
+
+ /*
+ * hardlockup and softlockup are not important enough
+ * to block cpu bring up. Just always succeed and
+ * rely on printk output to flag problems.
+ */
+ return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
- if (no_watchdog)
- return 0;
-
err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
WARN_ON(notifier_to_errno(err));
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
- return 0;
+ return;
}
-early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 90db1bd1a97..e3378e8d3a5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
- MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
+ MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
+ /* call for help after 10ms
+ (min two ticks) */
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
@@ -249,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
struct workqueue_struct *system_long_wq __read_mostly;
struct workqueue_struct *system_nrt_wq __read_mostly;
struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_wq);
EXPORT_SYMBOL_GPL(system_long_wq);
EXPORT_SYMBOL_GPL(system_nrt_wq);
EXPORT_SYMBOL_GPL(system_unbound_wq);
+EXPORT_SYMBOL_GPL(system_freezable_wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -314,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
static struct debug_obj_descr work_debug_descr;
+static void *work_debug_hint(void *addr)
+{
+ return ((struct work_struct *) addr)->func;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
@@ -385,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
static struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
+ .debug_hint = work_debug_hint,
.fixup_init = work_fixup_init,
.fixup_activate = work_fixup_activate,
.fixup_free = work_fixup_free,
@@ -661,7 +671,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
{
struct worker *worker = kthread_data(task);
- if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+ if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(get_gcwq_nr_running(cpu));
}
@@ -687,7 +697,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
struct global_cwq *gcwq = get_gcwq(cpu);
atomic_t *nr_running = get_gcwq_nr_running(cpu);
- if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+ if (worker->flags & WORKER_NOT_RUNNING)
return NULL;
/* this can only happen on the local cpu */
@@ -768,7 +778,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
worker->flags &= ~flags;
- /* if transitioning out of NOT_RUNNING, increment nr_running */
+ /*
+ * If transitioning out of NOT_RUNNING, increment nr_running. Note
+ * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
+ * of multiple flags, not a single flag.
+ */
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -932,6 +946,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
wake_up_worker(gcwq);
}
+/*
+ * Test whether @work is being queued from another work executing on the
+ * same workqueue. This is rather expensive and should only be used from
+ * cold paths.
+ */
+static bool is_chained_work(struct workqueue_struct *wq)
+{
+ unsigned long flags;
+ unsigned int cpu;
+
+ for_each_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct worker *worker;
+ struct hlist_node *pos;
+ int i;
+
+ spin_lock_irqsave(&gcwq->lock, flags);
+ for_each_busy_worker(worker, i, pos, gcwq) {
+ if (worker->task != current)
+ continue;
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+ /*
+ * I'm @worker, no locking necessary. See if @work
+ * is headed to the same workqueue.
+ */
+ return worker->current_cwq->wq == wq;
+ }
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+ }
+ return false;
+}
+
static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
@@ -943,7 +989,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
debug_work_activate(work);
- if (WARN_ON_ONCE(wq->flags & WQ_DYING))
+ /* if dying, only works from the same workqueue are allowed */
+ if (unlikely(wq->flags & WQ_DYING) &&
+ WARN_ON_ONCE(!is_chained_work(wq)))
return;
/* determine gcwq to use */
@@ -1243,8 +1291,14 @@ __acquires(&gcwq->lock)
return true;
spin_unlock_irq(&gcwq->lock);
- /* CPU has come up inbetween, retry migration */
+ /*
+ * We've raced with CPU hot[un]plug. Give it a breather
+ * and retry migration. cond_resched() is required here;
+ * otherwise, we might deadlock against cpu_stop trying to
+ * bring down the CPU on non-preemptive kernel.
+ */
cpu_relax();
+ cond_resched();
}
}
@@ -1318,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
worker->id = id;
if (!on_unbound_cpu)
- worker->task = kthread_create(worker_thread, worker,
- "kworker/%u:%d", gcwq->cpu, id);
+ worker->task = kthread_create_on_node(worker_thread,
+ worker,
+ cpu_to_node(gcwq->cpu),
+ "kworker/%u:%d", gcwq->cpu, id);
else
worker->task = kthread_create(worker_thread, worker,
"kworker/u:%d", id);
@@ -1806,7 +1862,7 @@ __acquires(&gcwq->lock)
spin_unlock_irq(&gcwq->lock);
work_clear_pending(work);
- lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
f(work);
@@ -2009,6 +2065,15 @@ repeat:
move_linked_works(work, scheduled, &n);
process_scheduled_works(rescuer);
+
+ /*
+ * Leave this gcwq. If keep_working() is %true, notify a
+ * regular worker; otherwise, we end up with 0 concurrency
+ * and stalling the execution.
+ */
+ if (keep_working(gcwq))
+ wake_up_worker(gcwq);
+
spin_unlock_irq(&gcwq->lock);
}
@@ -2350,8 +2415,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
insert_wq_barrier(cwq, barr, work, worker);
spin_unlock_irq(&gcwq->lock);
- lock_map_acquire(&cwq->wq->lockdep_map);
+ /*
+ * If @max_active is 1 or rescuer is in use, flushing another work
+ * item on the same workqueue may lead to deadlock. Make sure the
+ * flusher is not running on the same workqueue by verifying write
+ * access.
+ */
+ if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ else
+ lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_release(&cwq->wq->lockdep_map);
+
return true;
already_gone:
spin_unlock_irq(&gcwq->lock);
@@ -2908,7 +2983,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
*/
spin_lock(&workqueue_lock);
- if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+ if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
for_each_cwq_cpu(cpu, wq)
get_cwq(cpu, wq)->max_active = 0;
@@ -2936,11 +3011,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
+ unsigned int flush_cnt = 0;
unsigned int cpu;
+ /*
+ * Mark @wq dying and drain all pending works. Once WQ_DYING is
+ * set, only chain queueing is allowed. IOW, only currently
+ * pending or running work items on @wq can queue further work
+ * items on it. @wq is flushed repeatedly until it becomes empty.
+ * The number of flushing is detemined by the depth of chaining and
+ * should be relatively short. Whine if it takes too long.
+ */
wq->flags |= WQ_DYING;
+reflush:
flush_workqueue(wq);
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+ continue;
+
+ if (++flush_cnt == 10 ||
+ (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+ printk(KERN_WARNING "workqueue %s: flush on "
+ "destruction isn't complete after %u tries\n",
+ wq->name, flush_cnt);
+ goto reflush;
+ }
+
/*
* wq list is used to freeze wq, remove from list after
* flushing is complete in case freeze races us.
@@ -2996,7 +3095,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
spin_lock_irq(&gcwq->lock);
- if (!(wq->flags & WQ_FREEZEABLE) ||
+ if (!(wq->flags & WQ_FREEZABLE) ||
!(gcwq->flags & GCWQ_FREEZING))
get_cwq(gcwq->cpu, wq)->max_active = max_active;
@@ -3246,7 +3345,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
* want to get it over with ASAP - spam rescuers, wake up as
* many idlers as necessary and create new ones till the
* worklist is empty. Note that if the gcwq is frozen, there
- * may be frozen works in freezeable cwqs. Don't declare
+ * may be frozen works in freezable cwqs. Don't declare
* completion while frozen.
*/
while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3504,9 +3603,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
/**
* freeze_workqueues_begin - begin freezing workqueues
*
- * Start freezing workqueues. After this function returns, all
- * freezeable workqueues will queue new works to their frozen_works
- * list instead of gcwq->worklist.
+ * Start freezing workqueues. After this function returns, all freezable
+ * workqueues will queue new works to their frozen_works list instead of
+ * gcwq->worklist.
*
* CONTEXT:
* Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3532,7 +3631,7 @@ void freeze_workqueues_begin(void)
list_for_each_entry(wq, &workqueues, list) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- if (cwq && wq->flags & WQ_FREEZEABLE)
+ if (cwq && wq->flags & WQ_FREEZABLE)
cwq->max_active = 0;
}
@@ -3543,7 +3642,7 @@ void freeze_workqueues_begin(void)
}
/**
- * freeze_workqueues_busy - are freezeable workqueues still busy?
+ * freeze_workqueues_busy - are freezable workqueues still busy?
*
* Check whether freezing is complete. This function must be called
* between freeze_workqueues_begin() and thaw_workqueues().
@@ -3552,8 +3651,8 @@ void freeze_workqueues_begin(void)
* Grabs and releases workqueue_lock.
*
* RETURNS:
- * %true if some freezeable workqueues are still busy. %false if
- * freezing is complete.
+ * %true if some freezable workqueues are still busy. %false if freezing
+ * is complete.
*/
bool freeze_workqueues_busy(void)
{
@@ -3573,7 +3672,7 @@ bool freeze_workqueues_busy(void)
list_for_each_entry(wq, &workqueues, list) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ if (!cwq || !(wq->flags & WQ_FREEZABLE))
continue;
BUG_ON(cwq->nr_active < 0);
@@ -3618,7 +3717,7 @@ void thaw_workqueues(void)
list_for_each_entry(wq, &workqueues, list) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ if (!cwq || !(wq->flags & WQ_FREEZABLE))
continue;
/* restore max_active and repopulate worklist */
@@ -3692,7 +3791,10 @@ static int __init init_workqueues(void)
system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
- BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+ system_freezable_wq = alloc_workqueue("events_freezable",
+ WQ_FREEZABLE, 0);
+ BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
+ !system_unbound_wq || !system_freezable_wq);
return 0;
}
early_initcall(init_workqueues);