summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/compat.c63
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c10
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/debug/debug_core.c53
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/fork.c75
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig2
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/debug.h38
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c47
-rw-r--r--kernel/irq/manage.c46
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/itimer.c8
-rw-r--r--kernel/kmod.c117
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/padata.c13
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c62
-rw-r--r--kernel/power/hibernate.c18
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/qos.c50
-rw-r--r--kernel/power/suspend.c7
-rw-r--r--kernel/power/swap.c28
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/printk.c1390
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c333
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c27
-rw-r--r--kernel/sched/fair.c18
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/seccomp.c458
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/smp.c27
-rw-r--r--kernel/smpboot.c62
-rw-r--r--kernel/smpboot.h18
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/tick-broadcast.c11
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/timer.c12
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c18
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_output.c5
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/workqueue.c21
66 files changed, 3146 insertions, 1360 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9..6c07f30fa9b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34ea..4b96415527b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
+#include <linux/compat.h>
#include "audit.h"
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall)
+void __audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "seccomp", SIGKILL);
+ audit_log_abend(ab, "seccomp", signr);
audit_log_format(ab, " syscall=%ld", syscall);
+ audit_log_format(ab, " compat=%d", is_compat_task());
+ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+ audit_log_format(ab, " code=0x%x", code);
audit_log_end(ab);
}
diff --git a/kernel/compat.c b/kernel/compat.c
index 74ff8498809..d2c67aa49ae 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- compat_old_sigset_t __user *oset)
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs;
+ memcpy(blocked->sig, &set, sizeof(set));
+}
- if (set && get_user(s, set))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_sigprocmask(how,
- set ? (old_sigset_t __user *) &s : NULL,
- oset ? (old_sigset_t __user *) &s : NULL);
- set_fs(old_fs);
- if (ret == 0)
- if (oset)
- ret = put_user(s, oset);
- return ret;
+asmlinkage long compat_sys_sigprocmask(int how,
+ compat_old_sigset_t __user *nset,
+ compat_old_sigset_t __user *oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
}
#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e5702..0e6353cf147 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,8 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
+#include "smpboot.h"
+
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct task_struct *idle;
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
cpu_hotplug_begin();
+
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
}
/* Arch-specific enabling code. */
- ret = __cpu_up(cpu);
+ ret = __cpu_up(cpu, idle);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
out_notify:
if (ret != 0)
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
cpu_hotplug_done();
return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2382683617a..8c8bd652dd1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = {
* are online. If none are online, walk up the cpuset hierarchy
* until we find one that does have some online cpus. If we get
* all the way to the top and still haven't found any online cpus,
- * return cpu_online_map. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
+ * task, return cpu_online_mask.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
*
* Call with callback_mutex held.
*/
@@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
int is_load_balanced;
- /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -2138,7 +2138,7 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
diff --git a/kernel/cred.c b/kernel/cred.c
index 97b36eeca4c..e70683d9ec3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -386,6 +386,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
+ p->replacement_session_keyring = NULL;
+
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1dc53bae56e..0557f24c6bc 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -160,37 +160,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
* Weak aliases for breakpoint management,
* can be overriden by architectures when needed:
*/
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
- err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
if (err)
return err;
-
- return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
- BREAK_INSTR_SIZE);
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ return err;
}
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
- return probe_kernel_write((char *)addr,
- (char *)bundle, BREAK_INSTR_SIZE);
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
int __weak kgdb_validate_break_address(unsigned long addr)
{
- char tmp_variable[BREAK_INSTR_SIZE];
+ struct kgdb_bkpt tmp;
int err;
- /* Validate setting the breakpoint and then removing it. In the
+ /* Validate setting the breakpoint and then removing it. If the
* remove fails, the kernel needs to emit a bad message because we
* are deep trouble not being able to put things back the way we
* found them.
*/
- err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+ tmp.bpt_addr = addr;
+ err = kgdb_arch_set_breakpoint(&tmp);
if (err)
return err;
- err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+ err = kgdb_arch_remove_breakpoint(&tmp);
if (err)
printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
"memory destroyed at: %lx", addr);
@@ -234,7 +236,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
*/
int dbg_activate_sw_breakpoints(void)
{
- unsigned long addr;
int error;
int ret = 0;
int i;
@@ -243,16 +244,15 @@ int dbg_activate_sw_breakpoints(void)
if (kgdb_break[i].state != BP_SET)
continue;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_set_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
if (error) {
ret = error;
- printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+ printk(KERN_INFO "KGDB: BP install failed: %lx",
+ kgdb_break[i].bpt_addr);
continue;
}
- kgdb_flush_swbreak_addr(addr);
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
kgdb_break[i].state = BP_ACTIVE;
}
return ret;
@@ -301,7 +301,6 @@ int dbg_set_sw_break(unsigned long addr)
int dbg_deactivate_sw_breakpoints(void)
{
- unsigned long addr;
int error;
int ret = 0;
int i;
@@ -309,15 +308,14 @@ int dbg_deactivate_sw_breakpoints(void)
for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
if (kgdb_break[i].state != BP_ACTIVE)
continue;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error) {
- printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+ printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
ret = error;
}
- kgdb_flush_swbreak_addr(addr);
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
kgdb_break[i].state = BP_SET;
}
return ret;
@@ -351,7 +349,6 @@ int kgdb_isremovedbreak(unsigned long addr)
int dbg_remove_all_break(void)
{
- unsigned long addr;
int error;
int i;
@@ -359,12 +356,10 @@ int dbg_remove_all_break(void)
for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
if (kgdb_break[i].state != BP_ACTIVE)
goto setundefined;
- addr = kgdb_break[i].bpt_addr;
- error = kgdb_arch_remove_breakpoint(addr,
- kgdb_break[i].saved_instr);
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error)
printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
- addr);
+ kgdb_break[i].bpt_addr);
setundefined:
kgdb_break[i].state = BP_UNDEFINED;
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 9b5f17da1c5..bb9520f0f6f 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -743,7 +743,7 @@ kdb_printit:
kdb_input_flush();
c = console_drivers;
- if (!dbg_io_ops->is_console) {
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
len = strlen(moreprompt);
cp = moreprompt;
while (len--) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a6a9ec4cd8f..fd126f82b57 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3183,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(event, func);
func(event);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
- perf_event_for_each_child(event, func);
+ perf_event_for_each_child(sibling, func);
mutex_unlock(&ctx->mutex);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff1..ad54c833116 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
@@ -47,6 +48,7 @@
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
@@ -111,32 +113,67 @@ int nr_processes(void)
return total;
}
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct_node(node) \
- kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
-# define free_task_struct(tsk) \
- kmem_cache_free(task_struct_cachep, (tsk))
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
+
+static inline struct task_struct *alloc_task_struct_node(int node)
+{
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+}
+
+void __weak arch_release_task_struct(struct task_struct *tsk) { }
+
+static inline void free_task_struct(struct task_struct *tsk)
+{
+ arch_release_task_struct(tsk);
+ kmem_cache_free(task_struct_cachep, tsk);
+}
#endif
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+void __weak arch_release_thread_info(struct thread_info *ti) { }
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
-#ifdef CONFIG_DEBUG_STACK_USAGE
- gfp_t mask = GFP_KERNEL | __GFP_ZERO;
-#else
- gfp_t mask = GFP_KERNEL;
-#endif
- struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
+ arch_release_thread_info(ti);
free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
+# else
+static struct kmem_cache *thread_info_cache;
+
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+}
+
+static void free_thread_info(struct thread_info *ti)
+{
+ arch_release_thread_info(ti);
+ kmem_cache_free(thread_info_cache, ti);
+}
+
+void thread_info_cache_init(void)
+{
+ thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ THREAD_SIZE, 0, NULL);
+ BUG_ON(thread_info_cache == NULL);
+}
+# endif
#endif
/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -170,6 +207,7 @@ void free_task(struct task_struct *tsk)
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -203,17 +241,11 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
+void __init __weak arch_task_cache_init(void) { }
void __init fork_init(unsigned long mempages)
{
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
@@ -1162,6 +1194,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
+ get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1464,6 +1497,8 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
+ if (unlikely(clone_flags & CLONE_NEWPID))
+ pid_ns_release_proc(p->nsproxy->pid_ns);
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c21449f85a2..6df614912b9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
- if (sysctl_hung_task_panic)
+ if (sysctl_hung_task_panic) {
+ trigger_all_cpu_backtrace();
panic("hung_task: blocked tasks");
+ }
}
/*
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index cf1a4a68ce4..d1a758bc972 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -62,7 +62,7 @@ config IRQ_DOMAIN_DEBUG
help
This option will show the mapping relationship between hardware irq
numbers and Linux irq numbers. The mapping is exposed via debugfs
- in the file "virq_mapping".
+ in the file "irq_domain_mapping".
If you don't know what this means you don't need it.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6080f6bc8c3..fc275e4f629 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
* If its disabled or no action available
* keep it masked and get out of here
*/
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
handle_irq_event(desc);
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
out_unlock:
raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL(handle_edge_irq);
#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
/**
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 97a8bfadc88..e75e29e4434 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,10 +4,10 @@
#include <linux/kallsyms.h>
-#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
-#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
/* FIXME */
-#define PD(f) do { } while (0)
+#define ___PD(f) do { } while (0)
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
print_symbol("%s\n", (unsigned long)desc->action->handler);
}
- P(IRQ_LEVEL);
- P(IRQ_PER_CPU);
- P(IRQ_NOPROBE);
- P(IRQ_NOREQUEST);
- P(IRQ_NOTHREAD);
- P(IRQ_NOAUTOEN);
+ ___P(IRQ_LEVEL);
+ ___P(IRQ_PER_CPU);
+ ___P(IRQ_NOPROBE);
+ ___P(IRQ_NOREQUEST);
+ ___P(IRQ_NOTHREAD);
+ ___P(IRQ_NOAUTOEN);
- PS(IRQS_AUTODETECT);
- PS(IRQS_REPLAY);
- PS(IRQS_WAITING);
- PS(IRQS_PENDING);
+ ___PS(IRQS_AUTODETECT);
+ ___PS(IRQS_REPLAY);
+ ___PS(IRQS_WAITING);
+ ___PS(IRQS_PENDING);
- PD(IRQS_INPROGRESS);
- PD(IRQS_DISABLED);
- PD(IRQS_MASKED);
+ ___PD(IRQS_INPROGRESS);
+ ___PD(IRQS_DISABLED);
+ ___PD(IRQS_MASKED);
}
-#undef P
-#undef PS
-#undef PD
+#undef ___P
+#undef ___PS
+#undef ___PD
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95e..192a302d6cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return radix_tree_lookup(&irq_desc_tree, irq);
}
+EXPORT_SYMBOL(irq_to_desc);
static void delete_irq_desc(unsigned int irq)
{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3601f3fbf67..0e0ba5f840b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,7 +23,6 @@ static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
static DEFINE_MUTEX(revmap_trees_mutex);
-static unsigned int irq_virq_count = NR_IRQS;
static struct irq_domain *irq_default_domain;
/**
@@ -184,13 +183,16 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
}
struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
+ unsigned int max_irq,
const struct irq_domain_ops *ops,
void *host_data)
{
struct irq_domain *domain = irq_domain_alloc(of_node,
IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
- if (domain)
+ if (domain) {
+ domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
irq_domain_add(domain);
+ }
return domain;
}
@@ -262,22 +264,6 @@ void irq_set_default_host(struct irq_domain *domain)
irq_default_domain = domain;
}
-/**
- * irq_set_virq_count() - Set the maximum number of linux irqs
- * @count: number of linux irqs, capped with NR_IRQS
- *
- * This is mainly for use by platforms like iSeries who want to program
- * the virtual irq number in the controller to avoid the reverse mapping
- */
-void irq_set_virq_count(unsigned int count)
-{
- pr_debug("irq: Trying to set virq count to %d\n", count);
-
- BUG_ON(count < NUM_ISA_INTERRUPTS);
- if (count < NR_IRQS)
- irq_virq_count = count;
-}
-
static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
irq_hw_number_t hwirq)
{
@@ -320,13 +306,12 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
pr_debug("irq: create_direct virq allocation failed\n");
return 0;
}
- if (virq >= irq_virq_count) {
+ if (virq >= domain->revmap_data.nomap.max_irq) {
pr_err("ERROR: no free irqs available below %i maximum\n",
- irq_virq_count);
+ domain->revmap_data.nomap.max_irq);
irq_free_desc(virq);
return 0;
}
-
pr_debug("irq: create_direct obtained virq %d\n", virq);
if (irq_setup_virq(domain, virq, virq)) {
@@ -350,7 +335,8 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
unsigned int irq_create_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
- unsigned int virq, hint;
+ unsigned int hint;
+ int virq;
pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -377,13 +363,13 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
return irq_domain_legacy_revmap(domain, hwirq);
/* Allocate a virtual interrupt number */
- hint = hwirq % irq_virq_count;
+ hint = hwirq % nr_irqs;
if (hint == 0)
hint++;
virq = irq_alloc_desc_from(hint, 0);
- if (!virq)
+ if (virq <= 0)
virq = irq_alloc_desc_from(1, 0);
- if (!virq) {
+ if (virq <= 0) {
pr_debug("irq: -> virq allocation failed\n");
return 0;
}
@@ -515,7 +501,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
unsigned int i;
- unsigned int hint = hwirq % irq_virq_count;
+ unsigned int hint = hwirq % nr_irqs;
/* Look for default domain if nececssary */
if (domain == NULL)
@@ -536,7 +522,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
if (data && (data->domain == domain) && (data->hwirq == hwirq))
return i;
i++;
- if (i >= irq_virq_count)
+ if (i >= nr_irqs)
i = 1;
} while(i != hint);
return 0;
@@ -642,8 +628,9 @@ static int virq_debug_show(struct seq_file *m, void *private)
void *data;
int i;
- seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq",
- "chip name", "chip data", "domain name");
+ seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq",
+ "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
+ "domain name");
for (i = 1; i < nr_irqs; i++) {
desc = irq_to_desc(i);
@@ -666,7 +653,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
seq_printf(m, "%-15s ", p);
data = irq_desc_get_chip_data(desc);
- seq_printf(m, "0x%16p ", data);
+ seq_printf(m, data ? "0x%p " : " %p ", data);
if (desc->irq_data.domain && desc->irq_data.domain->of_node)
p = desc->irq_data.domain->of_node->full_name;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 89a3ea82569..585f6381f8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_debug("No set_type function for IRQ %d (%s)\n", irq,
- chip ? (chip->name ? : "unknown") : "unknown");
+ pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq,
+ chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
ret = 0;
break;
default:
- pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+ pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",
flags, irq, chip->irq_set_type);
}
if (unmask)
@@ -837,8 +837,7 @@ void exit_irq_thread(void)
action = kthread_data(tsk);
- printk(KERN_ERR
- "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+ pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
desc = irq_to_desc(action->irq);
@@ -878,7 +877,6 @@ static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
struct irqaction *old, **old_ptr;
- const char *old_name = NULL;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
cpumask_var_t mask;
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
- ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
- old_name = old->name;
+ ((old->flags ^ new->flags) & IRQF_ONESHOT))
goto mismatch;
- }
/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* all existing action->thread_mask bits.
*/
new->thread_mask = 1 << ffz(thread_mask);
+
+ } else if (new->handler == irq_default_primary_handler) {
+ /*
+ * The interrupt was requested with handler = NULL, so
+ * we use the default primary handler for it. But it
+ * does not have the oneshot flag set. In combination
+ * with level interrupts this is deadly, because the
+ * default primary handler just wakes the thread, then
+ * the irq lines is reenabled, but the device still
+ * has the level irq asserted. Rinse and repeat....
+ *
+ * While this works for edge type interrupts, we play
+ * it safe and reject unconditionally because we can't
+ * say for sure which type this interrupt really
+ * has. The type flags are unreliable as the
+ * underlying chip implementation can override them.
+ */
+ pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
+ irq);
+ ret = -EINVAL;
+ goto out_mask;
}
if (!shared) {
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
- pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+ pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",
irq, nmsk, omsk);
}
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
return 0;
mismatch:
-#ifdef CONFIG_DEBUG_SHIRQ
if (!(new->flags & IRQF_PROBE_SHARED)) {
- printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
- if (old_name)
- printk(KERN_ERR "current handler: %s\n", old_name);
+ pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
+ irq, new->flags, new->name, old->flags, old->name);
+#ifdef CONFIG_DEBUG_SHIRQ
dump_stack();
- }
#endif
+ }
ret = -EBUSY;
out_mask:
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a..cb228bf2176 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
int irq;
for_each_irq_desc(irq, desc) {
+ /*
+ * Only interrupts which are marked as wakeup source
+ * and have not been disabled before the suspend check
+ * can abort suspend.
+ */
if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->istate & IRQS_PENDING)
+ if (desc->depth == 1 && desc->istate & IRQS_PENDING)
return -EBUSY;
continue;
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c..6454db7b6a4 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
/*
* We do not resend level type interrupts. Level type
* interrupts are resent by hardware when they are still
- * active.
+ * active. Clear the pending bit so suspend/resume does not
+ * get confused.
*/
- if (irq_settings_is_level(desc))
+ if (irq_settings_is_level(desc)) {
+ desc->istate &= ~IRQS_PENDING;
return;
+ }
if (desc->istate & IRQS_REPLAY)
return;
if (desc->istate & IRQS_PENDING) {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c3c46c72046..1588e3b2871 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -5,11 +5,13 @@
* context. The enqueueing is NMI-safe.
*/
+#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/irqflags.h>
#include <asm/processor.h>
/*
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 22000c3db0d..8d262b46757 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -284,8 +284,12 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
if (value) {
if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
return -EFAULT;
- } else
- memset((char *) &set_buffer, 0, sizeof(set_buffer));
+ } else {
+ memset(&set_buffer, 0, sizeof(set_buffer));
+ printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+ " Misfeature support will be removed\n",
+ current->comm);
+ }
error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
if (error || !ovalue)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 957a7aab8eb..05698a7415f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -322,7 +322,7 @@ static void __call_usermodehelper(struct work_struct *work)
* land has been frozen during a system-wide hibernation or suspend operation).
* Should always be manipulated under umhelper_sem acquired for write.
*/
-static int usermodehelper_disabled = 1;
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);
@@ -334,32 +334,110 @@ static atomic_t running_helpers = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
/*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+
+/*
* Time to wait for running_helpers to become zero before the setting of
* usermodehelper_disabled in usermodehelper_disable() fails
*/
#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-void read_lock_usermodehelper(void)
+int usermodehelper_read_trylock(void)
{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ if (usermodehelper_disabled == UMH_DISABLED)
+ ret = -EAGAIN;
+
+ up_read(&umhelper_sem);
+
+ if (ret)
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
+
+long usermodehelper_read_lock_wait(long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ if (timeout < 0)
+ return -EINVAL;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ up_read(&umhelper_sem);
+
+ timeout = schedule_timeout(timeout);
+ if (!timeout)
+ break;
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return timeout;
}
-EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
-void read_unlock_usermodehelper(void)
+void usermodehelper_read_unlock(void)
{
up_read(&umhelper_sem);
}
-EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
/**
- * usermodehelper_disable - prevent new helpers from being started
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
*/
-int usermodehelper_disable(void)
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ wake_up(&usermodehelper_disabled_waitq);
+ up_write(&umhelper_sem);
+}
+
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
+ */
+int __usermodehelper_disable(enum umh_disable_depth depth)
{
long retval;
+ if (!depth)
+ return -EINVAL;
+
down_write(&umhelper_sem);
- usermodehelper_disabled = 1;
+ usermodehelper_disabled = depth;
up_write(&umhelper_sem);
/*
@@ -374,31 +452,10 @@ int usermodehelper_disable(void)
if (retval)
return 0;
- down_write(&umhelper_sem);
- usermodehelper_disabled = 0;
- up_write(&umhelper_sem);
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
return -EAGAIN;
}
-/**
- * usermodehelper_enable - allow new helpers to be started again
- */
-void usermodehelper_enable(void)
-{
- down_write(&umhelper_sem);
- usermodehelper_disabled = 0;
- up_write(&umhelper_sem);
-}
-
-/**
- * usermodehelper_is_disabled - check if new helpers are allowed to be started
- */
-bool usermodehelper_is_disabled(void)
-{
- return usermodehelper_disabled;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
-
static void helper_lock(void)
{
atomic_inc(&running_helpers);
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e42..a4e60973ca7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2953,7 +2953,7 @@ static struct module *load_module(void __user *umod,
/* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, NULL);
+ -32768, 32767, &ddebug_dyndbg_module_param_cb);
if (err < 0)
goto unlink;
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f10eb285ec..89fe3d1b9ef 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,6 +1,8 @@
/*
* padata.c - generic interface to process data streams in parallel
*
+ * See Documentation/padata.txt for an api documentation.
+ *
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
*
@@ -354,13 +356,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
return -ENOMEM;
- cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
+ cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
free_cpumask_var(pd->cpumask.cbcpu);
return -ENOMEM;
}
- cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
+ cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
return 0;
}
@@ -564,7 +566,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
static bool padata_validate_cpumask(struct padata_instance *pinst,
const struct cpumask *cpumask)
{
- if (!cpumask_intersects(cpumask, cpu_active_mask)) {
+ if (!cpumask_intersects(cpumask, cpu_online_mask)) {
pinst->flags |= PADATA_INVALID;
return false;
}
@@ -678,7 +680,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd;
- if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask)) {
pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
pinst->cpumask.cbcpu);
if (!pd)
@@ -746,6 +748,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
return -ENOMEM;
padata_replace(pinst, pd);
+
+ cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
+ cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
}
return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 80aed44e345..8ed89a175d7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -97,7 +97,7 @@ void panic(const char *fmt, ...)
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (!oops_in_progress)
+ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
dump_stack();
#endif
diff --git a/kernel/params.c b/kernel/params.c
index f37d8263134..ed35345be53 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)
static int parse_one(char *param,
char *val,
+ const char *doing,
const struct kernel_param *params,
unsigned num_params,
s16 min_level,
s16 max_level,
- int (*handle_unknown)(char *param, char *val))
+ int (*handle_unknown)(char *param, char *val,
+ const char *doing))
{
unsigned int i;
int err;
@@ -104,8 +106,8 @@ static int parse_one(char *param,
if (!val && params[i].ops->set != param_set_bool
&& params[i].ops->set != param_set_bint)
return -EINVAL;
- pr_debug("They are equal! Calling %p\n",
- params[i].ops->set);
+ pr_debug("handling %s with %p\n", param,
+ params[i].ops->set);
mutex_lock(&param_lock);
err = params[i].ops->set(val, &params[i]);
mutex_unlock(&param_lock);
@@ -114,11 +116,11 @@ static int parse_one(char *param,
}
if (handle_unknown) {
- pr_debug("Unknown argument: calling %p\n", handle_unknown);
- return handle_unknown(param, val);
+ pr_debug("doing %s: %s='%s'\n", doing, param, val);
+ return handle_unknown(param, val, doing);
}
- pr_debug("Unknown argument `%s'\n", param);
+ pr_debug("Unknown argument '%s'\n", param);
return -ENOENT;
}
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *name,
+int parse_args(const char *doing,
char *args,
const struct kernel_param *params,
unsigned num,
s16 min_level,
s16 max_level,
- int (*unknown)(char *param, char *val))
+ int (*unknown)(char *param, char *val, const char *doing))
{
char *param, *val;
- pr_debug("Parsing ARGS: %s\n", args);
-
/* Chew leading spaces */
args = skip_spaces(args);
+ if (*args)
+ pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
+
while (*args) {
int ret;
int irq_was_disabled;
args = next_arg(args, &param, &val);
irq_was_disabled = irqs_disabled();
- ret = parse_one(param, val, params, num,
+ ret = parse_one(param, val, doing, params, num,
min_level, max_level, unknown);
- if (irq_was_disabled && !irqs_disabled()) {
- printk(KERN_WARNING "parse_args(): option '%s' enabled "
- "irq's!\n", param);
- }
+ if (irq_was_disabled && !irqs_disabled())
+ pr_warn("%s: option '%s' enabled irq's!\n",
+ doing, param);
+
switch (ret) {
case -ENOENT:
- printk(KERN_ERR "%s: Unknown parameter `%s'\n",
- name, param);
+ pr_err("%s: Unknown parameter `%s'\n", doing, param);
return ret;
case -ENOSPC:
- printk(KERN_ERR
- "%s: `%s' too large for parameter `%s'\n",
- name, val ?: "", param);
+ pr_err("%s: `%s' too large for parameter `%s'\n",
+ doing, val ?: "", param);
return ret;
case 0:
break;
default:
- printk(KERN_ERR
- "%s: `%s' invalid for parameter `%s'\n",
- name, val ?: "", param);
+ pr_err("%s: `%s' invalid for parameter `%s'\n",
+ doing, val ?: "", param);
return ret;
}
}
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
if (strlen(val) > 1024) {
- printk(KERN_ERR "%s: string parameter too long\n",
- kp->name);
+ pr_err("%s: string parameter too long\n", kp->name);
return -ENOSPC;
}
@@ -400,8 +399,7 @@ static int param_array(const char *name,
int len;
if (*num == max) {
- printk(KERN_ERR "%s: can only take %i arguments\n",
- name, max);
+ pr_err("%s: can only take %i arguments\n", name, max);
return -EINVAL;
}
len = strcspn(val, ",");
@@ -420,8 +418,7 @@ static int param_array(const char *name,
} while (save == ',');
if (*num < min) {
- printk(KERN_ERR "%s: needs at least %i arguments\n",
- name, min);
+ pr_err("%s: needs at least %i arguments\n", name, min);
return -EINVAL;
}
return 0;
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
const struct kparam_string *kps = kp->str;
if (strlen(val)+1 > kps->maxlen) {
- printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+ pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
#endif
if (err) {
kobject_put(&mk->kobj);
- printk(KERN_ERR
- "Module '%s' failed add to sysfs, error number %d\n",
+ pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
name, err);
- printk(KERN_ERR
- "The system will be unstable now.\n");
return NULL;
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0a186cfde78..e09dfbfeece 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -16,7 +16,6 @@
#include <linux/string.h>
#include <linux/device.h>
#include <linux/async.h>
-#include <linux/kmod.h>
#include <linux/delay.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -611,14 +610,10 @@ int hibernate(void)
if (error)
goto Exit;
- error = usermodehelper_disable();
- if (error)
- goto Exit;
-
/* Allocate memory management structures */
error = create_basic_memory_bitmaps();
if (error)
- goto Enable_umh;
+ goto Exit;
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
@@ -661,8 +656,6 @@ int hibernate(void)
Free_bitmaps:
free_basic_memory_bitmaps();
- Enable_umh:
- usermodehelper_enable();
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION);
pm_restore_console();
@@ -777,15 +770,9 @@ static int software_resume(void)
if (error)
goto close_finish;
- error = usermodehelper_disable();
- if (error)
- goto close_finish;
-
error = create_basic_memory_bitmaps();
- if (error) {
- usermodehelper_enable();
+ if (error)
goto close_finish;
- }
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
@@ -806,7 +793,6 @@ static int software_resume(void)
thaw_processes();
Done:
free_basic_memory_bitmaps();
- usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
pm_restore_console();
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0d2aeb22610..19db29f6755 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,6 +16,7 @@
#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/workqueue.h>
+#include <linux/kmod.h>
/*
* Timeout for stopping processes
@@ -122,6 +123,10 @@ int freeze_processes(void)
{
int error;
+ error = __usermodehelper_disable(UMH_FREEZING);
+ if (error)
+ return error;
+
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
@@ -130,6 +135,7 @@ int freeze_processes(void)
error = try_to_freeze_tasks(true);
if (!error) {
printk("done.");
+ __usermodehelper_set_disable_depth(UMH_DISABLED);
oom_killer_disable();
}
printk("\n");
@@ -187,6 +193,8 @@ void thaw_processes(void)
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
+ usermodehelper_enable();
+
schedule();
printk("done.\n");
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index d6d6dbd1ecc..6a031e68402 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -230,6 +230,21 @@ int pm_qos_request_active(struct pm_qos_request *req)
EXPORT_SYMBOL_GPL(pm_qos_request_active);
/**
+ * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
+ * @work: work struct for the delayed work (timeout)
+ *
+ * This cancels the timeout request by falling back to the default at timeout.
+ */
+static void pm_qos_work_fn(struct work_struct *work)
+{
+ struct pm_qos_request *req = container_of(to_delayed_work(work),
+ struct pm_qos_request,
+ work);
+
+ pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+}
+
+/**
* pm_qos_add_request - inserts new qos request into the list
* @req: pointer to a preallocated handle
* @pm_qos_class: identifies which list of qos request to use
@@ -253,6 +268,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
return;
}
req->pm_qos_class = pm_qos_class;
+ INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
&req->node, PM_QOS_ADD_REQ, value);
}
@@ -279,6 +295,9 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
+ if (delayed_work_pending(&req->work))
+ cancel_delayed_work_sync(&req->work);
+
if (new_value != req->node.prio)
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
@@ -287,6 +306,34 @@ void pm_qos_update_request(struct pm_qos_request *req,
EXPORT_SYMBOL_GPL(pm_qos_update_request);
/**
+ * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
+ * @req : handle to list element holding a pm_qos request to use
+ * @new_value: defines the temporal qos request
+ * @timeout_us: the effective duration of this qos request in usecs.
+ *
+ * After timeout_us, this qos request is cancelled automatically.
+ */
+void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
+ unsigned long timeout_us)
+{
+ if (!req)
+ return;
+ if (WARN(!pm_qos_request_active(req),
+ "%s called for unknown object.", __func__))
+ return;
+
+ if (delayed_work_pending(&req->work))
+ cancel_delayed_work_sync(&req->work);
+
+ if (new_value != req->node.prio)
+ pm_qos_update_target(
+ pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_UPDATE_REQ, new_value);
+
+ schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
+}
+
+/**
* pm_qos_remove_request - modifies an existing qos request
* @req: handle to request list element
*
@@ -305,6 +352,9 @@ void pm_qos_remove_request(struct pm_qos_request *req)
return;
}
+ if (delayed_work_pending(&req->work))
+ cancel_delayed_work_sync(&req->work);
+
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_REMOVE_REQ,
PM_QOS_DEFAULT_VALUE);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 88e5c967370..396d262b8fd 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,7 +12,6 @@
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/init.h>
-#include <linux/kmod.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
@@ -102,17 +101,12 @@ static int suspend_prepare(void)
if (error)
goto Finish;
- error = usermodehelper_disable();
- if (error)
- goto Finish;
-
error = suspend_freeze_processes();
if (!error)
return 0;
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
- usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
@@ -259,7 +253,6 @@ int suspend_devices_and_enter(suspend_state_t state)
static void suspend_finish(void)
{
suspend_thaw_processes();
- usermodehelper_enable();
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8742fd013a9..eef311a58a6 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -51,6 +51,23 @@
#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
+/*
+ * Number of free pages that are not high.
+ */
+static inline unsigned long low_free_pages(void)
+{
+ return nr_free_pages() - nr_free_highpages();
+}
+
+/*
+ * Number of pages required to be kept free while writing the image. Always
+ * half of all available low pages before the writing starts.
+ */
+static inline unsigned long reqd_free_pages(void)
+{
+ return low_free_pages() / 2;
+}
+
struct swap_map_page {
sector_t entries[MAP_PAGE_ENTRIES];
sector_t next_swap;
@@ -72,7 +89,7 @@ struct swap_map_handle {
sector_t cur_swap;
sector_t first_sector;
unsigned int k;
- unsigned long nr_free_pages, written;
+ unsigned long reqd_free_pages;
u32 crc32;
};
@@ -316,8 +333,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
goto err_rel;
}
handle->k = 0;
- handle->nr_free_pages = nr_free_pages() >> 1;
- handle->written = 0;
+ handle->reqd_free_pages = reqd_free_pages();
handle->first_sector = handle->cur_swap;
return 0;
err_rel:
@@ -352,11 +368,11 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
handle->cur_swap = offset;
handle->k = 0;
}
- if (bio_chain && ++handle->written > handle->nr_free_pages) {
+ if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
error = hib_wait_on_bio_chain(bio_chain);
if (error)
goto out;
- handle->written = 0;
+ handle->reqd_free_pages = reqd_free_pages();
}
out:
return error;
@@ -618,7 +634,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
* Adjust number of free pages after all allocations have been done.
* We don't want to run out of pages when writing.
*/
- handle->nr_free_pages = nr_free_pages() >> 1;
+ handle->reqd_free_pages = reqd_free_pages();
/*
* Start the CRC32 thread.
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 33c4329205a..91b0fd021a9 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,7 +12,6 @@
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
-#include <linux/kmod.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/miscdevice.h>
@@ -222,14 +221,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
sys_sync();
printk("done.\n");
- error = usermodehelper_disable();
- if (error)
- break;
-
error = freeze_processes();
- if (error)
- usermodehelper_enable();
- else
+ if (!error)
data->frozen = 1;
break;
@@ -238,7 +231,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
pm_restore_gfp_mask();
thaw_processes();
- usermodehelper_enable();
data->frozen = 0;
break;
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d3..32462d2b364 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/rculist.h>
+#include <linux/poll.h>
#include <asm/uaccess.h>
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
{
}
-#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);
static int console_locked, console_suspended;
/*
- * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
- * It is also used in interesting ways to provide interlocking in
- * console_unlock();.
- */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-#define LOG_BUF_MASK (log_buf_len-1)
-#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
-
-/*
- * The indices into log_buf are not constrained to log_buf_len - they
- * must be masked before subscripting
- */
-static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
-static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
-static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
-
-/*
* If exclusive_console is non-NULL then only this console is to be printed to.
*/
static struct console *exclusive_console;
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline);
/* Flag: console code may call schedule() */
static int console_may_schedule;
+/*
+ * The printk log buffer consists of a chain of concatenated variable
+ * length records. Every record starts with a record header, containing
+ * the overall length of the record.
+ *
+ * The heads to the first and last entry in the buffer, as well as the
+ * sequence numbers of these both entries are maintained when messages
+ * are stored..
+ *
+ * If the heads indicate available messages, the length in the header
+ * tells the start next message. A length == 0 for the next message
+ * indicates a wrap-around to the beginning of the buffer.
+ *
+ * Every record carries the monotonic timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual
+ * kernel messages use LOG_KERN; userspace-injected messages always carry
+ * a matching syslog facility, by default LOG_USER. The origin of every
+ * message can be reliably determined that way.
+ *
+ * The human readable log message directly follows the message header. The
+ * length of the message text is stored in the header, the stored message
+ * is not terminated.
+ *
+ * Optionally, a message can carry a dictionary of properties (key/value pairs),
+ * to provide userspace with a machine-readable message context.
+ *
+ * Examples for well-defined, commonly used property names are:
+ * DEVICE=b12:8 device identifier
+ * b12:8 block dev_t
+ * c127:3 char dev_t
+ * n8 netdev ifindex
+ * +sound:card0 subsystem:devname
+ * SUBSYSTEM=pci driver-core subsystem name
+ *
+ * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
+ * follows directly after a '=' character. Every property is terminated by
+ * a '\0' character. The last property is not terminated.
+ *
+ * Example of a message structure:
+ * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
+ * 0008 34 00 record is 52 bytes long
+ * 000a 0b 00 text is 11 bytes long
+ * 000c 1f 00 dictionary is 23 bytes long
+ * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
+ * 0010 69 74 27 73 20 61 20 6c "it's a l"
+ * 69 6e 65 "ine"
+ * 001b 44 45 56 49 43 "DEVIC"
+ * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
+ * 52 49 56 45 52 3d 62 75 "RIVER=bu"
+ * 67 "g"
+ * 0032 00 00 00 padding to next message header
+ *
+ * The 'struct log' buffer header must never be directly exported to
+ * userspace, it is a kernel-private implementation detail that might
+ * need to be changed in the future, when the requirements change.
+ *
+ * /dev/kmsg exports the structured data in the following line format:
+ * "level,sequnum,timestamp;<message text>\n"
+ *
+ * The optional key/value pairs are attached as continuation lines starting
+ * with a space character and terminated by a newline. All possible
+ * non-prinatable characters are escaped in the "\xff" notation.
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
+ */
+
+struct log {
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 len; /* length of entire record */
+ u16 text_len; /* length of text buffer */
+ u16 dict_len; /* length of dictionary buffer */
+ u16 level; /* syslog level + facility */
+};
+
+/*
+ * The logbuf_lock protects kmsg buffer, indices, counters. It is also
+ * used in interesting ways to provide interlocking in console_unlock();
+ */
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
+
+/* the next printk record to read by syslog(READ) or /proc/kmsg */
+static u64 syslog_seq;
+static u32 syslog_idx;
+
+/* index and sequence number of the first record stored in the buffer */
+static u64 log_first_seq;
+static u32 log_first_idx;
+
+/* index and sequence number of the next record to store in the buffer */
+static u64 log_next_seq;
#ifdef CONFIG_PRINTK
+static u32 log_next_idx;
+
+/* the next printk record to read after the last 'clear' command */
+static u64 clear_seq;
+static u32 clear_idx;
+
+#define LOG_LINE_MAX 1024
-static char __log_buf[__LOG_BUF_LEN];
+/* record buffer */
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define LOG_ALIGN 4
+#else
+#define LOG_ALIGN 8
+#endif
+#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
-static int log_buf_len = __LOG_BUF_LEN;
-static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
-static int saved_console_loglevel = -1;
+static u32 log_buf_len = __LOG_BUF_LEN;
+
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int logbuf_cpu = UINT_MAX;
+
+/* human readable text of the record */
+static char *log_text(const struct log *msg)
+{
+ return (char *)msg + sizeof(struct log);
+}
+
+/* optional key/value pair dictionary attached to the record */
+static char *log_dict(const struct log *msg)
+{
+ return (char *)msg + sizeof(struct log) + msg->text_len;
+}
+
+/* get record by index; idx must point to valid msg */
+static struct log *log_from_idx(u32 idx)
+{
+ struct log *msg = (struct log *)(log_buf + idx);
+
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer.
+ */
+ if (!msg->len)
+ return (struct log *)log_buf;
+ return msg;
+}
+
+/* get next record; idx must point to valid msg */
+static u32 log_next(u32 idx)
+{
+ struct log *msg = (struct log *)(log_buf + idx);
+
+ /* length == 0 indicates the end of the buffer; wrap */
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer as *this* one, and
+ * return the one after that.
+ */
+ if (!msg->len) {
+ msg = (struct log *)log_buf;
+ return msg->len;
+ }
+ return idx + msg->len;
+}
+
+/* insert record into the buffer, discard old ones, update heads */
+static void log_store(int facility, int level,
+ const char *dict, u16 dict_len,
+ const char *text, u16 text_len)
+{
+ struct log *msg;
+ u32 size, pad_len;
+
+ /* number of '\0' padding bytes to next message */
+ size = sizeof(struct log) + text_len + dict_len;
+ pad_len = (-size) & (LOG_ALIGN - 1);
+ size += pad_len;
+
+ while (log_first_seq < log_next_seq) {
+ u32 free;
+
+ if (log_next_idx > log_first_idx)
+ free = max(log_buf_len - log_next_idx, log_first_idx);
+ else
+ free = log_first_idx - log_next_idx;
+
+ if (free > size + sizeof(struct log))
+ break;
+
+ /* drop old messages until we have enough contiuous space */
+ log_first_idx = log_next(log_first_idx);
+ log_first_seq++;
+ }
+
+ if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
+ /*
+ * This message + an additional empty header does not fit
+ * at the end of the buffer. Add an empty header with len == 0
+ * to signify a wrap around.
+ */
+ memset(log_buf + log_next_idx, 0, sizeof(struct log));
+ log_next_idx = 0;
+ }
+
+ /* fill message */
+ msg = (struct log *)(log_buf + log_next_idx);
+ memcpy(log_text(msg), text, text_len);
+ msg->text_len = text_len;
+ memcpy(log_dict(msg), dict, dict_len);
+ msg->dict_len = dict_len;
+ msg->level = (facility << 3) | (level & 7);
+ msg->ts_nsec = local_clock();
+ memset(log_dict(msg) + dict_len, 0, pad_len);
+ msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
+
+ /* insert message */
+ log_next_idx += msg->len;
+ log_next_seq++;
+}
+
+/* /dev/kmsg - userspace message inject/listen interface */
+struct devkmsg_user {
+ u64 seq;
+ u32 idx;
+ struct mutex lock;
+ char buf[8192];
+};
+
+static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ char *buf, *line;
+ int i;
+ int level = default_message_loglevel;
+ int facility = 1; /* LOG_USER */
+ size_t len = iov_length(iv, count);
+ ssize_t ret = len;
+
+ if (len > LOG_LINE_MAX)
+ return -EINVAL;
+ buf = kmalloc(len+1, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ line = buf;
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
+ goto out;
+ line += iv[i].iov_len;
+ }
+
+ /*
+ * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
+ * the decimal value represents 32bit, the lower 3 bit are the log
+ * level, the rest are the log facility.
+ *
+ * If no prefix or no userspace facility is specified, we
+ * enforce LOG_USER, to be able to reliably distinguish
+ * kernel-generated messages from userspace-injected ones.
+ */
+ line = buf;
+ if (line[0] == '<') {
+ char *endp = NULL;
+
+ i = simple_strtoul(line+1, &endp, 10);
+ if (endp && endp[0] == '>') {
+ level = i & 7;
+ if (i >> 3)
+ facility = i >> 3;
+ endp++;
+ len -= endp - line;
+ line = endp;
+ }
+ }
+ line[len] = '\0';
+
+ printk_emit(facility, level, NULL, 0, "%s", line);
+out:
+ kfree(buf);
+ return ret;
+}
+
+static ssize_t devkmsg_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct devkmsg_user *user = file->private_data;
+ struct log *msg;
+ u64 ts_usec;
+ size_t i;
+ size_t len;
+ ssize_t ret;
+
+ if (!user)
+ return -EBADF;
+
+ mutex_lock(&user->lock);
+ raw_spin_lock(&logbuf_lock);
+ while (user->seq == log_next_seq) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ raw_spin_unlock(&logbuf_lock);
+ goto out;
+ }
+
+ raw_spin_unlock(&logbuf_lock);
+ ret = wait_event_interruptible(log_wait,
+ user->seq != log_next_seq);
+ if (ret)
+ goto out;
+ raw_spin_lock(&logbuf_lock);
+ }
+
+ if (user->seq < log_first_seq) {
+ /* our last seen message is gone, return error and reset */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ ret = -EPIPE;
+ raw_spin_unlock(&logbuf_lock);
+ goto out;
+ }
+
+ msg = log_from_idx(user->idx);
+ ts_usec = msg->ts_nsec;
+ do_div(ts_usec, 1000);
+ len = sprintf(user->buf, "%u,%llu,%llu;",
+ msg->level, user->seq, ts_usec);
+
+ /* escape non-printable characters */
+ for (i = 0; i < msg->text_len; i++) {
+ unsigned char c = log_text(msg)[i];
+
+ if (c < ' ' || c >= 128)
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ else
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+
+ if (msg->dict_len) {
+ bool line = true;
+
+ for (i = 0; i < msg->dict_len; i++) {
+ unsigned char c = log_dict(msg)[i];
+
+ if (line) {
+ user->buf[len++] = ' ';
+ line = false;
+ }
+
+ if (c == '\0') {
+ user->buf[len++] = '\n';
+ line = true;
+ continue;
+ }
+
+ if (c < ' ' || c >= 128) {
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ continue;
+ }
+
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+ }
+
+ user->idx = log_next(user->idx);
+ user->seq++;
+ raw_spin_unlock(&logbuf_lock);
+
+ if (len > count) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(buf, user->buf, len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = len;
+out:
+ mutex_unlock(&user->lock);
+ return ret;
+}
+
+static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct devkmsg_user *user = file->private_data;
+ loff_t ret = 0;
+
+ if (!user)
+ return -EBADF;
+ if (offset)
+ return -ESPIPE;
+
+ raw_spin_lock(&logbuf_lock);
+ switch (whence) {
+ case SEEK_SET:
+ /* the first record */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ break;
+ case SEEK_DATA:
+ /*
+ * The first record after the last SYSLOG_ACTION_CLEAR,
+ * like issued by 'dmesg -c'. Reading /dev/kmsg itself
+ * changes no global state, and does not clear anything.
+ */
+ user->idx = clear_idx;
+ user->seq = clear_seq;
+ break;
+ case SEEK_END:
+ /* after the last record */
+ user->idx = log_next_idx;
+ user->seq = log_next_seq;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ raw_spin_unlock(&logbuf_lock);
+ return ret;
+}
+
+static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+{
+ struct devkmsg_user *user = file->private_data;
+ int ret = 0;
+
+ if (!user)
+ return POLLERR|POLLNVAL;
+
+ poll_wait(file, &log_wait, wait);
+
+ raw_spin_lock(&logbuf_lock);
+ if (user->seq < log_next_seq) {
+ /* return error when data has vanished underneath us */
+ if (user->seq < log_first_seq)
+ ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+ ret = POLLIN|POLLRDNORM;
+ }
+ raw_spin_unlock(&logbuf_lock);
+
+ return ret;
+}
+
+static int devkmsg_open(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user;
+ int err;
+
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ return 0;
+
+ err = security_syslog(SYSLOG_ACTION_READ_ALL);
+ if (err)
+ return err;
+
+ user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ if (!user)
+ return -ENOMEM;
+
+ mutex_init(&user->lock);
+
+ raw_spin_lock(&logbuf_lock);
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ raw_spin_unlock(&logbuf_lock);
+
+ file->private_data = user;
+ return 0;
+}
+
+static int devkmsg_release(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user = file->private_data;
+
+ if (!user)
+ return 0;
+
+ mutex_destroy(&user->lock);
+ kfree(user);
+ return 0;
+}
+
+const struct file_operations kmsg_fops = {
+ .open = devkmsg_open,
+ .read = devkmsg_read,
+ .aio_write = devkmsg_writev,
+ .llseek = devkmsg_llseek,
+ .poll = devkmsg_poll,
+ .release = devkmsg_release,
+};
#ifdef CONFIG_KEXEC
/*
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1;
void log_buf_kexec_setup(void)
{
VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_end);
VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(logged_chars);
+ VMCOREINFO_SYMBOL(log_first_idx);
+ VMCOREINFO_SYMBOL(log_next_idx);
}
#endif
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup);
void __init setup_log_buf(int early)
{
unsigned long flags;
- unsigned start, dest_idx, offset;
char *new_log_buf;
int free;
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early)
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_end;
-
- offset = start = min(con_start, log_start);
- dest_idx = 0;
- while (start != log_end) {
- unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
-
- log_buf[dest_idx] = __log_buf[log_idx_mask];
- start++;
- dest_idx++;
- }
- log_start -= offset;
- con_start -= offset;
- log_end -= offset;
+ free = __LOG_BUF_LEN - log_next_idx;
+ memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
pr_info("log_buf_len: %d\n", log_buf_len);
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file)
return 0;
}
+#if defined(CONFIG_PRINTK_TIME)
+static bool printk_time = 1;
+#else
+static bool printk_time;
+#endif
+module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+
+static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
+{
+ size_t len = 0;
+
+ if (syslog) {
+ if (buf) {
+ len += sprintf(buf, "<%u>", msg->level);
+ } else {
+ len += 3;
+ if (msg->level > 9)
+ len++;
+ if (msg->level > 99)
+ len++;
+ }
+ }
+
+ if (printk_time) {
+ if (buf) {
+ unsigned long long ts = msg->ts_nsec;
+ unsigned long rem_nsec = do_div(ts, 1000000000);
+
+ len += sprintf(buf + len, "[%5lu.%06lu] ",
+ (unsigned long) ts, rem_nsec / 1000);
+ } else {
+ len += 15;
+ }
+ }
+
+ return len;
+}
+
+static size_t msg_print_text(const struct log *msg, bool syslog,
+ char *buf, size_t size)
+{
+ const char *text = log_text(msg);
+ size_t text_size = msg->text_len;
+ size_t len = 0;
+
+ do {
+ const char *next = memchr(text, '\n', text_size);
+ size_t text_len;
+
+ if (next) {
+ text_len = next - text;
+ next++;
+ text_size -= next - text;
+ } else {
+ text_len = text_size;
+ }
+
+ if (buf) {
+ if (print_prefix(msg, syslog, NULL) +
+ text_len + 1>= size - len)
+ break;
+
+ len += print_prefix(msg, syslog, buf + len);
+ memcpy(buf + len, text, text_len);
+ len += text_len;
+ buf[len++] = '\n';
+ } else {
+ /* SYSLOG_ACTION_* buffer size only calculation */
+ len += print_prefix(msg, syslog, NULL);
+ len += text_len + 1;
+ }
+
+ text = next;
+ } while (text);
+
+ return len;
+}
+
+static int syslog_print(char __user *buf, int size)
+{
+ char *text;
+ struct log *msg;
+ int len;
+
+ text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ }
+ msg = log_from_idx(syslog_idx);
+ len = msg_print_text(msg, true, text, LOG_LINE_MAX);
+ syslog_idx = log_next(syslog_idx);
+ syslog_seq++;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ if (len > 0 && copy_to_user(buf, text, len))
+ len = -EFAULT;
+
+ kfree(text);
+ return len;
+}
+
+static int syslog_print_all(char __user *buf, int size, bool clear)
+{
+ char *text;
+ int len = 0;
+
+ text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (buf) {
+ u64 next_seq;
+ u64 seq;
+ u32 idx;
+
+ if (clear_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ }
+
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump.
+ */
+ seq = clear_seq;
+ idx = clear_idx;
+ while (seq < log_next_seq) {
+ struct log *msg = log_from_idx(idx);
+
+ len += msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
+ seq = clear_seq;
+ idx = clear_idx;
+ while (len > size && seq < log_next_seq) {
+ struct log *msg = log_from_idx(idx);
+
+ len -= msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
+
+ /* last message in this dump */
+ next_seq = log_next_seq;
+
+ len = 0;
+ while (len >= 0 && seq < next_seq) {
+ struct log *msg = log_from_idx(idx);
+ int textlen;
+
+ textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
+ if (textlen < 0) {
+ len = textlen;
+ break;
+ }
+ idx = log_next(idx);
+ seq++;
+
+ raw_spin_unlock_irq(&logbuf_lock);
+ if (copy_to_user(buf + len, text, textlen))
+ len = -EFAULT;
+ else
+ len += textlen;
+ raw_spin_lock_irq(&logbuf_lock);
+
+ if (seq < log_first_seq) {
+ /* messages are gone, move to next one */
+ seq = log_first_seq;
+ idx = log_first_idx;
+ }
+ }
+ }
+
+ if (clear) {
+ clear_seq = log_next_seq;
+ clear_idx = log_next_idx;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ kfree(text);
+ return len;
+}
+
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
- unsigned i, j, limit, count;
- int do_clear = 0;
- char c;
+ bool clear = false;
+ static int saved_console_loglevel = -1;
int error;
error = check_syslog_permissions(type, from_file);
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
goto out;
}
error = wait_event_interruptible(log_wait,
- (log_start - log_end));
+ syslog_seq != log_next_seq);
if (error)
goto out;
- i = 0;
- raw_spin_lock_irq(&logbuf_lock);
- while (!error && (log_start != log_end) && i < len) {
- c = LOG_BUF(log_start);
- log_start++;
- raw_spin_unlock_irq(&logbuf_lock);
- error = __put_user(c,buf);
- buf++;
- i++;
- cond_resched();
- raw_spin_lock_irq(&logbuf_lock);
- }
- raw_spin_unlock_irq(&logbuf_lock);
- if (!error)
- error = i;
+ error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
- do_clear = 1;
+ clear = true;
/* FALL THRU */
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
error = -EFAULT;
goto out;
}
- count = len;
- if (count > log_buf_len)
- count = log_buf_len;
- raw_spin_lock_irq(&logbuf_lock);
- if (count > logged_chars)
- count = logged_chars;
- if (do_clear)
- logged_chars = 0;
- limit = log_end;
- /*
- * __put_user() could sleep, and while we sleep
- * printk() could overwrite the messages
- * we try to copy to user space. Therefore
- * the messages are copied in reverse. <manfreds>
- */
- for (i = 0; i < count && !error; i++) {
- j = limit-1-i;
- if (j + log_buf_len < log_end)
- break;
- c = LOG_BUF(j);
- raw_spin_unlock_irq(&logbuf_lock);
- error = __put_user(c,&buf[count-1-i]);
- cond_resched();
- raw_spin_lock_irq(&logbuf_lock);
- }
- raw_spin_unlock_irq(&logbuf_lock);
- if (error)
- break;
- error = i;
- if (i != count) {
- int offset = count-error;
- /* buffer overflow during copy, correct user buffer. */
- for (i = 0; i < error; i++) {
- if (__get_user(c,&buf[i+offset]) ||
- __put_user(c,&buf[i])) {
- error = -EFAULT;
- break;
- }
- cond_resched();
- }
- }
+ error = syslog_print_all(buf, len, clear);
break;
/* Clear ring buffer */
case SYSLOG_ACTION_CLEAR:
- logged_chars = 0;
- break;
+ syslog_print_all(NULL, 0, true);
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
if (saved_console_loglevel == -1)
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- error = log_end - log_start;
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ }
+ if (from_file) {
+ /*
+ * Short-cut for poll(/"proc/kmsg") which simply checks
+ * for pending data, not the size; return the count of
+ * records, not the length.
+ */
+ error = log_next_idx - syslog_idx;
+ } else {
+ u64 seq;
+ u32 idx;
+
+ error = 0;
+ seq = syslog_seq;
+ idx = syslog_idx;
+ while (seq < log_next_seq) {
+ struct log *msg = log_from_idx(idx);
+
+ error += msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4])
{
syslog_data[0] = log_buf;
syslog_data[1] = log_buf + log_buf_len;
- syslog_data[2] = log_buf + log_end -
- (logged_chars < log_buf_len ? logged_chars : log_buf_len);
- syslog_data[3] = log_buf + log_end;
+ syslog_data[2] = log_buf + log_first_idx;
+ syslog_data[3] = log_buf + log_next_idx;
}
#endif /* CONFIG_KGDB_KDB */
-/*
- * Call the console drivers on a range of log_buf
- */
-static void __call_console_drivers(unsigned start, unsigned end)
-{
- struct console *con;
-
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if ((con->flags & CON_ENABLED) && con->write &&
- (cpu_online(smp_processor_id()) ||
- (con->flags & CON_ANYTIME)))
- con->write(con, &LOG_BUF(start), end - start);
- }
-}
-
static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
"print all kernel messages to the console.");
/*
- * Write out chars from start to end - 1 inclusive
- */
-static void _call_console_drivers(unsigned start,
- unsigned end, int msg_log_level)
-{
- trace_console(&LOG_BUF(0), start, end, log_buf_len);
-
- if ((msg_log_level < console_loglevel || ignore_loglevel) &&
- console_drivers && start != end) {
- if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
- /* wrapped write */
- __call_console_drivers(start & LOG_BUF_MASK,
- log_buf_len);
- __call_console_drivers(0, end & LOG_BUF_MASK);
- } else {
- __call_console_drivers(start, end);
- }
- }
-}
-
-/*
- * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
- * lower 3 bit are the log level, the rest are the log facility. In case
- * userspace passes usual userspace syslog messages to /dev/kmsg or
- * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
- * to extract the correct log level for in-kernel processing, and not mangle
- * the original value.
- *
- * If a prefix is found, the length of the prefix is returned. If 'level' is
- * passed, it will be filled in with the log level without a possible facility
- * value. If 'special' is passed, the special printk prefix chars are accepted
- * and returned. If no valid header is found, 0 is returned and the passed
- * variables are not touched.
- */
-static size_t log_prefix(const char *p, unsigned int *level, char *special)
-{
- unsigned int lev = 0;
- char sp = '\0';
- size_t len;
-
- if (p[0] != '<' || !p[1])
- return 0;
- if (p[2] == '>') {
- /* usual single digit level number or special char */
- switch (p[1]) {
- case '0' ... '7':
- lev = p[1] - '0';
- break;
- case 'c': /* KERN_CONT */
- case 'd': /* KERN_DEFAULT */
- sp = p[1];
- break;
- default:
- return 0;
- }
- len = 3;
- } else {
- /* multi digit including the level and facility number */
- char *endp = NULL;
-
- lev = (simple_strtoul(&p[1], &endp, 10) & 7);
- if (endp == NULL || endp[0] != '>')
- return 0;
- len = (endp + 1) - p;
- }
-
- /* do not accept special char if not asked for */
- if (sp && !special)
- return 0;
-
- if (special) {
- *special = sp;
- /* return special char, do not touch level */
- if (sp)
- return len;
- }
-
- if (level)
- *level = lev;
- return len;
-}
-
-/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(unsigned start, unsigned end)
+static void call_console_drivers(int level, const char *text, size_t len)
{
- unsigned cur_index, start_print;
- static int msg_level = -1;
+ struct console *con;
- BUG_ON(((int)(start - end)) > 0);
+ trace_console(text, 0, len, len);
- cur_index = start;
- start_print = start;
- while (cur_index != end) {
- if (msg_level < 0 && ((end - cur_index) > 2)) {
- /* strip log prefix */
- cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
- start_print = cur_index;
- }
- while (cur_index != end) {
- char c = LOG_BUF(cur_index);
-
- cur_index++;
- if (c == '\n') {
- if (msg_level < 0) {
- /*
- * printk() has already given us loglevel tags in
- * the buffer. This code is here in case the
- * log buffer has wrapped right round and scribbled
- * on those tags
- */
- msg_level = default_message_loglevel;
- }
- _call_console_drivers(start_print, cur_index, msg_level);
- msg_level = -1;
- start_print = cur_index;
- break;
- }
- }
- }
- _call_console_drivers(start_print, end, msg_level);
-}
+ if (level >= console_loglevel && !ignore_loglevel)
+ return;
+ if (!console_drivers)
+ return;
-static void emit_log_char(char c)
-{
- LOG_BUF(log_end) = c;
- log_end++;
- if (log_end - log_start > log_buf_len)
- log_start = log_end - log_buf_len;
- if (log_end - con_start > log_buf_len)
- con_start = log_end - log_buf_len;
- if (logged_chars < log_buf_len)
- logged_chars++;
+ for_each_console(con) {
+ if (exclusive_console && con != exclusive_console)
+ continue;
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (!con->write)
+ continue;
+ if (!cpu_online(smp_processor_id()) &&
+ !(con->flags & CON_ANYTIME))
+ continue;
+ con->write(con, text, len);
+ }
}
/*
@@ -700,16 +1183,6 @@ static void zap_locks(void)
sema_init(&console_sem, 1);
}
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time = 0;
-#endif
-module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-
-static bool always_kmsg_dump;
-module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
-
/* Check if we have any console registered that can be called early in boot. */
static int have_callable_console(void)
{
@@ -722,51 +1195,6 @@ static int have_callable_console(void)
return 0;
}
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the output and
- * call the console drivers. If we fail to get the semaphore we place the output
- * into the log buffer and return. The current holder of the console_sem will
- * notice the new output in console_unlock(); and will send it to the
- * consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-
-asmlinkage int printk(const char *fmt, ...)
-{
- va_list args;
- int r;
-
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- va_start(args, fmt);
- r = vkdb_printf(fmt, args);
- va_end(args);
- return r;
- }
-#endif
- va_start(args, fmt);
- r = vprintk(fmt, args);
- va_end(args);
-
- return r;
-}
-
-/* cpu currently holding logbuf_lock */
-static volatile unsigned int printk_cpu = UINT_MAX;
-
/*
* Can we actually use the console at this time on this cpu?
*
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu)
retval = 0;
}
}
- printk_cpu = UINT_MAX;
+ logbuf_cpu = UINT_MAX;
if (wake)
up(&console_sem);
raw_spin_unlock(&logbuf_lock);
return retval;
}
-static const char recursion_bug_msg [] =
- KERN_CRIT "BUG: recent printk recursion!\n";
-static int recursion_bug;
-static int new_text_line = 1;
-static char printk_buf[1024];
int printk_delay_msec __read_mostly;
@@ -836,15 +1259,23 @@ static inline void printk_delay(void)
}
}
-asmlinkage int vprintk(const char *fmt, va_list args)
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
{
- int printed_len = 0;
- int current_log_level = default_message_loglevel;
+ static int recursion_bug;
+ static char cont_buf[LOG_LINE_MAX];
+ static size_t cont_len;
+ static int cont_level;
+ static struct task_struct *cont_task;
+ static char textbuf[LOG_LINE_MAX];
+ char *text = textbuf;
+ size_t text_len;
unsigned long flags;
int this_cpu;
- char *p;
- size_t plen;
- char special;
+ bool newline = false;
+ bool prefix = false;
+ int printed_len = 0;
boot_delay_msec();
printk_delay();
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
/*
* Ouch, printk recursed into itself!
*/
- if (unlikely(printk_cpu == this_cpu)) {
+ if (unlikely(logbuf_cpu == this_cpu)) {
/*
* If a crash is occurring during printk() on this CPU,
* then try to get the crash message out but make sure
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args)
lockdep_off();
raw_spin_lock(&logbuf_lock);
- printk_cpu = this_cpu;
+ logbuf_cpu = this_cpu;
if (recursion_bug) {
+ static const char recursion_msg[] =
+ "BUG: recent printk recursion!";
+
recursion_bug = 0;
- strcpy(printk_buf, recursion_bug_msg);
- printed_len = strlen(recursion_bug_msg);
+ printed_len += strlen(recursion_msg);
+ /* emit KERN_CRIT message */
+ log_store(0, 2, NULL, 0, recursion_msg, printed_len);
}
- /* Emit the output into the temporary buffer */
- printed_len += vscnprintf(printk_buf + printed_len,
- sizeof(printk_buf) - printed_len, fmt, args);
- p = printk_buf;
+ /*
+ * The printf needs to come first; we need the syslog
+ * prefix which might be passed-in as a parameter.
+ */
+ text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
- /* Read log level and handle special printk prefix */
- plen = log_prefix(p, &current_log_level, &special);
- if (plen) {
- p += plen;
+ /* mark and strip a trailing newline */
+ if (text_len && text[text_len-1] == '\n') {
+ text_len--;
+ newline = true;
+ }
- switch (special) {
- case 'c': /* Strip <c> KERN_CONT, continue line */
- plen = 0;
- break;
- case 'd': /* Strip <d> KERN_DEFAULT, start new line */
- plen = 0;
- default:
- if (!new_text_line) {
- emit_log_char('\n');
- new_text_line = 1;
- }
+ /* strip syslog prefix and extract log level or control flags */
+ if (text[0] == '<' && text[1] && text[2] == '>') {
+ switch (text[1]) {
+ case '0' ... '7':
+ if (level == -1)
+ level = text[1] - '0';
+ case 'd': /* KERN_DEFAULT */
+ prefix = true;
+ case 'c': /* KERN_CONT */
+ text += 3;
+ text_len -= 3;
}
}
- /*
- * Copy the output into log_buf. If the caller didn't provide
- * the appropriate log prefix, we insert them here
- */
- for (; *p; p++) {
- if (new_text_line) {
- new_text_line = 0;
-
- if (plen) {
- /* Copy original log prefix */
- int i;
-
- for (i = 0; i < plen; i++)
- emit_log_char(printk_buf[i]);
- printed_len += plen;
- } else {
- /* Add log prefix */
- emit_log_char('<');
- emit_log_char(current_log_level + '0');
- emit_log_char('>');
- printed_len += 3;
- }
+ if (level == -1)
+ level = default_message_loglevel;
- if (printk_time) {
- /* Add the current time stamp */
- char tbuf[50], *tp;
- unsigned tlen;
- unsigned long long t;
- unsigned long nanosec_rem;
-
- t = cpu_clock(printk_cpu);
- nanosec_rem = do_div(t, 1000000000);
- tlen = sprintf(tbuf, "[%5lu.%06lu] ",
- (unsigned long) t,
- nanosec_rem / 1000);
-
- for (tp = tbuf; tp < tbuf + tlen; tp++)
- emit_log_char(*tp);
- printed_len += tlen;
- }
+ if (dict) {
+ prefix = true;
+ newline = true;
+ }
- if (!*p)
- break;
+ if (!newline) {
+ if (cont_len && (prefix || cont_task != current)) {
+ /*
+ * Flush earlier buffer, which is either from a
+ * different thread, or when we got a new prefix.
+ */
+ log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
+ cont_len = 0;
}
- emit_log_char(*p);
- if (*p == '\n')
- new_text_line = 1;
+ if (!cont_len) {
+ cont_level = level;
+ cont_task = current;
+ }
+
+ /* buffer or append to earlier buffer from the same thread */
+ if (cont_len + text_len > sizeof(cont_buf))
+ text_len = sizeof(cont_buf) - cont_len;
+ memcpy(cont_buf + cont_len, text, text_len);
+ cont_len += text_len;
+ } else {
+ if (cont_len && cont_task == current) {
+ if (prefix) {
+ /*
+ * New prefix from the same thread; flush. We
+ * either got no earlier newline, or we race
+ * with an interrupt.
+ */
+ log_store(facility, cont_level,
+ NULL, 0, cont_buf, cont_len);
+ cont_len = 0;
+ }
+
+ /* append to the earlier buffer and flush */
+ if (cont_len + text_len > sizeof(cont_buf))
+ text_len = sizeof(cont_buf) - cont_len;
+ memcpy(cont_buf + cont_len, text, text_len);
+ cont_len += text_len;
+ log_store(facility, cont_level,
+ NULL, 0, cont_buf, cont_len);
+ cont_len = 0;
+ cont_task = NULL;
+ printed_len = cont_len;
+ } else {
+ /* ordinary single and terminated line */
+ log_store(facility, level,
+ dict, dictlen, text, text_len);
+ printed_len = text_len;
+ }
}
/*
- * Try to acquire and then immediately release the
- * console semaphore. The release will do all the
- * actual magic (print out buffers, wake up klogd,
- * etc).
+ * Try to acquire and then immediately release the console semaphore.
+ * The release will print out buffers and wake up /dev/kmsg and syslog()
+ * users.
*
- * The console_trylock_for_printk() function
- * will release 'logbuf_lock' regardless of whether it
- * actually gets the semaphore or not.
+ * The console_trylock_for_printk() function will release 'logbuf_lock'
+ * regardless of whether it actually gets the console semaphore or not.
*/
if (console_trylock_for_printk(this_cpu))
console_unlock();
@@ -974,16 +1418,81 @@ out_restore_irqs:
return printed_len;
}
-EXPORT_SYMBOL(printk);
-EXPORT_SYMBOL(vprintk);
+EXPORT_SYMBOL(vprintk_emit);
-#else
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, -1, NULL, 0, fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
-static void call_console_drivers(unsigned start, unsigned end)
+asmlinkage int printk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, ...)
{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
+ va_end(args);
+
+ return r;
}
+EXPORT_SYMBOL(printk_emit);
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk(). It can be called from any context. We want it to work.
+ *
+ * We try to grab the console_lock. If we succeed, it's easy - we log the
+ * output and call the console drivers. If we fail to get the semaphore, we
+ * place the output into the log buffer and return. The current holder of
+ * the console_sem will notice the new output in console_unlock(); and will
+ * send it to the consoles before releasing the lock.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+asmlinkage int printk(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ va_start(args, fmt);
+ r = vkdb_printf(fmt, args);
+ va_end(args);
+ return r;
+ }
#endif
+ va_start(args, fmt);
+ r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+ va_end(args);
+
+ return r;
+}
+EXPORT_SYMBOL(printk);
+
+#else
+
+#define LOG_LINE_MAX 0
+static struct log *log_from_idx(u32 idx) { return NULL; }
+static u32 log_next(u32 idx) { return 0; }
+static void call_console_drivers(int level, const char *text, size_t len) {}
+static size_t msg_print_text(const struct log *msg, bool syslog,
+ char *buf, size_t size) { return 0; }
+
+#endif /* CONFIG_PRINTK */
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options)
@@ -1217,7 +1726,7 @@ int is_console_locked(void)
}
/*
- * Delayed printk facility, for scheduler-internal messages:
+ * Delayed printk version, for scheduler-internal messages:
*/
#define PRINTK_BUF_SIZE 512
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void)
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
}
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+
/**
* console_unlock - unlock the console system
*
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void)
* by printk(). If this is the case, console_unlock(); emits
* the output prior to releasing the lock.
*
- * If there is output waiting for klogd, we wake it up.
+ * If there is output waiting, we wake /dev/kmsg and syslog() users.
*
* console_unlock(); may be called from any context.
*/
void console_unlock(void)
{
+ static u64 seen_seq;
unsigned long flags;
- unsigned _con_start, _log_end;
- unsigned wake_klogd = 0, retry = 0;
+ bool wake_klogd = false;
+ bool retry;
if (console_suspended) {
up(&console_sem);
@@ -1281,17 +1795,38 @@ void console_unlock(void)
console_may_schedule = 0;
again:
- for ( ; ; ) {
+ for (;;) {
+ struct log *msg;
+ static char text[LOG_LINE_MAX];
+ size_t len;
+ int level;
+
raw_spin_lock_irqsave(&logbuf_lock, flags);
- wake_klogd |= log_start - log_end;
- if (con_start == log_end)
- break; /* Nothing to print */
- _con_start = con_start;
- _log_end = log_end;
- con_start = log_end; /* Flush */
+ if (seen_seq != log_next_seq) {
+ wake_klogd = true;
+ seen_seq = log_next_seq;
+ }
+
+ if (console_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ console_seq = log_first_seq;
+ console_idx = log_first_idx;
+ }
+
+ if (console_seq == log_next_seq)
+ break;
+
+ msg = log_from_idx(console_idx);
+ level = msg->level & 7;
+
+ len = msg_print_text(msg, false, text, sizeof(text));
+
+ console_idx = log_next(console_idx);
+ console_seq++;
raw_spin_unlock(&logbuf_lock);
+
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(_con_start, _log_end);
+ call_console_drivers(level, text, len);
start_critical_timings();
local_irq_restore(flags);
}
@@ -1312,8 +1847,7 @@ again:
* flush, no worries.
*/
raw_spin_lock(&logbuf_lock);
- if (con_start != log_end)
- retry = 1;
+ retry = console_seq != log_next_seq;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
if (retry && console_trylock())
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon)
* for us.
*/
raw_spin_lock_irqsave(&logbuf_lock, flags);
- con_start = log_start;
+ console_seq = syslog_seq;
+ console_idx = syslog_idx;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
+static bool always_kmsg_dump;
+module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
+
/**
* kmsg_dump - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
*/
void kmsg_dump(enum kmsg_dump_reason reason)
{
- unsigned long end;
- unsigned chars;
+ u64 idx;
struct kmsg_dumper *dumper;
const char *s1, *s2;
unsigned long l1, l2;
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason)
/* Theoretically, the log could move on after we do this, but
there's not a lot we can do about that. The new messages
will overwrite the start of what we dump. */
+
raw_spin_lock_irqsave(&logbuf_lock, flags);
- end = log_end & LOG_BUF_MASK;
- chars = logged_chars;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ if (syslog_seq < log_first_seq)
+ idx = syslog_idx;
+ else
+ idx = log_first_idx;
- if (chars > end) {
- s1 = log_buf + log_buf_len - chars + end;
- l1 = chars - end;
+ if (idx > log_next_idx) {
+ s1 = log_buf;
+ l1 = log_next_idx;
- s2 = log_buf;
- l2 = end;
+ s2 = log_buf + idx;
+ l2 = log_buf_len - idx;
} else {
s1 = "";
l1 = 0;
- s2 = log_buf + end - chars;
- l2 = chars;
+ s2 = log_buf + idx;
+ l2 = log_next_idx - idx;
}
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc2..95cba41ce1e 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
#include "rcu.h"
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (likely(list_empty(&current->rcu_node_entry)))
+ return;
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+ __rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+void exit_rcu(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb6..fc31a2d6510 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
}
-/*
- * Check for a task exiting while in a preemptible -RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6..e66b34ab755 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
static int fqs_holdoff; /* Hold time within burst (us). */
static int fqs_stutter = 3; /* Wait time between bursts (s). */
+static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(n_barrier_cbs, int, 0444);
+MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
module_param(onoff_interval, int, 0444);
MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
static struct task_struct *onoff_task;
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
#define RCU_TORTURE_PIPE_LEN 10
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
static long n_offline_successes;
static long n_online_attempts;
static long n_online_successes;
+static long n_barrier_attempts;
+static long n_barrier_successes;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
+static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
+static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
int (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
+ void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
.completed = rcu_torture_completed,
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
+ .call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
.completed = rcu_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
.completed = rcu_no_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu_expedited,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_bh_torture_deferred_free,
.sync = synchronize_rcu_bh,
+ .call = call_rcu_bh,
.cb_barrier = rcu_barrier_bh,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu_bh,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu_bh_expedited,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
return srcu_batches_completed(&srcu_ctl);
}
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+ call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
+
static void srcu_torture_synchronize(void)
{
synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
- cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
+ cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
}
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
+ .deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
+ .call = NULL,
.cb_barrier = NULL,
.stats = srcu_torture_stats,
.name = "srcu"
};
+static struct rcu_torture_ops srcu_sync_ops = {
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .call = NULL,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu_sync"
+};
+
static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
{
return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock_raw,
.completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
+ .deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
+ .call = NULL,
.cb_barrier = NULL,
.stats = srcu_torture_stats,
.name = "srcu_raw"
};
+static struct rcu_torture_ops srcu_raw_sync_ops = {
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock_raw,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock_raw,
+ .completed = srcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .call = NULL,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu_raw_sync"
+};
+
static void srcu_torture_synchronize_expedited(void)
{
synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
.completed = srcu_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = srcu_torture_synchronize_expedited,
+ .call = NULL,
.cb_barrier = NULL,
.stats = srcu_torture_stats,
.name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
"rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
"rtmbe: %d rtbke: %ld rtbre: %ld "
"rtbf: %ld rtb: %ld nt: %ld "
- "onoff: %ld/%ld:%ld/%ld",
+ "onoff: %ld/%ld:%ld/%ld "
+ "barrier: %ld/%ld:%ld",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
n_online_successes,
n_online_attempts,
n_offline_successes,
- n_offline_attempts);
+ n_offline_attempts,
+ n_barrier_successes,
+ n_barrier_attempts,
+ n_rcu_torture_barrier_error);
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_barrier_error != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_failure != 0)
- cnt += sprintf(&page[cnt], " !!!");
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
- if (i > 1) {
+ n_rcu_torture_boost_failure != 0 ||
+ i > 1) {
cnt += sprintf(&page[cnt], "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
/* This must be outside of the mutex, otherwise deadlock! */
kthread_stop(t);
+ boost_tasks[cpu] = NULL;
}
static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
return;
VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
kthread_stop(onoff_task);
+ onoff_task = NULL;
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void
+static int
rcu_torture_onoff_init(void)
{
+ return 0;
}
static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
return;
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
kthread_stop(stall_task);
+ stall_task = NULL;
+}
+
+/* Callback function for RCU barrier testing. */
+void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+ atomic_inc(&barrier_cbs_invoked);
+}
+
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+ long myid = (long)arg;
+ struct rcu_head rcu;
+
+ init_rcu_head_on_stack(&rcu);
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
+ set_user_nice(current, 19);
+ do {
+ wait_event(barrier_cbs_wq[myid],
+ atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+ kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP);
+ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ break;
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ if (atomic_dec_and_test(&barrier_cbs_count))
+ wake_up(&barrier_wq);
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(1);
+ cur_ops->cb_barrier();
+ destroy_rcu_head_on_stack(&rcu);
+ return 0;
+}
+
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+ int i;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+ do {
+ atomic_set(&barrier_cbs_invoked, 0);
+ atomic_set(&barrier_cbs_count, n_barrier_cbs);
+ /* wake_up() path contains the required barriers. */
+ for (i = 0; i < n_barrier_cbs; i++)
+ wake_up(&barrier_cbs_wq[i]);
+ wait_event(barrier_wq,
+ atomic_read(&barrier_cbs_count) == 0 ||
+ kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP);
+ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ break;
+ n_barrier_attempts++;
+ cur_ops->cb_barrier();
+ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+ n_rcu_torture_barrier_error++;
+ WARN_ON_ONCE(1);
+ }
+ n_barrier_successes++;
+ schedule_timeout_interruptible(HZ / 10);
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(1);
+ return 0;
+}
+
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+ int i;
+ int ret;
+
+ if (n_barrier_cbs == 0)
+ return 0;
+ if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ " Call or barrier ops missing for %s,\n",
+ torture_type, cur_ops->name);
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ " RCU barrier testing omitted from run.\n",
+ torture_type);
+ return 0;
+ }
+ atomic_set(&barrier_cbs_count, 0);
+ atomic_set(&barrier_cbs_invoked, 0);
+ barrier_cbs_tasks =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ GFP_KERNEL);
+ barrier_cbs_wq =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+ GFP_KERNEL);
+ if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+ return -ENOMEM;
+ for (i = 0; i < n_barrier_cbs; i++) {
+ init_waitqueue_head(&barrier_cbs_wq[i]);
+ barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
+ (void *)(long)i,
+ "rcu_torture_barrier_cbs");
+ if (IS_ERR(barrier_cbs_tasks[i])) {
+ ret = PTR_ERR(barrier_cbs_tasks[i]);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
+ barrier_cbs_tasks[i] = NULL;
+ return ret;
+ }
+ }
+ barrier_task = kthread_run(rcu_torture_barrier, NULL,
+ "rcu_torture_barrier");
+ if (IS_ERR(barrier_task)) {
+ ret = PTR_ERR(barrier_task);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
+ barrier_task = NULL;
+ }
+ return 0;
+}
+
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+ int i;
+
+ if (barrier_task != NULL) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
+ kthread_stop(barrier_task);
+ barrier_task = NULL;
+ }
+ if (barrier_cbs_tasks != NULL) {
+ for (i = 0; i < n_barrier_cbs; i++) {
+ if (barrier_cbs_tasks[i] != NULL) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
+ kthread_stop(barrier_cbs_tasks[i]);
+ barrier_cbs_tasks[i] = NULL;
+ }
+ }
+ kfree(barrier_cbs_tasks);
+ barrier_cbs_tasks = NULL;
+ }
+ if (barrier_cbs_wq != NULL) {
+ kfree(barrier_cbs_wq);
+ barrier_cbs_wq = NULL;
+ }
}
static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
fullstop = FULLSTOP_RMMOD;
mutex_unlock(&fullstop_mutex);
unregister_reboot_notifier(&rcutorture_shutdown_nb);
+ rcu_torture_barrier_cleanup();
rcu_torture_stall_cleanup();
if (stutter_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
kthread_stop(shutdown_task);
}
+ shutdown_task = NULL;
rcu_torture_onoff_cleanup();
/* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup)
cur_ops->cleanup();
- if (atomic_read(&n_rcu_torture_error))
+ if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else if (n_online_successes != n_online_attempts ||
n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
int i;
int cpu;
int firsterr = 0;
+ int retval;
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
&rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
- &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
+ &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
+ &srcu_raw_sync_ops, &srcu_expedited_ops,
&sched_ops, &sched_sync_ops, &sched_expedited_ops, };
mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
test_boost_duration = 2;
if ((test_boost == 1 && cur_ops->can_boost) ||
test_boost == 2) {
- int retval;
boost_starttime = jiffies + test_boost_interval * HZ;
register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
goto unwind;
}
}
- rcu_torture_onoff_init();
+ i = rcu_torture_onoff_init();
+ if (i != 0) {
+ firsterr = i;
+ goto unwind;
+ }
register_reboot_notifier(&rcutorture_shutdown_nb);
- rcu_torture_stall_init();
+ i = rcu_torture_stall_init();
+ if (i != 0) {
+ firsterr = i;
+ goto unwind;
+ }
+ retval = rcu_torture_barrier_init();
+ if (retval != 0) {
+ firsterr = retval;
+ goto unwind;
+ }
rcutorture_record_test_transition();
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 1050d6d3922..0da7b88d92d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+ .orphan_nxttail = &structname##_state.orphan_nxtlist, \
+ .orphan_donetail = &structname##_state.orphan_donelist, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
+/* State information for rcu_barrier() and friends. */
+
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+
/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
{
trace_rcu_utilization("Start context switch");
rcu_sched_qs(cpu);
- rcu_preempt_note_context_switch(cpu);
trace_rcu_utilization("End context switch");
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * Also record a quiescent state for this CPU for the current grace period.
- * Synchronization and interrupt disabling are not required because
- * this function executes in stop_machine() context. Therefore, cleanup
- * operations that might block must be done later from the CPU_DEAD
- * notifier.
- *
- * Note that the outgoing CPU's bit has already been cleared in the
- * cpu_online_mask. This allows us to randomly pick a callback
- * destination from the bits set in that mask.
+ * Send the specified CPU's RCU callbacks to the orphanage. The
+ * specified CPU must be offline, and the caller must hold the
+ * ->onofflock.
*/
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+ struct rcu_node *rnp, struct rcu_data *rdp)
{
int i;
- unsigned long mask;
- int receive_cpu = cpumask_any(cpu_online_mask);
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
- /* First, adjust the counts. */
+ /*
+ * Orphan the callbacks. First adjust the counts. This is safe
+ * because ->onofflock excludes _rcu_barrier()'s adoption of
+ * the callbacks, thus no memory barrier is required.
+ */
if (rdp->nxtlist != NULL) {
- receive_rdp->qlen_lazy += rdp->qlen_lazy;
- receive_rdp->qlen += rdp->qlen;
+ rsp->qlen_lazy += rdp->qlen_lazy;
+ rsp->qlen += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
rdp->qlen = 0;
}
/*
- * Next, move ready-to-invoke callbacks to be invoked on some
- * other CPU. These will not be required to pass through another
- * grace period: They are done, regardless of CPU.
+ * Next, move those callbacks still needing a grace period to
+ * the orphanage, where some other CPU will pick them up.
+ * Some of the callbacks might have gone partway through a grace
+ * period, but that is too bad. They get to start over because we
+ * cannot assume that grace periods are synchronized across CPUs.
+ * We don't bother updating the ->nxttail[] array yet, instead
+ * we just reset the whole thing later on.
*/
- if (rdp->nxtlist != NULL &&
- rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
- struct rcu_head *oldhead;
- struct rcu_head **oldtail;
- struct rcu_head **newtail;
-
- oldhead = rdp->nxtlist;
- oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
- rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
- *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
- newtail = rdp->nxttail[RCU_DONE_TAIL];
- for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
- if (receive_rdp->nxttail[i] == oldtail)
- receive_rdp->nxttail[i] = newtail;
- if (rdp->nxttail[i] == newtail)
- rdp->nxttail[i] = &rdp->nxtlist;
- }
+ if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
+ *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
+ rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = NULL;
}
/*
- * Finally, put the rest of the callbacks at the end of the list.
- * The ones that made it partway through get to start over: We
- * cannot assume that grace periods are synchronized across CPUs.
- * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
- * this does not seem compelling. Not yet, anyway.)
+ * Then move the ready-to-invoke callbacks to the orphanage,
+ * where some other CPU will pick them up. These will not be
+ * required to pass though another grace period: They are done.
*/
if (rdp->nxtlist != NULL) {
- *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
- receive_rdp->nxttail[RCU_NEXT_TAIL] =
- rdp->nxttail[RCU_NEXT_TAIL];
- receive_rdp->n_cbs_adopted += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
-
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ *rsp->orphan_donetail = rdp->nxtlist;
+ rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
}
+ /* Finally, initialize the rcu_data structure's list to empty. */
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage. The caller must hold the ->onofflock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+ int i;
+ struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+
/*
- * Record a quiescent state for the dying CPU. This is safe
- * only because we have already cleared out the callbacks.
- * (Otherwise, the RCU core might try to schedule the invocation
- * of callbacks on this now-offline CPU, which would be bad.)
+ * If there is an rcu_barrier() operation in progress, then
+ * only the task doing that operation is permitted to adopt
+ * callbacks. To do otherwise breaks rcu_barrier() and friends
+ * by causing them to fail to wait for the callbacks in the
+ * orphanage.
*/
- mask = rdp->grpmask; /* rnp->grplo is constant. */
+ if (rsp->rcu_barrier_in_progress &&
+ rsp->rcu_barrier_in_progress != current)
+ return;
+
+ /* Do the accounting first. */
+ rdp->qlen_lazy += rsp->qlen_lazy;
+ rdp->qlen += rsp->qlen;
+ rdp->n_cbs_adopted += rsp->qlen;
+ rsp->qlen_lazy = 0;
+ rsp->qlen = 0;
+
+ /*
+ * We do not need a memory barrier here because the only way we
+ * can get here if there is an rcu_barrier() in flight is if
+ * we are the task doing the rcu_barrier().
+ */
+
+ /* First adopt the ready-to-invoke callbacks. */
+ if (rsp->orphan_donelist != NULL) {
+ *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+ for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = rsp->orphan_donetail;
+ rsp->orphan_donelist = NULL;
+ rsp->orphan_donetail = &rsp->orphan_donelist;
+ }
+
+ /* And then adopt the callbacks that still need a grace period. */
+ if (rsp->orphan_nxtlist != NULL) {
+ *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
+ rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
+ rsp->orphan_nxtlist = NULL;
+ rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ }
+}
+
+/*
+ * Trace the fact that this CPU is going offline.
+ */
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+ RCU_TRACE(unsigned long mask);
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+
+ RCU_TRACE(mask = rdp->grpmask);
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
"cpuofl");
- rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
- /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
}
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
+ * this fact from process context. Do the remainder of the cleanup,
+ * including orphaning the outgoing CPU's RCU callbacks, and also
+ * adopting them, if there is no _rcu_barrier() instance running.
* There can only be one CPU hotplug operation at a time, so no other
* CPU can be attempting to update rcu_cpu_kthread_task.
*/
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
unsigned long mask;
int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
/* Adjust any no-longer-needed kthreads. */
rcu_stop_cpu_kthread(cpu);
rcu_node_kthread_setaffinity(rnp, -1);
- /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
+ /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
/* Exclude any attempts to start a new grace period. */
raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+ rcu_adopt_orphan_cbs(rsp);
+
/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
mask = rdp->grpmask; /* rnp->grplo is constant. */
do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
+
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_is_callbacks_kthread());
/* Update count, and requeue any remaining callbacks. */
- rdp->qlen_lazy -= count_lazy;
- rdp->qlen -= count;
- rdp->n_cbs_invoked += count;
if (list != NULL) {
*tail = rdp->nxtlist;
rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
else
break;
}
+ smp_mb(); /* List handling before counting for rcu_barrier(). */
+ rdp->qlen_lazy -= count_lazy;
+ rdp->qlen -= count;
+ rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1820,15 +1875,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* a quiescent state betweentimes.
*/
local_irq_save(flags);
- WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
- *rdp->nxttail[RCU_NEXT_TAIL] = head;
- rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
rdp->qlen++;
if (lazy)
rdp->qlen_lazy++;
+ else
+ rcu_idle_count_callbacks_posted();
+ smp_mb(); /* Count before adding callback for rcu_barrier(). */
+ *rdp->nxttail[RCU_NEXT_TAIL] = head;
+ rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1894,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
+/*
+ * Because a context switch is a grace period for RCU-sched and RCU-bh,
+ * any blocking grace-period wait automatically implies a grace period
+ * if there is only one CPU online at any point time during execution
+ * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds
+ * some overhead: RCU still operates correctly.
+ *
+ * Of course, sampling num_online_cpus() with preemption enabled can
+ * give erroneous results if there are concurrent CPU-hotplug operations.
+ * For example, given a demonic sequence of preemptions in num_online_cpus()
+ * and CPU-hotplug operations, there could be two or more CPUs online at
+ * all times, but num_online_cpus() might well return one (or even zero).
+ *
+ * However, all such demonic sequences require at least one CPU-offline
+ * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
+ * is only a problem if there is an RCU read-side critical section executing
+ * throughout. But RCU-sched and RCU-bh read-side critical sections
+ * disable either preemption or bh, which prevents a CPU from going offline.
+ * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
+ * that there is only one CPU when in fact there was more than one throughout
+ * is when there were no RCU readers in the system. If there are no
+ * RCU readers, the grace period by definition can be of zero length,
+ * regardless of the number of online CPUs.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+ might_sleep(); /* Check for RCU read-side critical section. */
+ return num_online_cpus() <= 1;
+}
+
/**
* synchronize_sched - wait until an rcu-sched grace period has elapsed.
*
@@ -2167,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
rcu_preempt_cpu_has_callbacks(cpu);
}
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
+/*
+ * RCU callback function for _rcu_barrier(). If we are last, wake
+ * up the task executing _rcu_barrier().
+ */
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2201,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
void (*call_rcu_func)(struct rcu_head *head,
void (*func)(struct rcu_head *head)))
{
- BUG_ON(in_interrupt());
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_head rh;
+
+ init_rcu_head_on_stack(&rh);
+
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rcu_barrier_mutex);
- init_completion(&rcu_barrier_completion);
+
+ smp_mb(); /* Prevent any prior operations from leaking in. */
+
/*
- * Initialize rcu_barrier_cpu_count to 1, then invoke
- * rcu_barrier_func() on each CPU, so that each CPU also has
- * incremented rcu_barrier_cpu_count. Only then is it safe to
- * decrement rcu_barrier_cpu_count -- otherwise the first CPU
- * might complete its grace period before all of the other CPUs
- * did their increment, causing this function to return too
- * early. Note that on_each_cpu() disables irqs, which prevents
- * any CPUs from coming online or going offline until each online
- * CPU has queued its RCU-barrier callback.
+ * Initialize the count to one rather than to zero in order to
+ * avoid a too-soon return to zero in case of a short grace period
+ * (or preemption of this task). Also flag this task as doing
+ * an rcu_barrier(). This will prevent anyone else from adopting
+ * orphaned callbacks, which could cause otherwise failure if a
+ * CPU went offline and quickly came back online. To see this,
+ * consider the following sequence of events:
+ *
+ * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
+ * 2. CPU 1 goes offline, orphaning its callbacks.
+ * 3. CPU 0 adopts CPU 1's orphaned callbacks.
+ * 4. CPU 1 comes back online.
+ * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
+ * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
+ * us -- but before CPU 1's orphaned callbacks are invoked!!!
*/
+ init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 1);
- on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
+ raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ rsp->rcu_barrier_in_progress = current;
+ raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+
+ /*
+ * Force every CPU with callbacks to register a new callback
+ * that will tell us when all the preceding callbacks have
+ * been invoked. If an offline CPU has callbacks, wait for
+ * it to either come back online or to finish orphaning those
+ * callbacks.
+ */
+ for_each_possible_cpu(cpu) {
+ preempt_disable();
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (cpu_is_offline(cpu)) {
+ preempt_enable();
+ while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
+ schedule_timeout_interruptible(1);
+ } else if (ACCESS_ONCE(rdp->qlen)) {
+ smp_call_function_single(cpu, rcu_barrier_func,
+ (void *)call_rcu_func, 1);
+ preempt_enable();
+ } else {
+ preempt_enable();
+ }
+ }
+
+ /*
+ * Now that all online CPUs have rcu_barrier_callback() callbacks
+ * posted, we can adopt all of the orphaned callbacks and place
+ * an rcu_barrier_callback() callback after them. When that is done,
+ * we are guaranteed to have an rcu_barrier_callback() callback
+ * following every callback that could possibly have been
+ * registered before _rcu_barrier() was called.
+ */
+ raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ rcu_adopt_orphan_cbs(rsp);
+ rsp->rcu_barrier_in_progress = NULL;
+ raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ atomic_inc(&rcu_barrier_cpu_count);
+ smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
+ call_rcu_func(&rh, rcu_barrier_callback);
+
+ /*
+ * Now that we have an rcu_barrier_callback() callback on each
+ * CPU, and thus each counted, remove the initial count.
+ */
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
+
+ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
wait_for_completion(&rcu_barrier_completion);
+
+ /* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_barrier_mutex);
+
+ destroy_rcu_head_on_stack(&rh);
}
/**
@@ -2418,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
for (i = NUM_RCU_LVLS - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = RCU_FANOUT_LEAF;
+ rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a407..7f5d138dedf 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
#include <linux/seqlock.h>
/*
- * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
* In theory, it should be possible to add more levels straightforwardly.
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
-#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_LEAF 16
-#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
-#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
raw_spinlock_t onofflock; /* exclude on/offline and */
/* starting new GP. */
+ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
+ /* need a grace period. */
+ struct rcu_head **orphan_nxttail; /* Tail of above. */
+ struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
+ /* are ready to invoke. */
+ struct rcu_head **orphan_donetail; /* Tail of above. */
+ long qlen_lazy; /* Number of lazy callbacks. */
+ long qlen; /* Total number of callbacks. */
+ struct task_struct *rcu_barrier_in_progress;
+ /* Task doing rcu_barrier(), */
+ /* or NULL if no barrier. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
+static void rcu_idle_count_callbacks_posted(void);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816b..2411000d986 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
*
* Caller must disable preemption.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rdp = __this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
* means that we continue to block the current grace period.
*/
local_irq_save(flags);
- rcu_preempt_qs(cpu);
+ rcu_preempt_qs(smp_processor_id());
local_irq_restore(flags);
}
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
}
-/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
- * Because preemptible RCU does not exist, we never have to check for
- * CPUs being in quiescent states.
- */
-static void rcu_preempt_note_context_switch(int cpu)
-{
-}
-
-/*
* Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
{
}
+/*
+ * Don't bother keeping a running count of the number of RCU callbacks
+ * posted because CONFIG_RCU_FAST_NO_HZ=n.
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+}
+
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+/* Loop counter for rcu_prepare_for_idle(). */
static DEFINE_PER_CPU(int, rcu_dyntick_drain);
+/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
-static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */
-static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
+/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
+static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
+/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
+static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
+/* Enable special processing on first attempt to enter dyntick-idle mode. */
+static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
+/* Running count of non-lazy callbacks posted, never decremented. */
+static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
+/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
+static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
/*
* Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
*/
int rcu_needs_cpu(int cpu)
{
+ /* Flag a new idle sojourn to the idle-entry state machine. */
+ per_cpu(rcu_idle_first_pass, cpu) = 1;
/* If no callbacks, RCU doesn't need the CPU. */
if (!rcu_cpu_has_callbacks(cpu))
return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
}
/*
+ * Handler for smp_call_function_single(). The only point of this
+ * handler is to wake the CPU up, so the handler does only tracing.
+ */
+void rcu_idle_demigrate(void *unused)
+{
+ trace_rcu_prep_idle("Demigrate");
+}
+
+/*
* Timer handler used to force CPU to start pushing its remaining RCU
* callbacks in the case where it entered dyntick-idle mode with callbacks
* pending. The hander doesn't really need to do anything because the
* real work is done upon re-entry to idle, or by the next scheduling-clock
* interrupt should idle not be re-entered.
+ *
+ * One special case: the timer gets migrated without awakening the CPU
+ * on which the timer was scheduled on. In this case, we must wake up
+ * that CPU. We do so with smp_call_function_single().
*/
-static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+static void rcu_idle_gp_timer_func(unsigned long cpu_in)
{
+ int cpu = (int)cpu_in;
+
trace_rcu_prep_idle("Timer");
- return HRTIMER_NORESTART;
+ if (cpu != smp_processor_id())
+ smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
+ else
+ WARN_ON_ONCE(1); /* Getting here can hang the system... */
}
/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
*/
static void rcu_prepare_for_idle_init(int cpu)
{
- static int firsttime = 1;
- struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
-
- hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtp->function = rcu_idle_gp_timer_func;
- if (firsttime) {
- unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
-
- rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
- upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
- rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
- firsttime = 0;
- }
+ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+ setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
+ rcu_idle_gp_timer_func, cpu);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
+ per_cpu(rcu_idle_first_pass, cpu) = 1;
}
/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
*/
static void rcu_cleanup_after_idle(int cpu)
{
- hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+ del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
+ trace_rcu_prep_idle("Cleanup after idle");
}
/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
*/
static void rcu_prepare_for_idle(int cpu)
{
+ struct timer_list *tp;
+
+ /*
+ * If this is an idle re-entry, for example, due to use of
+ * RCU_NONIDLE() or the new idle-loop tracing API within the idle
+ * loop, then don't take any state-machine actions, unless the
+ * momentary exit from idle queued additional non-lazy callbacks.
+ * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
+ * pending.
+ */
+ if (!per_cpu(rcu_idle_first_pass, cpu) &&
+ (per_cpu(rcu_nonlazy_posted, cpu) ==
+ per_cpu(rcu_nonlazy_posted_snap, cpu))) {
+ if (rcu_cpu_has_callbacks(cpu)) {
+ tp = &per_cpu(rcu_idle_gp_timer, cpu);
+ mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+ }
+ return;
+ }
+ per_cpu(rcu_idle_first_pass, cpu) = 0;
+ per_cpu(rcu_nonlazy_posted_snap, cpu) =
+ per_cpu(rcu_nonlazy_posted, cpu) - 1;
+
/*
* If there are no callbacks on this CPU, enter dyntick-idle mode.
* Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
if (rcu_cpu_has_nonlazy_callbacks(cpu))
- hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
- rcu_idle_gp_wait, HRTIMER_MODE_REL);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) =
+ jiffies + RCU_IDLE_GP_DELAY;
else
- hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
- rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) =
+ jiffies + RCU_IDLE_LAZY_GP_DELAY;
+ tp = &per_cpu(rcu_idle_gp_timer, cpu);
+ mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+ per_cpu(rcu_nonlazy_posted_snap, cpu) =
+ per_cpu(rcu_nonlazy_posted, cpu);
return; /* Nothing more to do immediately. */
} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
/* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
trace_rcu_prep_idle("Callbacks drained");
}
+/*
+ * Keep a running count of the number of non-lazy callbacks posted
+ * on this CPU. This running counter (which is never decremented) allows
+ * rcu_prepare_for_idle() to detect when something out of the idle loop
+ * posts a callback, even if an equal number of callbacks are invoked.
+ * Of course, callbacks should only be posted from within a trace event
+ * designed to be called from idle or from within RCU_NONIDLE().
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+ __this_cpu_add(rcu_nonlazy_posted, 1);
+}
+
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
- struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+ struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
- sprintf(cp, "drain=%d %c timer=%lld",
+ sprintf(cp, "drain=%d %c timer=%lu",
per_cpu(rcu_dyntick_drain, cpu),
per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
- hrtimer_active(hrtp)
- ? ktime_to_us(hrtimer_get_remaining(hrtp))
- : -1);
+ timer_pending(tltp) ? tltp->expires - jiffies : -1);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff4..d4bc16ddd1d 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+ "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->completed, gpnum, rsp->fqs_state,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh);
+ rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a..173ea52f3af 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
-
-
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index afc6d7e7155..03667c3fdb3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
#include "sched.h"
#include "../workqueue_sched.h"
+#include "../smpboot.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -2083,6 +2084,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
#endif
/* Here we just switch the register state and the stack. */
+ rcu_switch_from(prev);
switch_to(prev, next, prev);
barrier();
@@ -6382,6 +6384,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sg)
return -ENOMEM;
+ sg->next = sg;
+
*per_cpu_ptr(sdd->sg, j) = sg;
sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6405,16 +6409,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- kfree(*per_cpu_ptr(sdd->sg, j));
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgp)
+ kfree(*per_cpu_ptr(sdd->sgp, j));
}
free_percpu(sdd->sd);
+ sdd->sd = NULL;
free_percpu(sdd->sg);
+ sdd->sg = NULL;
free_percpu(sdd->sgp);
+ sdd->sgp = NULL;
}
}
@@ -7049,6 +7063,7 @@ void __init sched_init(void)
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
#endif
init_sched_fair_class();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d97ebdc58f..e9553640c1c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se))
- list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+ list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
#endif
cfs_rq->nr_running++;
}
@@ -3215,6 +3215,8 @@ static int move_one_task(struct lb_env *env)
static unsigned long task_h_load(struct task_struct *p);
+static const unsigned int sched_nr_migrate_break = 32;
+
/*
* move_tasks tries to move up to load_move weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
@@ -3242,7 +3244,7 @@ static int move_tasks(struct lb_env *env)
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sysctl_sched_nr_migrate;
+ env->loop_break += sched_nr_migrate_break;
env->flags |= LBF_NEED_BREAK;
break;
}
@@ -3252,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
load = task_h_load(p);
- if (load < 16 && !env->sd->nr_balance_failed)
+ if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
if ((load / 2) > env->load_move)
@@ -4407,7 +4409,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.idle = idle,
- .loop_break = sysctl_sched_nr_migrate,
+ .loop_break = sched_nr_migrate_break,
};
cpumask_copy(cpus, cpu_active_mask);
@@ -4445,10 +4447,10 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.load_move = imbalance;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
- env.loop_max = busiest->nr_running;
+ env.load_move = imbalance;
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+ env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
local_irq_save(flags);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73913d..de00a486c5c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895e..ee376beedaf 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
*
- * This defines a simple but solid secure-computing mode.
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ * of Berkeley Packet Filters/Linux Socket Filters.
*/
+#include <linux/atomic.h>
#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+
+#ifdef CONFIG_SECCOMP_FILTER
+#include <asm/syscall.h>
+#include <linux/filter.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ * get/put helpers should be used when accessing an instance
+ * outside of a lifetime-guarded section. In general, this
+ * is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insns: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer. For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory. This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+ atomic_t usage;
+ struct seccomp_filter *prev;
+ unsigned short len; /* Instruction count */
+ struct sock_filter insns[];
+};
+
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+/**
+ * get_u32 - returns a u32 offset into data
+ * @data: a unsigned 64 bit value
+ * @index: 0 or 1 to return the first or second 32-bits
+ *
+ * This inline exists to hide the length of unsigned long. If a 32-bit
+ * unsigned long is passed in, it will be extended and the top 32-bits will be
+ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
+ * properly returned.
+ *
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static inline u32 get_u32(u64 data, int index)
+{
+ return ((u32 *)&data)[index];
+}
+
+/* Helper for bpf_load below. */
+#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
+/**
+ * bpf_load: checks and returns a pointer to the requested offset
+ * @off: offset into struct seccomp_data to load from
+ *
+ * Returns the requested 32-bits of data.
+ * seccomp_check_filter() should assure that @off is 32-bit aligned
+ * and not out of bounds. Failure to do so is a BUG.
+ */
+u32 seccomp_bpf_load(int off)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ if (off == BPF_DATA(nr))
+ return syscall_get_nr(current, regs);
+ if (off == BPF_DATA(arch))
+ return syscall_get_arch(current, regs);
+ if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
+ unsigned long value;
+ int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
+ int index = !!(off % sizeof(u64));
+ syscall_get_arguments(current, regs, arg, 1, &value);
+ return get_u32(value, index);
+ }
+ if (off == BPF_DATA(instruction_pointer))
+ return get_u32(KSTK_EIP(current), 0);
+ if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
+ return get_u32(KSTK_EIP(current), 1);
+ /* seccomp_check_filter should make this impossible. */
+ BUG();
+}
+
+/**
+ * seccomp_check_filter - verify seccomp filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Takes a previously checked filter (by sk_chk_filter) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load. It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+ int pc;
+ for (pc = 0; pc < flen; pc++) {
+ struct sock_filter *ftest = &filter[pc];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+
+ switch (code) {
+ case BPF_S_LD_W_ABS:
+ ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+ /* 32-bit aligned and not out of bounds. */
+ if (k >= sizeof(struct seccomp_data) || k & 3)
+ return -EINVAL;
+ continue;
+ case BPF_S_LD_W_LEN:
+ ftest->code = BPF_S_LD_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ case BPF_S_LDX_W_LEN:
+ ftest->code = BPF_S_LDX_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ /* Explicitly include allowed calls. */
+ case BPF_S_RET_K:
+ case BPF_S_RET_A:
+ case BPF_S_ALU_ADD_K:
+ case BPF_S_ALU_ADD_X:
+ case BPF_S_ALU_SUB_K:
+ case BPF_S_ALU_SUB_X:
+ case BPF_S_ALU_MUL_K:
+ case BPF_S_ALU_MUL_X:
+ case BPF_S_ALU_DIV_X:
+ case BPF_S_ALU_AND_K:
+ case BPF_S_ALU_AND_X:
+ case BPF_S_ALU_OR_K:
+ case BPF_S_ALU_OR_X:
+ case BPF_S_ALU_LSH_K:
+ case BPF_S_ALU_LSH_X:
+ case BPF_S_ALU_RSH_K:
+ case BPF_S_ALU_RSH_X:
+ case BPF_S_ALU_NEG:
+ case BPF_S_LD_IMM:
+ case BPF_S_LDX_IMM:
+ case BPF_S_MISC_TAX:
+ case BPF_S_MISC_TXA:
+ case BPF_S_ALU_DIV_K:
+ case BPF_S_LD_MEM:
+ case BPF_S_LDX_MEM:
+ case BPF_S_ST:
+ case BPF_S_STX:
+ case BPF_S_JMP_JA:
+ case BPF_S_JMP_JEQ_K:
+ case BPF_S_JMP_JEQ_X:
+ case BPF_S_JMP_JGE_K:
+ case BPF_S_JMP_JGE_X:
+ case BPF_S_JMP_JGT_K:
+ case BPF_S_JMP_JGT_X:
+ case BPF_S_JMP_JSET_K:
+ case BPF_S_JMP_JSET_X:
+ continue;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(int syscall)
+{
+ struct seccomp_filter *f;
+ u32 ret = SECCOMP_RET_ALLOW;
+
+ /* Ensure unexpected behavior doesn't result in failing open. */
+ if (WARN_ON(current->seccomp.filter == NULL))
+ return SECCOMP_RET_KILL;
+
+ /*
+ * All filters in the list are evaluated and the lowest BPF return
+ * value always takes priority (ignoring the DATA).
+ */
+ for (f = current->seccomp.filter; f; f = f->prev) {
+ u32 cur_ret = sk_run_filter(NULL, f->insns);
+ if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ ret = cur_ret;
+ }
+ return ret;
+}
+
+/**
+ * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * @fprog: BPF program to install
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+static long seccomp_attach_filter(struct sock_fprog *fprog)
+{
+ struct seccomp_filter *filter;
+ unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+ unsigned long total_insns = fprog->len;
+ long ret;
+
+ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ for (filter = current->seccomp.filter; filter; filter = filter->prev)
+ total_insns += filter->len + 4; /* include a 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /*
+ * Installing a seccomp filter requires that the task have
+ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+ * This avoids scenarios where unprivileged tasks can affect the
+ * behavior of privileged children.
+ */
+ if (!current->no_new_privs &&
+ security_capable_noaudit(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN) != 0)
+ return -EACCES;
+
+ /* Allocate a new seccomp_filter */
+ filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter)
+ return -ENOMEM;
+ atomic_set(&filter->usage, 1);
+ filter->len = fprog->len;
+
+ /* Copy the instructions from fprog. */
+ ret = -EFAULT;
+ if (copy_from_user(filter->insns, fprog->filter, fp_size))
+ goto fail;
+
+ /* Check and rewrite the fprog via the skb checker */
+ ret = sk_chk_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /* Check and rewrite the fprog for seccomp use */
+ ret = seccomp_check_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+ return 0;
+fail:
+ kfree(filter);
+ return ret;
+}
+
+/**
+ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+long seccomp_attach_user_filter(char __user *user_filter)
+{
+ struct sock_fprog fprog;
+ long ret = -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task()) {
+ struct compat_sock_fprog fprog32;
+ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+ goto out;
+ fprog.len = fprog32.len;
+ fprog.filter = compat_ptr(fprog32.filter);
+ } else /* falls through to the if below. */
+#endif
+ if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+ goto out;
+ ret = seccomp_attach_filter(&fprog);
+out:
+ return ret;
+}
+
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ if (!orig)
+ return;
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&orig->usage);
+}
+
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* Clean up single-reference branches iteratively. */
+ while (orig && atomic_dec_and_test(&orig->usage)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ kfree(freeme);
+ }
+}
+
+/**
+ * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+static void seccomp_send_sigsys(int syscall, int reason)
+{
+ struct siginfo info;
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_SECCOMP;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = reason;
+ info.si_arch = syscall_get_arch(current, task_pt_regs(current));
+ info.si_syscall = syscall;
+ force_sig_info(SIGSYS, &info, current);
+}
+#endif /* CONFIG_SECCOMP_FILTER */
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
};
#endif
-void __secure_computing(int this_syscall)
+int __secure_computing(int this_syscall)
{
int mode = current->seccomp.mode;
- int * syscall;
+ int exit_sig = 0;
+ int *syscall;
+ u32 ret;
switch (mode) {
- case 1:
+ case SECCOMP_MODE_STRICT:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
#endif
do {
if (*syscall == this_syscall)
- return;
+ return 0;
} while (*++syscall);
+ exit_sig = SIGKILL;
+ ret = SECCOMP_RET_KILL;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER: {
+ int data;
+ ret = seccomp_run_filters(this_syscall);
+ data = ret & SECCOMP_RET_DATA;
+ ret &= SECCOMP_RET_ACTION;
+ switch (ret) {
+ case SECCOMP_RET_ERRNO:
+ /* Set the low-order 16-bits as a errno. */
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+ case SECCOMP_RET_TRACE:
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
+ goto skip;
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ break;
+ return 0;
+ case SECCOMP_RET_ALLOW:
+ return 0;
+ case SECCOMP_RET_KILL:
+ default:
+ break;
+ }
+ exit_sig = SIGSYS;
break;
+ }
+#endif
default:
BUG();
}
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
- audit_seccomp(this_syscall);
- do_exit(SIGKILL);
+ audit_seccomp(this_syscall, exit_sig, ret);
+ do_exit(exit_sig);
+#ifdef CONFIG_SECCOMP_FILTER
+skip:
+ audit_seccomp(this_syscall, exit_sig, ret);
+#endif
+ return -1;
}
long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
return current->seccomp.mode;
}
-long prctl_set_seccomp(unsigned long seccomp_mode)
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * This function may be called repeatedly with a @seccomp_mode of
+ * SECCOMP_MODE_FILTER to install additional filters. Every filter
+ * successfully installed will be evaluated (in reverse order) for each system
+ * call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
- long ret;
+ long ret = -EINVAL;
- /* can set it only once to be even more secure */
- ret = -EPERM;
- if (unlikely(current->seccomp.mode))
+ if (current->seccomp.mode &&
+ current->seccomp.mode != seccomp_mode)
goto out;
- ret = -EINVAL;
- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ ret = 0;
#ifdef TIF_NOTSC
disable_TSC();
#endif
- ret = 0;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ ret = seccomp_attach_user_filter(filter);
+ if (ret)
+ goto out;
+ break;
+#endif
+ default:
+ goto out;
}
- out:
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+out:
return ret;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d..1a006b5d9d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -160,7 +160,7 @@ void recalc_sigpending(void)
#define SYNCHRONOUS_MASK \
(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
- sigmask(SIGTRAP) | sigmask(SIGFPE))
+ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
int next_signal(struct sigpending *pending, sigset_t *mask)
{
@@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
+#ifdef __ARCH_SIGSYS
+ case __SI_SYS:
+ err |= __put_user(from->si_call_addr, &to->si_call_addr);
+ err |= __put_user(from->si_syscall, &to->si_syscall);
+ err |= __put_user(from->si_arch, &to->si_arch);
+ break;
+#endif
default: /* this is just in case for now ... */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
diff --git a/kernel/smp.c b/kernel/smp.c
index 2f8b10ecf75..d0ae5b24875 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,8 @@
#include <linux/smp.h>
#include <linux/cpu.h>
+#include "smpboot.h"
+
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
static struct {
struct list_head queue;
@@ -669,6 +671,8 @@ void __init smp_init(void)
{
unsigned int cpu;
+ idle_threads_init();
+
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
}
}
EXPORT_SYMBOL(on_each_cpu_cond);
+
+static void do_nothing(void *unused)
+{
+}
+
+/**
+ * kick_all_cpus_sync - Force all cpus out of idle
+ *
+ * Used to synchronize the update of pm_idle function pointer. It's
+ * called after the pointer is updated and returns after the dummy
+ * callback function has been executed on all cpus. The execution of
+ * the function can only happen on the remote cpus after they have
+ * left the idle function which had been called via pm_idle function
+ * pointer. So it's guaranteed that nothing uses the previous pointer
+ * anymore.
+ */
+void kick_all_cpus_sync(void)
+{
+ /* Make sure the change is visible before we kick the cpus */
+ smp_mb();
+ smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 00000000000..e1a797e028a
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,62 @@
+/*
+ * Common SMP CPU bringup/teardown functions
+ */
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+#include "smpboot.h"
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+/*
+ * For the hotplug case we keep the task structs around and reuse
+ * them.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_threads);
+
+struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk)
+ return ERR_PTR(-ENOMEM);
+ init_idle(tsk, cpu);
+ return tsk;
+}
+
+void __init idle_thread_set_boot_cpu(void)
+{
+ per_cpu(idle_threads, smp_processor_id()) = current;
+}
+
+static inline void idle_init(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk) {
+ tsk = fork_idle(cpu);
+ if (IS_ERR(tsk))
+ pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
+ else
+ per_cpu(idle_threads, cpu) = tsk;
+ }
+}
+
+/**
+ * idle_thread_init - Initialize the idle thread for a cpu
+ * @cpu: The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
+void __init idle_threads_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != smp_processor_id())
+ idle_init(cpu);
+ }
+}
+#endif
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 00000000000..80c0acfb847
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,18 @@
+#ifndef SMPBOOT_H
+#define SMPBOOT_H
+
+struct task_struct;
+
+int smpboot_prepare(unsigned int cpu);
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+struct task_struct *idle_thread_get(unsigned int cpu);
+void idle_thread_set_boot_cpu(void);
+void idle_threads_init(void);
+#else
+static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
+static inline void idle_thread_set_boot_cpu(void) { }
+static inline void idle_threads_init(void) { }
+#endif
+
+#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f..2095be3318d 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
#include <linux/delay.h>
#include <linux/srcu.h>
+/*
+ * Initialize an rcu_batch structure to empty.
+ */
+static inline void rcu_batch_init(struct rcu_batch *b)
+{
+ b->head = NULL;
+ b->tail = &b->head;
+}
+
+/*
+ * Enqueue a callback onto the tail of the specified rcu_batch structure.
+ */
+static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
+{
+ *b->tail = head;
+ b->tail = &head->next;
+}
+
+/*
+ * Is the specified rcu_batch structure empty?
+ */
+static inline bool rcu_batch_empty(struct rcu_batch *b)
+{
+ return b->tail == &b->head;
+}
+
+/*
+ * Remove the callback at the head of the specified rcu_batch structure
+ * and return a pointer to it, or return NULL if the structure is empty.
+ */
+static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
+{
+ struct rcu_head *head;
+
+ if (rcu_batch_empty(b))
+ return NULL;
+
+ head = b->head;
+ b->head = head->next;
+ if (b->tail == &head->next)
+ rcu_batch_init(b);
+
+ return head;
+}
+
+/*
+ * Move all callbacks from the rcu_batch structure specified by "from" to
+ * the structure specified by "to".
+ */
+static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
+{
+ if (!rcu_batch_empty(from)) {
+ *to->tail = from->head;
+ to->tail = from->tail;
+ rcu_batch_init(from);
+ }
+}
+
+/* single-thread state-machine */
+static void process_srcu(struct work_struct *work);
+
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->completed = 0;
- mutex_init(&sp->mutex);
+ spin_lock_init(&sp->queue_lock);
+ sp->running = false;
+ rcu_batch_init(&sp->batch_queue);
+ rcu_batch_init(&sp->batch_check0);
+ rcu_batch_init(&sp->batch_check1);
+ rcu_batch_init(&sp->batch_done);
+ INIT_DELAYED_WORK(&sp->work, process_srcu);
sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
return sp->per_cpu_ref ? 0 : -ENOMEM;
}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
- * srcu_readers_active_idx -- returns approximate number of readers
- * active on the specified rank of per-CPU counters.
+ * Returns approximate total of the readers' ->seq[] values for the
+ * rank of per-CPU counters specified by idx.
*/
+static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
-static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ sum += t;
+ }
+ return sum;
+}
+
+/*
+ * Returns approximate number of readers active on the specified rank
+ * of the per-CPU ->c[] counters.
+ */
+static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
{
int cpu;
- int sum;
+ unsigned long sum = 0;
+ unsigned long t;
- sum = 0;
- for_each_possible_cpu(cpu)
- sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ sum += t;
+ }
return sum;
}
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be stably zero. An example unstable zero can occur if the call
+ * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
+ * but due to task migration, sees the corresponding __srcu_read_unlock()
+ * decrement. This can happen because srcu_readers_active_idx() takes
+ * time to sum the array, and might in fact be interrupted or preempted
+ * partway through the summation.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+ unsigned long seq;
+
+ seq = srcu_readers_seq_idx(sp, idx);
+
+ /*
+ * The following smp_mb() A pairs with the smp_mb() B located in
+ * __srcu_read_lock(). This pairing ensures that if an
+ * __srcu_read_lock() increments its counter after the summation
+ * in srcu_readers_active_idx(), then the corresponding SRCU read-side
+ * critical section will see any changes made prior to the start
+ * of the current SRCU grace period.
+ *
+ * Also, if the above call to srcu_readers_seq_idx() saw the
+ * increment of ->seq[], then the call to srcu_readers_active_idx()
+ * must see the increment of ->c[].
+ */
+ smp_mb(); /* A */
+
+ /*
+ * Note that srcu_readers_active_idx() can incorrectly return
+ * zero even though there is a pre-existing reader throughout.
+ * To see this, suppose that task A is in a very long SRCU
+ * read-side critical section that started on CPU 0, and that
+ * no other reader exists, so that the sum of the counters
+ * is equal to one. Then suppose that task B starts executing
+ * srcu_readers_active_idx(), summing up to CPU 1, and then that
+ * task C starts reading on CPU 0, so that its increment is not
+ * summed, but finishes reading on CPU 2, so that its decrement
+ * -is- summed. Then when task B completes its sum, it will
+ * incorrectly get zero, despite the fact that task A has been
+ * in its SRCU read-side critical section the whole time.
+ *
+ * We therefore do a validation step should srcu_readers_active_idx()
+ * return zero.
+ */
+ if (srcu_readers_active_idx(sp, idx) != 0)
+ return false;
+
+ /*
+ * The remainder of this function is the validation step.
+ * The following smp_mb() D pairs with the smp_mb() C in
+ * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
+ * by srcu_readers_active_idx() above, then any destructive
+ * operation performed after the grace period will happen after
+ * the corresponding SRCU read-side critical section.
+ *
+ * Note that there can be at most NR_CPUS worth of readers using
+ * the old index, which is not enough to overflow even a 32-bit
+ * integer. (Yes, this does mean that systems having more than
+ * a billion or so CPUs need to be 64-bit systems.) Therefore,
+ * the sum of the ->seq[] counters cannot possibly overflow.
+ * Therefore, the only way that the return values of the two
+ * calls to srcu_readers_seq_idx() can be equal is if there were
+ * no increments of the corresponding rank of ->seq[] counts
+ * in the interim. But the missed-increment scenario laid out
+ * above includes an increment of the ->seq[] counter by
+ * the corresponding __srcu_read_lock(). Therefore, if this
+ * scenario occurs, the return values from the two calls to
+ * srcu_readers_seq_idx() will differ, and thus the validation
+ * step below suffices.
+ */
+ smp_mb(); /* D */
+
+ return srcu_readers_seq_idx(sp, idx) == seq;
+}
+
/**
* srcu_readers_active - returns approximate number of readers.
* @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
*/
static int srcu_readers_active(struct srcu_struct *sp)
{
- return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ }
+ return sum;
}
/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx;
preempt_disable();
- idx = sp->completed & 0x1;
- barrier(); /* ensure compiler looks -once- at sp->completed. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
+ idx = rcu_dereference_index_check(sp->completed,
+ rcu_read_lock_sched_held()) & 0x1;
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
preempt_enable();
return idx;
}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
preempt_disable();
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
preempt_enable();
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
* we repeatedly block for 1-millisecond time periods. This approach
* has done well in testing, so there is no need for a config parameter.
*/
-#define SYNCHRONIZE_SRCU_READER_DELAY 10
+#define SRCU_RETRY_CHECK_DELAY 5
+#define SYNCHRONIZE_SRCU_TRYCOUNT 2
+#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ * @@@ Wait until all pre-existing readers complete. Such readers
+ * will have used the index specified by "idx".
+ * the caller should ensures the ->completed is not changed while checking
+ * and idx = (->completed & 1) ^ 1
*/
-static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
{
- int idx;
-
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
-
- idx = sp->completed;
- mutex_lock(&sp->mutex);
+ for (;;) {
+ if (srcu_readers_active_idx_check(sp, idx))
+ return true;
+ if (--trycount <= 0)
+ return false;
+ udelay(SRCU_RETRY_CHECK_DELAY);
+ }
+}
- /*
- * Check to see if someone else did the work for us while we were
- * waiting to acquire the lock. We need -two- advances of
- * the counter, not just one. If there was but one, we might have
- * shown up -after- our helper's first synchronize_sched(), thus
- * having failed to prevent CPU-reordering races with concurrent
- * srcu_read_unlock()s on other CPUs (see comment below). So we
- * either (1) wait for two or (2) supply the second ourselves.
- */
+/*
+ * Increment the ->completed counter so that future SRCU readers will
+ * use the other rank of the ->c[] and ->seq[] arrays. This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+ sp->completed++;
+}
- if ((sp->completed - idx) >= 2) {
- mutex_unlock(&sp->mutex);
- return;
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ unsigned long flags;
+
+ head->next = NULL;
+ head->func = func;
+ spin_lock_irqsave(&sp->queue_lock, flags);
+ rcu_batch_queue(&sp->batch_queue, head);
+ if (!sp->running) {
+ sp->running = true;
+ queue_delayed_work(system_nrt_wq, &sp->work, 0);
}
+ spin_unlock_irqrestore(&sp->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
- sync_func(); /* Force memory barrier on all CPUs. */
+struct rcu_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
- /*
- * The preceding synchronize_sched() ensures that any CPU that
- * sees the new value of sp->completed will also see any preceding
- * changes to data structures made by this CPU. This prevents
- * some other CPU from reordering the accesses in its SRCU
- * read-side critical section to precede the corresponding
- * srcu_read_lock() -- ensuring that such references will in
- * fact be protected.
- *
- * So it is now safe to do the flip.
- */
+/*
+ * Awaken the corresponding synchronize_srcu() instance now that a
+ * grace period has elapsed.
+ */
+static void wakeme_after_rcu(struct rcu_head *head)
+{
+ struct rcu_synchronize *rcu;
- idx = sp->completed & 0x1;
- sp->completed++;
+ rcu = container_of(head, struct rcu_synchronize, head);
+ complete(&rcu->completion);
+}
- sync_func(); /* Force memory barrier on all CPUs. */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+static void srcu_reschedule(struct srcu_struct *sp);
- /*
- * At this point, because of the preceding synchronize_sched(),
- * all srcu_read_lock() calls using the old counters have completed.
- * Their corresponding critical sections might well be still
- * executing, but the srcu_read_lock() primitives themselves
- * will have finished executing. We initially give readers
- * an arbitrarily chosen 10 microseconds to get out of their
- * SRCU read-side critical sections, then loop waiting 1/HZ
- * seconds per iteration. The 10-microsecond value has done
- * very well in testing.
- */
-
- if (srcu_readers_active_idx(sp, idx))
- udelay(SYNCHRONIZE_SRCU_READER_DELAY);
- while (srcu_readers_active_idx(sp, idx))
- schedule_timeout_interruptible(1);
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
+{
+ struct rcu_synchronize rcu;
+ struct rcu_head *head = &rcu.head;
+ bool done = false;
- sync_func(); /* Force memory barrier on all CPUs. */
+ rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+ !lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
- /*
- * The preceding synchronize_sched() forces all srcu_read_unlock()
- * primitives that were executing concurrently with the preceding
- * for_each_possible_cpu() loop to have completed by this point.
- * More importantly, it also forces the corresponding SRCU read-side
- * critical sections to have also completed, and the corresponding
- * references to SRCU-protected data items to be dropped.
- *
- * Note:
- *
- * Despite what you might think at first glance, the
- * preceding synchronize_sched() -must- be within the
- * critical section ended by the following mutex_unlock().
- * Otherwise, a task taking the early exit can race
- * with a srcu_read_unlock(), which might have executed
- * just before the preceding srcu_readers_active() check,
- * and whose CPU might have reordered the srcu_read_unlock()
- * with the preceding critical section. In this case, there
- * is nothing preventing the synchronize_sched() task that is
- * taking the early exit from freeing a data structure that
- * is still being referenced (out of order) by the task
- * doing the srcu_read_unlock().
- *
- * Alternatively, the comparison with "2" on the early exit
- * could be changed to "3", but this increases synchronize_srcu()
- * latency for bulk loads. So the current code is preferred.
- */
+ init_completion(&rcu.completion);
+
+ head->next = NULL;
+ head->func = wakeme_after_rcu;
+ spin_lock_irq(&sp->queue_lock);
+ if (!sp->running) {
+ /* steal the processing owner */
+ sp->running = true;
+ rcu_batch_queue(&sp->batch_check0, head);
+ spin_unlock_irq(&sp->queue_lock);
+
+ srcu_advance_batches(sp, trycount);
+ if (!rcu_batch_empty(&sp->batch_done)) {
+ BUG_ON(sp->batch_done.head != head);
+ rcu_batch_dequeue(&sp->batch_done);
+ done = true;
+ }
+ /* give the processing owner to work_struct */
+ srcu_reschedule(sp);
+ } else {
+ rcu_batch_queue(&sp->batch_queue, head);
+ spin_unlock_irq(&sp->queue_lock);
+ }
- mutex_unlock(&sp->mutex);
+ if (!done)
+ wait_for_completion(&rcu.completion);
}
/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, synchronize_sched);
+ __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
* synchronize_srcu_expedited - Brute-force SRCU grace period
* @sp: srcu_struct with which to synchronize.
*
- * Wait for an SRCU grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code. In fact,
- * if you are using synchronize_srcu_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_srcu() instead.
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
*
* Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier. Failing to observe
- * these restriction will result in deadlock. It is also illegal to call
+ * that is acquired by a CPU-hotplug notifier. It is also illegal to call
* synchronize_srcu_expedited() from the corresponding SRCU read-side
* critical section; doing so will result in deadlock. However, it is
* perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
*/
void synchronize_srcu_expedited(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, synchronize_sched_expedited);
+ __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
}
EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+ synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
* srcu_batches_completed - return batches completed.
* @sp: srcu_struct on which to report batch completion.
*
* Report the number of batches, correlated with, but not necessarily
* precisely the same as, the number of grace periods that have elapsed.
*/
-
long srcu_batches_completed(struct srcu_struct *sp)
{
return sp->completed;
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+#define SRCU_CALLBACK_BATCH 10
+#define SRCU_INTERVAL 1
+
+/*
+ * Move any new SRCU callbacks to the first stage of the SRCU grace
+ * period pipeline.
+ */
+static void srcu_collect_new(struct srcu_struct *sp)
+{
+ if (!rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
+ spin_unlock_irq(&sp->queue_lock);
+ }
+}
+
+/*
+ * Core SRCU state machine. Advance callbacks from ->batch_check0 to
+ * ->batch_check1 and then to ->batch_done as readers drain.
+ */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
+{
+ int idx = 1 ^ (sp->completed & 1);
+
+ /*
+ * Because readers might be delayed for an extended period after
+ * fetching ->completed for their index, at any point in time there
+ * might well be readers using both idx=0 and idx=1. We therefore
+ * need to wait for readers to clear from both index values before
+ * invoking a callback.
+ */
+
+ if (rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_check1))
+ return; /* no callbacks need to be advanced */
+
+ if (!try_check_zero(sp, idx, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have already done with their
+ * first zero check and flip back when they were enqueued on
+ * ->batch_check0 in a previous invocation of srcu_advance_batches().
+ * (Presumably try_check_zero() returned false during that
+ * invocation, leaving the callbacks stranded on ->batch_check1.)
+ * They are therefore ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+ if (rcu_batch_empty(&sp->batch_check0))
+ return; /* no callbacks need to be advanced */
+ srcu_flip(sp);
+
+ /*
+ * The callbacks in ->batch_check0 just finished their
+ * first check zero and flip, so move them to ->batch_check1
+ * for future checking on the other idx.
+ */
+ rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+ /*
+ * SRCU read-side critical sections are normally short, so check
+ * at least twice in quick succession after a flip.
+ */
+ trycount = trycount < 2 ? 2 : trycount;
+ if (!try_check_zero(sp, idx^1, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have now waited for all
+ * pre-existing readers using both idx values. They are therefore
+ * ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period. If there are more to do, SRCU will reschedule
+ * the workqueue.
+ */
+static void srcu_invoke_callbacks(struct srcu_struct *sp)
+{
+ int i;
+ struct rcu_head *head;
+
+ for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
+ head = rcu_batch_dequeue(&sp->batch_done);
+ if (!head)
+ break;
+ local_bh_disable();
+ head->func(head);
+ local_bh_enable();
+ }
+}
+
+/*
+ * Finished one round of SRCU grace period. Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp)
+{
+ bool pending = true;
+
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ sp->running = false;
+ pending = false;
+ }
+ spin_unlock_irq(&sp->queue_lock);
+ }
+
+ if (pending)
+ queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+static void process_srcu(struct work_struct *work)
+{
+ struct srcu_struct *sp;
+
+ sp = container_of(work, struct srcu_struct, work.work);
+
+ srcu_collect_new(sp);
+ srcu_advance_batches(sp, 1);
+ srcu_invoke_callbacks(sp);
+ srcu_reschedule(sp);
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e..ba0ae8eea6f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = prctl_get_seccomp();
break;
case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2);
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;
case PR_GET_TSC:
error = GET_TSC_CTL(arg2);
@@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = put_user(me->signal->is_child_subreaper,
(int __user *) arg2);
break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ current->no_new_privs = 1;
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return current->no_new_privs ? 1 : 0;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 52b3a06a02f..4ab11879aeb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,7 +170,7 @@ static int proc_taint(struct ctl_table *table, int write,
#endif
#ifdef CONFIG_PRINTK
-static int proc_dmesg_restrict(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
@@ -703,7 +703,7 @@ static struct ctl_table kern_table[] = {
.data = &dmesg_restrict,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
.extra1 = &zero,
.extra2 = &one,
},
@@ -712,7 +712,7 @@ static struct ctl_table kern_table[] = {
.data = &kptr_restrict,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dmesg_restrict,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
.extra1 = &zero,
.extra2 = &two,
},
@@ -1943,7 +1943,7 @@ static int proc_taint(struct ctl_table *table, int write,
}
#ifdef CONFIG_PRINTK
-static int proc_dmesg_restrict(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2cf9cc7aa10..a20dc8a3c94 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,6 +1,10 @@
#
# Timer subsystem related configuration options
#
+
+# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# only related to the tick functionality. Oneshot clockevent devices
+# are supported independ of this.
config TICK_ONESHOT
bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7..aa27d391bfc 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
* If one has not already been chosen, it checks to see if a
* functional rtc device is available.
*/
-static struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
{
unsigned long flags;
struct rtc_device *ret;
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
-static inline struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
{
return NULL;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e883f57a3cd..f113755695e 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -346,7 +346,8 @@ int tick_resume_broadcast(void)
tick_get_broadcast_mask());
break;
case TICKDEV_MODE_ONESHOT:
- broadcast = tick_resume_broadcast_oneshot(bc);
+ if (!cpumask_empty(tick_get_broadcast_mask()))
+ broadcast = tick_resume_broadcast_oneshot(bc);
break;
}
}
@@ -373,6 +374,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
{
struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+
return clockevents_program_event(bc, expires, force);
}
@@ -531,7 +535,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
bc->event_handler = tick_handle_oneshot_broadcast;
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
/* Take the do_timer update */
tick_do_timer_cpu = cpu;
@@ -549,6 +552,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
to_cpumask(tmpmask));
if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
tick_broadcast_init_next_event(to_cpumask(tmpmask),
tick_next_period);
tick_broadcast_set_event(tick_next_period, 1);
@@ -575,15 +579,12 @@ void tick_broadcast_switch_to_oneshot(void)
unsigned long flags;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
- if (cpumask_empty(tick_get_broadcast_mask()))
- goto end;
tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
bc = tick_broadcast_device.evtdev;
if (bc)
tick_broadcast_setup_oneshot(bc);
-end:
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3526038f283..6a3a5b9ff56 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -534,9 +534,9 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
hrtimer_get_expires(&ts->sched_timer), 0))
break;
}
- /* Update jiffies and reread time */
- tick_do_update_jiffies64(now);
+ /* Reread time and update jiffies */
now = ktime_get();
+ tick_do_update_jiffies64(now);
}
}
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888..09de9a941cd 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
*
* mod_timer_pinned() is a way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
- * and not allow the timer to be migrated to a different CPU.
+ * and to ensure that the timer is scheduled on the current CPU.
+ *
+ * Note that this does not prevent the timer from being migrated
+ * when the current CPU goes offline. If this is a problem for
+ * you, use CPU-hotplug notifiers to handle it correctly, for
+ * example, cancelling the timer when the corresponding CPU goes
+ * offline.
*
* mod_timer_pinned(timer, expires) is equivalent to:
*
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
* warnings as well as problems when looking into
* timer->lockdep_map, make a copy and use that here.
*/
- struct lockdep_map lockdep_map = timer->lockdep_map;
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
/*
* Couple the lock chain with the lock chain at
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5e..b3afe0e76f7 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
ifeq ($(CONFIG_BLOCK),y)
obj-$(CONFIG_EVENT_TRACING) += blktrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cdea7b56b0c..c0bd0308741 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_trace_remove);
-static int blk_dropped_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = inode->i_private;
-
- return 0;
-}
-
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
@@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
static const struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
- .open = blk_dropped_open,
+ .open = simple_open,
.read = blk_dropped_read,
.llseek = default_llseek,
};
-static int blk_msg_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = inode->i_private;
-
- return 0;
-}
-
static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
size_t count, loff_t *ppos)
{
@@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
static const struct file_operations blk_msg_fops = {
.owner = THIS_MODULE,
- .open = blk_msg_open,
+ .open = simple_open,
.write = blk_msg_write,
.llseek = noop_llseek,
};
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ed7b5d1e12f..2a22255c101 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4629,7 +4629,8 @@ static ssize_t
rb_simple_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct ring_buffer *buffer = filp->private_data;
+ struct trace_array *tr = filp->private_data;
+ struct ring_buffer *buffer = tr->buffer;
char buf[64];
int r;
@@ -4647,7 +4648,8 @@ static ssize_t
rb_simple_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct ring_buffer *buffer = filp->private_data;
+ struct trace_array *tr = filp->private_data;
+ struct ring_buffer *buffer = tr->buffer;
unsigned long val;
int ret;
@@ -4734,7 +4736,7 @@ static __init int tracer_init_debugfs(void)
&trace_clock_fops);
trace_create_file("tracing_on", 0644, d_tracer,
- global_trace.buffer, &rb_simple_fops);
+ &global_trace, &rb_simple_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 95059f091a2..f95d65da6db 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -836,11 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[];
filter)
#include "trace_entries.h"
-#ifdef CONFIG_FUNCTION_TRACER
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
int perf_ftrace_event_register(struct ftrace_event_call *call,
enum trace_reg type, void *data);
#else
#define perf_ftrace_event_register NULL
-#endif /* CONFIG_FUNCTION_TRACER */
+#endif
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 079a93ae8a9..29111da1d10 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
if (!call->name || !call->class || !call->class->reg)
continue;
+ if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ continue;
+
if (match &&
strcmp(match, call->name) != 0 &&
strcmp(match, call->class->system) != 0)
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return -1;
}
- if (call->class->reg)
+ if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
trace_create_file("enable", 0644, call->dir, call,
enable);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 3dd15e8bc85..e039906b037 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \
.event.type = etype, \
.class = &event_class_ftrace_##call, \
.print_fmt = print, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
struct ftrace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 859fae6b182..df611a0e76c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter)
{
u64 next_ts;
int ret;
+ /* trace_find_next_entry will reset ent_size */
+ int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent,
*next_entry = trace_find_next_entry(iter, NULL,
@@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter)
unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
unsigned long rel_usecs;
+ /* Restore the original ent_size */
+ iter->ent_size = ent_size;
+
if (!next_entry)
next_ts = iter->ts;
rel_usecs = ns2usecs(next_ts - iter->ts);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a472..00000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Workqueue statistical tracer.
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-
-#include <trace/events/workqueue.h>
-#include <linux/list.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/kref.h>
-#include "trace_stat.h"
-#include "trace.h"
-
-
-/* A cpu workqueue thread */
-struct cpu_workqueue_stats {
- struct list_head list;
- struct kref kref;
- int cpu;
- pid_t pid;
-/* Can be inserted from interrupt or user context, need to be atomic */
- atomic_t inserted;
-/*
- * Don't need to be atomic, works are serialized in a single workqueue thread
- * on a single CPU.
- */
- unsigned int executed;
-};
-
-/* List of workqueue threads on one cpu */
-struct workqueue_global_stats {
- struct list_head list;
- spinlock_t lock;
-};
-
-/* Don't need a global lock because allocated before the workqueues, and
- * never freed.
- */
-static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
-#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
-
-static void cpu_workqueue_stat_free(struct kref *kref)
-{
- kfree(container_of(kref, struct cpu_workqueue_stats, kref));
-}
-
-/* Insertion of a work */
-static void
-probe_workqueue_insertion(void *ignore,
- struct task_struct *wq_thread,
- struct work_struct *work)
-{
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
- if (node->pid == wq_thread->pid) {
- atomic_inc(&node->inserted);
- goto found;
- }
- }
- pr_debug("trace_workqueue: entry not found\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Execution of a work */
-static void
-probe_workqueue_execution(void *ignore,
- struct task_struct *wq_thread,
- struct work_struct *work)
-{
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
- if (node->pid == wq_thread->pid) {
- node->executed++;
- goto found;
- }
- }
- pr_debug("trace_workqueue: entry not found\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(void *ignore,
- struct task_struct *wq_thread, int cpu)
-{
- struct cpu_workqueue_stats *cws;
- unsigned long flags;
-
- WARN_ON(cpu < 0);
-
- /* Workqueues are sometimes created in atomic context */
- cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
- if (!cws) {
- pr_warning("trace_workqueue: not enough memory\n");
- return;
- }
- INIT_LIST_HEAD(&cws->list);
- kref_init(&cws->kref);
- cws->cpu = cpu;
- cws->pid = wq_thread->pid;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Destruction of a cpu workqueue thread */
-static void
-probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
-{
- /* Workqueue only execute on one cpu */
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node, *next;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
- list) {
- if (node->pid == wq_thread->pid) {
- list_del(&node->list);
- kref_put(&node->kref, cpu_workqueue_stat_free);
- goto found;
- }
- }
-
- pr_debug("trace_workqueue: don't find workqueue to destroy\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
-}
-
-static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
-{
- unsigned long flags;
- struct cpu_workqueue_stats *ret = NULL;
-
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-
- if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
- ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
- struct cpu_workqueue_stats, list);
- kref_get(&ret->kref);
- }
-
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
- return ret;
-}
-
-static void *workqueue_stat_start(struct tracer_stat *trace)
-{
- int cpu;
- void *ret = NULL;
-
- for_each_possible_cpu(cpu) {
- ret = workqueue_stat_start_cpu(cpu);
- if (ret)
- return ret;
- }
- return NULL;
-}
-
-static void *workqueue_stat_next(void *prev, int idx)
-{
- struct cpu_workqueue_stats *prev_cws = prev;
- struct cpu_workqueue_stats *ret;
- int cpu = prev_cws->cpu;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
- do {
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- return NULL;
- } while (!(ret = workqueue_stat_start_cpu(cpu)));
- return ret;
- } else {
- ret = list_entry(prev_cws->list.next,
- struct cpu_workqueue_stats, list);
- kref_get(&ret->kref);
- }
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
- return ret;
-}
-
-static int workqueue_stat_show(struct seq_file *s, void *p)
-{
- struct cpu_workqueue_stats *cws = p;
- struct pid *pid;
- struct task_struct *tsk;
-
- pid = find_get_pid(cws->pid);
- if (pid) {
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (tsk) {
- seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
- atomic_read(&cws->inserted), cws->executed,
- tsk->comm);
- put_task_struct(tsk);
- }
- put_pid(pid);
- }
-
- return 0;
-}
-
-static void workqueue_stat_release(void *stat)
-{
- struct cpu_workqueue_stats *node = stat;
-
- kref_put(&node->kref, cpu_workqueue_stat_free);
-}
-
-static int workqueue_stat_headers(struct seq_file *s)
-{
- seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
- seq_printf(s, "# | | | |\n");
- return 0;
-}
-
-struct tracer_stat workqueue_stats __read_mostly = {
- .name = "workqueues",
- .stat_start = workqueue_stat_start,
- .stat_next = workqueue_stat_next,
- .stat_show = workqueue_stat_show,
- .stat_release = workqueue_stat_release,
- .stat_headers = workqueue_stat_headers
-};
-
-
-int __init stat_workqueue_init(void)
-{
- if (register_stat_tracer(&workqueue_stats)) {
- pr_warning("Unable to register workqueue stat tracer\n");
- return 1;
- }
-
- return 0;
-}
-fs_initcall(stat_workqueue_init);
-
-/*
- * Workqueues are created very early, just after pre-smp initcalls.
- * So we must register our tracepoints at this stage.
- */
-int __init trace_workqueue_early_init(void)
-{
- int ret, cpu;
-
- for_each_possible_cpu(cpu) {
- spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
- INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
- }
-
- ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
- if (ret)
- goto out;
-
- ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
- if (ret)
- goto no_insertion;
-
- ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
- if (ret)
- goto no_execution;
-
- ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
- if (ret)
- goto no_creation;
-
- return 0;
-
-no_creation:
- unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
-no_execution:
- unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
-no_insertion:
- unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
-out:
- pr_warning("trace_workqueue: unable to trace workqueues\n");
-
- return 1;
-}
-early_initcall(trace_workqueue_early_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c0..9a3128dc67d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
cwq = get_cwq(gcwq->cpu, wq);
trace_workqueue_queue_work(cpu, cwq, work);
- BUG_ON(!list_empty(&work->entry));
+ if (WARN_ON(!list_empty(&work->entry))) {
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+ return;
+ }
cwq->nr_in_flight[cwq->work_color]++;
work_flags = work_color_to_flags(cwq->work_color);
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
} else
wake_up_all(&gcwq->trustee_wait);
- /* sanity check nr_running */
- WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+ /*
+ * Sanity check nr_running. Because trustee releases gcwq->lock
+ * between setting %WORKER_ROGUE and zapping nr_running, the
+ * warning may trigger spuriously. Check iff trustee is idle.
+ */
+ WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+ gcwq->nr_workers == gcwq->nr_idle &&
atomic_read(get_gcwq_nr_running(gcwq->cpu)));
}
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)
* lock freed" warnings as well as problems when looking into
* work->lockdep_map, make a copy and use that here.
*/
- struct lockdep_map lockdep_map = work->lockdep_map;
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
/*
* A single work shouldn't be executed concurrently by
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)
{
struct wq_barrier barr;
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
+
if (start_flush_work(work, &barr, true)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);