summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c96
-rw-r--r--kernel/cgroup.c14
-rw-r--r--kernel/cpu.c11
-rw-r--r--kernel/cpuset.c80
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c154
-rw-r--r--kernel/futex.c11
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/kallsyms.c14
-rw-r--r--kernel/kthread.c31
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/printk.c36
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/sched.c21
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c81
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c77
-rw-r--r--kernel/sysctl.c35
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/trace/blktrace.c15
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/watchdog.c27
-rw-r--r--kernel/workqueue.c6
39 files changed, 651 insertions, 308 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba3..85cbfb31e73 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c13..0c9b862292b 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
void foo(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+ DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
/* End of constants */
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c..bf0c734d0c1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/*
@@ -290,6 +291,60 @@ error:
}
/**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+ int ret = security_real_capable(t, &init_user_ns, cap);
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret = security_real_capable(t, ns, cap);
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited)
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+ int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+
+ return (ret == 0);
+}
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -299,17 +354,48 @@ error:
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
-int capable(int cap)
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
BUG();
}
- if (security_capable(current_cred(), cap) == 0) {
+ if (security_capable(ns, current_cred(), cap) == 0) {
current->flags |= PF_SUPERPRIV;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
+
+/**
+ * task_ns_capable - Determine whether current task has a superior
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
+ *
+ * Return true if it does, false otherwise.
+ */
+bool task_ns_capable(struct task_struct *t, int cap)
+{
+ return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+}
+EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 95362d15128..e31b220a743 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
/* Update the css_set linked lists if we're using them */
write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
- list_del(&cgrp->release_list);
+ list_del_init(&cgrp->release_list);
spin_unlock(&release_list_lock);
cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
- list_del(&cgrp->sibling);
+ list_del_init(&cgrp->sibling);
cgroup_unlock_hierarchy(cgrp->root);
d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
- list_del(&ss->sibling);
+ list_del_init(&ss->sibling);
/*
* disentangle the css from all css_sets attached to the dummytop. as
@@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
- list_del(&tsk->cg_list);
+ list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc555614..c95fc4df0fa 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
{
BUG_ON(cpu_notify(val, v));
}
-
EXPORT_SYMBOL(register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
-
return 0;
}
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
return -EINVAL;
cpu_hotplug_begin();
+
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
if (err) {
nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- printk("%s: attempt to bring up CPU %u failed\n",
+ printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
__func__, cpu);
goto out_notify;
}
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk("Enabling non-boot CPUs ...\n");
+ printk(KERN_INFO "Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
error = _cpu_up(cpu, 1);
if (!error) {
- printk("CPU%d is up\n", cpu);
+ printk(KERN_INFO "CPU%d is up\n", cpu);
continue;
}
printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e92e9818903..33eee16addb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
- if (!newmems)
- return;
+ static nodemask_t newmems; /* protected by cgroup_mutex */
cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, newmems);
-
- cpuset_change_task_nodemask(p, newmems);
+ guarantee_online_mems(cs, &newmems);
- NODEMASK_FREE(newmems);
+ cpuset_change_task_nodemask(p, &newmems);
mm = get_task_mm(p);
if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
- if (from == NULL || to == NULL)
- goto alloc_fail;
+ static nodemask_t to; /* protected by cgroup_mutex */
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
} else {
guarantee_online_cpus(cs, cpus_attach);
}
- guarantee_online_mems(cs, to);
+ guarantee_online_mems(cs, &to);
/* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, to, cs);
+ cpuset_attach_task(tsk, &to, cs);
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, to, cs);
+ cpuset_attach_task(c, &to, cs);
}
rcu_read_unlock();
}
/* change mm; only needs to be done once even if threadgroup */
- *from = oldcs->mems_allowed;
- *to = cs->mems_allowed;
+ to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, from, to);
+ cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(from);
- NODEMASK_FREE(to);
}
/* The various types of files and directories in a cpuset file system */
@@ -1610,34 +1596,26 @@ out:
* across a page fault.
*/
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
- int ret;
+ size_t count;
mutex_lock(&callback_mutex);
- ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+ count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
mutex_unlock(&callback_mutex);
- return ret;
+ return count;
}
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
- NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
- int retval;
-
- if (mask == NULL)
- return -ENOMEM;
+ size_t count;
mutex_lock(&callback_mutex);
- *mask = cs->mems_allowed;
+ count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
mutex_unlock(&callback_mutex);
- retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
-
- NODEMASK_FREE(mask);
-
- return retval;
+ return count;
}
static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
cs = cgroup_cs(cgroup);
parent_cs = cgroup_cs(parent);
+ mutex_lock(&callback_mutex);
cs->mems_allowed = parent_cs->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+ mutex_unlock(&callback_mutex);
return;
}
@@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
- *oldmems = cp->mems_allowed;
+ oldmems = cp->mems_allowed;
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
@@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, oldmems, NULL);
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
- NODEMASK_FREE(oldmems);
}
/*
@@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return NOTIFY_DONE;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
cgroup_lock();
switch (action) {
case MEM_ONLINE:
- *oldmems = top_cpuset.mems_allowed;
+ oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+ update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
break;
case MEM_OFFLINE:
/*
@@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
}
cgroup_unlock();
- NODEMASK_FREE(oldmems);
return NOTIFY_OK;
}
#endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 00000000000..5f85690285d
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 2343c132c5a..5557b55048d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
}
EXPORT_SYMBOL(set_create_files_as);
+struct user_namespace *current_user_ns(void)
+{
+ return _current_user_ns();
+}
+EXPORT_SYMBOL(current_user_ns);
+
#ifdef CONFIG_DEBUG_CREDENTIALS
bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe..a11db956dd6 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
put_packet(remcom_out_buffer);
return 0;
}
+
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+ unsigned char checksum, ch, buffer[3];
+ int loop;
+
+ buffer[0] = 'W';
+ buffer[1] = hex_asc_hi(status);
+ buffer[2] = hex_asc_lo(status);
+
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+
+ for (loop = 0; loop < 3; loop++) {
+ ch = buffer[loop];
+ checksum += ch;
+ dbg_io_ops->write_char(ch);
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+
+ /* make sure the output is flushed, lest the bootloader clobber it */
+ dbg_io_ops->flush();
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b..6a488ad2dce 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
+ WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 05b92c45701..e7548dee636 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/compat.h>
+#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
@@ -109,20 +110,25 @@ int nr_processes(void)
}
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
-# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
+# define alloc_task_struct_node(node) \
+ kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk) \
+ kmem_cache_free(task_struct_cachep, (tsk))
static struct kmem_cache *task_struct_cachep;
#endif
#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
{
#ifdef CONFIG_DEBUG_STACK_USAGE
gfp_t mask = GFP_KERNEL | __GFP_ZERO;
#else
gfp_t mask = GFP_KERNEL;
#endif
- return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+
+ return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
@@ -249,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
struct task_struct *tsk;
struct thread_info *ti;
unsigned long *stackend;
-
+ int node = tsk_fork_get_node(orig);
int err;
prepare_to_copy(orig);
- tsk = alloc_task_struct();
+ tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
- ti = alloc_thread_info(tsk);
+ ti = alloc_thread_info_node(tsk, node);
if (!ti) {
free_task_struct(tsk);
return NULL;
@@ -1181,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}
p->pid = pid_nr(pid);
@@ -1205,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+ p->plug = NULL;
+#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
@@ -1290,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
tracehook_finish_clone(p, clone_flags, trace);
if (thread_group_leader(p)) {
- if (clone_flags & CLONE_NEWPID)
+ if (is_child_reaper(pid))
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
@@ -1513,38 +1516,24 @@ void __init proc_caches_init(void)
}
/*
- * Check constraints on flags passed to the unshare system call and
- * force unsharing of additional process context as appropriate.
+ * Check constraints on flags passed to the unshare system call.
*/
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
{
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ return -EINVAL;
/*
- * If unsharing a thread from a thread group, must also
- * unshare vm.
- */
- if (*flags_ptr & CLONE_THREAD)
- *flags_ptr |= CLONE_VM;
-
- /*
- * If unsharing vm, must also unshare signal handlers.
- */
- if (*flags_ptr & CLONE_VM)
- *flags_ptr |= CLONE_SIGHAND;
-
- /*
- * If unsharing namespace, must also unshare filesystem information.
+ * Not implemented, but pretend it works if there is nothing to
+ * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+ * needs to unshare vm.
*/
- if (*flags_ptr & CLONE_NEWNS)
- *flags_ptr |= CLONE_FS;
-}
-
-/*
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
- if (unshare_flags & CLONE_THREAD)
- return -EINVAL;
+ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+ /* FIXME: get_task_mm() increments ->mm_users */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+ }
return 0;
}
@@ -1571,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
- struct sighand_struct *sigh = current->sighand;
-
- if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
- return -EINVAL;
- else
- return 0;
-}
-
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
- struct mm_struct *mm = current->mm;
-
- if ((unshare_flags & CLONE_VM) &&
- (mm && atomic_read(&mm->mm_users) > 1)) {
- return -EINVAL;
- }
-
- return 0;
-}
-
-/*
* Unshare file descriptor table if it is being shared
*/
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1626,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
*/
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
- int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct sighand_struct *new_sigh = NULL;
- struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
+ int err;
- check_unshare_flags(&unshare_flags);
-
- /* Return -EINVAL for all unsupported flags */
- err = -EINVAL;
- if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ err = check_unshare_flags(unshare_flags);
+ if (err)
goto bad_unshare_out;
/*
+ * If unsharing namespace, must also unshare filesystem information.
+ */
+ if (unshare_flags & CLONE_NEWNS)
+ unshare_flags |= CLONE_FS;
+ /*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable.
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
- if ((err = unshare_thread(unshare_flags)))
- goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
- goto bad_unshare_cleanup_thread;
- if ((err = unshare_sighand(unshare_flags, &new_sigh)))
- goto bad_unshare_cleanup_fs;
- if ((err = unshare_vm(unshare_flags, &new_mm)))
- goto bad_unshare_cleanup_sigh;
+ goto bad_unshare_out;
if ((err = unshare_fd(unshare_flags, &new_fd)))
- goto bad_unshare_cleanup_vm;
+ goto bad_unshare_cleanup_fs;
if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
new_fs)))
goto bad_unshare_cleanup_fd;
- if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1690,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_mm) {
- mm = current->mm;
- active_mm = current->active_mm;
- current->mm = new_mm;
- current->active_mm = new_mm;
- if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
- atomic_dec(&mm->oom_disable_count);
- atomic_inc(&new_mm->oom_disable_count);
- }
- activate_mm(active_mm, new_mm);
- new_mm = mm;
- }
-
if (new_fd) {
fd = current->files;
current->files = new_fd;
@@ -1719,20 +1659,10 @@ bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
-bad_unshare_cleanup_vm:
- if (new_mm)
- mmput(new_mm);
-
-bad_unshare_cleanup_sigh:
- if (new_sigh)
- if (atomic_dec_and_test(&new_sigh->count))
- kmem_cache_free(sighand_cachep, new_sigh);
-
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);
-bad_unshare_cleanup_thread:
bad_unshare_out:
return err;
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 823aae3e2a9..dfb924ffe65 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
goto err_unlock;
ret = -EPERM;
pcred = __task_cred(p);
+ /* If victim is in different user_ns, then uids are not
+ comparable, so we must have CAP_SYS_PTRACE */
+ if (cred->user->user_ns != pcred->user->user_ns) {
+ if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+ goto err_unlock;
+ goto ok;
+ }
+ /* If victim is in same user_ns, then uids are comparable */
if (cred->euid != pcred->euid &&
cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
+ !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
goto err_unlock;
+ok:
head = p->robust_list;
rcu_read_unlock();
}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5..5f9e689dc8f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
goto err_unlock;
ret = -EPERM;
pcred = __task_cred(p);
+ /* If victim is in different user_ns, then uids are not
+ comparable, so we must have CAP_SYS_PTRACE */
+ if (cred->user->user_ns != pcred->user->user_ns) {
+ if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+ goto err_unlock;
+ goto ok;
+ }
+ /* If victim is in same user_ns, then uids are comparable */
if (cred->euid != pcred->euid &&
cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
+ !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
goto err_unlock;
+ok:
head = p->compat_robust_list;
rcu_read_unlock();
}
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf..1cc476d52dd 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!capable(CAP_SETGID))
+ if (!nsown_capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 59e879929b1..079f1d39a8b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
arch_is_kernel_text(addr))
return 1;
- return in_gate_area_no_task(addr);
+ return in_gate_area_no_mm(addr);
}
static inline int is_kernel(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
return 1;
- return in_gate_area_no_task(addr);
+ return in_gate_area_no_mm(addr);
}
static int is_ksym_addr(unsigned long addr)
@@ -515,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%0*lx %c %s\t[%s]\n",
- (int)(2 * sizeof(void *)),
- iter->value, type, iter->name, iter->module_name);
+ seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ type, iter->name, iter->module_name);
} else
- seq_printf(m, "%0*lx %c %s\n",
- (int)(2 * sizeof(void *)),
- iter->value, iter->type, iter->name);
+ seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ iter->type, iter->name);
return 0;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a..684ab3f7dd7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
+ int node;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
do_exit(ret);
}
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+ if (tsk == kthreadd_task)
+ return tsk->pref_node_fork;
+#endif
+ return numa_node_id();
+}
+
static void create_kthread(struct kthread_create_info *create)
{
int pid;
+#ifdef CONFIG_NUMA
+ current->pref_node_fork = create->node;
+#endif
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
@@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create)
}
/**
- * kthread_create - create a kthread.
+ * kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
+ * @node: memory node number.
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
* it. See also kthread_run().
*
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which noone will call kthread_stop(), or
@@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create)
*
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
-struct task_struct *kthread_create(int (*threadfn)(void *data),
- void *data,
- const char namefmt[],
- ...)
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+ void *data,
+ int node,
+ const char namefmt[],
+ ...)
{
struct kthread_create_info create;
create.threadfn = threadfn;
create.data = data;
+ create.node = node;
init_completion(&create.done);
spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
}
return create.result;
}
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_on_node);
/**
* kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94b..1f9f7bc56ca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
{
struct module_sect_attr *sattr =
container_of(mattr, struct module_sect_attr, mattr);
- return sprintf(buf, "0x%lx\n", sattr->address);
+ return sprintf(buf, "0x%pK\n", (void *)sattr->address);
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading":
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%p", mod->module_core);
+ seq_printf(m, " 0x%pK", mod->module_core);
/* Taints info */
if (mod->taints)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26..a05d191ffdd 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_ns;
}
- new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+ new_nsp->uts_ns = copy_utsname(flags, tsk);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
- new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+ new_nsp->ipc_ns = copy_ipcs(flags, tsk);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a170..69231670eb9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0..e9c9adc84ca 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
+#include <linux/proc_fs.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
- int i;
+ int i, err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+ err = pid_ns_prepare_proc(ns);
+ if (err)
+ goto out_put_parent_pid_ns;
+
return ns;
+out_put_parent_pid_ns:
+ put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
out:
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df..d09dd10c5a5 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
static int submit(int rw, struct block_device *bdev, sector_t sector,
struct page *page, struct bio **bio_chain)
{
- const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+ const int bio_rw = rw | REQ_SYNC;
struct bio *bio;
bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/printk.c b/kernel/printk.c
index 33284adb218..da8ca817eae 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
/* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
/* We show everything that is MORE important than this.. */
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
/*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+
+/*
* Array of consoles built from command line options (console=)
*/
struct console_cmdline
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
struct console *con;
for_each_console(con) {
+ if (exclusive_console && con != exclusive_console)
+ continue;
if ((con->flags & CON_ENABLED) && con->write &&
(cpu_online(smp_processor_id()) ||
(con->flags & CON_ANYTIME)))
@@ -1230,6 +1237,11 @@ void console_unlock(void)
local_irq_restore(flags);
}
console_locked = 0;
+
+ /* Release the exclusive_console once it is used */
+ if (unlikely(exclusive_console))
+ exclusive_console = NULL;
+
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd)
@@ -1316,6 +1328,18 @@ void console_start(struct console *console)
}
EXPORT_SYMBOL(console_start);
+static int __read_mostly keep_bootcon;
+
+static int __init keep_bootcon_setup(char *str)
+{
+ keep_bootcon = 1;
+ printk(KERN_INFO "debug: skip boot console de-registration.\n");
+
+ return 0;
+}
+
+early_param("keep_bootcon", keep_bootcon_setup);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -1452,6 +1476,12 @@ void register_console(struct console *newcon)
spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
spin_unlock_irqrestore(&logbuf_lock, flags);
+ /*
+ * We're about to replay the log buffer. Only do this to the
+ * just-registered console to avoid excessive message spam to
+ * the already-registered consoles.
+ */
+ exclusive_console = newcon;
}
console_unlock();
console_sysfs_notify();
@@ -1463,7 +1493,9 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+ if (bcon &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+ !keep_bootcon) {
/* we need to iterate through twice, to make sure we print
* everything out, before we unregister the console(s)
*/
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e2302e40b36..0fc1eed28d2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return 0;
rcu_read_lock();
tcred = __task_cred(task);
- if ((cred->uid != tcred->euid ||
- cred->uid != tcred->suid ||
- cred->uid != tcred->uid ||
- cred->gid != tcred->egid ||
- cred->gid != tcred->sgid ||
- cred->gid != tcred->gid) &&
- !capable(CAP_SYS_PTRACE)) {
- rcu_read_unlock();
- return -EPERM;
- }
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ goto ok;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+ goto ok;
+ rcu_read_unlock();
+ return -EPERM;
+ok:
rcu_read_unlock();
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !capable(CAP_SYS_PTRACE))
+ if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
return -EPERM;
return security_ptrace_access_check(task, mode);
@@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task)
goto unlock_tasklist;
task->ptrace = PT_PTRACED;
- if (capable(CAP_SYS_PTRACE))
+ if (task_ns_capable(task, CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768..34683efa2cc 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
pos, buf, s - buf);
}
+#if BITS_PER_LONG == 32
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+ unsigned long flags;
+ u64 ret;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ ret = *res_counter_member(counter, member);
+ spin_unlock_irqrestore(&counter->lock, flags);
+
+ return ret;
+}
+#else
u64 res_counter_read_u64(struct res_counter *counter, int member)
{
return *res_counter_member(counter, member);
}
+#endif
int res_counter_memparse_write_strategy(const char *buf,
unsigned long long *res)
diff --git a/kernel/sched.c b/kernel/sched.c
index a172494a9a6..ae659b99ce7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4115,6 +4115,16 @@ need_resched:
switch_count = &prev->nvcsw;
}
+ /*
+ * If we are going to sleep and we have plugged IO queued, make
+ * sure to submit it to avoid deadlocks.
+ */
+ if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
+ raw_spin_unlock(&rq->lock);
+ blk_flush_plug(prev);
+ raw_spin_lock(&rq->lock);
+ }
+
pre_schedule(rq, prev);
if (unlikely(!rq->nr_running))
@@ -4892,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
+ if (cred->user->user_ns == pcred->user->user_ns)
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ else
+ match = false;
rcu_read_unlock();
return match;
}
@@ -5221,7 +5234,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
@@ -5525,6 +5538,7 @@ void __sched io_schedule(void)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
@@ -5540,6 +5554,7 @@ long __sched io_schedule_timeout(long timeout)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout);
current->in_iowait = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdc..324eff5468a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -636,13 +636,33 @@ static inline bool si_fromuser(const struct siginfo *info)
}
/*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = __task_cred(t);
+
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->euid == tcred->suid ||
+ cred->euid == tcred->uid ||
+ cred->uid == tcred->suid ||
+ cred->uid == tcred->uid))
+ return 1;
+
+ if (ns_capable(tcred->user->user_ns, CAP_KILL))
+ return 1;
+
+ return 0;
+}
+
+/*
* Bad permissions for sending the signal
* - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
{
- const struct cred *cred, *tcred;
struct pid *sid;
int error;
@@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
if (error)
return error;
- cred = current_cred();
- tcred = __task_cred(t);
if (!same_thread_group(current, t) &&
- (cred->euid ^ tcred->suid) &&
- (cred->euid ^ tcred->uid) &&
- (cred->uid ^ tcred->suid) &&
- (cred->uid ^ tcred->uid) &&
- !capable(CAP_KILL)) {
+ !kill_ok_by_cred(t)) {
switch (sig) {
case SIGCONT:
sid = task_session(t);
@@ -2421,9 +2435,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
return -EFAULT;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info.si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info.si_code != SI_QUEUE) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info.si_code < 0);
return -EPERM;
+ }
info.si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2455,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
return -EINVAL;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info->si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info->si_code != SI_QUEUE) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
+ }
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index 7cbd0f293df..73a19519355 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void)
}
#endif /* USE_GENERIC_SMP_HELPERS */
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+
+
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+
+void __weak arch_disable_smp_support(void) { }
+
+static int __init nosmp(char *str)
+{
+ setup_max_cpus = 0;
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("nosmp", nosmp);
+
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+ int nr_cpus;
+
+ get_option(&str, &nr_cpus);
+ if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+ nr_cpu_ids = nr_cpus;
+
+ return 0;
+}
+
+early_param("nr_cpus", nrcpus);
+
+static int __init maxcpus(char *str)
+{
+ get_option(&str, &setup_max_cpus);
+ if (setup_max_cpus == 0)
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("maxcpus", maxcpus);
+
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+ nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+ unsigned int cpu;
+
+ /* FIXME: This should be done in userspace --RR */
+ for_each_present_cpu(cpu) {
+ if (num_online_cpus() >= setup_max_cpus)
+ break;
+ if (!cpu_online(cpu))
+ cpu_up(cpu);
+ }
+
+ /* Any cleanup work */
+ printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+ smp_cpus_done(setup_max_cpus);
+}
+
/*
* Call a function on all processors. May be used during early boot while
* early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 56e5dec837f..735d8709517 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+ p = kthread_create_on_node(run_ksoftirqd,
+ hcpu,
+ cpu_to_node(hotcpu),
+ "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03be..e3516b29076 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
BUG_ON(stopper->thread || stopper->enabled ||
!list_empty(&stopper->works));
- p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
- cpu);
+ p = kthread_create_on_node(cpu_stopper_thread,
+ stopper,
+ cpu_to_node(cpu),
+ "migration/%d", cpu);
if (IS_ERR(p))
return notifier_from_errno(PTR_ERR(p));
get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1ad48b3b906..af468edf096 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -120,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
void (*pm_power_off_prepare)(void);
/*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+
+ if (pcred->user->user_ns == cred->user->user_ns &&
+ (pcred->uid == cred->euid ||
+ pcred->euid == cred->euid))
+ return true;
+ if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+ return true;
+ return false;
+}
+
+/*
* set the priority of a task
* - the caller must hold the RCU read lock
*/
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
- const struct cred *cred = current_cred(), *pcred = __task_cred(p);
int no_nice;
- if (pcred->uid != cred->euid &&
- pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
+ if (!set_one_prio_perm(p)) {
error = -EPERM;
goto out;
}
@@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
if (rgid != (gid_t) -1) {
if (old->gid == rgid ||
old->egid == rgid ||
- capable(CAP_SETGID))
+ nsown_capable(CAP_SETGID))
new->gid = rgid;
else
goto error;
@@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
if (old->gid == egid ||
old->egid == egid ||
old->sgid == egid ||
- capable(CAP_SETGID))
+ nsown_capable(CAP_SETGID))
new->egid = egid;
else
goto error;
@@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
old = current_cred();
retval = -EPERM;
- if (capable(CAP_SETGID))
+ if (nsown_capable(CAP_SETGID))
new->gid = new->egid = new->sgid = new->fsgid = gid;
else if (gid == old->gid || gid == old->sgid)
new->egid = new->fsgid = gid;
@@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
new->uid = ruid;
if (old->uid != ruid &&
old->euid != ruid &&
- !capable(CAP_SETUID))
+ !nsown_capable(CAP_SETUID))
goto error;
}
@@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
if (old->uid != euid &&
old->euid != euid &&
old->suid != euid &&
- !capable(CAP_SETUID))
+ !nsown_capable(CAP_SETUID))
goto error;
}
@@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
old = current_cred();
retval = -EPERM;
- if (capable(CAP_SETUID)) {
+ if (nsown_capable(CAP_SETUID)) {
new->suid = new->uid = uid;
if (uid != old->uid) {
retval = set_user(new);
@@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
old = current_cred();
retval = -EPERM;
- if (!capable(CAP_SETUID)) {
+ if (!nsown_capable(CAP_SETUID)) {
if (ruid != (uid_t) -1 && ruid != old->uid &&
ruid != old->euid && ruid != old->suid)
goto error;
@@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
old = current_cred();
retval = -EPERM;
- if (!capable(CAP_SETGID)) {
+ if (!nsown_capable(CAP_SETGID)) {
if (rgid != (gid_t) -1 && rgid != old->gid &&
rgid != old->egid && rgid != old->sgid)
goto error;
@@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
if (uid == old->uid || uid == old->euid ||
uid == old->suid || uid == old->fsuid ||
- capable(CAP_SETUID)) {
+ nsown_capable(CAP_SETUID)) {
if (uid != old_fsuid) {
new->fsuid = uid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
if (gid == old->gid || gid == old->egid ||
gid == old->sgid || gid == old->fsgid ||
- capable(CAP_SETGID)) {
+ nsown_capable(CAP_SETGID)) {
if (gid != old_fsgid) {
new->fsgid = gid;
goto change_okay;
@@ -1181,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
+
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
down_write(&uts_sem);
@@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
@@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
rlim = tsk->signal->rlim + resource;
task_lock(tsk->group_leader);
if (new_rlim) {
+ /* Keep the capable check against init_user_ns until
+ cgroups can contain all limits */
if (new_rlim->rlim_max > rlim->rlim_max &&
!capable(CAP_SYS_RESOURCE))
retval = -EPERM;
@@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
{
const struct cred *cred = current_cred(), *tcred;
- tcred = __task_cred(task);
- if (current != task &&
- (cred->uid != tcred->euid ||
- cred->uid != tcred->suid ||
- cred->uid != tcred->uid ||
- cred->gid != tcred->egid ||
- cred->gid != tcred->sgid ||
- cred->gid != tcred->gid) &&
- !capable(CAP_SYS_RESOURCE)) {
- return -EPERM;
- }
+ if (current == task)
+ return 0;
- return 0;
+ tcred = __task_cred(task);
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ return 0;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+ return 0;
+
+ return -EPERM;
}
SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 40245d69760..c0bb32414b1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
static unsigned long one_ul = 1;
static int one_hundred = 100;
#ifdef CONFIG_PRINTK
@@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -706,7 +712,7 @@ static struct ctl_table kern_table[] = {
.data = &kptr_restrict,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dmesg_restrict,
.extra1 = &zero,
.extra2 = &two,
},
@@ -971,14 +977,18 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "panic_on_oom",
.data = &sysctl_panic_on_oom,
.maxlen = sizeof(sysctl_panic_on_oom),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "oom_kill_allocating_task",
@@ -1006,7 +1016,8 @@ static struct ctl_table vm_table[] = {
.data = &page_cluster,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "dirty_background_ratio",
@@ -1054,7 +1065,8 @@ static struct ctl_table vm_table[] = {
.data = &dirty_expire_interval,
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "nr_pdflush_threads",
@@ -1130,6 +1142,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &three,
},
#ifdef CONFIG_COMPACTION
{
@@ -2385,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
struct do_proc_dointvec_minmax_conv_param {
int *min;
int *max;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c..4e4932a7b36 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
const char *fail = NULL;
if (table->parent) {
- if (table->procname && !table->parent->procname)
+ if (!table->parent->procname)
set_fail(&fail, table, "Parent without procname");
}
- if (!table->procname)
- set_fail(&fail, table, "No procname");
if (table->child) {
if (table->data)
set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
set_fail(&fail, table, "No maxlen");
}
#ifdef CONFIG_PROC_SYSCTL
- if (table->procname && !table->proc_handler)
+ if (!table->proc_handler)
set_fail(&fail, table, "No proc_handler");
#endif
-#if 0
- if (!table->procname && table->proc_handler)
- set_fail(&fail, table, "proc_handler without procname");
-#endif
sysctl_check_leaf(namespaces, table, &fail);
}
if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58..9ffea360a77 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
goto err_cgroup_ops;
family_registered = 1;
- printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+ pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
return 0;
err_cgroup_ops:
genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f3..7aa40f8e182 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
*
**/
static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
- u32 what)
+ u32 what)
{
struct blk_trace *bt = q->blk_trace;
- int rw = rq->cmd_flags & 0x03;
if (likely(!bt))
return;
- if (rq->cmd_flags & REQ_DISCARD)
- rw |= REQ_DISCARD;
-
- if (rq->cmd_flags & REQ_SECURE)
- rw |= REQ_SECURE;
-
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
- what, rq->errors, 0, NULL);
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+ rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d8..51c6e89e861 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!capable(CAP_SETGID))
+ if (!nsown_capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781d..9e03e9c1df8 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
#include <linux/module.h>
#include <linux/user_namespace.h>
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
struct user_namespace init_user_ns = {
.kref = {
- .refcount = ATOMIC_INIT(2),
+ .refcount = ATOMIC_INIT(3),
},
.creator = &root_user,
};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
*/
static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
struct user_struct root_user = {
.__count = ATOMIC_INIT(2),
.processes = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea5..44646179eab 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
#include <linux/utsname.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/user_namespace.h>
static struct uts_namespace *create_uts_ns(void)
{
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
* @old_ns: namespace to clone
* Return NULL on error (failure to kmalloc), new ns otherwise
*/
-static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+ struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
up_read(&uts_sem);
return ns;
}
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
* utsname of this process won't be seen by parent, and vice
* versa.
*/
-struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags,
+ struct task_struct *tsk)
{
+ struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
if (!(flags & CLONE_NEWUTS))
return old_ns;
- new_ns = clone_uts_ns(old_ns);
+ new_ns = clone_uts_ns(tsk, old_ns);
put_uts_ns(old_ns);
return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
struct uts_namespace *ns;
ns = container_of(kref, struct uts_namespace, kref);
+ put_user_ns(ns->user_ns);
kfree(ns);
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c5..140dce75045 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic;
+static int hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static int __init hardlockup_panic_setup(char *str)
{
if (!strncmp(str, "panic", 5))
hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
watchdog_enabled = 0;
return 1;
@@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu)
static int watchdog_enable(int cpu)
{
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err;
+ int err = 0;
/* enable the perf event */
err = watchdog_nmi_enable(cpu);
- if (err)
- return err;
+
+ /* Regardless of err above, fall through and start softlockup */
/* create the watchdog thread */
if (!p) {
p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
if (IS_ERR(p)) {
printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
- return PTR_ERR(p);
+ if (!err)
+ /* if hardlockup hasn't already set this */
+ err = PTR_ERR(p);
+ goto out;
}
kthread_bind(p, cpu);
per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +441,8 @@ static int watchdog_enable(int cpu)
wake_up_process(p);
}
- return 0;
+out:
+ return err;
}
static void watchdog_disable(int cpu)
@@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
- return notifier_from_errno(err);
+
+ /*
+ * hardlockup and softlockup are not important enough
+ * to block cpu bring up. Just always succeed and
+ * rely on printk output to flag problems.
+ */
+ return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5ca7ce9ce75..04ef830690e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
worker->id = id;
if (!on_unbound_cpu)
- worker->task = kthread_create(worker_thread, worker,
- "kworker/%u:%d", gcwq->cpu, id);
+ worker->task = kthread_create_on_node(worker_thread,
+ worker,
+ cpu_to_node(gcwq->cpu),
+ "kworker/%u:%d", gcwq->cpu, id);
else
worker->task = kthread_create(worker_thread, worker,
"kworker/u:%d", id);