diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 5 | ||||
-rw-r--r-- | kernel/cgroup.c | 20 | ||||
-rw-r--r-- | kernel/cpu.c | 44 | ||||
-rw-r--r-- | kernel/cpu_pm.c | 16 | ||||
-rw-r--r-- | kernel/cred.c | 9 | ||||
-rw-r--r-- | kernel/exit.c | 13 | ||||
-rw-r--r-- | kernel/fork.c | 28 | ||||
-rw-r--r-- | kernel/irq/manage.c | 80 | ||||
-rw-r--r-- | kernel/kallsyms.c | 32 | ||||
-rw-r--r-- | kernel/kcmp.c | 196 | ||||
-rw-r--r-- | kernel/kmod.c | 30 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 13 | ||||
-rw-r--r-- | kernel/res_counter.c | 10 | ||||
-rw-r--r-- | kernel/resource.c | 4 | ||||
-rw-r--r-- | kernel/signal.c | 11 | ||||
-rw-r--r-- | kernel/sys.c | 213 | ||||
-rw-r--r-- | kernel/sys_ni.c | 3 | ||||
-rw-r--r-- | kernel/task_work.c | 84 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 5 |
19 files changed, 636 insertions, 180 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 296132c19a57..c0cc67ad764c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o \ + signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ @@ -25,6 +25,9 @@ endif obj-y += sched/ obj-y += power/ +ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) +obj-$(CONFIG_X86) += kcmp.o +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0c6af34d500..0f3527d6184a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5132,7 +5132,7 @@ EXPORT_SYMBOL_GPL(css_depth); * @root: the css supporsed to be an ancestor of the child. * * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * this function reads css->id, the caller must hold rcu_read_lock(). * But, considering usual usage, the csses should be valid objects after test. * Assuming that the caller will do some action to the child if this returns * returns true, the caller must take "child";s reference count. @@ -5144,18 +5144,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, { struct css_id *child_id; struct css_id *root_id; - bool ret = true; - rcu_read_lock(); child_id = rcu_dereference(child->id); + if (!child_id) + return false; root_id = rcu_dereference(root->id); - if (!child_id - || !root_id - || (child_id->depth < root_id->depth) - || (child_id->stack[root_id->depth] != root_id->id)) - ret = false; - rcu_read_unlock(); - return ret; + if (!root_id) + return false; + if (child_id->depth < root_id->depth) + return false; + if (child_id->stack[root_id->depth] != root_id->id) + return false; + return true; } void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) diff --git a/kernel/cpu.c b/kernel/cpu.c index 0e6353cf147a..a4eb5227a19e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,7 +10,10 @@ #include <linux/sched.h> #include <linux/unistd.h> #include <linux/cpu.h> +#include <linux/oom.h> +#include <linux/rcupdate.h> #include <linux/export.h> +#include <linux/bug.h> #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> @@ -173,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +/** + * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU + * @cpu: a CPU id + * + * This function walks all processes, finds a valid mm struct for each one and + * then clears a corresponding bit in mm's cpumask. While this all sounds + * trivial, there are various non-obvious corner cases, which this function + * tries to solve in a safe manner. + * + * Also note that the function uses a somewhat relaxed locking scheme, so it may + * be called only for an already offlined CPU. + */ +void clear_tasks_mm_cpumask(int cpu) +{ + struct task_struct *p; + + /* + * This function is called after the cpu is taken down and marked + * offline, so its not like new tasks will ever get this cpu set in + * their mm mask. -- Peter Zijlstra + * Thus, we may use rcu_read_lock() here, instead of grabbing + * full-fledged tasklist_lock. + */ + WARN_ON(cpu_online(cpu)); + rcu_read_lock(); + for_each_process(p) { + struct task_struct *t; + + /* + * Main thread might exit, but other threads may still have + * a valid mm. Find one. + */ + t = find_lock_task_mm(p); + if (!t) + continue; + cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); + task_unlock(t); + } + rcu_read_unlock(); +} + static inline void check_for_tasks(int cpu) { struct task_struct *p; diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 249152e15308..9656a3c36503 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); /** - * cpm_pm_enter - CPU low power entry notifier + * cpu_pm_enter - CPU low power entry notifier * * Notifies listeners that a single CPU is entering a low power state that may * cause some blocks in the same power domain as the cpu to reset. @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); * Must be called on the affected CPU with interrupts disabled. Platform is * responsible for ensuring that cpu_pm_enter is not called twice on the same * CPU before cpu_pm_exit is called. Notified drivers can include VFP - * co-processor, interrupt controller and it's PM extensions, local CPU + * co-processor, interrupt controller and its PM extensions, local CPU * timers context save/restore which shouldn't be interrupted. Hence it * must be called with interrupts disabled. * @@ -115,13 +115,13 @@ int cpu_pm_enter(void) EXPORT_SYMBOL_GPL(cpu_pm_enter); /** - * cpm_pm_exit - CPU low power exit notifier + * cpu_pm_exit - CPU low power exit notifier * * Notifies listeners that a single CPU is exiting a low power state that may * have caused some blocks in the same power domain as the cpu to reset. * * Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Return conditions are same as __raw_notifier_call_chain. @@ -139,7 +139,7 @@ int cpu_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_pm_exit); /** - * cpm_cluster_pm_enter - CPU cluster low power entry notifier + * cpu_cluster_pm_enter - CPU cluster low power entry notifier * * Notifies listeners that all cpus in a power domain are entering a low power * state that may cause some blocks in the same power domain to reset. @@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); * Must be called after cpu_pm_enter has been called on all cpus in the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Must be called with interrupts disabled. @@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); /** - * cpm_cluster_pm_exit - CPU cluster low power exit notifier + * cpu_cluster_pm_exit - CPU cluster low power exit notifier * * Notifies listeners that all cpus in a power domain are exiting form a * low power state that may have caused some blocks in the same power domain @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * Must be called after cpu_pm_exit has been called on all cpus in the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Return conditions are same as __raw_notifier_call_chain. diff --git a/kernel/cred.c b/kernel/cred.c index 430557ea488f..de728ac50d82 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -207,13 +207,6 @@ void exit_creds(struct task_struct *tsk) validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); - - cred = (struct cred *) tsk->replacement_session_keyring; - if (cred) { - tsk->replacement_session_keyring = NULL; - validate_creds(cred); - put_cred(cred); - } } /** @@ -396,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - p->replacement_session_keyring = NULL; - if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && diff --git a/kernel/exit.c b/kernel/exit.c index 910a0716e17a..34867cc5b42a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -884,9 +884,9 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " - "left\n", - current->comm, free); + printk(KERN_WARNING "%s (%d) used greatest stack depth: " + "%lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -946,12 +946,13 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * an exiting task cleaning up the robust pi futexes, and in + * task_work_add() to avoid the race with exit_task_work(). */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - exit_irq_thread(); + exit_task_work(tsk); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -1214,7 +1215,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); - uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); + uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; if (!likely(wo->wo_flags & WEXITED)) diff --git a/kernel/fork.c b/kernel/fork.c index 47b4e4f379f9..ab5211b9e622 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -386,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned long len; + len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; @@ -614,7 +615,6 @@ void mmput(struct mm_struct *mm) list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } - put_swap_token(mm); if (mm->binfmt) module_put(mm->binfmt->module); mmdrop(mm); @@ -787,9 +787,6 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any cached register state */ deactivate_mm(tsk, mm); - if (tsk->vfork_done) - complete_vfork_done(tsk); - /* * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave @@ -810,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) } tsk->clear_child_tid = NULL; } + + /* + * All done, finally we can wake up parent and return this mm to him. + * Also kthread_stop() uses this completion for synchronization. + */ + if (tsk->vfork_done) + complete_vfork_done(tsk); } /* @@ -831,10 +835,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif @@ -913,10 +913,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) goto fail_nomem; good_mm: - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - tsk->mm = mm; tsk->active_mm = mm; return 0; @@ -984,9 +980,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; + ioc_task_link(ioc); + tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); if (unlikely(!new_ioc)) @@ -1420,6 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); + INIT_HLIST_HEAD(&p->task_works); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bb32326afe87..ea0c6c2ae6f7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -7,6 +7,8 @@ * This file contains driver APIs to the irq subsystem. */ +#define pr_fmt(fmt) "genirq: " fmt + #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> @@ -14,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/task_work.h> #include "internals.h" @@ -565,7 +568,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", irq, chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -600,7 +603,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, ret = 0; break; default: - pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); } if (unmask) @@ -773,11 +776,39 @@ static void wake_threads_waitq(struct irq_desc *desc) wake_up(&desc->wait_for_threads); } +static void irq_thread_dtor(struct task_work *unused) +{ + struct task_struct *tsk = current; + struct irq_desc *desc; + struct irqaction *action; + + if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) + return; + + action = kthread_data(tsk); + + pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + + + desc = irq_to_desc(action->irq); + /* + * If IRQTF_RUNTHREAD is set, we need to decrement + * desc->threads_active and wake possible waiters. + */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + wake_threads_waitq(desc); + + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action); +} + /* * Interrupt handler thread */ static int irq_thread(void *data) { + struct task_work on_exit_work; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; @@ -793,7 +824,9 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->irq_thread = 1; + + init_task_work(&on_exit_work, irq_thread_dtor, NULL); + task_work_add(current, &on_exit_work, false); while (!irq_wait_for_interrupt(action)) { irqreturn_t action_ret; @@ -815,44 +848,11 @@ static int irq_thread(void *data) * cannot touch the oneshot mask at this point anymore as * __setup_irq() might have given out currents thread_mask * again. - * - * Clear irq_thread. Otherwise exit_irq_thread() would make - * fuzz about an active irq thread going into nirvana. */ - current->irq_thread = 0; + task_work_cancel(current, irq_thread_dtor); return 0; } -/* - * Called from do_exit() - */ -void exit_irq_thread(void) -{ - struct task_struct *tsk = current; - struct irq_desc *desc; - struct irqaction *action; - - if (!tsk->irq_thread) - return; - - action = kthread_data(tsk); - - pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); - - desc = irq_to_desc(action->irq); - - /* - * If IRQTF_RUNTHREAD is set, we need to decrement - * desc->threads_active and wake possible waiters. - */ - if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) - wake_threads_waitq(desc); - - /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action); -} - static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) @@ -1044,7 +1044,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * has. The type flags are unreliable as the * underlying chip implementation can override them. */ - pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", + pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", irq); ret = -EINVAL; goto out_mask; @@ -1095,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", + pr_warning("irq %d uses trigger mode %u; requested %u\n", irq, nmsk, omsk); } @@ -1133,7 +1133,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { - pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", + pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 079f1d39a8b8..2169feeba529 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, - int symbol_offset) + int symbol_offset, int add_offset) { char *modname; const char *name; @@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address, if (name != buffer) strcpy(buffer, name); len = strlen(buffer); - buffer += len; offset -= symbol_offset; + if (add_offset) + len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); + if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); - else - len += sprintf(buffer, "+%#lx/%#lx", offset, size); + len += sprintf(buffer + len, " [%s]", modname); return len; } @@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address, */ int sprint_symbol(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, 0); + return __sprint_symbol(buffer, address, 0, 1); } - EXPORT_SYMBOL_GPL(sprint_symbol); /** + * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name + * and module name to @buffer if possible. If no symbol was found, just saves + * its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol_no_offset(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0, 0); +} +EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); + +/** * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup @@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol); */ int sprint_backtrace(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, -1); + return __sprint_symbol(buffer, address, -1, 1); } /* Look up a kernel symbol and print it to the kernel messages. */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c @@ -0,0 +1,196 @@ +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/fdtable.h> +#include <linux/string.h> +#include <linux/random.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/cache.h> +#include <linux/bug.h> +#include <linux/err.h> +#include <linux/kcmp.h> + +#include <asm/unistd.h> + +/* + * We don't expose the real in-memory order of objects for security reasons. + * But still the comparison results should be suitable for sorting. So we + * obfuscate kernel pointers values and compare the production instead. + * + * The obfuscation is done in two steps. First we xor the kernel pointer with + * a random value, which puts pointer into a new position in a reordered space. + * Secondly we multiply the xor production with a large odd random number to + * permute its bits even more (the odd multiplier guarantees that the product + * is unique ever after the high bits are truncated, since any odd number is + * relative prime to 2^n). + * + * Note also that the obfuscation itself is invisible to userspace and if needed + * it can be changed to an alternate scheme. + */ +static unsigned long cookies[KCMP_TYPES][2] __read_mostly; + +static long kptr_obfuscate(long v, int type) +{ + return (v ^ cookies[type][0]) * cookies[type][1]; +} + +/* + * 0 - equal, i.e. v1 = v2 + * 1 - less than, i.e. v1 < v2 + * 2 - greater than, i.e. v1 > v2 + * 3 - not equal but ordering unavailable (reserved for future) + */ +static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) +{ + long ret; + + ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + + return (ret < 0) | ((ret > 0) << 1); +} + +/* The caller must have pinned the task */ +static struct file * +get_file_raw_ptr(struct task_struct *task, unsigned int idx) +{ + struct file *file = NULL; + + task_lock(task); + rcu_read_lock(); + + if (task->files) + file = fcheck_files(task->files, idx); + + rcu_read_unlock(); + task_unlock(task); + + return file; +} + +static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +{ + if (likely(m2 != m1)) + mutex_unlock(m2); + mutex_unlock(m1); +} + +static int kcmp_lock(struct mutex *m1, struct mutex *m2) +{ + int err; + + if (m2 > m1) + swap(m1, m2); + + err = mutex_lock_killable(m1); + if (!err && likely(m1 != m2)) { + err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + if (err) + mutex_unlock(m1); + } + + return err; +} + +SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, + unsigned long, idx1, unsigned long, idx2) +{ + struct task_struct *task1, *task2; + int ret; + + rcu_read_lock(); + + /* + * Tasks are looked up in caller's PID namespace only. + */ + task1 = find_task_by_vpid(pid1); + task2 = find_task_by_vpid(pid2); + if (!task1 || !task2) + goto err_no_task; + + get_task_struct(task1); + get_task_struct(task2); + + rcu_read_unlock(); + + /* + * One should have enough rights to inspect task details. + */ + ret = kcmp_lock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); + if (ret) + goto err; + if (!ptrace_may_access(task1, PTRACE_MODE_READ) || + !ptrace_may_access(task2, PTRACE_MODE_READ)) { + ret = -EPERM; + goto err_unlock; + } + + switch (type) { + case KCMP_FILE: { + struct file *filp1, *filp2; + + filp1 = get_file_raw_ptr(task1, idx1); + filp2 = get_file_raw_ptr(task2, idx2); + + if (filp1 && filp2) + ret = kcmp_ptr(filp1, filp2, KCMP_FILE); + else + ret = -EBADF; + break; + } + case KCMP_VM: + ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); + break; + case KCMP_FILES: + ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); + break; + case KCMP_FS: + ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); + break; + case KCMP_SIGHAND: + ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); + break; + case KCMP_IO: + ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); + break; + case KCMP_SYSVSEM: +#ifdef CONFIG_SYSVIPC + ret = kcmp_ptr(task1->sysvsem.undo_list, + task2->sysvsem.undo_list, + KCMP_SYSVSEM); +#else + ret = -EOPNOTSUPP; +#endif + break; + default: + ret = -EINVAL; + break; + } + +err_unlock: + kcmp_unlock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); +err: + put_task_struct(task1); + put_task_struct(task2); + + return ret; + +err_no_task: + rcu_read_unlock(); + return -ESRCH; +} + +static __init int kcmp_cookies_init(void) +{ + int i; + + get_random_bytes(cookies, sizeof(cookies)); + + for (i = 0; i < KCMP_TYPES; i++) + cookies[i][1] |= (~(~0UL >> 1) | 1); + + return 0; +} +arch_initcall(kcmp_cookies_init); diff --git a/kernel/kmod.c b/kernel/kmod.c index 05698a7415fe..ff2c7cb86d77 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -221,13 +221,12 @@ fail: return 0; } -void call_usermodehelper_freeinfo(struct subprocess_info *info) +static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info); kfree(info); } -EXPORT_SYMBOL(call_usermodehelper_freeinfo); static void umh_complete(struct subprocess_info *sub_info) { @@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. - * depth: New value to assign to usermodehelper_disabled. + * @depth: New value to assign to usermodehelper_disabled. * * Change the value of usermodehelper_disabled (under umhelper_sem locked for * writing) and wakeup tasks waiting for it to change. @@ -479,6 +478,7 @@ static void helper_unlock(void) * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. */ +static struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask) { @@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, out: return sub_info; } -EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_setfns - set a cleanup/init function @@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ +static void call_usermodehelper_setfns(struct subprocess_info *info, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), @@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info, info->init = init; info->data = data; } -EXPORT_SYMBOL(call_usermodehelper_setfns); /** * call_usermodehelper_exec - start a usermode application @@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ +static int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); @@ -576,7 +576,25 @@ unlock: helper_unlock(); return retval; } -EXPORT_SYMBOL(call_usermodehelper_exec); + +int call_usermodehelper_fns( + char *path, char **argv, char **envp, int wait, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *), void *data) +{ + struct subprocess_info *info; + gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + + info = call_usermodehelper_setup(path, argv, envp, gfp_mask); + + if (info == NULL) + return -ENOMEM; + + call_usermodehelper_setfns(info, init, cleanup, data); + + return call_usermodehelper_exec(info, wait); +} +EXPORT_SYMBOL(call_usermodehelper_fns); static int proc_cap_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 57bc1fd35b3c..16b20e38c4a1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; - struct task_struct *task; + struct task_struct *task, *me = current; + + /* Ignore SIGCHLD causing any terminated children to autoreap */ + spin_lock_irq(&me->sighand->siglock); + me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; + spin_unlock_irq(&me->sighand->siglock); /* * The last thread in the cgroup-init thread group is terminating. @@ -191,6 +196,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -218,8 +224,8 @@ static struct ctl_table pid_ns_ctl_table[] = { }, { } }; - static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +#endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { @@ -253,7 +259,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + +#ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); +#endif return 0; } diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bebe2b170d49..ad581aa2369a 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -94,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) counter->usage -= val; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +void res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { + for (c = counter; c != top; c = c->parent) { spin_lock(&c->lock); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); @@ -108,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) local_irq_restore(flags); } +void res_counter_uncharge(struct res_counter *counter, unsigned long val) +{ + res_counter_uncharge_until(counter, NULL, val); +} static inline unsigned long long * res_counter_member(struct res_counter *counter, int member) diff --git a/kernel/resource.c b/kernel/resource.c index 7e8ea66a8c01..e1d2b8ee76d5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -515,8 +515,8 @@ out: * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size - * @min: minimum size to allocate - * @max: maximum size to allocate + * @min: minimum boundary to allocate + * @max: maximum boundary to allocate * @align: alignment requested, in bytes * @alignf: alignment function, optional, called if not NULL * @alignf_data: arbitrary data to pass to the @alignf function diff --git a/kernel/signal.c b/kernel/signal.c index f7b418217633..08dfbd748cd2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1656,19 +1656,18 @@ bool do_notify_parent(struct task_struct *tsk, int sig) info.si_signo = sig; info.si_errno = 0; /* - * we are under tasklist_lock here so our parent is tied to - * us and cannot exit and release its namespace. + * We are under tasklist_lock here so our parent is tied to + * us and cannot change. * - * the only it can is to switch its nsproxy with sys_unshare, - * bu uncharing pid namespaces is not allowed, so we'll always - * see relevant namespace + * task_active_pid_ns will always return the same pid namespace + * until a task passes through release_task. * * write_lock() currently calls preempt_disable() which is the * same as rcu_read_lock(), but according to Oleg, this is not * correct to rely on this */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/kernel/sys.c b/kernel/sys.c index 6df42624e454..9ff89cb9657a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -36,6 +36,8 @@ #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> +#include <linux/file.h> +#include <linux/mount.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/version.h> @@ -1378,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; + uts_proc_notify(UTS_PROC_HOSTNAME); } - uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1429,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; + uts_proc_notify(UTS_PROC_DOMAINNAME); } - uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } @@ -1784,77 +1786,102 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE +static bool vma_flags_mismatch(struct vm_area_struct *vma, + unsigned long required, + unsigned long banned) +{ + return (vma->vm_flags & required) != required || + (vma->vm_flags & banned); +} + +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +{ + struct file *exe_file; + struct dentry *dentry; + int err; + + /* + * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's + * remain. So perform a quick test first. + */ + if (mm->num_exe_file_vmas) + return -EBUSY; + + exe_file = fget(fd); + if (!exe_file) + return -EBADF; + + dentry = exe_file->f_path.dentry; + + /* + * Because the original mm->exe_file points to executable file, make + * sure that this one is executable as well, to avoid breaking an + * overall picture. + */ + err = -EACCES; + if (!S_ISREG(dentry->d_inode->i_mode) || + exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + err = inode_permission(dentry->d_inode, MAY_EXEC); + if (err) + goto exit; + + /* + * The symlink can be changed only once, just to disallow arbitrary + * transitions malicious software might bring in. This means one + * could make a snapshot over all processes running and monitor + * /proc/pid/exe changes to notice unusual activity if needed. + */ + down_write(&mm->mmap_sem); + if (likely(!mm->exe_file)) + set_mm_exe_file(mm, exe_file); + else + err = -EBUSY; + up_write(&mm->mmap_sem); + +exit: + fput(exe_file); + return err; +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { unsigned long rlim = rlimit(RLIMIT_DATA); - unsigned long vm_req_flags; - unsigned long vm_bad_flags; - struct vm_area_struct *vma; - int error = 0; struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int error; - if (arg4 | arg5) + if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) return -EINVAL; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (addr >= TASK_SIZE) return -EINVAL; + error = -EINVAL; + down_read(&mm->mmap_sem); vma = find_vma(mm, addr); - if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { - /* It must be existing VMA */ - if (!vma || vma->vm_start > addr) - goto out; - } - - error = -EINVAL; switch (opt) { case PR_SET_MM_START_CODE: + mm->start_code = addr; + break; case PR_SET_MM_END_CODE: - vm_req_flags = VM_READ | VM_EXEC; - vm_bad_flags = VM_WRITE | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_CODE) - mm->start_code = addr; - else - mm->end_code = addr; + mm->end_code = addr; break; - case PR_SET_MM_START_DATA: - case PR_SET_MM_END_DATA: - vm_req_flags = VM_READ | VM_WRITE; - vm_bad_flags = VM_EXEC | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_DATA) - mm->start_data = addr; - else - mm->end_data = addr; + mm->start_data = addr; break; - - case PR_SET_MM_START_STACK: - -#ifdef CONFIG_STACK_GROWSUP - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; -#else - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; -#endif - if ((vma->vm_flags & vm_req_flags) != vm_req_flags) - goto out; - - mm->start_stack = addr; + case PR_SET_MM_END_DATA: + mm->end_data = addr; break; case PR_SET_MM_START_BRK: @@ -1881,16 +1908,77 @@ static int prctl_set_mm(int opt, unsigned long addr, mm->brk = addr; break; + /* + * If command line arguments and environment + * are placed somewhere else on stack, we can + * set them up here, ARG_START/END to setup + * command line argumets and ENV_START/END + * for environment. + */ + case PR_SET_MM_START_STACK: + case PR_SET_MM_ARG_START: + case PR_SET_MM_ARG_END: + case PR_SET_MM_ENV_START: + case PR_SET_MM_ENV_END: + if (!vma) { + error = -EFAULT; + goto out; + } +#ifdef CONFIG_STACK_GROWSUP + if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) +#else + if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) +#endif + goto out; + if (opt == PR_SET_MM_START_STACK) + mm->start_stack = addr; + else if (opt == PR_SET_MM_ARG_START) + mm->arg_start = addr; + else if (opt == PR_SET_MM_ARG_END) + mm->arg_end = addr; + else if (opt == PR_SET_MM_ENV_START) + mm->env_start = addr; + else if (opt == PR_SET_MM_ENV_END) + mm->env_end = addr; + break; + + /* + * This doesn't move auxiliary vector itself + * since it's pinned to mm_struct, but allow + * to fill vector with new values. It's up + * to a caller to provide sane values here + * otherwise user space tools which use this + * vector might be unhappy. + */ + case PR_SET_MM_AUXV: { + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (arg4 > sizeof(user_auxv)) + goto out; + up_read(&mm->mmap_sem); + + if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, arg4); + task_unlock(current); + + return 0; + } default: - error = -EINVAL; goto out; } error = 0; - out: up_read(&mm->mmap_sem); - return error; } #else /* CONFIG_CHECKPOINT_RESTORE */ @@ -2114,7 +2202,6 @@ int orderly_poweroff(bool force) NULL }; int ret = -ENOMEM; - struct subprocess_info *info; if (argv == NULL) { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", @@ -2122,18 +2209,16 @@ int orderly_poweroff(bool force) goto out; } - info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); - if (info == NULL) { - argv_free(argv); - goto out; - } - - call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + NULL, argv_cleanup, NULL); +out: + if (likely(!ret)) + return 0; - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + if (ret == -ENOMEM) + argv_free(argv); - out: - if (ret && force) { + if (force) { printk(KERN_WARNING "Failed to start orderly shutdown: " "forcing the issue\n"); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); cond_syscall(sys_name_to_handle_at); cond_syscall(sys_open_by_handle_at); cond_syscall(compat_sys_open_by_handle_at); + +/* compare kernel pointers */ +cond_syscall(sys_kcmp); diff --git a/kernel/task_work.c b/kernel/task_work.c new file mode 100644 index 000000000000..82d1c794066d --- /dev/null +++ b/kernel/task_work.c @@ -0,0 +1,84 @@ +#include <linux/spinlock.h> +#include <linux/task_work.h> +#include <linux/tracehook.h> + +int +task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +{ + unsigned long flags; + int err = -ESRCH; + +#ifndef TIF_NOTIFY_RESUME + if (notify) + return -ENOTSUPP; +#endif + /* + * We must not insert the new work if the task has already passed + * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() + * and check PF_EXITING under pi_lock. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (likely(!(task->flags & PF_EXITING))) { + hlist_add_head(&twork->hlist, &task->task_works); + err = 0; + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ + if (likely(!err) && notify) + set_notify_resume(task); + return err; +} + +struct task_work * +task_work_cancel(struct task_struct *task, task_work_func_t func) +{ + unsigned long flags; + struct task_work *twork; + struct hlist_node *pos; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + hlist_for_each_entry(twork, pos, &task->task_works, hlist) { + if (twork->func == func) { + hlist_del(&twork->hlist); + goto found; + } + } + twork = NULL; + found: + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + return twork; +} + +void task_work_run(void) +{ + struct task_struct *task = current; + struct hlist_head task_works; + struct hlist_node *pos; + + raw_spin_lock_irq(&task->pi_lock); + hlist_move_list(&task->task_works, &task_works); + raw_spin_unlock_irq(&task->pi_lock); + + if (unlikely(hlist_empty(&task_works))) + return; + /* + * We use hlist to save the space in task_struct, but we want fifo. + * Find the last entry, the list should be short, then process them + * in reverse order. + */ + for (pos = task_works.first; pos->next; pos = pos->next) + ; + + for (;;) { + struct hlist_node **pprev = pos->pprev; + struct task_work *twork = container_of(pos, struct task_work, + hlist); + twork->func(twork); + + if (pprev == &task_works.first) + break; + pos = container_of(pprev, struct hlist_node, next); + } +} diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6420cda62336..1d0f6a8a0e5e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1486,6 +1486,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!buffer) return size; + /* Make sure the requested buffer exists */ + if (cpu_id != RING_BUFFER_ALL_CPUS && + !cpumask_test_cpu(cpu_id, buffer->cpumask)) + return size; + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; |