summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/Kconfig17
-rw-r--r--include/linux/Kbuild1
-rw-r--r--include/linux/seccomp.h76
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/seccomp.c396
-rw-r--r--kernel/sys.c2
6 files changed, 472 insertions, 23 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..91c2c730fc1a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -216,4 +216,21 @@ config HAVE_CMPXCHG_DOUBLE
config ARCH_WANT_OLD_COMPAT_IPC
bool
+config HAVE_ARCH_SECCOMP_FILTER
+ bool
+ help
+ This symbol should be selected by an architecure if it provides
+ asm/syscall.h, specifically syscall_get_arguments() and
+ syscall_get_arch().
+
+config SECCOMP_FILTER
+ def_bool y
+ depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
+ help
+ Enable tasks to build secure computing environments defined
+ in terms of Berkeley Packet Filter programs which implement
+ task-defined system call filtering polices.
+
+ See Documentation/prctl/seccomp_filter.txt for details.
+
source "kernel/gcov/Kconfig"
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 3c9b616c834a..5c93d6c5d591 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -332,6 +332,7 @@ header-y += scc.h
header-y += sched.h
header-y += screen_info.h
header-y += sdla.h
+header-y += seccomp.h
header-y += securebits.h
header-y += selinux_netlink.h
header-y += sem.h
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index d61f27fcaa97..86bb68fc7683 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -1,14 +1,67 @@
#ifndef _LINUX_SECCOMP_H
#define _LINUX_SECCOMP_H
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+
+/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
+#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
+#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
+#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
+
+/*
+ * All BPF programs must return a 32-bit value.
+ * The bottom 16-bits are reserved for future use.
+ * The upper 16-bits are ordered from least permissive values to most.
+ *
+ * The ordering ensures that a min_t() over composed return values always
+ * selects the least permissive choice.
+ */
+#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
+
+/* Masks for the return value sections. */
+#define SECCOMP_RET_ACTION 0x7fff0000U
+#define SECCOMP_RET_DATA 0x0000ffffU
+
+/**
+ * struct seccomp_data - the format the BPF program executes over.
+ * @nr: the system call number
+ * @arch: indicates system call convention as an AUDIT_ARCH_* value
+ * as defined in <linux/audit.h>.
+ * @instruction_pointer: at the time of the system call.
+ * @args: up to 6 system call arguments always stored as 64-bit values
+ * regardless of the architecture.
+ */
+struct seccomp_data {
+ int nr;
+ __u32 arch;
+ __u64 instruction_pointer;
+ __u64 args[6];
+};
+#ifdef __KERNEL__
#ifdef CONFIG_SECCOMP
#include <linux/thread_info.h>
#include <asm/seccomp.h>
+struct seccomp_filter;
+/**
+ * struct seccomp - the state of a seccomp'ed process
+ *
+ * @mode: indicates one of the valid values above for controlled
+ * system calls available to a process.
+ * @filter: The metadata and ruleset for determining what system calls
+ * are allowed for a task.
+ *
+ * @filter must only be accessed from the context of current as there
+ * is no locking.
+ */
struct seccomp {
int mode;
+ struct seccomp_filter *filter;
};
extern void __secure_computing(int);
@@ -19,7 +72,7 @@ static inline void secure_computing(int this_syscall)
}
extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long);
+extern long prctl_set_seccomp(unsigned long, char __user *);
static inline int seccomp_mode(struct seccomp *s)
{
@@ -31,15 +84,16 @@ static inline int seccomp_mode(struct seccomp *s)
#include <linux/errno.h>
struct seccomp { };
+struct seccomp_filter { };
-#define secure_computing(x) do { } while (0)
+#define secure_computing(x) 0
static inline long prctl_get_seccomp(void)
{
return -EINVAL;
}
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
{
return -EINVAL;
}
@@ -48,7 +102,21 @@ static inline int seccomp_mode(struct seccomp *s)
{
return 0;
}
-
#endif /* CONFIG_SECCOMP */
+#ifdef CONFIG_SECCOMP_FILTER
+extern void put_seccomp_filter(struct task_struct *tsk);
+extern void get_seccomp_filter(struct task_struct *tsk);
+extern u32 seccomp_bpf_load(int off);
+#else /* CONFIG_SECCOMP_FILTER */
+static inline void put_seccomp_filter(struct task_struct *tsk)
+{
+ return;
+}
+static inline void get_seccomp_filter(struct task_struct *tsk)
+{
+ return;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+#endif /* __KERNEL__ */
#endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..f7cf6fb107ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
@@ -170,6 +171,7 @@ void free_task(struct task_struct *tsk)
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -1162,6 +1164,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
+ get_seccomp_filter(p);
rt_mutex_init_task(p);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..0aeec1960f91 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,343 @@
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
*
- * This defines a simple but solid secure-computing mode.
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ * of Berkeley Packet Filters/Linux Socket Filters.
*/
+#include <linux/atomic.h>
#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+
+#ifdef CONFIG_SECCOMP_FILTER
+#include <asm/syscall.h>
+#include <linux/filter.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ * get/put helpers should be used when accessing an instance
+ * outside of a lifetime-guarded section. In general, this
+ * is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insns: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer. For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory. This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+ atomic_t usage;
+ struct seccomp_filter *prev;
+ unsigned short len; /* Instruction count */
+ struct sock_filter insns[];
+};
+
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+static void seccomp_filter_log_failure(int syscall)
+{
+ int compat = 0;
+#ifdef CONFIG_COMPAT
+ compat = is_compat_task();
+#endif
+ pr_info("%s[%d]: %ssystem call %d blocked at 0x%lx\n",
+ current->comm, task_pid_nr(current),
+ (compat ? "compat " : ""),
+ syscall, KSTK_EIP(current));
+}
+
+/**
+ * get_u32 - returns a u32 offset into data
+ * @data: a unsigned 64 bit value
+ * @index: 0 or 1 to return the first or second 32-bits
+ *
+ * This inline exists to hide the length of unsigned long. If a 32-bit
+ * unsigned long is passed in, it will be extended and the top 32-bits will be
+ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
+ * properly returned.
+ *
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static inline u32 get_u32(u64 data, int index)
+{
+ return ((u32 *)&data)[index];
+}
+
+/* Helper for bpf_load below. */
+#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
+/**
+ * bpf_load: checks and returns a pointer to the requested offset
+ * @off: offset into struct seccomp_data to load from
+ *
+ * Returns the requested 32-bits of data.
+ * seccomp_check_filter() should assure that @off is 32-bit aligned
+ * and not out of bounds. Failure to do so is a BUG.
+ */
+u32 seccomp_bpf_load(int off)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ if (off == BPF_DATA(nr))
+ return syscall_get_nr(current, regs);
+ if (off == BPF_DATA(arch))
+ return syscall_get_arch(current, regs);
+ if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
+ unsigned long value;
+ int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
+ int index = !!(off % sizeof(u64));
+ syscall_get_arguments(current, regs, arg, 1, &value);
+ return get_u32(value, index);
+ }
+ if (off == BPF_DATA(instruction_pointer))
+ return get_u32(KSTK_EIP(current), 0);
+ if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
+ return get_u32(KSTK_EIP(current), 1);
+ /* seccomp_check_filter should make this impossible. */
+ BUG();
+}
+
+/**
+ * seccomp_check_filter - verify seccomp filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Takes a previously checked filter (by sk_chk_filter) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load. It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+ int pc;
+ for (pc = 0; pc < flen; pc++) {
+ struct sock_filter *ftest = &filter[pc];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+
+ switch (code) {
+ case BPF_S_LD_W_ABS:
+ ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+ /* 32-bit aligned and not out of bounds. */
+ if (k >= sizeof(struct seccomp_data) || k & 3)
+ return -EINVAL;
+ continue;
+ case BPF_S_LD_W_LEN:
+ ftest->code = BPF_S_LD_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ case BPF_S_LDX_W_LEN:
+ ftest->code = BPF_S_LDX_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ /* Explicitly include allowed calls. */
+ case BPF_S_RET_K:
+ case BPF_S_RET_A:
+ case BPF_S_ALU_ADD_K:
+ case BPF_S_ALU_ADD_X:
+ case BPF_S_ALU_SUB_K:
+ case BPF_S_ALU_SUB_X:
+ case BPF_S_ALU_MUL_K:
+ case BPF_S_ALU_MUL_X:
+ case BPF_S_ALU_DIV_X:
+ case BPF_S_ALU_AND_K:
+ case BPF_S_ALU_AND_X:
+ case BPF_S_ALU_OR_K:
+ case BPF_S_ALU_OR_X:
+ case BPF_S_ALU_LSH_K:
+ case BPF_S_ALU_LSH_X:
+ case BPF_S_ALU_RSH_K:
+ case BPF_S_ALU_RSH_X:
+ case BPF_S_ALU_NEG:
+ case BPF_S_LD_IMM:
+ case BPF_S_LDX_IMM:
+ case BPF_S_MISC_TAX:
+ case BPF_S_MISC_TXA:
+ case BPF_S_ALU_DIV_K:
+ case BPF_S_LD_MEM:
+ case BPF_S_LDX_MEM:
+ case BPF_S_ST:
+ case BPF_S_STX:
+ case BPF_S_JMP_JA:
+ case BPF_S_JMP_JEQ_K:
+ case BPF_S_JMP_JEQ_X:
+ case BPF_S_JMP_JGE_K:
+ case BPF_S_JMP_JGE_X:
+ case BPF_S_JMP_JGT_K:
+ case BPF_S_JMP_JGT_X:
+ case BPF_S_JMP_JSET_K:
+ case BPF_S_JMP_JSET_X:
+ continue;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(int syscall)
+{
+ struct seccomp_filter *f;
+ u32 ret = SECCOMP_RET_KILL;
+ /*
+ * All filters in the list are evaluated and the lowest BPF return
+ * value always takes priority.
+ */
+ for (f = current->seccomp.filter; f; f = f->prev) {
+ ret = sk_run_filter(NULL, f->insns);
+ if (ret != SECCOMP_RET_ALLOW)
+ break;
+ }
+ return ret;
+}
+
+/**
+ * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * @fprog: BPF program to install
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+static long seccomp_attach_filter(struct sock_fprog *fprog)
+{
+ struct seccomp_filter *filter;
+ unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+ unsigned long total_insns = fprog->len;
+ long ret;
+
+ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ for (filter = current->seccomp.filter; filter; filter = filter->prev)
+ total_insns += filter->len + 4; /* include a 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /*
+ * Installing a seccomp filter requires that the task have
+ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+ * This avoids scenarios where unprivileged tasks can affect the
+ * behavior of privileged children.
+ */
+ if (!current->no_new_privs &&
+ security_capable_noaudit(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN) != 0)
+ return -EACCES;
+
+ /* Allocate a new seccomp_filter */
+ filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter)
+ return -ENOMEM;
+ atomic_set(&filter->usage, 1);
+ filter->len = fprog->len;
+
+ /* Copy the instructions from fprog. */
+ ret = -EFAULT;
+ if (copy_from_user(filter->insns, fprog->filter, fp_size))
+ goto fail;
+
+ /* Check and rewrite the fprog via the skb checker */
+ ret = sk_chk_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /* Check and rewrite the fprog for seccomp use */
+ ret = seccomp_check_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+ return 0;
+fail:
+ kfree(filter);
+ return ret;
+}
+
+/**
+ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+long seccomp_attach_user_filter(char __user *user_filter)
+{
+ struct sock_fprog fprog;
+ long ret = -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task()) {
+ struct compat_sock_fprog fprog32;
+ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+ goto out;
+ fprog.len = fprog32.len;
+ fprog.filter = compat_ptr(fprog32.filter);
+ } else /* falls through to the if below. */
+#endif
+ if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+ goto out;
+ ret = seccomp_attach_filter(&fprog);
+out:
+ return ret;
+}
+
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ if (!orig)
+ return;
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&orig->usage);
+}
+
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* Clean up single-reference branches iteratively. */
+ while (orig && atomic_dec_and_test(&orig->usage)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ kfree(freeme);
+ }
+}
+#endif /* CONFIG_SECCOMP_FILTER */
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -34,10 +361,11 @@ static int mode1_syscalls_32[] = {
void __secure_computing(int this_syscall)
{
int mode = current->seccomp.mode;
- int * syscall;
+ int exit_sig = 0;
+ int *syscall;
switch (mode) {
- case 1:
+ case SECCOMP_MODE_STRICT:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (is_compat_task())
@@ -47,7 +375,16 @@ void __secure_computing(int this_syscall)
if (*syscall == this_syscall)
return;
} while (*++syscall);
+ exit_sig = SIGKILL;
break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW)
+ return;
+ seccomp_filter_log_failure(this_syscall);
+ exit_sig = SIGSYS;
+ break;
+#endif
default:
BUG();
}
@@ -56,7 +393,7 @@ void __secure_computing(int this_syscall)
dump_stack();
#endif
audit_seccomp(this_syscall);
- do_exit(SIGKILL);
+ do_exit(exit_sig);
}
long prctl_get_seccomp(void)
@@ -64,25 +401,48 @@ long prctl_get_seccomp(void)
return current->seccomp.mode;
}
-long prctl_set_seccomp(unsigned long seccomp_mode)
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * This function may be called repeatedly with a @seccomp_mode of
+ * SECCOMP_MODE_FILTER to install additional filters. Every filter
+ * successfully installed will be evaluated (in reverse order) for each system
+ * call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
- long ret;
+ long ret = -EINVAL;
- /* can set it only once to be even more secure */
- ret = -EPERM;
- if (unlikely(current->seccomp.mode))
+ if (current->seccomp.mode &&
+ current->seccomp.mode != seccomp_mode)
goto out;
- ret = -EINVAL;
- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ ret = 0;
#ifdef TIF_NOTSC
disable_TSC();
#endif
- ret = 0;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ ret = seccomp_attach_user_filter(filter);
+ if (ret)
+ goto out;
+ break;
+#endif
+ default:
+ goto out;
}
- out:
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+out:
return ret;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index b82568b7d201..ba0ae8eea6fb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = prctl_get_seccomp();
break;
case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2);
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;
case PR_GET_TSC:
error = GET_TSC_CTL(arg2);