From 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:57 -0700 Subject: bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 kernel/bpf/syscall.c (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,169 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} -- cgit v1.2.3 From db20fd2b01087bdfbe30bce314a198eefedcc42e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:59 -0700 Subject: bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 ++ include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/syscall.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48014a71f0fe..2887f3f9da59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -9,6 +9,7 @@ #include #include +#include struct bpf_map; @@ -17,6 +18,12 @@ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_free)(struct bpf_map *); + int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + + /* funcs callable from userspace and from eBPF programs */ + void *(*map_lookup_elem)(struct bpf_map *map, void *key); + int (*map_update_elem)(struct bpf_map *map, void *key, void *value); + int (*map_delete_elem)(struct bpf_map *map, void *key); }; struct bpf_map { @@ -37,5 +44,6 @@ struct bpf_map_type_list { void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); +struct bpf_map *bpf_map_get(struct fd f); #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f58a10f9670c..395cabd2ca0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -70,6 +70,35 @@ enum bpf_cmd { * map is deleted when fd is closed */ BPF_MAP_CREATE, + + /* lookup key in a given map + * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero and stores found elem into value + * or negative error + */ + BPF_MAP_LOOKUP_ELEM, + + /* create or update key/value pair in a given map + * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero or negative error + */ + BPF_MAP_UPDATE_ELEM, + + /* find and delete elem by key in a given map + * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key + * returns zero or negative error + */ + BPF_MAP_DELETE_ELEM, + + /* lookup key in a given map and return next key + * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->next_key + * returns zero and stores next key or negative error + */ + BPF_MAP_GET_NEXT_KEY, }; enum bpf_map_type { @@ -83,6 +112,15 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + }; } __attribute__((aligned(8))); #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 428a0e23adc0..f94349ecaf61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -13,6 +13,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -111,6 +112,228 @@ free_map: return err; } +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ESRCH; + rcu_read_lock(); + value = map->ops->map_lookup_elem(map, key); + if (!value) + goto err_unlock; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto err_unlock; + + err = 0; + +err_unlock: + rcu_read_unlock(); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -160,6 +383,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_CREATE: err = map_create(&attr); break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 09756af46893c18839062976c3252e93a1beeba7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:00 -0700 Subject: bpf: expand BPF syscall with program load/unload eBPF programs are similar to kernel modules. They are loaded by the user process and automatically unloaded when process exits. Each eBPF program is a safe run-to-completion set of instructions. eBPF verifier statically determines that the program terminates and is safe to execute. The following syscall wrapper can be used to load the program: int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = prog_type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } where 'insns' is an array of eBPF instructions and 'license' is a string that must be GPL compatible to call helper functions marked gpl_only Upon succesful load the syscall returns prog_fd. Use close(prog_fd) to unload the program. User space tests and examples follow in the later patches Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 38 +++++++++++ include/linux/filter.h | 8 +-- include/uapi/linux/bpf.h | 26 ++++++++ kernel/bpf/core.c | 29 +++++---- kernel/bpf/syscall.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 246 insertions(+), 20 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2887f3f9da59..92979182be81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,4 +46,42 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL + * instructions after verifying + */ +struct bpf_func_proto { + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + bool gpl_only; +}; + +struct bpf_verifier_ops { + /* return eBPF function prototype for verification */ + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); +}; + +struct bpf_prog_type_list { + struct list_head list_node; + struct bpf_verifier_ops *ops; + enum bpf_prog_type type; +}; + +void bpf_register_prog_type(struct bpf_prog_type_list *tl); + +struct bpf_prog; + +struct bpf_prog_aux { + atomic_t refcnt; + bool is_gpl_compatible; + enum bpf_prog_type prog_type; + struct bpf_verifier_ops *ops; + struct bpf_map **used_maps; + u32 used_map_cnt; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_prog_put(struct bpf_prog *prog); +struct bpf_prog *bpf_prog_get(u32 ufd); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 1a0bc6d134d7..4ffc0958d85e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,6 +21,7 @@ struct sk_buff; struct sock; struct seccomp_data; +struct bpf_prog_aux; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -300,17 +301,12 @@ struct bpf_binary_header { u8 image[]; }; -struct bpf_work_struct { - struct bpf_prog *prog; - struct work_struct work; -}; - struct bpf_prog { u16 pages; /* Number of allocated pages */ bool jited; /* Is our filter JIT'ed? */ u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - struct bpf_work_struct *work; /* Deferred free work struct */ + struct bpf_prog_aux *aux; /* Auxiliary fields */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); /* Instructions for interpreter */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395cabd2ca0a..424f442016e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -99,12 +99,23 @@ enum bpf_cmd { * returns zero and stores next key or negative error */ BPF_MAP_GET_NEXT_KEY, + + /* verify and load eBPF program + * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) + * Using attr->prog_type, attr->insns, attr->license + * returns fd or negative error + */ + BPF_PROG_LOAD, }; enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, }; +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -121,6 +132,21 @@ union bpf_attr { __aligned_u64 next_key; }; }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + }; } __attribute__((aligned(8))); +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +enum bpf_func_id { + BPF_FUNC_unspec, + __BPF_FUNC_MAX_ID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b7002488251..f0c30c59b317 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Registers */ #define BPF_R0 regs[BPF_REG_0] @@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); @@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); - if (ws == NULL) { + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { vfree(fp); return NULL; } fp->pages = size / PAGE_SIZE; - fp->work = ws; + fp->aux = aux; return fp; } @@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; - /* We keep fp->work from fp_old around in the new + /* We keep fp->aux from fp_old around in the new * reallocated structure. */ - fp_old->work = NULL; + fp_old->aux = NULL; __bpf_prog_free(fp_old); } @@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->work); + kfree(fp->aux); vfree(fp); } EXPORT_SYMBOL_GPL(__bpf_prog_free); @@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static void bpf_prog_free_deferred(struct work_struct *work) { - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; - ws = container_of(work, struct bpf_work_struct, work); - bpf_jit_free(ws->prog); + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); } /* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - struct bpf_work_struct *ws = fp->work; + struct bpf_prog_aux *aux = fp->aux; - INIT_WORK(&ws->work, bpf_prog_free_deferred); - ws->prog = fp; - schedule_work(&ws->work); + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f94349ecaf61..0afb4eaa1887 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include static LIST_HEAD(bpf_map_types); @@ -334,6 +336,166 @@ err_put: return err; } +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->aux->prog_type = type; + return 0; + } + } + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD license + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->aux->is_gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + /* err = bpf_check(prog, tb); */ + + if (err < 0) + goto free_used_maps; + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -395,6 +557,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 0a542a86d73b1577e7d4f55fc95dcffd3fe62643 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:01 -0700 Subject: bpf: handle pseudo BPF_CALL insn in native eBPF programs userspace is using pseudo BPF_CALL instructions which encode one of 'enum bpf_func_id' inside insn->imm field. Verifier checks that program using correct function arguments to given func_id. If all checks passed, kernel needs to fixup BPF_CALL->imm fields by replacing func_id with in-kernel function pointer. eBPF interpreter just calls the function. In-kernel eBPF users continue to use generic BPF_CALL. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0afb4eaa1887..b513659d120f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -357,6 +357,40 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl) list_add(&tl->list_node, &bpf_prog_types); } +/* fixup insn->imm field of bpf_call instructions: + * if (insn->imm == BPF_FUNC_map_lookup_elem) + * insn->imm = bpf_map_lookup_elem - __bpf_call_base; + * else if (insn->imm == BPF_FUNC_map_update_elem) + * insn->imm = bpf_map_update_elem - __bpf_call_base; + * else ... + * + * this function is called after eBPF program passed verification + */ +static void fixup_bpf_calls(struct bpf_prog *prog) +{ + const struct bpf_func_proto *fn; + int i; + + for (i = 0; i < prog->len; i++) { + struct bpf_insn *insn = &prog->insnsi[i]; + + if (insn->code == (BPF_JMP | BPF_CALL)) { + /* we reach here when program has bpf_call instructions + * and it passed bpf_check(), means that + * ops->get_func_proto must have been supplied, check it + */ + BUG_ON(!prog->aux->ops->get_func_proto); + + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + BUG_ON(!fn->func); + insn->imm = fn->func - __bpf_call_base; + } + } +} + /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { @@ -478,6 +512,9 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; + /* fixup BPF_CALL->imm field */ + fixup_bpf_calls(prog); + /* eBPF program is ready to be JITed */ bpf_prog_select_runtime(prog); -- cgit v1.2.3 From 51580e798cb61b0fc63fa3aa6c5c975375aa0550 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:02 -0700 Subject: bpf: verifier (add docs) this patch adds all of eBPF verfier documentation and empty bpf_check() The end goal for the verifier is to statically check safety of the program. Verifier will catch: - loops - out of range jumps - unreachable instructions - invalid instructions - uninitialized register access - uninitialized stack access - misaligned stack access - out of range stack access - invalid calling convention More details in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 224 ++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 2 + kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 133 +++++++++++++++++++++ 5 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/verifier.c (limited to 'kernel/bpf/syscall.c') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 4a01d71785e9..5ce4d07406a5 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,99 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF verifier +------------- +The safety of the eBPF program is determined in two steps. + +First step does DAG check to disallow loops and other CFG validation. +In particular it will detect programs that have unreachable instructions. +(though classic BPF checker allows them) + +Second step starts from the first insn and descends all possible paths. +It simulates execution of every insn and observes the state change of +registers and stack. + +At the start of the program the register R1 contains a pointer to context +and has type PTR_TO_CTX. +If verifier sees an insn that does R2=R1, then R2 has now type +PTR_TO_CTX as well and can be used on the right hand side of expression. +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE, +since addition of two valid pointers makes invalid pointer. +(In 'secure' mode verifier will reject any type of pointer arithmetic to make +sure that kernel addresses don't leak to unprivileged users) + +If register was never written to, it's not readable: + bpf_mov R0 = R2 + bpf_exit +will be rejected, since R2 is unreadable at the start of the program. + +After kernel function call, R1-R5 are reset to unreadable and +R0 has a return type of the function. + +Since R6-R9 are callee saved, their state is preserved across the call. + bpf_mov R6 = 1 + bpf_call foo + bpf_mov R0 = R6 + bpf_exit +is a correct program. If there was R1 instead of R6, it would have +been rejected. + +load/store instructions are allowed only with registers of valid types, which +are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked. +For example: + bpf_mov R1 = 1 + bpf_mov R2 = 2 + bpf_xadd *(u32 *)(R1 + 3) += R2 + bpf_exit +will be rejected, since R1 doesn't have a valid pointer type at the time of +execution of instruction bpf_xadd. + +At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context') +A callback is used to customize verifier to restrict eBPF program access to only +certain fields within ctx structure with specified size and alignment. + +For example, the following insn: + bpf_ld R0 = *(u32 *)(R6 + 8) +intends to load a word from address R6 + 8 and store it into R0 +If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know +that offset 8 of size 4 bytes can be accessed for reading, otherwise +the verifier will reject the program. +If R6=FRAME_PTR, then access should be aligned and be within +stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, +so it will fail verification, since it's out of bounds. + +The verifier will allow eBPF program to read data from stack only after +it wrote into it. +Classic BPF verifier does similar check with M[0-15] memory slots. +For example: + bpf_ld R0 = *(u32 *)(R10 - 4) + bpf_exit +is invalid program. +Though R10 is correct read-only register and has type FRAME_PTR +and R10 - 4 is within stack bounds, there were no stores into that location. + +Pointer register spill/fill is tracked as well, since four (R6-R9) +callee saved registers may not be enough for some programs. + +Allowed function calls are customized with bpf_verifier_ops->get_func_proto() +The eBPF verifier will check that registers match argument constraints. +After the call register R0 will be set to return type of the function. + +Function calls is a main mechanism to extend functionality of eBPF programs. +Socket filters may let programs to call one set of functions, whereas tracing +filters may allow completely different set. + +If a function made accessible to eBPF program, it needs to be thought through +from safety point of view. The verifier will guarantee that the function is +called with valid arguments. + +seccomp vs socket filters have different security restrictions for classic BPF. +Seccomp solves this by two stage verifier: classic BPF verifier is followed +by seccomp verifier. In case of eBPF one configurable verifier is shared for +all use cases. + +See details of eBPF verifier in kernel/bpf/verifier.c + eBPF maps --------- 'maps' is a generic storage of different types for sharing data between kernel @@ -1040,6 +1133,137 @@ The map is defined by: . key size in bytes . value size in bytes +Understanding eBPF verifier messages +------------------------------------ + +The following are few examples of invalid eBPF programs and verifier error +messages as seen in the log: + +Program with unreachable instructions: +static struct bpf_insn prog[] = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), +}; +Error: + unreachable insn 1 + +Program that reads uninitialized register: + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), +Error: + 0: (bf) r0 = r2 + R2 !read_ok + +Program that doesn't initialize R0 before exiting: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r1 + 1: (95) exit + R0 !read_ok + +Program that accesses stack out of bounds: + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r10 + 1: (07) r2 += -8 + 2: (b7) r1 = 0x0 + 3: (85) call 1 + invalid indirect read from stack off -8+0 size 8 + +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + fd 0 is not pointing to valid bpf_map + +Program that doesn't check return value of map_lookup_elem() before accessing +map element: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + 5: (7a) *(u64 *)(r0 +0) = 0 + R0 invalid mem access 'map_value_or_null' + +Program that correctly checks map_lookup_elem() returned value for NULL, but +accesses the memory with incorrect alignment: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+1 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +4) = 0 + misaligned access off 4 size 8 + +Program that correctly checks map_lookup_elem() returned value for NULL and +accesses memory with correct alignment in one side of 'if' branch, but fails +to do so in the other side of 'if' branch: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+2 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +0) = 0 + 7: (95) exit + + from 5 to 8: R0=imm0 R10=fp + 8: (7a) *(u64 *)(r0 +0) = 1 + R0 invalid mem access 'imm' + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 92979182be81..9dfeb36f8971 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -83,5 +83,7 @@ struct bpf_prog_aux { void bpf_prog_put(struct bpf_prog *prog); struct bpf_prog *bpf_prog_get(u32 ufd); +/* verify correctness of eBPF program */ +int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e9f7334ed07a..3c726b0995b7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o syscall.o +obj-y := core.o syscall.o verifier.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b513659d120f..74b3628c5fdb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -507,7 +507,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - /* err = bpf_check(prog, tb); */ + err = bpf_check(prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..d6f9c3d6b4d7 --- /dev/null +++ b/kernel/bpf/verifier.c @@ -0,0 +1,133 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* bpf_check() is a static code analyzer that walks eBPF program + * instruction by instruction and updates register/stack state. + * All paths of conditional branches are analyzed until 'bpf_exit' insn. + * + * The first pass is depth-first-search to check that the program is a DAG. + * It rejects the following programs: + * - larger than BPF_MAXINSNS insns + * - if loop is present (detected via back-edge) + * - unreachable insns exist (shouldn't be a forest. program = one function) + * - out of bounds or malformed jumps + * The second pass is all possible path descent from the 1st insn. + * Since it's analyzing all pathes through the program, the length of the + * analysis is limited to 32k insn, which may be hit even if total number of + * insn is less then 4K, but there are too many branches that change stack/regs. + * Number of 'branches to be analyzed' is limited to 1k + * + * On entry to each instruction, each register has a type, and the instruction + * changes the types of the registers depending on instruction semantics. + * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is + * copied to R1. + * + * All registers are 64-bit. + * R0 - return register + * R1-R5 argument passing registers + * R6-R9 callee saved registers + * R10 - frame pointer read-only + * + * At the start of BPF program the register R1 contains a pointer to bpf_context + * and has type PTR_TO_CTX. + * + * Verifier tracks arithmetic operations on pointers in case: + * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), + * 1st insn copies R10 (which has FRAME_PTR) type into R1 + * and 2nd arithmetic instruction is pattern matched to recognize + * that it wants to construct a pointer to some element within stack. + * So after 2nd insn, the register R1 has type PTR_TO_STACK + * (and -20 constant is saved for further stack bounds checking). + * Meaning that this reg is a pointer to stack plus known immediate constant. + * + * Most of the time the registers have UNKNOWN_VALUE type, which + * means the register has some value, but it's not a valid pointer. + * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * + * When verifier sees load or store instructions the type of base register + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * types recognized by check_mem_access() function. + * + * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' + * and the range of [ptr, ptr + map's value_size) is accessible. + * + * registers used to pass values to function calls are checked against + * function argument constraints. + * + * ARG_PTR_TO_MAP_KEY is one of such argument constraints. + * It means that the register type passed to this function must be + * PTR_TO_STACK and it will be used inside the function as + * 'pointer to map element key' + * + * For example the argument constraints for bpf_map_lookup_elem(): + * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + * .arg1_type = ARG_CONST_MAP_PTR, + * .arg2_type = ARG_PTR_TO_MAP_KEY, + * + * ret_type says that this function returns 'pointer to map elem value or null' + * function expects 1st argument to be a const pointer to 'struct bpf_map' and + * 2nd argument should be a pointer to stack, which will be used inside + * the helper function as a pointer to map element key. + * + * On the kernel side the helper function looks like: + * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) + * { + * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + * void *key = (void *) (unsigned long) r2; + * void *value; + * + * here kernel can access 'key' and 'map' pointers safely, knowing that + * [key, key + map->key_size) bytes are valid and were initialized on + * the stack of eBPF program. + * } + * + * Corresponding eBPF program may look like: + * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK + * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + * here verifier looks at prototype of map_lookup_elem() and sees: + * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, + * Now verifier knows that this map has key of R1->map_ptr->key_size bytes + * + * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, + * Now verifier checks that [R2, R2 + map's key_size) are within stack limits + * and were initialized prior to this call. + * If it's ok, then verifier allows this BPF_CALL insn and looks at + * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets + * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function + * returns ether pointer to map value or NULL. + * + * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' + * insn, the register holding that pointer in the true branch changes state to + * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false + * branch. See check_cond_jmp_op(). + * + * After the call R0 is set to return type of the function and registers R1-R5 + * are set to NOT_INIT to indicate that they are no longer readable. + */ + +int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +{ + int ret = -EINVAL; + + return ret; +} -- cgit v1.2.3 From cbd357008604925355ae7b54a09137dabb81b580 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:03 -0700 Subject: bpf: verifier (add ability to receive verification log) add optional attributes for BPF_PROG_LOAD syscall: union bpf_attr { struct { ... __u32 log_level; /* verbosity level of eBPF verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied 'char *buffer' */ }; }; when log_level > 0 the verifier will return its verification log in the user supplied buffer 'log_buf' which can be used by program author to analyze why verifier rejected given program. 'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt provides several examples of these messages, like the program: BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), will be rejected with the following multi-line message in log_buf: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 0 4: (85) call 1 5: (15) if r0 == 0x0 goto pc+1 R0=map_ptr R10=fp 6: (7a) *(u64 *)(r0 +4) = 0 misaligned access off 4 size 8 The format of the output can change at any time as verifier evolves. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 + kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+), 1 deletion(-) (limited to 'kernel/bpf/syscall.c') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 424f442016e7..31b0ac208a52 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -138,6 +138,9 @@ union bpf_attr { __u32 insn_cnt; __aligned_u64 insns; __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 74b3628c5fdb..ba61c8c16032 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -458,7 +458,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD license +#define BPF_PROG_LOAD_LAST_FIELD log_buf static int bpf_prog_load(union bpf_attr *attr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6f9c3d6b4d7..871edc1f2e1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,9 +125,244 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct verifier_env { +}; + +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + +static DEFINE_MUTEX(bpf_verifier_lock); + +/* log_level controls verbosity level of eBPF verifier. + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program + */ +static void verbose(const char *fmt, ...) +{ + va_list args; + + if (log_level == 0 || log_len >= log_size - 1) + return; + + va_start(args, fmt); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + va_end(args); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JGE >> 4] = ">=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSGE >> 4] = "s>=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; + } + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; + } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM) { + verbose("(%02x) r%d = 0x%x\n", + insn->code, insn->dst_reg, insn->imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %d\n", insn->code, insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + } +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { + char __user *log_ubuf = NULL; + struct verifier_env *env; int ret = -EINVAL; + if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + return -E2BIG; + + /* 'struct verifier_env' can be global, but since it's not small, + * allocate/free it every time bpf_check() is called + */ + env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + if (attr->log_level || attr->log_buf || attr->log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; + + ret = -EINVAL; + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) + goto free_env; + + ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto free_env; + } else { + log_level = 0; + } + + /* ret = do_check(env); */ + + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ + ret = -ENOSPC; + /* fall through to return what was recorded */ + } + + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + ret = -EFAULT; + goto free_log_buf; + } + + +free_log_buf: + if (log_level) + vfree(log_buf); +free_env: + kfree(env); + mutex_unlock(&bpf_verifier_lock); return ret; } -- cgit v1.2.3