diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 1 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 160 | ||||
-rw-r--r-- | kernel/bpf/core.c | 12 | ||||
-rw-r--r-- | kernel/bpf/cpumask.c | 476 | ||||
-rw-r--r-- | kernel/bpf/offload.c | 419 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 34 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 544 |
7 files changed, 1399 insertions, 247 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3a12e6b400a2..02242614dcc7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 578cee398550..47b8cb96f2c2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -337,6 +337,12 @@ const char *btf_type_str(const struct btf_type *t) #define BTF_SHOW_NAME_SIZE 80 /* + * The suffix of a type that indicates it cannot alias another type when + * comparing BTF IDs for kfunc invocations. + */ +#define NOCAST_ALIAS_SUFFIX "___init" + +/* * Common data to all BTF show operations. Private show functions can add * their own data to a structure containing a struct btf_show and consult it * in the show callback. See btf_type_show() below. @@ -1397,12 +1403,18 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; - /* btf verifier prints all types it is processing via - * btf_verifier_log_type(..., fmt = NULL). - * Skip those prints for in-kernel BTF verification. - */ - if (log->level == BPF_LOG_KERNEL && !fmt) - return; + if (log->level == BPF_LOG_KERNEL) { + /* btf verifier prints all types it is processing via + * btf_verifier_log_type(..., fmt = NULL). + * Skip those prints for in-kernel BTF verification. + */ + if (!fmt) + return; + + /* Skip logging when loading module BTF with mismatches permitted */ + if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + return; + } __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, @@ -1441,8 +1453,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; - if (log->level == BPF_LOG_KERNEL && !fmt) - return; + if (log->level == BPF_LOG_KERNEL) { + if (!fmt) + return; + + /* Skip logging when loading module BTF with mismatches permitted */ + if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + return; + } + /* The CHECK_META phase already did a btf dump. * * If member is logged again, it must hit an error in @@ -7261,11 +7280,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, } btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); if (IS_ERR(btf)) { - pr_warn("failed to validate module [%s] BTF: %ld\n", - mod->name, PTR_ERR(btf)); kfree(btf_mod); - if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { + pr_warn("failed to validate module [%s] BTF: %ld\n", + mod->name, PTR_ERR(btf)); err = PTR_ERR(btf); + } else { + pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n"); + } goto out; } err = btf_alloc_id(btf); @@ -8211,3 +8233,119 @@ out: } return err; } + +bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off) +{ + struct btf *btf = reg->btf; + const struct btf_type *walk_type, *safe_type; + const char *tname; + char safe_tname[64]; + long ret, safe_id; + const struct btf_member *member, *m_walk = NULL; + u32 i; + const char *walk_name; + + walk_type = btf_type_by_id(btf, reg->btf_id); + if (!walk_type) + return false; + + tname = btf_name_by_offset(btf, walk_type->name_off); + + ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname); + if (ret < 0) + return false; + + safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); + if (safe_id < 0) + return false; + + safe_type = btf_type_by_id(btf, safe_id); + if (!safe_type) + return false; + + for_each_member(i, walk_type, member) { + u32 moff; + + /* We're looking for the PTR_TO_BTF_ID member in the struct + * type we're walking which matches the specified offset. + * Below, we'll iterate over the fields in the safe variant of + * the struct and see if any of them has a matching type / + * name. + */ + moff = __btf_member_bit_offset(walk_type, member) / 8; + if (off == moff) { + m_walk = member; + break; + } + } + if (m_walk == NULL) + return false; + + walk_name = __btf_name_by_offset(btf, m_walk->name_off); + for_each_member(i, safe_type, member) { + const char *m_name = __btf_name_by_offset(btf, member->name_off); + + /* If we match on both type and name, the field is considered trusted. */ + if (m_walk->type == member->type && !strcmp(walk_name, m_name)) + return true; + } + + return false; +} + +bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, + const struct btf *reg_btf, u32 reg_id, + const struct btf *arg_btf, u32 arg_id) +{ + const char *reg_name, *arg_name, *search_needle; + const struct btf_type *reg_type, *arg_type; + int reg_len, arg_len, cmp_len; + size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char); + + reg_type = btf_type_by_id(reg_btf, reg_id); + if (!reg_type) + return false; + + arg_type = btf_type_by_id(arg_btf, arg_id); + if (!arg_type) + return false; + + reg_name = btf_name_by_offset(reg_btf, reg_type->name_off); + arg_name = btf_name_by_offset(arg_btf, arg_type->name_off); + + reg_len = strlen(reg_name); + arg_len = strlen(arg_name); + + /* Exactly one of the two type names may be suffixed with ___init, so + * if the strings are the same size, they can't possibly be no-cast + * aliases of one another. If you have two of the same type names, e.g. + * they're both nf_conn___init, it would be improper to return true + * because they are _not_ no-cast aliases, they are the same type. + */ + if (reg_len == arg_len) + return false; + + /* Either of the two names must be the other name, suffixed with ___init. */ + if ((reg_len != arg_len + pattern_len) && + (arg_len != reg_len + pattern_len)) + return false; + + if (reg_len < arg_len) { + search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX); + cmp_len = reg_len; + } else { + search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX); + cmp_len = arg_len; + } + + if (!search_needle) + return false; + + /* ___init suffix must come at the end of the name */ + if (*(search_needle + pattern_len) != '\0') + return false; + + return !strncmp(reg_name, arg_name, cmp_len); +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba3fff17e2f9..16da51093aff 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2096,6 +2096,14 @@ bool bpf_prog_map_compatible(struct bpf_map *map, if (fp->kprobe_override) return false; + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for @@ -2182,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - if (!bpf_prog_is_dev_bound(fp->aux)) { + if (!bpf_prog_is_offloaded(fp->aux)) { *err = bpf_prog_alloc_jited_linfo(fp); if (*err) return fp; @@ -2554,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) - bpf_prog_offload_destroy(aux->prog); + bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS if (aux->prog->has_callchain_buf) put_callchain_buffers(); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c new file mode 100644 index 000000000000..25355a0a367a --- /dev/null +++ b/kernel/bpf/cpumask.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2023 Meta, Inc */ +#include <linux/bpf.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/cpumask.h> + +/** + * struct bpf_cpumask - refcounted BPF cpumask wrapper structure + * @cpumask: The actual cpumask embedded in the struct. + * @usage: Object reference counter. When the refcount goes to 0, the + * memory is released back to the BPF allocator, which provides + * RCU safety. + * + * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This + * is done to avoid confusing the verifier due to the typedef of cpumask_var_t + * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See + * the details in <linux/cpumask.h>. The consequence is that this structure is + * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is + * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory + * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK + * not being defined, the structure is the same size regardless. + */ +struct bpf_cpumask { + cpumask_t cpumask; + refcount_t usage; +}; + +static struct bpf_mem_alloc bpf_cpumask_ma; + +static bool cpu_valid(u32 cpu) +{ + return cpu < nr_cpu_ids; +} + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global kfuncs as their definitions will be in BTF"); + +/** + * bpf_cpumask_create() - Create a mutable BPF cpumask. + * + * Allocates a cpumask that can be queried, mutated, acquired, and released by + * a BPF program. The cpumask returned by this function must either be embedded + * in a map as a kptr, or freed with bpf_cpumask_release(). + * + * bpf_cpumask_create() allocates memory using the BPF memory allocator, and + * will not block. It may return NULL if no memory is available. + */ +struct bpf_cpumask *bpf_cpumask_create(void) +{ + struct bpf_cpumask *cpumask; + + cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask)); + if (!cpumask) + return NULL; + + memset(cpumask, 0, sizeof(*cpumask)); + refcount_set(&cpumask->usage, 1); + + return cpumask; +} + +/** + * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask. + * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted + * pointer. + * + * Acquires a reference to a BPF cpumask. The cpumask returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_cpumask_release(). + */ +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) +{ + refcount_inc(&cpumask->usage); + return cpumask; +} + +/** + * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask + * stored in a map. + * @cpumaskp: A pointer to a BPF cpumask map value. + * + * Attempts to acquire a reference to a BPF cpumask stored in a map value. The + * cpumask returned by this function must either be embedded in a map as a + * kptr, or freed with bpf_cpumask_release(). This function may return NULL if + * no BPF cpumask was found in the specified map value. + */ +struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp) +{ + struct bpf_cpumask *cpumask; + + /* The BPF memory allocator frees memory backing its caches in an RCU + * callback. Thus, we can safely use RCU to ensure that the cpumask is + * safe to read. + */ + rcu_read_lock(); + + cpumask = READ_ONCE(*cpumaskp); + if (cpumask && !refcount_inc_not_zero(&cpumask->usage)) + cpumask = NULL; + + rcu_read_unlock(); + return cpumask; +} + +/** + * bpf_cpumask_release() - Release a previously acquired BPF cpumask. + * @cpumask: The cpumask being released. + * + * Releases a previously acquired reference to a BPF cpumask. When the final + * reference of the BPF cpumask has been released, it is subsequently freed in + * an RCU callback in the BPF memory allocator. + */ +void bpf_cpumask_release(struct bpf_cpumask *cpumask) +{ + if (!cpumask) + return; + + if (refcount_dec_and_test(&cpumask->usage)) { + migrate_disable(); + bpf_mem_free(&bpf_cpumask_ma, cpumask); + migrate_enable(); + } +} + +/** + * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask. + * @cpumask: The cpumask being queried. + * + * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask + * pointer may be safely passed to this function. + */ +u32 bpf_cpumask_first(const struct cpumask *cpumask) +{ + return cpumask_first(cpumask); +} + +/** + * bpf_cpumask_first_zero() - Get the index of the first unset bit in the + * cpumask. + * @cpumask: The cpumask being queried. + * + * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask + * pointer may be safely passed to this function. + */ +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) +{ + return cpumask_first_zero(cpumask); +} + +/** + * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask. + * @cpu: The CPU to be set in the cpumask. + * @cpumask: The BPF cpumask in which a bit is being set. + */ +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return; + + cpumask_set_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask. + * @cpu: The CPU to be cleared from the cpumask. + * @cpumask: The BPF cpumask in which a bit is being cleared. + */ +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return; + + cpumask_clear_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask. + * @cpu: The CPU being queried for. + * @cpumask: The cpumask being queried for containing a CPU. + * + * Return: + * * true - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu. + */ +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return false; + + return cpumask_test_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask. + * @cpu: The CPU being set and queried for. + * @cpumask: The BPF cpumask being set and queried for containing a CPU. + * + * Return: + * * true - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is invalid. + */ +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return false; + + return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF + * cpumask. + * @cpu: The CPU being cleared and queried for. + * @cpumask: The BPF cpumask being cleared and queried for containing a CPU. + * + * Return: + * * true - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is invalid. + */ +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return false; + + return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask. + * @cpumask: The BPF cpumask having all of its bits set. + */ +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) +{ + cpumask_setall((struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask. + * @cpumask: The BPF cpumask being cleared. + */ +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) +{ + cpumask_clear((struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_and() - AND two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true - @dst has at least one bit set following the operation + * * false - @dst is empty following the operation + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_and(struct bpf_cpumask *dst, + const struct cpumask *src1, + const struct cpumask *src2) +{ + return cpumask_and((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_or() - OR two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +void bpf_cpumask_or(struct bpf_cpumask *dst, + const struct cpumask *src1, + const struct cpumask *src2) +{ + cpumask_or((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_xor() - XOR two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +void bpf_cpumask_xor(struct bpf_cpumask *dst, + const struct cpumask *src1, + const struct cpumask *src2) +{ + cpumask_xor((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_equal() - Check two cpumasks for equality. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true - @src1 and @src2 have the same bits set. + * * false - @src1 and @src2 differ in at least one bit. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_equal(src1, src2); +} + +/** + * bpf_cpumask_intersects() - Check two cpumasks for overlap. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true - @src1 and @src2 have at least one of the same bits set. + * * false - @src1 and @src2 don't have any of the same bits set. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_intersects(src1, src2); +} + +/** + * bpf_cpumask_subset() - Check if a cpumask is a subset of another. + * @src1: The first cpumask being checked as a subset. + * @src2: The second cpumask being checked as a superset. + * + * Return: + * * true - All of the bits of @src1 are set in @src2. + * * false - At least one bit in @src1 is not set in @src2. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_subset(src1, src2); +} + +/** + * bpf_cpumask_empty() - Check if a cpumask is empty. + * @cpumask: The cpumask being checked. + * + * Return: + * * true - None of the bits in @cpumask are set. + * * false - At least one bit in @cpumask is set. + * + * A struct bpf_cpumask pointer may be safely passed to @cpumask. + */ +bool bpf_cpumask_empty(const struct cpumask *cpumask) +{ + return cpumask_empty(cpumask); +} + +/** + * bpf_cpumask_full() - Check if a cpumask has all bits set. + * @cpumask: The cpumask being checked. + * + * Return: + * * true - All of the bits in @cpumask are set. + * * false - At least one bit in @cpumask is cleared. + * + * A struct bpf_cpumask pointer may be safely passed to @cpumask. + */ +bool bpf_cpumask_full(const struct cpumask *cpumask) +{ + return cpumask_full(cpumask); +} + +/** + * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask. + * @dst: The BPF cpumask being copied into. + * @src: The cpumask being copied. + * + * A struct bpf_cpumask pointer may be safely passed to @src. + */ +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) +{ + cpumask_copy((struct cpumask *)dst, src); +} + +/** + * bpf_cpumask_any() - Return a random set CPU from a cpumask. + * @cpumask: The cpumask being queried. + * + * Return: + * * A random set bit within [0, num_cpus) if at least one bit is set. + * * >= num_cpus if no bit is set. + * + * A struct bpf_cpumask pointer may be safely passed to @src. + */ +u32 bpf_cpumask_any(const struct cpumask *cpumask) +{ + return cpumask_any(cpumask); +} + +/** + * bpf_cpumask_any_and() - Return a random set CPU from the AND of two + * cpumasks. + * @src1: The first cpumask. + * @src2: The second cpumask. + * + * Return: + * * A random set bit within [0, num_cpus) if at least one bit is set. + * * >= num_cpus if no bit is set. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_any_and(src1, src2); +} + +__diag_pop(); + +BTF_SET8_START(cpumask_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS) +BTF_SET8_END(cpumask_kfunc_btf_ids) + +static const struct btf_kfunc_id_set cpumask_kfunc_set = { + .owner = THIS_MODULE, + .set = &cpumask_kfunc_btf_ids, +}; + +BTF_ID_LIST(cpumask_dtor_ids) +BTF_ID(struct, bpf_cpumask) +BTF_ID(func, bpf_cpumask_release) + +static int __init cpumask_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc cpumask_dtors[] = { + { + .btf_id = cpumask_dtor_ids[0], + .kfunc_btf_id = cpumask_dtor_ids[1] + }, + }; + + ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, + ARRAY_SIZE(cpumask_dtors), + THIS_MODULE); +} + +late_initcall(cpumask_kfunc_init); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 190d9f9dc987..88aae38fde66 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -41,7 +41,7 @@ struct bpf_offload_dev { struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; - struct bpf_offload_dev *offdev; + struct bpf_offload_dev *offdev; /* NULL when bound-only */ struct list_head progs; struct list_head maps; struct list_head offdev_netdevs; @@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = { }; static struct rhashtable offdevs; -static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -72,58 +71,218 @@ bpf_offload_find_netdev(struct net_device *netdev) { lockdep_assert_held(&bpf_devs_lock); - if (!offdevs_inited) - return NULL; return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); } -int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { struct bpf_offload_netdev *ondev; - struct bpf_prog_offload *offload; int err; - if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && - attr->prog_type != BPF_PROG_TYPE_XDP) - return -EINVAL; + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; - if (attr->prog_flags) - return -EINVAL; + ondev->netdev = netdev; + ondev->offdev = offdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_free; + } + + if (offdev) + list_add(&ondev->offdev_netdevs, &offdev->netdevs); + return 0; + +err_free: + kfree(ondev); + return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_prog_offload *offload = prog->aux->offload; + + if (offload->dev_state) + offload->offdev->ops->destroy(prog); + + list_del_init(&offload->offloads); + kfree(offload); + prog->aux->offload = NULL; +} + +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) +{ + struct bpf_offload_netdev *ondev, *altdev = NULL; + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; + + ASSERT_RTNL(); + + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + return; + + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + + /* Try to move the objects to another netdev of the device */ + if (offdev) { + list_del(&ondev->offdev_netdevs); + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + } + + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + } + + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +} + +static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev) +{ + struct bpf_offload_netdev *ondev; + struct bpf_prog_offload *offload; + int err; offload = kzalloc(sizeof(*offload), GFP_USER); if (!offload) return -ENOMEM; offload->prog = prog; + offload->netdev = netdev; - offload->netdev = dev_get_by_index(current->nsproxy->net_ns, - attr->prog_ifindex); - err = bpf_dev_offload_check(offload->netdev); - if (err) - goto err_maybe_put; - - down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offload->netdev); if (!ondev) { - err = -EINVAL; - goto err_unlock; + if (bpf_prog_is_offloaded(prog->aux)) { + err = -EINVAL; + goto err_free; + } + + /* When only binding to the device, explicitly + * create an entry in the hashtable. + */ + err = __bpf_offload_dev_netdev_register(NULL, offload->netdev); + if (err) + goto err_free; + ondev = bpf_offload_find_netdev(offload->netdev); } offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); - dev_put(offload->netdev); - up_write(&bpf_devs_lock); return 0; -err_unlock: - up_write(&bpf_devs_lock); -err_maybe_put: - if (offload->netdev) - dev_put(offload->netdev); +err_free: kfree(offload); return err; } +int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) +{ + struct net_device *netdev; + int err; + + if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && + attr->prog_type != BPF_PROG_TYPE_XDP) + return -EINVAL; + + if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY) + return -EINVAL; + + if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && + attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) + return -EINVAL; + + netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); + if (!netdev) + return -EINVAL; + + err = bpf_dev_offload_check(netdev); + if (err) + goto out; + + prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY); + + down_write(&bpf_devs_lock); + err = __bpf_prog_dev_bound_init(prog, netdev); + up_write(&bpf_devs_lock); + +out: + dev_put(netdev); + return err; +} + +int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) +{ + int err; + + if (!bpf_prog_is_dev_bound(old_prog->aux)) + return 0; + + if (bpf_prog_is_offloaded(old_prog->aux)) + return -EINVAL; + + new_prog->aux->dev_bound = old_prog->aux->dev_bound; + new_prog->aux->offload_requested = old_prog->aux->offload_requested; + + down_write(&bpf_devs_lock); + if (!old_prog->aux->offload) { + err = -EINVAL; + goto out; + } + + err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev); + +out: + up_write(&bpf_devs_lock); + return err; +} + int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; @@ -209,24 +368,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) up_read(&bpf_devs_lock); } -static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +void bpf_prog_dev_bound_destroy(struct bpf_prog *prog) { - struct bpf_prog_offload *offload = prog->aux->offload; - - if (offload->dev_state) - offload->offdev->ops->destroy(prog); - - list_del_init(&offload->offloads); - kfree(offload); - prog->aux->offload = NULL; -} + struct bpf_offload_netdev *ondev; + struct net_device *netdev; -void bpf_prog_offload_destroy(struct bpf_prog *prog) -{ + rtnl_lock(); down_write(&bpf_devs_lock); - if (prog->aux->offload) + if (prog->aux->offload) { + list_del_init(&prog->aux->offload->offloads); + + netdev = prog->aux->offload->netdev; __bpf_prog_offload_destroy(prog); + + ondev = bpf_offload_find_netdev(netdev); + if (!ondev->offdev && list_empty(&ondev->progs)) + __bpf_offload_dev_netdev_unregister(NULL, netdev); + } up_write(&bpf_devs_lock); + rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) @@ -340,22 +500,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; -static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, - enum bpf_netdev_command cmd) -{ - struct netdev_bpf data = {}; - struct net_device *netdev; - - ASSERT_RTNL(); - - data.command = cmd; - data.offmap = offmap; - /* Caller must make sure netdev is valid */ - netdev = offmap->netdev; - - return netdev->netdev_ops->ndo_bpf(netdev, &data); -} - struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; @@ -405,15 +549,6 @@ err_unlock: return ERR_PTR(err); } -static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) -{ - WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); - /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ - bpf_map_free_id(&offmap->map, true); - list_del_init(&offmap->offloads); - offmap->netdev = NULL; -} - void bpf_map_offload_map_free(struct bpf_map *map) { struct bpf_offloaded_map *offmap = map_to_offmap(map); @@ -573,12 +708,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) } EXPORT_SYMBOL_GPL(bpf_offload_dev_match); +bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) +{ + bool ret; + + if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux)) + return false; + + down_read(&bpf_devs_lock); + ret = lhs->aux->offload && rhs->aux->offload && + lhs->aux->offload->netdev && + lhs->aux->offload->netdev == rhs->aux->offload->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; bool ret; - if (!bpf_map_is_dev_bound(map)) + if (!bpf_map_is_offloaded(map)) return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); @@ -592,32 +743,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { - struct bpf_offload_netdev *ondev; int err; - ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); - if (!ondev) - return -ENOMEM; - - ondev->netdev = netdev; - ondev->offdev = offdev; - INIT_LIST_HEAD(&ondev->progs); - INIT_LIST_HEAD(&ondev->maps); - down_write(&bpf_devs_lock); - err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); - if (err) { - netdev_warn(netdev, "failed to register for BPF offload\n"); - goto err_unlock_free; - } - - list_add(&ondev->offdev_netdevs, &offdev->netdevs); + err = __bpf_offload_dev_netdev_register(offdev, netdev); up_write(&bpf_devs_lock); - return 0; - -err_unlock_free: - up_write(&bpf_devs_lock); - kfree(ondev); return err; } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); @@ -625,43 +755,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { - struct bpf_offload_netdev *ondev, *altdev; - struct bpf_offloaded_map *offmap, *mtmp; - struct bpf_prog_offload *offload, *ptmp; - - ASSERT_RTNL(); - down_write(&bpf_devs_lock); - ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); - if (WARN_ON(!ondev)) - goto unlock; - - WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); - list_del(&ondev->offdev_netdevs); - - /* Try to move the objects to another netdev of the device */ - altdev = list_first_entry_or_null(&offdev->netdevs, - struct bpf_offload_netdev, - offdev_netdevs); - if (altdev) { - list_for_each_entry(offload, &ondev->progs, offloads) - offload->netdev = altdev->netdev; - list_splice_init(&ondev->progs, &altdev->progs); - - list_for_each_entry(offmap, &ondev->maps, offloads) - offmap->netdev = altdev->netdev; - list_splice_init(&ondev->maps, &altdev->maps); - } else { - list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) - __bpf_prog_offload_destroy(offload->prog); - list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) - __bpf_map_offload_destroy(offmap); - } - - WARN_ON(!list_empty(&ondev->progs)); - WARN_ON(!list_empty(&ondev->maps)); - kfree(ondev); -unlock: + __bpf_offload_dev_netdev_unregister(offdev, netdev); up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); @@ -670,18 +765,6 @@ struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; - int err; - - down_write(&bpf_devs_lock); - if (!offdevs_inited) { - err = rhashtable_init(&offdevs, &offdevs_params); - if (err) { - up_write(&bpf_devs_lock); - return ERR_PTR(err); - } - offdevs_inited = true; - } - up_write(&bpf_devs_lock); offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); if (!offdev) @@ -707,3 +790,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) return offdev->priv; } EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); + +void bpf_dev_bound_netdev_unregister(struct net_device *dev) +{ + struct bpf_offload_netdev *ondev; + + ASSERT_RTNL(); + + down_write(&bpf_devs_lock); + ondev = bpf_offload_find_netdev(dev); + if (ondev && !ondev->offdev) + __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev); + up_write(&bpf_devs_lock); +} + +int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, + struct bpf_prog_aux *prog_aux) +{ + if (!bpf_prog_is_dev_bound(prog_aux)) { + bpf_log(log, "metadata kfuncs require device-bound program\n"); + return -EINVAL; + } + + if (bpf_prog_is_offloaded(prog_aux)) { + bpf_log(log, "metadata kfuncs can't be offloaded\n"); + return -EINVAL; + } + + return 0; +} + +void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) +{ + const struct xdp_metadata_ops *ops; + void *p = NULL; + + /* We don't hold bpf_devs_lock while resolving several + * kfuncs and can race with the unregister_netdevice(). + * We rely on bpf_dev_bound_match() check at attach + * to render this program unusable. + */ + down_read(&bpf_devs_lock); + if (!prog->aux->offload) + goto out; + + ops = prog->aux->offload->netdev->xdp_metadata_ops; + if (!ops) + goto out; + + if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) + p = ops->xmo_rx_timestamp; + else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH)) + p = ops->xmo_rx_hash; +out: + up_read(&bpf_devs_lock); + + return p; +} + +static int __init bpf_offload_init(void) +{ + return rhashtable_init(&offdevs, &offdevs_params); +} + +late_initcall(bpf_offload_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bf384b3346d6..99417b387547 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,7 +181,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, int err; /* Need to create a kthread, thus must support schedule */ - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { @@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, void *ptr; int err; - if (bpf_map_is_dev_bound(map)) + if (bpf_map_is_offloaded(map)) return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); @@ -1483,7 +1483,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; } else if (IS_FD_PROG_ARRAY(map) || @@ -1547,7 +1547,7 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_get_next_key(map, key, next_key); goto out; } @@ -1605,7 +1605,7 @@ int generic_map_delete_batch(struct bpf_map *map, map->key_size)) break; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); break; } @@ -1851,7 +1851,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - if (!bpf_map_is_dev_bound(map)) { + if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); @@ -1944,7 +1944,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (!ops) return -EINVAL; - if (!bpf_prog_is_dev_bound(prog->aux)) + if (!bpf_prog_is_offloaded(prog->aux)) prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; @@ -2245,7 +2245,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog, if (prog->type != *attach_type) return false; - if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) + if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) return false; return true; @@ -2481,7 +2481,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | - BPF_F_XDP_HAS_FRAGS)) + BPF_F_XDP_HAS_FRAGS | + BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2565,7 +2566,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->dst_prog = dst_prog; - prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -2589,7 +2590,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { - err = bpf_prog_offload_init(prog, attr); + err = bpf_prog_dev_bound_init(prog, attr); + if (err) + goto free_prog_sec; + } + + if (type == BPF_PROG_TYPE_EXT && dst_prog && + bpf_prog_is_dev_bound(dst_prog->aux)) { + err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) goto free_prog_sec; } @@ -3987,7 +3995,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, return -EFAULT; } - if (bpf_prog_is_dev_bound(prog->aux)) { + if (bpf_prog_is_offloaded(prog->aux)) { err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; @@ -4215,7 +4223,7 @@ static int bpf_map_get_info_by_fd(struct file *file, } info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ba62f98d3c59..4cc0e70ee71e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -255,6 +255,7 @@ struct bpf_call_arg_meta { int mem_size; u64 msize_max_value; int ref_obj_id; + int dynptr_id; int map_uid; int func_id; struct btf *btf; @@ -638,31 +639,57 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } -static int get_spi(s32 off) +static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; } +static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) +{ + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { - int allocated_slots = state->allocated_stack / BPF_REG_SIZE; + int allocated_slots = state->allocated_stack / BPF_REG_SIZE; - /* We need to check that slots between [spi - nr_slots + 1, spi] are - * within [0, allocated_stack). - * - * Please note that the spi grows downwards. For example, a dynptr - * takes the size of two stack slots; the first slot will be at - * spi and the second slot will be at spi - 1. - */ - return spi - nr_slots + 1 >= 0 && spi < allocated_slots; + /* We need to check that slots between [spi - nr_slots + 1, spi] are + * within [0, allocated_stack). + * + * Please note that the spi grows downwards. For example, a dynptr + * takes the size of two stack slots; the first slot will be at + * spi and the second slot will be at spi - 1. + */ + return spi - nr_slots + 1 >= 0 && spi < allocated_slots; } -static struct bpf_func_state *func(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg) +static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_verifier_state *cur = env->cur_state; + int off, spi; - return cur->frame[reg->frameno]; + if (!tnum_is_const(reg->var_off)) { + verbose(env, "dynptr has to be at a constant offset\n"); + return -EINVAL; + } + + off = reg->off + reg->var_off.value; + if (off % BPF_REG_SIZE) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + return -EINVAL; + } + + spi = __get_spi(off); + if (spi < 1) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + return -EINVAL; + } + + if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS)) + return -ERANGE; + return spi; } static const char *kernel_type_name(const struct btf* btf, u32 id) @@ -727,37 +754,58 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot); + bool first_slot, int dynptr_id); static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, +static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, + struct bpf_reg_state *sreg1, struct bpf_reg_state *sreg2, enum bpf_dynptr_type type) { - __mark_dynptr_reg(sreg1, type, true); - __mark_dynptr_reg(sreg2, type, false); + int id = ++env->id_gen; + + __mark_dynptr_reg(sreg1, type, true, id); + __mark_dynptr_reg(sreg2, type, false, id); } -static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, +static void mark_dynptr_cb_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, enum bpf_dynptr_type type) { - __mark_dynptr_reg(reg, type, true); + __mark_dynptr_reg(reg, type, true, ++env->id_gen); } +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type type; - int spi, i, id; - - spi = get_spi(reg->off); - - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) - return -EINVAL; + int spi, i, id, err; + + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + + /* We cannot assume both spi and spi - 1 belong to the same dynptr, + * hence we need to call destroy_if_dynptr_stack_slot twice for both, + * to ensure that for the following example: + * [d1][d1][d2][d2] + * spi 3 2 1 0 + * So marking spi = 2 should lead to destruction of both d1 and d2. In + * case they do belong to same dynptr, second call won't see slot_type + * as STACK_DYNPTR and will simply skip destruction. + */ + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + err = destroy_if_dynptr_stack_slot(env, state, spi - 1); + if (err) + return err; for (i = 0; i < BPF_REG_SIZE; i++) { state->stack[spi].slot_type[i] = STACK_DYNPTR; @@ -768,7 +816,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { @@ -781,6 +829,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + return 0; } @@ -789,10 +840,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = func(env, reg); int spi, i; - spi = get_spi(reg->off); - - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) - return -EINVAL; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; for (i = 0; i < BPF_REG_SIZE; i++) { state->stack[spi].slot_type[i] = STACK_INVALID; @@ -805,43 +855,133 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + + /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? + * + * While we don't allow reading STACK_INVALID, it is still possible to + * do <8 byte writes marking some but not all slots as STACK_MISC. Then, + * helpers or insns can do partial read of that part without failing, + * but check_stack_range_initialized, check_stack_read_var_off, and + * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of + * the slot conservatively. Hence we need to prevent those liveness + * marking walks. + * + * This was not a problem before because STACK_INVALID is only set by + * default (where the default reg state has its reg->parent as NULL), or + * in clean_live_states after REG_LIVE_DONE (at which point + * mark_reg_read won't walk reg->parent chain), but not randomly during + * verifier state exploration (like we did above). Hence, for our case + * parentage chain will still be live (i.e. reg->parent may be + * non-NULL), while earlier reg->parent was NULL, so we need + * REG_LIVE_WRITTEN to screen off read marker propagation when it is + * done later on reads or by mark_dynptr_read as well to unnecessary + * mark registers in verifier state. + */ + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + return 0; } -static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi) { - struct bpf_func_state *state = func(env, reg); - int spi, i; + struct bpf_func_state *fstate; + struct bpf_reg_state *dreg; + int i, dynptr_id; - if (reg->type == CONST_PTR_TO_DYNPTR) - return false; + /* We always ensure that STACK_DYNPTR is never set partially, + * hence just checking for slot_type[0] is enough. This is + * different for STACK_SPILL, where it may be only set for + * 1 byte, so code has to use is_spilled_reg. + */ + if (state->stack[spi].slot_type[0] != STACK_DYNPTR) + return 0; - spi = get_spi(reg->off); - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) - return true; + /* Reposition spi to first slot */ + if (!state->stack[spi].spilled_ptr.dynptr.first_slot) + spi = spi + 1; + + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; + } + + mark_stack_slot_scratched(env, spi); + mark_stack_slot_scratched(env, spi - 1); + /* Writing partially to one dynptr stack slot destroys both. */ for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack[spi].slot_type[i] == STACK_DYNPTR || - state->stack[spi - 1].slot_type[i] == STACK_DYNPTR) - return false; + state->stack[spi].slot_type[i] = STACK_INVALID; + state->stack[spi - 1].slot_type[i] = STACK_INVALID; } + dynptr_id = state->stack[spi].spilled_ptr.id; + /* Invalidate any slices associated with this dynptr */ + bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ + /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ + if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) + continue; + if (dreg->dynptr_id == dynptr_id) { + if (!env->allow_ptr_leaks) + __mark_reg_not_init(env, dreg); + else + __mark_reg_unknown(env, dreg); + } + })); + + /* Do not release reference state, we are destroying dynptr on stack, + * not using some helper to release it. Just reset register. + */ + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + + /* Same reason as unmark_stack_slots_dynptr above */ + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + + return 0; +} + +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi) +{ + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + + /* For -ERANGE (i.e. spi not falling into allocated stack slots), we + * will do check_mem_access to check and update stack bounds later, so + * return true for that case. + */ + if (spi < 0) + return spi == -ERANGE; + /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see + * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to + * ensure dynptr objects at the slots we are touching are completely + * destructed before we reinitialize them for a new one. For referenced + * ones, destroy_if_dynptr_stack_slot returns an error early instead of + * delaying it until the end where the user will get "Unreleased + * reference" error. + */ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi) { struct bpf_func_state *state = func(env, reg); - int spi; int i; /* This already represents first slot of initialized bpf_dynptr */ if (reg->type == CONST_PTR_TO_DYNPTR) return true; - spi = get_spi(reg->off); - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.dynptr.first_slot) + if (spi < 0) + return false; + if (!state->stack[spi].spilled_ptr.dynptr.first_slot) return false; for (i = 0; i < BPF_REG_SIZE; i++) { @@ -868,7 +1008,9 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg if (reg->type == CONST_PTR_TO_DYNPTR) { return reg->dynptr.type == dynptr_type; } else { - spi = get_spi(reg->off); + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return false; return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; } } @@ -1449,7 +1591,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot) + bool first_slot, int dynptr_id) { /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply @@ -1457,6 +1599,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty */ __mark_reg_known_zero(reg); reg->type = CONST_PTR_TO_DYNPTR; + /* Give each dynptr a unique id to uniquely associate slices to it. */ + reg->id = dynptr_id; reg->dynptr.type = type; reg->dynptr.first_slot = first_slot; } @@ -2189,6 +2333,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } + if (bpf_dev_bound_kfunc_id(func_id)) { + err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); + if (err) + return err; + } + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; desc->imm = call_imm; @@ -2390,6 +2540,32 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } +static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, ret; + + /* For CONST_PTR_TO_DYNPTR, it must have already been done by + * check_reg_arg in check_helper_call and mark_btf_func_reg_size in + * check_kfunc_call. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return 0; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + /* Caller ensures dynptr is valid and initialized, which means spi is in + * bounds and spi is the first dynptr slot. Simply mark stack slot as + * read. + */ + ret = mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + if (ret) + return ret; + return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr, + state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); +} + /* This function is supposed to be used by the following 32-bit optimization * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. @@ -3311,6 +3487,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, env->insn_aux_data[insn_idx].sanitize_stack_spill = true; } + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { @@ -3424,6 +3604,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, if (err) return err; + for (i = min_off; i < max_off; i++) { + int spi; + + spi = __get_spi(i); + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + } /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { @@ -4763,6 +4951,25 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) return 0; } +#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields) + +BTF_TYPE_SAFE_NESTED(struct task_struct) { + const cpumask_t *cpus_ptr; +}; + +static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + int off) +{ + /* If its parent is not trusted, it can't regain its trusted status. */ + if (!is_trusted_reg(reg)) + return false; + + BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct)); + + return btf_nested_type_is_trusted(&env->log, reg, off); +} + static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, @@ -4851,10 +5058,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_flag(reg->type) & PTR_UNTRUSTED) flag |= PTR_UNTRUSTED; - /* By default any pointer obtained from walking a trusted pointer is - * no longer trusted except the rcu case below. + /* By default any pointer obtained from walking a trusted pointer is no + * longer trusted, unless the field being accessed has explicitly been + * marked as inheriting its parent's state of trust. + * + * An RCU-protected pointer can also be deemed trusted if we are in an + * RCU read region. This case is handled below. */ - flag &= ~PTR_TRUSTED; + if (nested_ptr_is_trusted(env, reg, off)) + flag |= PTR_TRUSTED; + else + flag &= ~PTR_TRUSTED; if (flag & MEM_RCU) { /* Mark value register as MEM_RCU only if it is protected by @@ -5451,6 +5665,31 @@ static int check_stack_range_initialized( } if (meta && meta->raw_mode) { + /* Ensure we won't be overwriting dynptrs when simulating byte + * by byte access in check_helper_call using meta.access_size. + * This would be a problem if we have a helper in the future + * which takes: + * + * helper(uninit_mem, len, dynptr) + * + * Now, uninint_mem may overlap with dynptr pointer. Hence, it + * may end up writing to dynptr itself when touching memory from + * arg 1. This can be relaxed on a case by case basis for known + * safe cases, but reject due to the possibilitiy of aliasing by + * default. + */ + for (i = min_off; i < max_off + access_size; i++) { + int stack_off = -i - 1; + + spi = __get_spi(i); + /* raw_mode may write past allocated_stack */ + if (state->allocated_stack <= stack_off) + continue; + if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) { + verbose(env, "potential write to dynptr at off=%d disallowed\n", i); + return -EACCES; + } + } meta->access_size = access_size; meta->regno = regno; return 0; @@ -5938,6 +6177,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + int spi = 0; /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): @@ -5948,12 +6188,14 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, } /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to * check_func_arg_reg_off's logic. We only need to check offset - * alignment for PTR_TO_STACK. + * and its alignment for PTR_TO_STACK. */ - if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { - verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); - return -EINVAL; + if (reg->type == PTR_TO_STACK) { + spi = dynptr_get_spi(env, reg); + if (spi < 0 && spi != -ERANGE) + return spi; } + /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * @@ -5970,7 +6212,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, * to. */ if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg)) { + if (!is_dynptr_reg_valid_uninit(env, reg, spi)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); return -EINVAL; } @@ -5985,13 +6227,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, meta->uninit_dynptr_regno = regno; } else /* MEM_RDONLY and None case from above */ { + int err; + /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); return -EINVAL; } - if (!is_dynptr_reg_valid_init(env, reg)) { + if (!is_dynptr_reg_valid_init(env, reg, spi)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", regno); @@ -6018,6 +6262,10 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, err_extra, regno); return -EINVAL; } + + err = mark_dynptr_read(env, reg); + if (err) + return err; } return 0; } @@ -6355,15 +6603,29 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, } } -static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->ref_obj_id; + return reg->id; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + return state->stack[spi].spilled_ptr.id; +} + +static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi; - spi = get_spi(reg->off); + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->ref_obj_id; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; return state->stack[spi].spilled_ptr.ref_obj_id; } @@ -6437,9 +6699,8 @@ skip_type_check: * PTR_TO_STACK. */ if (reg->type == PTR_TO_STACK) { - spi = get_spi(reg->off); - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.ref_obj_id) { + spi = dynptr_get_spi(env, reg); + if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) { verbose(env, "arg %d is an unacquired reference\n", regno); return -EINVAL; } @@ -7421,7 +7682,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); - mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); + mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ @@ -7927,13 +8188,32 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + int id, ref_obj_id; + + if (meta.dynptr_id) { + verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + return -EFAULT; + } if (meta.ref_obj_id) { verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } - meta.ref_obj_id = dynptr_ref_obj_id(env, reg); + id = dynptr_id(env, reg); + if (id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + return id; + } + + ref_obj_id = dynptr_ref_obj_id(env, reg); + if (ref_obj_id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + return ref_obj_id; + } + + meta.dynptr_id = id; + meta.ref_obj_id = ref_obj_id; break; } } @@ -8089,6 +8369,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } + if (is_dynptr_ref_function(func_id)) + regs[BPF_REG_0].dynptr_id = meta.dynptr_id; + if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -8545,9 +8828,37 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, reg_ref_id = *reg2btf_ids[base_type(reg->type)]; } - if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id)) + /* Enforce strict type matching for calls to kfuncs that are acquiring + * or releasing a reference, or are no-cast aliases. We do _not_ + * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * as we want to enable BPF programs to pass types that are bitwise + * equivalent without forcing them to explicitly cast with something + * like bpf_cast_to_kern_ctx(). + * + * For example, say we had a type like the following: + * + * struct bpf_cpumask { + * cpumask_t cpumask; + * refcount_t usage; + * }; + * + * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed + * to a struct cpumask, so it would be safe to pass a struct + * bpf_cpumask * to a kfunc expecting a struct cpumask *. + * + * The philosophy here is similar to how we allow scalars of different + * types to be passed to kfuncs as long as the size is the same. The + * only difference here is that we're simply allowing + * btf_struct_ids_match() to walk the struct at the 0th offset, and + * resolve types. + */ + if (is_kfunc_acquire(meta) || + (is_kfunc_release(meta) && reg->ref_obj_id) || + btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; + WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off); + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { @@ -8891,6 +9202,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } + if (is_kfunc_trusted_args(meta) && + (register_is_null(reg) || type_may_be_null(reg->type))) { + verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + return -EACCES; + } + if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -13223,10 +13540,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return false; if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1) continue; - if (!is_spilled_reg(&old->stack[spi])) - continue; - if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap)) + /* Both old and cur are having same slot_type */ + switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -13237,7 +13553,30 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * such verifier states are not equivalent. * return false to continue verification of this path */ + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) + return false; + break; + case STACK_DYNPTR: + { + const struct bpf_reg_state *old_reg, *cur_reg; + + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + if (old_reg->dynptr.type != cur_reg->dynptr.type || + old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; + } + case STACK_MISC: + case STACK_ZERO: + case STACK_INVALID: + continue; + /* Ensure that new unhandled slot types return false by default */ + default: return false; + } } return true; } @@ -13834,7 +14173,7 @@ static int do_check(struct bpf_verifier_env *env) env->prev_log_len = env->log.len_used; } - if (bpf_prog_is_dev_bound(env->prog->aux)) { + if (bpf_prog_is_offloaded(env->prog->aux)) { err = bpf_prog_offload_verify_insn(env, env->insn_idx, env->prev_insn_idx); if (err) @@ -14314,7 +14653,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; @@ -14795,7 +15134,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) unsigned int orig_prog_len = env->prog->len; int err; - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); err = bpf_remove_insns(env->prog, off, cnt); @@ -14876,7 +15215,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) else continue; - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_replace_insn(env, i, &ja); memcpy(insn, &ja, sizeof(ja)); @@ -15063,7 +15402,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; @@ -15463,7 +15802,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) int err = 0; if (env->prog->jit_requested && - !bpf_prog_is_dev_bound(env->prog->aux)) { + !bpf_prog_is_offloaded(env->prog->aux)) { err = jit_subprogs(env); if (err == 0) return 0; @@ -15507,12 +15846,25 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { const struct bpf_kfunc_desc *desc; + void *xdp_kfunc; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); return -EINVAL; } + *cnt = 0; + + if (bpf_dev_bound_kfunc_id(insn->imm)) { + xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm); + if (xdp_kfunc) { + insn->imm = BPF_CALL_IMM(xdp_kfunc); + return 0; + } + + /* fallback to default kfunc when not supported by netdev */ + } + /* insn->imm has the btf func_id. Replace it with * an address (relative to __bpf_call_base). */ @@ -15523,7 +15875,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } - *cnt = 0; insn->imm = desc->imm; if (insn->off) return 0; @@ -16449,7 +16800,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } if (st_ops->check_member) { - int err = st_ops->check_member(t, member); + int err = st_ops->check_member(t, member, prog); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", @@ -16530,6 +16881,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; + if (bpf_prog_is_dev_bound(prog->aux) && + !bpf_prog_dev_bound_match(prog, tgt_prog)) { + bpf_log(log, "Target program bound device mismatch"); + return -EINVAL; + } + for (i = 0; i < aux->func_info_cnt; i++) if (aux->func_info[i].type_id == btf_id) { subprog = i; @@ -16751,6 +17108,24 @@ BTF_ID(func, rcu_read_unlock_strict) #endif BTF_SET_END(btf_id_deny) +static bool can_be_sleepable(struct bpf_prog *prog) +{ + if (prog->type == BPF_PROG_TYPE_TRACING) { + switch (prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + case BPF_TRACE_ITER: + return true; + default: + return false; + } + } + return prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || + prog->type == BPF_PROG_TYPE_STRUCT_OPS; +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -16769,9 +17144,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } - if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { - verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); + if (prog->aux->sleepable && !can_be_sleepable(prog)) { + verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); return -EINVAL; } @@ -16950,7 +17324,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) if (ret < 0) goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { + if (bpf_prog_is_offloaded(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) goto skip_full_check; @@ -16963,7 +17337,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) ret = do_check_subprogs(env); ret = ret ?: do_check_main(env); - if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) + if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux)) ret = bpf_prog_offload_finalize(env); skip_full_check: @@ -16998,7 +17372,7 @@ skip_full_check: /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ - if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) { ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret : false; |