summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/btf.c160
-rw-r--r--kernel/bpf/core.c12
-rw-r--r--kernel/bpf/cpumask.c476
-rw-r--r--kernel/bpf/offload.c419
-rw-r--r--kernel/bpf/syscall.c34
-rw-r--r--kernel/bpf/verifier.c544
7 files changed, 1399 insertions, 247 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3a12e6b400a2..02242614dcc7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 578cee398550..47b8cb96f2c2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -337,6 +337,12 @@ const char *btf_type_str(const struct btf_type *t)
#define BTF_SHOW_NAME_SIZE 80
/*
+ * The suffix of a type that indicates it cannot alias another type when
+ * comparing BTF IDs for kfunc invocations.
+ */
+#define NOCAST_ALIAS_SUFFIX "___init"
+
+/*
* Common data to all BTF show operations. Private show functions can add
* their own data to a structure containing a struct btf_show and consult it
* in the show callback. See btf_type_show() below.
@@ -1397,12 +1403,18 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- /* btf verifier prints all types it is processing via
- * btf_verifier_log_type(..., fmt = NULL).
- * Skip those prints for in-kernel BTF verification.
- */
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
@@ -1441,8 +1453,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
@@ -7261,11 +7280,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
if (IS_ERR(btf)) {
- pr_warn("failed to validate module [%s] BTF: %ld\n",
- mod->name, PTR_ERR(btf));
kfree(btf_mod);
- if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
err = PTR_ERR(btf);
+ } else {
+ pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
+ }
goto out;
}
err = btf_alloc_id(btf);
@@ -8211,3 +8233,119 @@ out:
}
return err;
}
+
+bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off)
+{
+ struct btf *btf = reg->btf;
+ const struct btf_type *walk_type, *safe_type;
+ const char *tname;
+ char safe_tname[64];
+ long ret, safe_id;
+ const struct btf_member *member, *m_walk = NULL;
+ u32 i;
+ const char *walk_name;
+
+ walk_type = btf_type_by_id(btf, reg->btf_id);
+ if (!walk_type)
+ return false;
+
+ tname = btf_name_by_offset(btf, walk_type->name_off);
+
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname);
+ if (ret < 0)
+ return false;
+
+ safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
+ if (safe_id < 0)
+ return false;
+
+ safe_type = btf_type_by_id(btf, safe_id);
+ if (!safe_type)
+ return false;
+
+ for_each_member(i, walk_type, member) {
+ u32 moff;
+
+ /* We're looking for the PTR_TO_BTF_ID member in the struct
+ * type we're walking which matches the specified offset.
+ * Below, we'll iterate over the fields in the safe variant of
+ * the struct and see if any of them has a matching type /
+ * name.
+ */
+ moff = __btf_member_bit_offset(walk_type, member) / 8;
+ if (off == moff) {
+ m_walk = member;
+ break;
+ }
+ }
+ if (m_walk == NULL)
+ return false;
+
+ walk_name = __btf_name_by_offset(btf, m_walk->name_off);
+ for_each_member(i, safe_type, member) {
+ const char *m_name = __btf_name_by_offset(btf, member->name_off);
+
+ /* If we match on both type and name, the field is considered trusted. */
+ if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
+ const struct btf *reg_btf, u32 reg_id,
+ const struct btf *arg_btf, u32 arg_id)
+{
+ const char *reg_name, *arg_name, *search_needle;
+ const struct btf_type *reg_type, *arg_type;
+ int reg_len, arg_len, cmp_len;
+ size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);
+
+ reg_type = btf_type_by_id(reg_btf, reg_id);
+ if (!reg_type)
+ return false;
+
+ arg_type = btf_type_by_id(arg_btf, arg_id);
+ if (!arg_type)
+ return false;
+
+ reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
+ arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);
+
+ reg_len = strlen(reg_name);
+ arg_len = strlen(arg_name);
+
+ /* Exactly one of the two type names may be suffixed with ___init, so
+ * if the strings are the same size, they can't possibly be no-cast
+ * aliases of one another. If you have two of the same type names, e.g.
+ * they're both nf_conn___init, it would be improper to return true
+ * because they are _not_ no-cast aliases, they are the same type.
+ */
+ if (reg_len == arg_len)
+ return false;
+
+ /* Either of the two names must be the other name, suffixed with ___init. */
+ if ((reg_len != arg_len + pattern_len) &&
+ (arg_len != reg_len + pattern_len))
+ return false;
+
+ if (reg_len < arg_len) {
+ search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = reg_len;
+ } else {
+ search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = arg_len;
+ }
+
+ if (!search_needle)
+ return false;
+
+ /* ___init suffix must come at the end of the name */
+ if (*(search_needle + pattern_len) != '\0')
+ return false;
+
+ return !strncmp(reg_name, arg_name, cmp_len);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba3fff17e2f9..16da51093aff 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2096,6 +2096,14 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
if (fp->kprobe_override)
return false;
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
spin_lock(&map->owner.lock);
if (!map->owner.type) {
/* There's no owner yet where we could check for
@@ -2182,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- if (!bpf_prog_is_dev_bound(fp->aux)) {
+ if (!bpf_prog_is_offloaded(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;
@@ -2554,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
- bpf_prog_offload_destroy(aux->prog);
+ bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
new file mode 100644
index 000000000000..25355a0a367a
--- /dev/null
+++ b/kernel/bpf/cpumask.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/cpumask.h>
+
+/**
+ * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
+ * @cpumask: The actual cpumask embedded in the struct.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ *
+ * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This
+ * is done to avoid confusing the verifier due to the typedef of cpumask_var_t
+ * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See
+ * the details in <linux/cpumask.h>. The consequence is that this structure is
+ * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is
+ * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory
+ * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK
+ * not being defined, the structure is the same size regardless.
+ */
+struct bpf_cpumask {
+ cpumask_t cpumask;
+ refcount_t usage;
+};
+
+static struct bpf_mem_alloc bpf_cpumask_ma;
+
+static bool cpu_valid(u32 cpu)
+{
+ return cpu < nr_cpu_ids;
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global kfuncs as their definitions will be in BTF");
+
+/**
+ * bpf_cpumask_create() - Create a mutable BPF cpumask.
+ *
+ * Allocates a cpumask that can be queried, mutated, acquired, and released by
+ * a BPF program. The cpumask returned by this function must either be embedded
+ * in a map as a kptr, or freed with bpf_cpumask_release().
+ *
+ * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
+ * will not block. It may return NULL if no memory is available.
+ */
+struct bpf_cpumask *bpf_cpumask_create(void)
+{
+ struct bpf_cpumask *cpumask;
+
+ cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask));
+ if (!cpumask)
+ return NULL;
+
+ memset(cpumask, 0, sizeof(*cpumask));
+ refcount_set(&cpumask->usage, 1);
+
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask.
+ * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF cpumask. The cpumask returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_cpumask_release().
+ */
+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+{
+ refcount_inc(&cpumask->usage);
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask
+ * stored in a map.
+ * @cpumaskp: A pointer to a BPF cpumask map value.
+ *
+ * Attempts to acquire a reference to a BPF cpumask stored in a map value. The
+ * cpumask returned by this function must either be embedded in a map as a
+ * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
+ * no BPF cpumask was found in the specified map value.
+ */
+struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* The BPF memory allocator frees memory backing its caches in an RCU
+ * callback. Thus, we can safely use RCU to ensure that the cpumask is
+ * safe to read.
+ */
+ rcu_read_lock();
+
+ cpumask = READ_ONCE(*cpumaskp);
+ if (cpumask && !refcount_inc_not_zero(&cpumask->usage))
+ cpumask = NULL;
+
+ rcu_read_unlock();
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
+ * @cpumask: The cpumask being released.
+ *
+ * Releases a previously acquired reference to a BPF cpumask. When the final
+ * reference of the BPF cpumask has been released, it is subsequently freed in
+ * an RCU callback in the BPF memory allocator.
+ */
+void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+{
+ if (!cpumask)
+ return;
+
+ if (refcount_dec_and_test(&cpumask->usage)) {
+ migrate_disable();
+ bpf_mem_free(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
+ }
+}
+
+/**
+ * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+u32 bpf_cpumask_first(const struct cpumask *cpumask)
+{
+ return cpumask_first(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_zero() - Get the index of the first unset bit in the
+ * cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+{
+ return cpumask_first_zero(cpumask);
+}
+
+/**
+ * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be set in the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being set.
+ */
+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be cleared from the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being cleared.
+ */
+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask.
+ * @cpu: The CPU being queried for.
+ * @cpumask: The cpumask being queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
+ */
+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask.
+ * @cpu: The CPU being set and queried for.
+ * @cpumask: The BPF cpumask being set and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF
+ * cpumask.
+ * @cpu: The CPU being cleared and queried for.
+ * @cpumask: The BPF cpumask being cleared and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask having all of its bits set.
+ */
+void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+{
+ cpumask_setall((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask being cleared.
+ */
+void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+{
+ cpumask_clear((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_and() - AND two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @dst has at least one bit set following the operation
+ * * false - @dst is empty following the operation
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+bool bpf_cpumask_and(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_and((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_or() - OR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+void bpf_cpumask_or(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_or((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_xor() - XOR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+void bpf_cpumask_xor(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_xor((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_equal() - Check two cpumasks for equality.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have the same bits set.
+ * * false - @src1 and @src2 differ in at least one bit.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_equal(src1, src2);
+}
+
+/**
+ * bpf_cpumask_intersects() - Check two cpumasks for overlap.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have at least one of the same bits set.
+ * * false - @src1 and @src2 don't have any of the same bits set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_intersects(src1, src2);
+}
+
+/**
+ * bpf_cpumask_subset() - Check if a cpumask is a subset of another.
+ * @src1: The first cpumask being checked as a subset.
+ * @src2: The second cpumask being checked as a superset.
+ *
+ * Return:
+ * * true - All of the bits of @src1 are set in @src2.
+ * * false - At least one bit in @src1 is not set in @src2.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_subset(src1, src2);
+}
+
+/**
+ * bpf_cpumask_empty() - Check if a cpumask is empty.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - None of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+bool bpf_cpumask_empty(const struct cpumask *cpumask)
+{
+ return cpumask_empty(cpumask);
+}
+
+/**
+ * bpf_cpumask_full() - Check if a cpumask has all bits set.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - All of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is cleared.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+bool bpf_cpumask_full(const struct cpumask *cpumask)
+{
+ return cpumask_full(cpumask);
+}
+
+/**
+ * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask.
+ * @dst: The BPF cpumask being copied into.
+ * @src: The cpumask being copied.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+{
+ cpumask_copy((struct cpumask *)dst, src);
+}
+
+/**
+ * bpf_cpumask_any() - Return a random set CPU from a cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+u32 bpf_cpumask_any(const struct cpumask *cpumask)
+{
+ return cpumask_any(cpumask);
+}
+
+/**
+ * bpf_cpumask_any_and() - Return a random set CPU from the AND of two
+ * cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_any_and(src1, src2);
+}
+
+__diag_pop();
+
+BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS)
+BTF_SET8_END(cpumask_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set cpumask_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cpumask_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(cpumask_dtor_ids)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(func, bpf_cpumask_release)
+
+static int __init cpumask_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc cpumask_dtors[] = {
+ {
+ .btf_id = cpumask_dtor_ids[0],
+ .kfunc_btf_id = cpumask_dtor_ids[1]
+ },
+ };
+
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
+ ARRAY_SIZE(cpumask_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(cpumask_kfunc_init);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 190d9f9dc987..88aae38fde66 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -41,7 +41,7 @@ struct bpf_offload_dev {
struct bpf_offload_netdev {
struct rhash_head l;
struct net_device *netdev;
- struct bpf_offload_dev *offdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
struct list_head progs;
struct list_head maps;
struct list_head offdev_netdevs;
@@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = {
};
static struct rhashtable offdevs;
-static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -72,58 +71,218 @@ bpf_offload_find_netdev(struct net_device *netdev)
{
lockdep_assert_held(&bpf_devs_lock);
- if (!offdevs_inited)
- return NULL;
return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
}
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
struct bpf_offload_netdev *ondev;
- struct bpf_prog_offload *offload;
int err;
- if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
- attr->prog_type != BPF_PROG_TYPE_XDP)
- return -EINVAL;
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
- if (attr->prog_flags)
- return -EINVAL;
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map, true);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
offload = kzalloc(sizeof(*offload), GFP_USER);
if (!offload)
return -ENOMEM;
offload->prog = prog;
+ offload->netdev = netdev;
- offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
- attr->prog_ifindex);
- err = bpf_dev_offload_check(offload->netdev);
- if (err)
- goto err_maybe_put;
-
- down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offload->netdev);
if (!ondev) {
- err = -EINVAL;
- goto err_unlock;
+ if (bpf_prog_is_offloaded(prog->aux)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
}
offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
- dev_put(offload->netdev);
- up_write(&bpf_devs_lock);
return 0;
-err_unlock:
- up_write(&bpf_devs_lock);
-err_maybe_put:
- if (offload->netdev)
- dev_put(offload->netdev);
+err_free:
kfree(offload);
return err;
}
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload;
@@ -209,24 +368,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
up_read(&bpf_devs_lock);
}
-static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
-
- if (offload->dev_state)
- offload->offdev->ops->destroy(prog);
-
- list_del_init(&offload->offloads);
- kfree(offload);
- prog->aux->offload = NULL;
-}
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
-{
+ rtnl_lock();
down_write(&bpf_devs_lock);
- if (prog->aux->offload)
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
__bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
up_write(&bpf_devs_lock);
+ rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -340,22 +500,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
-static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
- enum bpf_netdev_command cmd)
-{
- struct netdev_bpf data = {};
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- data.command = cmd;
- data.offmap = offmap;
- /* Caller must make sure netdev is valid */
- netdev = offmap->netdev;
-
- return netdev->netdev_ops->ndo_bpf(netdev, &data);
-}
-
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
@@ -405,15 +549,6 @@ err_unlock:
return ERR_PTR(err);
}
-static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
-{
- WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
- /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
- bpf_map_free_id(&offmap->map, true);
- list_del_init(&offmap->offloads);
- offmap->netdev = NULL;
-}
-
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
@@ -573,12 +708,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
bool ret;
- if (!bpf_map_is_dev_bound(map))
+ if (!bpf_map_is_offloaded(map))
return bpf_map_offload_neutral(map);
offmap = map_to_offmap(map);
@@ -592,32 +743,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev;
int err;
- ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
- if (!ondev)
- return -ENOMEM;
-
- ondev->netdev = netdev;
- ondev->offdev = offdev;
- INIT_LIST_HEAD(&ondev->progs);
- INIT_LIST_HEAD(&ondev->maps);
-
down_write(&bpf_devs_lock);
- err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
- if (err) {
- netdev_warn(netdev, "failed to register for BPF offload\n");
- goto err_unlock_free;
- }
-
- list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
up_write(&bpf_devs_lock);
- return 0;
-
-err_unlock_free:
- up_write(&bpf_devs_lock);
- kfree(ondev);
return err;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
@@ -625,43 +755,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev, *altdev;
- struct bpf_offloaded_map *offmap, *mtmp;
- struct bpf_prog_offload *offload, *ptmp;
-
- ASSERT_RTNL();
-
down_write(&bpf_devs_lock);
- ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
- if (WARN_ON(!ondev))
- goto unlock;
-
- WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
- list_del(&ondev->offdev_netdevs);
-
- /* Try to move the objects to another netdev of the device */
- altdev = list_first_entry_or_null(&offdev->netdevs,
- struct bpf_offload_netdev,
- offdev_netdevs);
- if (altdev) {
- list_for_each_entry(offload, &ondev->progs, offloads)
- offload->netdev = altdev->netdev;
- list_splice_init(&ondev->progs, &altdev->progs);
-
- list_for_each_entry(offmap, &ondev->maps, offloads)
- offmap->netdev = altdev->netdev;
- list_splice_init(&ondev->maps, &altdev->maps);
- } else {
- list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
- __bpf_prog_offload_destroy(offload->prog);
- list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
- __bpf_map_offload_destroy(offmap);
- }
-
- WARN_ON(!list_empty(&ondev->progs));
- WARN_ON(!list_empty(&ondev->maps));
- kfree(ondev);
-unlock:
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
up_write(&bpf_devs_lock);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
@@ -670,18 +765,6 @@ struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
- int err;
-
- down_write(&bpf_devs_lock);
- if (!offdevs_inited) {
- err = rhashtable_init(&offdevs, &offdevs_params);
- if (err) {
- up_write(&bpf_devs_lock);
- return ERR_PTR(err);
- }
- offdevs_inited = true;
- }
- up_write(&bpf_devs_lock);
offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
if (!offdev)
@@ -707,3 +790,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
return offdev->priv;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+ if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
+ p = ops->xmo_rx_timestamp;
+ else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
+ p = ops->xmo_rx_hash;
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+late_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bf384b3346d6..99417b387547 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -181,7 +181,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
int err;
/* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
@@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
void *ptr;
int err;
- if (bpf_map_is_dev_bound(map))
+ if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
@@ -1483,7 +1483,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
@@ -1547,7 +1547,7 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
@@ -1605,7 +1605,7 @@ int generic_map_delete_batch(struct bpf_map *map,
map->key_size))
break;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
@@ -1851,7 +1851,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- if (!bpf_map_is_dev_bound(map)) {
+ if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
@@ -1944,7 +1944,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (!ops)
return -EINVAL;
- if (!bpf_prog_is_dev_bound(prog->aux))
+ if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
@@ -2245,7 +2245,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
if (prog->type != *attach_type)
return false;
- if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
@@ -2481,7 +2481,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32 |
- BPF_F_XDP_HAS_FRAGS))
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2565,7 +2566,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
- prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
@@ -2589,7 +2590,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
+ err = bpf_prog_dev_bound_init(prog, attr);
+ if (err)
+ goto free_prog_sec;
+ }
+
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
goto free_prog_sec;
}
@@ -3987,7 +3995,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
@@ -4215,7 +4223,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ba62f98d3c59..4cc0e70ee71e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -255,6 +255,7 @@ struct bpf_call_arg_meta {
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int dynptr_id;
int map_uid;
int func_id;
struct btf *btf;
@@ -638,31 +639,57 @@ static void print_liveness(struct bpf_verifier_env *env,
verbose(env, "D");
}
-static int get_spi(s32 off)
+static int __get_spi(s32 off)
{
return (-off - 1) / BPF_REG_SIZE;
}
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+
+ return cur->frame[reg->frameno];
+}
+
static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
- int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
- /* We need to check that slots between [spi - nr_slots + 1, spi] are
- * within [0, allocated_stack).
- *
- * Please note that the spi grows downwards. For example, a dynptr
- * takes the size of two stack slots; the first slot will be at
- * spi and the second slot will be at spi - 1.
- */
- return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}
-static struct bpf_func_state *func(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg)
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *cur = env->cur_state;
+ int off, spi;
- return cur->frame[reg->frameno];
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "dynptr has to be at a constant offset\n");
+ return -EINVAL;
+ }
+
+ off = reg->off + reg->var_off.value;
+ if (off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ return -EINVAL;
+ }
+
+ spi = __get_spi(off);
+ if (spi < 1) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ return -EINVAL;
+ }
+
+ if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS))
+ return -ERANGE;
+ return spi;
}
static const char *kernel_type_name(const struct btf* btf, u32 id)
@@ -727,37 +754,58 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
enum bpf_dynptr_type type,
- bool first_slot);
+ bool first_slot, int dynptr_id);
static void __mark_reg_not_init(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
-static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *sreg1,
struct bpf_reg_state *sreg2,
enum bpf_dynptr_type type)
{
- __mark_dynptr_reg(sreg1, type, true);
- __mark_dynptr_reg(sreg2, type, false);
+ int id = ++env->id_gen;
+
+ __mark_dynptr_reg(sreg1, type, true, id);
+ __mark_dynptr_reg(sreg2, type, false, id);
}
-static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
enum bpf_dynptr_type type)
{
- __mark_dynptr_reg(reg, type, true);
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen);
}
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi);
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type, int insn_idx)
{
struct bpf_func_state *state = func(env, reg);
enum bpf_dynptr_type type;
- int spi, i, id;
-
- spi = get_spi(reg->off);
-
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return -EINVAL;
+ int spi, i, id, err;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ /* We cannot assume both spi and spi - 1 belong to the same dynptr,
+ * hence we need to call destroy_if_dynptr_stack_slot twice for both,
+ * to ensure that for the following example:
+ * [d1][d1][d2][d2]
+ * spi 3 2 1 0
+ * So marking spi = 2 should lead to destruction of both d1 and d2. In
+ * case they do belong to same dynptr, second call won't see slot_type
+ * as STACK_DYNPTR and will simply skip destruction.
+ */
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
+ if (err)
+ return err;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_DYNPTR;
@@ -768,7 +816,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (type == BPF_DYNPTR_TYPE_INVALID)
return -EINVAL;
- mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
&state->stack[spi - 1].spilled_ptr, type);
if (dynptr_type_refcounted(type)) {
@@ -781,6 +829,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
return 0;
}
@@ -789,10 +840,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
struct bpf_func_state *state = func(env, reg);
int spi, i;
- spi = get_spi(reg->off);
-
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return -EINVAL;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_INVALID;
@@ -805,43 +855,133 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
+ *
+ * While we don't allow reading STACK_INVALID, it is still possible to
+ * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
+ * helpers or insns can do partial read of that part without failing,
+ * but check_stack_range_initialized, check_stack_read_var_off, and
+ * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
+ * the slot conservatively. Hence we need to prevent those liveness
+ * marking walks.
+ *
+ * This was not a problem before because STACK_INVALID is only set by
+ * default (where the default reg state has its reg->parent as NULL), or
+ * in clean_live_states after REG_LIVE_DONE (at which point
+ * mark_reg_read won't walk reg->parent chain), but not randomly during
+ * verifier state exploration (like we did above). Hence, for our case
+ * parentage chain will still be live (i.e. reg->parent may be
+ * non-NULL), while earlier reg->parent was NULL, so we need
+ * REG_LIVE_WRITTEN to screen off read marker propagation when it is
+ * done later on reads or by mark_dynptr_read as well to unnecessary
+ * mark registers in verifier state.
+ */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
return 0;
}
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi)
{
- struct bpf_func_state *state = func(env, reg);
- int spi, i;
+ struct bpf_func_state *fstate;
+ struct bpf_reg_state *dreg;
+ int i, dynptr_id;
- if (reg->type == CONST_PTR_TO_DYNPTR)
- return false;
+ /* We always ensure that STACK_DYNPTR is never set partially,
+ * hence just checking for slot_type[0] is enough. This is
+ * different for STACK_SPILL, where it may be only set for
+ * 1 byte, so code has to use is_spilled_reg.
+ */
+ if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+ return 0;
- spi = get_spi(reg->off);
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return true;
+ /* Reposition spi to first slot */
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ spi = spi + 1;
+
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
+
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
+ /* Writing partially to one dynptr stack slot destroys both. */
for (i = 0; i < BPF_REG_SIZE; i++) {
- if (state->stack[spi].slot_type[i] == STACK_DYNPTR ||
- state->stack[spi - 1].slot_type[i] == STACK_DYNPTR)
- return false;
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
+ dynptr_id = state->stack[spi].spilled_ptr.id;
+ /* Invalidate any slices associated with this dynptr */
+ bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+ continue;
+ if (dreg->dynptr_id == dynptr_id) {
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, dreg);
+ else
+ __mark_reg_unknown(env, dreg);
+ }
+ }));
+
+ /* Do not release reference state, we are destroying dynptr on stack,
+ * not using some helper to release it. Just reset register.
+ */
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Same reason as unmark_stack_slots_dynptr above */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi)
+{
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ if (spi < 0)
+ return spi == -ERANGE;
+ /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
+ * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
+ * ensure dynptr objects at the slots we are touching are completely
+ * destructed before we reinitialize them for a new one. For referenced
+ * ones, destroy_if_dynptr_stack_slot returns an error early instead of
+ * delaying it until the end where the user will get "Unreleased
+ * reference" error.
+ */
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi)
{
struct bpf_func_state *state = func(env, reg);
- int spi;
int i;
/* This already represents first slot of initialized bpf_dynptr */
if (reg->type == CONST_PTR_TO_DYNPTR)
return true;
- spi = get_spi(reg->off);
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.dynptr.first_slot)
+ if (spi < 0)
+ return false;
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
return false;
for (i = 0; i < BPF_REG_SIZE; i++) {
@@ -868,7 +1008,9 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
if (reg->type == CONST_PTR_TO_DYNPTR) {
return reg->dynptr.type == dynptr_type;
} else {
- spi = get_spi(reg->off);
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
}
}
@@ -1449,7 +1591,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
- bool first_slot)
+ bool first_slot, int dynptr_id)
{
/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
* callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
@@ -1457,6 +1599,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty
*/
__mark_reg_known_zero(reg);
reg->type = CONST_PTR_TO_DYNPTR;
+ /* Give each dynptr a unique id to uniquely associate slices to it. */
+ reg->id = dynptr_id;
reg->dynptr.type = type;
reg->dynptr.first_slot = first_slot;
}
@@ -2189,6 +2333,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
desc->imm = call_imm;
@@ -2390,6 +2540,32 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ret;
+
+ /* For CONST_PTR_TO_DYNPTR, it must have already been done by
+ * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+ * check_kfunc_call.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return 0;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ /* Caller ensures dynptr is valid and initialized, which means spi is in
+ * bounds and spi is the first dynptr slot. Simply mark stack slot as
+ * read.
+ */
+ ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
+ state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
+ if (ret)
+ return ret;
+ return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
+ state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -3311,6 +3487,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
}
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
@@ -3424,6 +3604,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
if (err)
return err;
+ for (i = min_off; i < max_off; i++) {
+ int spi;
+
+ spi = __get_spi(i);
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ }
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
@@ -4763,6 +4951,25 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
+#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields)
+
+BTF_TYPE_SAFE_NESTED(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+};
+
+static bool nested_ptr_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ int off)
+{
+ /* If its parent is not trusted, it can't regain its trusted status. */
+ if (!is_trusted_reg(reg))
+ return false;
+
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct));
+
+ return btf_nested_type_is_trusted(&env->log, reg, off);
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -4851,10 +5058,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (type_flag(reg->type) & PTR_UNTRUSTED)
flag |= PTR_UNTRUSTED;
- /* By default any pointer obtained from walking a trusted pointer is
- * no longer trusted except the rcu case below.
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust.
+ *
+ * An RCU-protected pointer can also be deemed trusted if we are in an
+ * RCU read region. This case is handled below.
*/
- flag &= ~PTR_TRUSTED;
+ if (nested_ptr_is_trusted(env, reg, off))
+ flag |= PTR_TRUSTED;
+ else
+ flag &= ~PTR_TRUSTED;
if (flag & MEM_RCU) {
/* Mark value register as MEM_RCU only if it is protected by
@@ -5451,6 +5665,31 @@ static int check_stack_range_initialized(
}
if (meta && meta->raw_mode) {
+ /* Ensure we won't be overwriting dynptrs when simulating byte
+ * by byte access in check_helper_call using meta.access_size.
+ * This would be a problem if we have a helper in the future
+ * which takes:
+ *
+ * helper(uninit_mem, len, dynptr)
+ *
+ * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+ * may end up writing to dynptr itself when touching memory from
+ * arg 1. This can be relaxed on a case by case basis for known
+ * safe cases, but reject due to the possibilitiy of aliasing by
+ * default.
+ */
+ for (i = min_off; i < max_off + access_size; i++) {
+ int stack_off = -i - 1;
+
+ spi = __get_spi(i);
+ /* raw_mode may write past allocated_stack */
+ if (state->allocated_stack <= stack_off)
+ continue;
+ if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+ verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+ return -EACCES;
+ }
+ }
meta->access_size = access_size;
meta->regno = regno;
return 0;
@@ -5938,6 +6177,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int spi = 0;
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
@@ -5948,12 +6188,14 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
}
/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
* check_func_arg_reg_off's logic. We only need to check offset
- * alignment for PTR_TO_STACK.
+ * and its alignment for PTR_TO_STACK.
*/
- if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
- verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
- return -EINVAL;
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 && spi != -ERANGE)
+ return spi;
}
+
/* MEM_UNINIT - Points to memory that is an appropriate candidate for
* constructing a mutable bpf_dynptr object.
*
@@ -5970,7 +6212,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
* to.
*/
if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
return -EINVAL;
}
@@ -5985,13 +6227,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
meta->uninit_dynptr_regno = regno;
} else /* MEM_RDONLY and None case from above */ {
+ int err;
+
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
return -EINVAL;
}
- if (!is_dynptr_reg_valid_init(env, reg)) {
+ if (!is_dynptr_reg_valid_init(env, reg, spi)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
regno);
@@ -6018,6 +6262,10 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
err_extra, regno);
return -EINVAL;
}
+
+ err = mark_dynptr_read(env, reg);
+ if (err)
+ return err;
}
return 0;
}
@@ -6355,15 +6603,29 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
}
}
-static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
- return reg->ref_obj_id;
+ return reg->id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.id;
+}
+
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
- spi = get_spi(reg->off);
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
return state->stack[spi].spilled_ptr.ref_obj_id;
}
@@ -6437,9 +6699,8 @@ skip_type_check:
* PTR_TO_STACK.
*/
if (reg->type == PTR_TO_STACK) {
- spi = get_spi(reg->off);
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.ref_obj_id) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
verbose(env, "arg %d is an unacquired reference\n", regno);
return -EINVAL;
}
@@ -7421,7 +7682,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
* callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
*/
__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
- mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+ mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
@@ -7927,13 +8188,32 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
if (arg_type_is_dynptr(fn->arg_type[i])) {
struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+ int id, ref_obj_id;
+
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
if (meta.ref_obj_id) {
verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
return -EFAULT;
}
- meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
+ }
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
break;
}
}
@@ -8089,6 +8369,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
+ if (is_dynptr_ref_function(func_id))
+ regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
@@ -8545,9 +8828,37 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
}
- if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id))
+ /* Enforce strict type matching for calls to kfuncs that are acquiring
+ * or releasing a reference, or are no-cast aliases. We do _not_
+ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * as we want to enable BPF programs to pass types that are bitwise
+ * equivalent without forcing them to explicitly cast with something
+ * like bpf_cast_to_kern_ctx().
+ *
+ * For example, say we had a type like the following:
+ *
+ * struct bpf_cpumask {
+ * cpumask_t cpumask;
+ * refcount_t usage;
+ * };
+ *
+ * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
+ * to a struct cpumask, so it would be safe to pass a struct
+ * bpf_cpumask * to a kfunc expecting a struct cpumask *.
+ *
+ * The philosophy here is similar to how we allow scalars of different
+ * types to be passed to kfuncs as long as the size is the same. The
+ * only difference here is that we're simply allowing
+ * btf_struct_ids_match() to walk the struct at the 0th offset, and
+ * resolve types.
+ */
+ if (is_kfunc_acquire(meta) ||
+ (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
+ WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
@@ -8891,6 +9202,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
+ if (is_kfunc_trusted_args(meta) &&
+ (register_is_null(reg) || type_may_be_null(reg->type))) {
+ verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ return -EACCES;
+ }
+
if (reg->ref_obj_id) {
if (is_kfunc_release(meta) && meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -13223,10 +13540,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return false;
if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
continue;
- if (!is_spilled_reg(&old->stack[spi]))
- continue;
- if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap))
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -13237,7 +13553,30 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ {
+ const struct bpf_reg_state *old_reg, *cur_reg;
+
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ }
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
return false;
+ }
}
return true;
}
@@ -13834,7 +14173,7 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_log_len = env->log.len_used;
}
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
@@ -14314,7 +14653,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
@@ -14795,7 +15134,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
unsigned int orig_prog_len = env->prog->len;
int err;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
err = bpf_remove_insns(env->prog, off, cnt);
@@ -14876,7 +15215,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
else
continue;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_replace_insn(env, i, &ja);
memcpy(insn, &ja, sizeof(ja));
@@ -15063,7 +15402,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
@@ -15463,7 +15802,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
int err = 0;
if (env->prog->jit_requested &&
- !bpf_prog_is_dev_bound(env->prog->aux)) {
+ !bpf_prog_is_offloaded(env->prog->aux)) {
err = jit_subprogs(env);
if (err == 0)
return 0;
@@ -15507,12 +15846,25 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
+ void *xdp_kfunc;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
return -EINVAL;
}
+ *cnt = 0;
+
+ if (bpf_dev_bound_kfunc_id(insn->imm)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
+ if (xdp_kfunc) {
+ insn->imm = BPF_CALL_IMM(xdp_kfunc);
+ return 0;
+ }
+
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
/* insn->imm has the btf func_id. Replace it with
* an address (relative to __bpf_call_base).
*/
@@ -15523,7 +15875,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
- *cnt = 0;
insn->imm = desc->imm;
if (insn->off)
return 0;
@@ -16449,7 +16800,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member);
+ int err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -16530,6 +16881,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
+
for (i = 0; i < aux->func_info_cnt; i++)
if (aux->func_info[i].type_id == btf_id) {
subprog = i;
@@ -16751,6 +17108,24 @@ BTF_ID(func, rcu_read_unlock_strict)
#endif
BTF_SET_END(btf_id_deny)
+static bool can_be_sleepable(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING) {
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ case BPF_TRACE_ITER:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return prog->type == BPF_PROG_TYPE_LSM ||
+ prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -16769,9 +17144,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) {
- verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n");
+ if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -16950,7 +17324,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (ret < 0)
goto skip_full_check;
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
@@ -16963,7 +17337,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
- if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
@@ -16998,7 +17372,7 @@ skip_full_check:
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
- if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;