diff options
Diffstat (limited to 'kernel')
89 files changed, 3791 insertions, 2022 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 78701ea37c97..c6b299a6b786 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -kheaders.md5 -timeconst.h -hz.bc +/config_data +/kheaders.md5 diff --git a/kernel/Makefile b/kernel/Makefile index e8a6715f38dc..4df609be42d0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -142,10 +142,15 @@ obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o $(obj)/configs.o: $(obj)/config_data.gz -targets += config_data.gz -$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE +targets += config_data config_data.gz +$(obj)/config_data.gz: $(obj)/config_data FORCE $(call if_changed,gzip) +filechk_cat = cat $< + +$(obj)/config_data: $(KCONFIG_CONFIG) FORCE + $(call filechk,cat) + $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz diff --git a/kernel/async.c b/kernel/async.c index 33258e6e20f8..b8d7a663497f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -78,6 +78,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; +static long long microseconds_since(ktime_t start) +{ + ktime_t now = ktime_get(); + return ktime_to_ns(ktime_sub(now, start)) >> 10; +} + static async_cookie_t lowest_in_progress(struct async_domain *domain) { struct async_entry *first = NULL; @@ -111,24 +117,18 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t calltime, delta, rettime; + ktime_t calltime; /* 1) run (and print duration) */ - if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pS @ %i\n", - (long long)entry->cookie, - entry->func, task_pid_nr(current)); - calltime = ktime_get(); - } + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, + entry->func, task_pid_nr(current)); + calltime = ktime_get(); + entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state < SYSTEM_RUNNING) { - rettime = ktime_get(); - delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", - (long long)entry->cookie, - entry->func, - (long long)ktime_to_ns(delta) >> 10); - } + + pr_debug("initcall %lli_%pS returned after %lld usecs\n", + (long long)entry->cookie, entry->func, + microseconds_since(calltime)); /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); @@ -246,24 +246,6 @@ void async_synchronize_full(void) EXPORT_SYMBOL_GPL(async_synchronize_full); /** - * async_unregister_domain - ensure no more anonymous waiters on this domain - * @domain: idle domain to flush out of any async_synchronize_full instances - * - * async_synchronize_{cookie|full}_domain() are not flushed since callers - * of these routines should know the lifetime of @domain - * - * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing - */ -void async_unregister_domain(struct async_domain *domain) -{ - spin_lock_irq(&async_lock); - WARN_ON(!domain->registered || !list_empty(&domain->pending)); - domain->registered = 0; - spin_unlock_irq(&async_lock); -} -EXPORT_SYMBOL_GPL(async_unregister_domain); - -/** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain * @domain: the domain to synchronize * @@ -287,23 +269,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { - ktime_t starttime, delta, endtime; + ktime_t starttime; - if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("async_waiting @ %i\n", task_pid_nr(current)); - starttime = ktime_get(); - } + pr_debug("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state < SYSTEM_RUNNING) { - endtime = ktime_get(); - delta = ktime_sub(endtime, starttime); - - pr_debug("async_continuing @ %i after %lli usec\n", - task_pid_nr(current), - (long long)ktime_to_ns(delta) >> 10); - } + pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), + microseconds_since(starttime)); } EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d1249340fd6b..7f33098ca63f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1f8453343bf2..3c4105603f9d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -625,6 +625,42 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; +static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + u32 i, key, num_elems = 0; + struct bpf_array *array; + bool is_percpu; + u64 ret = 0; + void *val; + + if (flags != 0) + return -EINVAL; + + is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + array = container_of(map, struct bpf_array, map); + if (is_percpu) + migrate_disable(); + for (i = 0; i < map->max_entries; i++) { + if (is_percpu) + val = this_cpu_ptr(array->pptrs[i]); + else + val = array->value + array->elem_size * i; + num_elems++; + key = i; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)&key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + break; + } + + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, @@ -643,6 +679,8 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &array_map_btf_id, .iter_seq_info = &iter_seq_info, @@ -660,6 +698,10 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &percpu_array_map_btf_id, .iter_seq_info = &iter_seq_info, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index b58b2efb9b43..2921ca39a93e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static int inode_storage_map_btf_id; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e7..b305270b7a4b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; + unsigned long flags; if (unlikely(!selem_linked_to_storage(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference(selem->local_storage); - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) kfree_rcu(local_storage, rcu); @@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; + unsigned long flags; if (unlikely(!selem_linked_to_map(selem))) /* selem has already be unlinked from smap */ @@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) smap = rcu_dereference(SDATA(selem)->smap); b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + unsigned long flags; - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem) @@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, sdata = SDATA(selem); if (cacheit_lockit) { + unsigned long flags; + /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], sdata); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); } return sdata; @@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; + unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } unlock: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return SDATA(selem); unlock_err: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return ERR_PTR(err); } @@ -468,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map_bucket *b; @@ -497,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + __this_cpu_inc(*busy_counter); + } bpf_selem_unlink(selem); + if (busy_counter) { + __this_cpu_dec(*busy_counter); + migrate_enable(); + } cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0ff58259ccf8..5efb2b24012c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -67,7 +67,7 @@ BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags) BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm) -const static struct bpf_func_proto bpf_bprm_opts_set_proto = { +static const struct bpf_func_proto bpf_bprm_opts_set_proto = { .func = bpf_bprm_opts_set, .gpl_only = false, .ret_type = RET_INTEGER, @@ -88,7 +88,7 @@ static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog) BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) -const static struct bpf_func_proto bpf_ima_inode_hash_proto = { +static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .func = bpf_ima_inode_hash, .gpl_only = false, .ret_type = RET_INTEGER, @@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e0da0258b732..3ce75758d394 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -15,21 +15,41 @@ #include <linux/bpf_local_storage.h> #include <linux/filter.h> #include <uapi/linux/btf.h> -#include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> DEFINE_BPF_STORAGE_CACHE(task_cache); +static DEFINE_PER_CPU(int, bpf_task_storage_busy); + +static void bpf_task_storage_lock(void) +{ + migrate_disable(); + __this_cpu_inc(bpf_task_storage_busy); +} + +static void bpf_task_storage_unlock(void) +{ + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); +} + +static bool bpf_task_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); + return false; + } + return true; +} + static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; - struct bpf_storage_blob *bsb; - bsb = bpf_task(task); - if (!bsb) - return NULL; - return &bsb->storage; + return &task->bpf_storage; } static struct bpf_local_storage_data * @@ -38,13 +58,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, { struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - struct bpf_storage_blob *bsb; - - bsb = bpf_task(task); - if (!bsb) - return NULL; - task_storage = rcu_dereference(bsb->storage); + task_storage = rcu_dereference(task->bpf_storage); if (!task_storage) return NULL; @@ -57,16 +72,12 @@ void bpf_task_storage_free(struct task_struct *task) struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct bpf_storage_blob *bsb; struct hlist_node *n; - - bsb = bpf_task(task); - if (!bsb) - return; + unsigned long flags; rcu_read_lock(); - local_storage = rcu_dereference(bsb->storage); + local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) { rcu_read_unlock(); return; @@ -81,7 +92,8 @@ void bpf_task_storage_free(struct task_struct *task) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_bh(&local_storage->lock); + bpf_task_storage_lock(); + raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -90,7 +102,8 @@ void bpf_task_storage_free(struct task_struct *task) free_task_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, false); } - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_task_storage_unlock(); rcu_read_unlock(); /* free_task_storage should always be true as long as @@ -123,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); + bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -150,13 +165,15 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task || !task_storage_ptr(task)) { + if (!task) { err = -ENOENT; goto out; } + bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags); + bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -199,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); err = task_storage_delete(task, map); + bpf_task_storage_unlock(); out: put_pid(pid); return err; @@ -213,44 +232,47 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; - /* explicitly check that the task_storage_ptr is not - * NULL as task_storage_lookup returns NULL in this case and - * bpf_local_storage_update expects the owner to have a - * valid storage pointer. - */ - if (!task || !task_storage_ptr(task)) + if (!task) + return (unsigned long)NULL; + + if (!bpf_task_storage_trylock()) return (unsigned long)NULL; sdata = task_storage_lookup(task, map, true); if (sdata) - return (unsigned long)sdata->data; + goto unlock; - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + /* only allocate new storage, when the task is refcounted */ + if (refcount_read(&task->usage) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); - return IS_ERR(sdata) ? (unsigned long)NULL : - (unsigned long)sdata->data; - } - return (unsigned long)NULL; +unlock: + bpf_task_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { + int ret; + if (!task) return -EINVAL; + if (!bpf_task_storage_trylock()) + return -EBUSY; + /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - return task_storage_delete(task, map); + ret = task_storage_delete(task, map); + bpf_task_storage_unlock(); + return ret; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -276,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } static int task_storage_map_btf_id; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b1a76fe046cb..0600ed325fa0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -173,7 +173,7 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x8f00ffff +#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -280,9 +280,10 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; -static const char *btf_type_str(const struct btf_type *t) +const char *btf_type_str(const struct btf_type *t) { return btf_kind_str[BTF_INFO_KIND(t->info)]; } @@ -574,6 +575,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: return true; } @@ -787,7 +789,6 @@ static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, while (btf_type_is_modifier(t) && BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { - id = t->type; t = btf_type_by_id(btf, t->type); } @@ -1704,6 +1705,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: size = type->size; goto resolved; @@ -1849,7 +1851,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env, return -EINVAL; } -/* Used for ptr, array and struct/union type members. +/* Used for ptr, array struct/union and float type members. * int, enum and modifier types have their specific callback functions. */ static int btf_generic_check_kflag_member(struct btf_verifier_env *env, @@ -3675,6 +3677,81 @@ static const struct btf_kind_operations datasec_ops = { .show = btf_datasec_show, }; +static s32 btf_float_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 && + t->size != 16) { + btf_verifier_log_type(env, t, "Invalid type_size"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static int btf_float_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u64 start_offset_bytes; + u64 end_offset_bytes; + u64 misalign_bits; + u64 align_bytes; + u64 align_bits; + + /* Different architectures have different alignment requirements, so + * here we check only for the reasonable minimum. This way we ensure + * that types after CO-RE can pass the kernel BTF verifier. + */ + align_bytes = min_t(u64, sizeof(void *), member_type->size); + align_bits = align_bytes * BITS_PER_BYTE; + div64_u64_rem(member->offset, align_bits, &misalign_bits); + if (misalign_bits) { + btf_verifier_log_member(env, struct_type, member, + "Member is not properly aligned"); + return -EINVAL; + } + + start_offset_bytes = member->offset / BITS_PER_BYTE; + end_offset_bytes = start_offset_bytes + member_type->size; + if (end_offset_bytes > struct_type->size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static void btf_float_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u", t->size); +} + +static const struct btf_kind_operations float_ops = { + .check_meta = btf_float_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_float_check_member, + .check_kflag_member = btf_generic_check_kflag_member, + .log_details = btf_float_log, + .show = btf_df_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -3808,6 +3885,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = &func_proto_ops, [BTF_KIND_VAR] = &var_ops, [BTF_KIND_DATASEC] = &datasec_ops, + [BTF_KIND_FLOAT] = &float_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -4298,7 +4376,7 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_LINK_TYPE static const struct btf_member * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { @@ -4592,8 +4670,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ - nr_args = t ? btf_type_vlen(t) : 5; + /* if (t == NULL) Fall back to default BPF prog with + * MAX_BPF_FUNC_REG_ARGS u64 arguments. + */ + nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -4649,7 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } else { if (!t) - /* Default prog with 5 args */ + /* Default prog with MAX_BPF_FUNC_REG_ARGS args */ return true; t = btf_type_by_id(btf, args[arg].type); } @@ -5100,12 +5180,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (!func) { /* BTF function prototype doesn't match the verifier types. - * Fall back to 5 u64 args. + * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < 5; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) m->arg_size[i] = 8; m->ret_size = 8; - m->nr_args = 5; + m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; } args = (const struct btf_param *)(func + 1); @@ -5281,121 +5361,190 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } -/* Compare BTF of a function with given bpf_reg_state. - * Returns: - * EFAULT - there is a verifier bug. Abort verification. - * EINVAL - there is a type mismatch or BTF is not available. - * 0 - BTF matches with what bpf_reg_state expects. - * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. - */ -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif +}; + +static int btf_check_func_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs, + bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; - struct bpf_prog *prog = env->prog; - struct btf *btf = prog->aux->btf; - const struct btf_param *args; + const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; - u32 i, nargs, btf_id, type_size; - const char *tname; - bool is_global; - - if (!prog->aux->func_info) - return -EINVAL; - - btf_id = prog->aux->func_info[subprog].type_id; - if (!btf_id) - return -EFAULT; - - if (prog->aux->func_info_aux[subprog].unreliable) - return -EINVAL; + const struct btf_param *args; + u32 i, nargs, ref_id; - t = btf_type_by_id(btf, btf_id); + t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { /* These checks were already done by the verifier while loading - * struct bpf_func_info + * struct bpf_func_info or in add_kfunc_call(). */ - bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", - subprog); + bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", + func_id); return -EFAULT; } - tname = btf_name_by_offset(btf, t->name_off); + func_name = btf_name_by_offset(btf, t->name_off); t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid BTF of func %s\n", tname); + bpf_log(log, "Invalid BTF of func %s\n", func_name); return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); - goto out; + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs, + MAX_BPF_FUNC_REG_ARGS); + return -EINVAL; } - is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *reg = ®s[i + 1]; + u32 regno = i + 1; + struct bpf_reg_state *reg = ®s[regno]; - t = btf_type_by_id(btf, args[i].type); - while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) { + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + if (btf_type_is_scalar(t)) { if (reg->type == SCALAR_VALUE) continue; - bpf_log(log, "R%d is not a scalar\n", i + 1); - goto out; + bpf_log(log, "R%d is not a scalar\n", regno); + return -EINVAL; } - if (btf_type_is_ptr(t)) { + + if (!btf_type_is_ptr(t)) { + bpf_log(log, "Unrecognized arg#%d type %s\n", + i, btf_type_str(t)); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + if (btf_is_kernel(btf)) { + const struct btf_type *reg_ref_t; + const struct btf *reg_btf; + const char *reg_ref_tname; + u32 reg_ref_id; + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (reg->type == PTR_TO_BTF_ID) { + reg_btf = reg->btf; + reg_ref_id = reg->btf_id; + } else if (reg2btf_ids[reg->type]) { + reg_btf = btf_vmlinux; + reg_ref_id = *reg2btf_ids[reg->type]; + } else { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", + func_name, i, + btf_type_str(ref_t), ref_tname, regno); + return -EINVAL; + } + + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, + ®_ref_id); + reg_ref_tname = btf_name_by_offset(reg_btf, + reg_ref_t->name_off); + if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, + reg->off, btf, ref_id)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", + func_name, i, + btf_type_str(ref_t), ref_tname, + regno, btf_type_str(reg_ref_t), + reg_ref_tname); + return -EINVAL; + } + } else if (btf_get_prog_ctx_type(log, btf, t, + env->prog->type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ - if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { - if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)]); - goto out; - } - if (check_ctx_reg(env, reg, i + 1)) - goto out; - continue; + if (reg->type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_type_str(t)); + return -EINVAL; } + if (check_ctx_reg(env, reg, regno)) + return -EINVAL; + } else if (ptr_to_mem_ok) { + const struct btf_type *resolve_ret; + u32 type_size; - if (!is_global) - goto out; - - t = btf_type_skip_modifiers(btf, t->type, NULL); - - ref_t = btf_resolve_size(btf, t, &type_size); - if (IS_ERR(ref_t)) { + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); + if (IS_ERR(resolve_ret)) { bpf_log(log, - "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), - PTR_ERR(ref_t)); - goto out; + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(ref_t), ref_tname, + PTR_ERR(resolve_ret)); + return -EINVAL; } - if (check_mem_reg(env, reg, i + 1, type_size)) - goto out; - - continue; + if (check_mem_reg(env, reg, regno, type_size)) + return -EINVAL; + } else { + return -EINVAL; } - bpf_log(log, "Unrecognized arg#%d type %s\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)]); - goto out; } + return 0; -out: +} + +/* Compare BTF of a function with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); + /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. * In such cases mark the function as unreliable from BTF point of view. */ - prog->aux->func_info_aux[subprog].unreliable = true; - return -EINVAL; + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; +} + +int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs) +{ + return btf_check_func_arg_match(env, btf, func_id, regs, false); } /* Convert BTF of a function into bpf_reg_state if possible @@ -5458,9 +5607,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", - tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } /* check that function returns int */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 75244ecb2389..5e31ee9f7512 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -143,25 +143,25 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) if (!prog->aux->nr_linfo || !prog->jit_requested) return 0; - prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, - sizeof(*prog->aux->jited_linfo), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, + sizeof(*prog->aux->jited_linfo), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!prog->aux->jited_linfo) return -ENOMEM; return 0; } -void bpf_prog_free_jited_linfo(struct bpf_prog *prog) +void bpf_prog_jit_attempt_done(struct bpf_prog *prog) { - kfree(prog->aux->jited_linfo); - prog->aux->jited_linfo = NULL; -} + if (prog->aux->jited_linfo && + (!prog->jited || !prog->aux->jited_linfo[0])) { + kvfree(prog->aux->jited_linfo); + prog->aux->jited_linfo = NULL; + } -void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) -{ - if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) - bpf_prog_free_jited_linfo(prog); + kfree(prog->aux->kfunc_tab); + prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array @@ -217,12 +217,6 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; } -void bpf_prog_free_linfo(struct bpf_prog *prog) -{ - bpf_prog_free_jited_linfo(prog); - kvfree(prog->aux->linfo); -} - struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { @@ -1369,11 +1363,10 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions - * @stack: is the eBPF storage stack * * Decode and execute eBPF instructions. */ -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z @@ -1707,7 +1700,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ - return ___bpf_prog_run(regs, insn, stack); \ + return ___bpf_prog_run(regs, insn); \ } #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size @@ -1724,7 +1717,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ BPF_R3 = r3; \ BPF_R4 = r4; \ BPF_R5 = r5; \ - return ___bpf_prog_run(regs, insn, stack); \ + return ___bpf_prog_run(regs, insn); \ } #define EVAL1(FN, X) FN(X) @@ -1849,9 +1842,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ + bool jit_needed = false; + if (fp->bpf_func) goto finalize; + if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || + bpf_prog_has_kfunc_call(fp)) + jit_needed = true; + bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant @@ -1866,14 +1865,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) return fp; fp = bpf_int_jit_compile(fp); - if (!fp->jited) { - bpf_prog_free_jited_linfo(fp); -#ifdef CONFIG_BPF_JIT_ALWAYS_ON + bpf_prog_jit_attempt_done(fp); + if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; -#endif - } else { - bpf_prog_free_unused_jited_linfo(fp); } } else { *err = bpf_prog_offload_compile(fp); @@ -2354,6 +2349,11 @@ bool __weak bpf_jit_needs_zext(void) return false; } +bool __weak bpf_jit_supports_kfunc_call(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5d1469de6921..5dd3e866599a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -27,7 +27,7 @@ #include <linux/capability.h> #include <trace/events/xdp.h> -#include <linux/netdevice.h> /* netif_receive_skb_core */ +#include <linux/netdevice.h> /* netif_receive_skb_list */ #include <linux/etherdevice.h> /* eth_type_trans */ /* General idea: XDP packets getting XDP redirected to another CPU, @@ -252,11 +252,12 @@ static int cpu_map_kthread_run(void *data) */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ + unsigned int kmem_alloc_drops = 0, sched = 0; gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; int i, n, m, nframes; + LIST_HEAD(list); /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -297,7 +298,7 @@ static int cpu_map_kthread_run(void *data) if (unlikely(m == 0)) { for (i = 0; i < nframes; i++) skbs[i] = NULL; /* effect: xdp_return_frame */ - drops += nframes; + kmem_alloc_drops += nframes; } } @@ -305,7 +306,6 @@ static int cpu_map_kthread_run(void *data) for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; - int ret; skb = __xdp_build_skb_from_frame(xdpf, skb, xdpf->dev_rx); @@ -314,13 +314,13 @@ static int cpu_map_kthread_run(void *data) continue; } - /* Inject into network stack */ - ret = netif_receive_skb_core(skb); - if (ret == NET_RX_DROP) - drops++; + list_add_tail(&skb->list, &list); } + netif_receive_skb_list(&list); + /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); + trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, + sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map) * complete. */ - bpf_clear_redirect_map(map); synchronize_rcu(); /* For cpu_map the remote CPUs can still be using the entries @@ -563,7 +562,7 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -600,6 +599,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); +} + static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -612,6 +616,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_cpu_map", .map_btf_id = &cpu_map_btf_id, + .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 85d9d1b72a33..aa516472ce46 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -258,7 +257,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); @@ -330,7 +329,7 @@ bool dev_map_can_have_prog(struct bpf_map *map) static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; - int sent = 0, drops = 0, err = 0; + int sent = 0, err = 0; int i; if (unlikely(!bq->count)) @@ -344,29 +343,23 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); if (sent < 0) { + /* If ndo_xdp_xmit fails with an errno, no frames have + * been xmit'ed. + */ err = sent; sent = 0; - goto error; } - drops = bq->count - sent; -out: - bq->count = 0; - trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); - bq->dev_rx = NULL; - __list_del_clearprev(&bq->flush_node); - return; -error: - /* If ndo_xdp_xmit fails with an errno, no frames have been - * xmit'ed and it's our responsibility to them free all. + /* If not all frames have been transmitted, it is our + * responsibility to free them */ - for (i = 0; i < bq->count; i++) { - struct xdp_frame *xdpf = bq->q[i]; + for (i = sent; unlikely(i < bq->count); i++) + xdp_return_frame_rx_napi(bq->q[i]); - xdp_return_frame_rx_napi(xdpf); - drops++; - } - goto out; + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err); + bq->dev_rx = NULL; + bq->count = 0; + __list_del_clearprev(&bq->flush_node); } /* __dev_flush is called from xdp_do_flush() which _must_ be signaled @@ -392,7 +385,7 @@ void __dev_flush(void) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; @@ -735,6 +728,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); +} + +static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); +} + static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -747,6 +750,7 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_btf_id, + .map_redirect = dev_map_redirect, }; static int dev_map_hash_map_btf_id; @@ -761,6 +765,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_hash_map_btf_id, + .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index faa54d58972c..bbfc6bb79240 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs, { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - if (insn->src_reg != BPF_PSEUDO_CALL && + if (!insn->src_reg && insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && func_id_str[insn->imm]) return func_id_str[insn->imm]; - if (cbs && cbs->cb_call) - return cbs->cb_call(cbs->private_data, insn); + if (cbs && cbs->cb_call) { + const char *res; + + res = cbs->cb_call(cbs->private_data, insn); + if (res) + return res; + } if (insn->src_reg == BPF_PSEUDO_CALL) snprintf(buff, len, "%+d", insn->imm); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + snprintf(buff, len, "kernel-function"); return buff; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d63912e73ad9..d7ebb12ffffc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -31,7 +31,7 @@ /* * The bucket lock has two protection scopes: * - * 1) Serializing concurrent operations from BPF programs on differrent + * 1) Serializing concurrent operations from BPF programs on different * CPUs * * 2) Serializing concurrent operations from BPF programs and sys_bpf() @@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; +static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + u32 roundup_key_size; + int i, num_elems = 0; + void __percpu *pptr; + struct bucket *b; + void *key, *val; + bool is_percpu; + u64 ret = 0; + + if (flags != 0) + return -EINVAL; + + is_percpu = htab_is_percpu(htab); + + roundup_key_size = round_up(map->key_size, 8); + /* disable migration so percpu value prepared here will be the + * same as the one seen by the bpf program with bpf_map_lookup_elem(). + */ + if (is_percpu) + migrate_disable(); + for (i = 0; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + rcu_read_lock(); + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + key = elem->key; + if (is_percpu) { + /* current cpu value for percpu map */ + pptr = htab_elem_get_ptr(elem, map->key_size); + val = this_cpu_ptr(pptr); + } else { + val = elem->key + roundup_key_size; + } + num_elems++; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) { + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + } +out: + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), .map_btf_name = "bpf_htab", .map_btf_id = &htab_map_btf_id, @@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_map_btf_id, @@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_percpu_map_btf_id, @@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_percpu_map_btf_id, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..544773970dbc 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage*, - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DECLARE_PER_CPU(struct bpf_cgroup_storage_info, + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage; + struct bpf_cgroup_storage *storage = NULL; void *ptr; + int i; - storage = this_cpu_read(bpf_cgroup_storage[stype]); + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + continue; + + storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); + break; + } if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; @@ -662,6 +669,322 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; +static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)unsafe_ptr < TASK_SIZE) + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + fallthrough; +#endif + case 'k': + return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + case 'u': + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + } + + return -EINVAL; +} + +/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p + */ +#define MAX_PRINTF_BUF_LEN 512 + +struct bpf_printf_buf { + char tmp_buf[MAX_PRINTF_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); +static DEFINE_PER_CPU(int, bpf_printf_buf_used); + +static int try_get_fmt_tmp_buf(char **tmp_buf) +{ + struct bpf_printf_buf *bufs; + int used; + + preempt_disable(); + used = this_cpu_inc_return(bpf_printf_buf_used); + if (WARN_ON_ONCE(used > 1)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + return -EBUSY; + } + bufs = this_cpu_ptr(&bpf_printf_buf); + *tmp_buf = bufs->tmp_buf; + + return 0; +} + +void bpf_bprintf_cleanup(void) +{ + if (this_cpu_read(bpf_printf_buf_used)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + } +} + +/* + * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers + * + * Returns a negative value if fmt is an invalid format string or 0 otherwise. + * + * This can be used in two ways: + * - Format string verification only: when bin_args is NULL + * - Arguments preparation: in addition to the above verification, it writes in + * bin_args a binary representation of arguments usable by bstr_printf where + * pointers from BPF have been sanitized. + * + * In argument preparation mode, if 0 is returned, safe temporary buffers are + * allocated and bpf_bprintf_cleanup should be called to free them after use. + */ +int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 **bin_args, u32 num_args) +{ + char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; + size_t sizeof_cur_arg, sizeof_cur_ip; + int err, i, num_spec = 0; + u64 cur_arg; + char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; + + fmt_end = strnchr(fmt, fmt_size, 0); + if (!fmt_end) + return -EINVAL; + fmt_size = fmt_end - fmt; + + if (bin_args) { + if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) + return -EBUSY; + + tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN; + *bin_args = (u32 *)tmp_buf; + } + + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (num_spec >= num_args) { + err = -EINVAL; + goto out; + } + + /* The string is zero-terminated so if fmt[i] != 0, we can + * always access fmt[i + 1], in the worst case it will be a 0 + */ + i++; + + /* skip optional "[0 +-][num]" width formatting field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 'p') { + sizeof_cur_arg = sizeof(long); + + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + fmt[i + 1] == 'x' || fmt[i + 1] == 's' || + fmt[i + 1] == 'S') { + /* just kernel pointers */ + if (tmp_buf) + cur_arg = raw_args[num_spec]; + i++; + goto nocopy_fmt; + } + + if (fmt[i + 1] == 'B') { + if (tmp_buf) { + err = snprintf(tmp_buf, + (tmp_buf_end - tmp_buf), + "%pB", + (void *)(long)raw_args[num_spec]); + tmp_buf += (err + 1); + } + + i++; + num_spec++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || + (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { + err = -EINVAL; + goto out; + } + + i += 2; + if (!tmp_buf) + goto nocopy_fmt; + + sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; + if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { + err = -ENOSPC; + goto out; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, + sizeof_cur_ip); + if (err < 0) + memset(cur_ip, 0, sizeof_cur_ip); + + /* hack: bstr_printf expects IP addresses to be + * pre-formatted as strings, ironically, the easiest way + * to do that is to call snprintf. + */ + ip_spec[2] = fmt[i - 1]; + ip_spec[3] = fmt[i]; + err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, + ip_spec, &cur_ip); + + tmp_buf += err + 1; + num_spec++; + + continue; + } else if (fmt[i] == 's') { + fmt_ptype = fmt[i]; +fmt_str: + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) { + err = -EINVAL; + goto out; + } + + if (!tmp_buf) + goto nocopy_fmt; + + if (tmp_buf_end == tmp_buf) { + err = -ENOSPC; + goto out; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, + fmt_ptype, + tmp_buf_end - tmp_buf); + if (err < 0) { + tmp_buf[0] = '\0'; + err = 1; + } + + tmp_buf += err; + num_spec++; + + continue; + } + + sizeof_cur_arg = sizeof(int); + + if (fmt[i] == 'l') { + sizeof_cur_arg = sizeof(long); + i++; + } + if (fmt[i] == 'l') { + sizeof_cur_arg = sizeof(long long); + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && + fmt[i] != 'x' && fmt[i] != 'X') { + err = -EINVAL; + goto out; + } + + if (tmp_buf) + cur_arg = raw_args[num_spec]; +nocopy_fmt: + if (tmp_buf) { + tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); + if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { + err = -ENOSPC; + goto out; + } + + if (sizeof_cur_arg == 8) { + *(u32 *)tmp_buf = *(u32 *)&cur_arg; + *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); + } else { + *(u32 *)tmp_buf = (u32)(long)cur_arg; + } + tmp_buf += sizeof_cur_arg; + } + num_spec++; + } + + err = 0; +out: + if (err) + bpf_bprintf_cleanup(); + return err; +} + +#define MAX_SNPRINTF_VARARGS 12 + +BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) +{ + int err, num_args; + u32 *bin_args; + + if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ + err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args); + if (err < 0) + return err; + + err = bstr_printf(str, str_size, fmt, bin_args); + + bpf_bprintf_cleanup(); + + return err + 1; +} + +const struct bpf_func_proto bpf_snprintf_proto = { + .func = bpf_snprintf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -708,6 +1031,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } @@ -748,6 +1073,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d2de2abec35b..b4ebd60a6c16 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -816,8 +816,6 @@ static int __init bpf_init(void) { int ret; - mutex_init(&bpf_preload_lock); - ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2d4f9ac12377..bd11db9774c3 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,10 +9,11 @@ #include <linux/slab.h> #include <uapi/linux/btf.h> -DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); - #ifdef CONFIG_CGROUP_BPF +DEFINE_PER_CPU(struct bpf_cgroup_storage_info, + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); + #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index cec792a17e5f..1b7b8a6f34ee 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -726,6 +726,9 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, + .map_delete_batch = generic_map_delete_batch, .map_check_btf = trie_check_btf, .map_btf_name = "lpm_trie", .map_btf_id = &trie_map_btf_id, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 250503482cda..941ca06d9dfa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1694,7 +1694,9 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); - bpf_prog_free_linfo(prog); + kvfree(prog->aux->jited_linfo); + kvfree(prog->aux->linfo); + kfree(prog->aux->kfunc_tab); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); @@ -2549,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link); info->tracing.attach_type = tr_link->attach_type; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &info->tracing.target_obj_id, + &info->tracing.target_btf_id); return 0; } @@ -2643,14 +2648,25 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * target_btf_id using the link_create API. * * - if tgt_prog == NULL when this function was called using the old - * raw_tracepoint_open API, and we need a target from prog->aux - * - * The combination of no saved target in prog->aux, and no target - * specified on load is illegal, and we reject that here. + * raw_tracepoint_open API, and we need a target from prog->aux + * + * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program + * was detached and is going for re-attachment. */ if (!prog->aux->dst_trampoline && !tgt_prog) { - err = -ENOENT; - goto out_unlock; + /* + * Allow re-attach for TRACING and LSM programs. If it's + * currently linked, bpf_trampoline_link_prog will fail. + * EXT programs need to specify tgt_prog_fd, so they + * re-attach in separate code path. + */ + if (prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + err = -EINVAL; + goto out_unlock; + } + btf_id = prog->aux->attach_btf_id; + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); } if (!prog->aux->dst_trampoline || @@ -2946,6 +2962,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4aa8b52adf25..2d44b5aa0057 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -444,7 +444,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) tr->progs_cnt[kind]++; err = bpf_trampoline_update(tr); if (err) { - hlist_del(&prog->aux->tramp_hlist); + hlist_del_init(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; } out: @@ -467,7 +467,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) tr->extension_prog = NULL; goto out; } - hlist_del(&prog->aux->tramp_hlist); + hlist_del_init(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(tr); out: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0399ac092b36..757476c91c98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,18 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_KFUNC_CALL; +} + +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +260,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -390,6 +403,24 @@ __printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, env->prev_linfo = linfo; } +static void verbose_invalid_scalar(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct tnum *range, const char *ctx, + const char *reg_name) +{ + char tn_buf[48]; + + verbose(env, "At %s the register %s ", ctx, reg_name); + if (!tnum_is_unknown(reg->var_off)) { + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "has value %s", tn_buf); + } else { + verbose(env, "has unknown scalar value"); + } + tnum_strn(tn_buf, sizeof(tn_buf), *range); + verbose(env, " should have been in %s\n", tn_buf); +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -409,6 +440,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -451,7 +483,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -541,6 +574,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -612,6 +647,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1362,9 +1398,7 @@ static bool __reg64_bound_s32(s64 a) static bool __reg64_bound_u32(u64 a) { - if (a > U32_MIN && a < U32_MAX) - return true; - return false; + return a > U32_MIN && a < U32_MAX; } static void __reg_combine_64_into_32(struct bpf_reg_state *reg) @@ -1375,10 +1409,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg) reg->s32_min_value = (s32)reg->smin_value; reg->s32_max_value = (s32)reg->smax_value; } - if (__reg64_bound_u32(reg->umin_value)) + if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) { reg->u32_min_value = (u32)reg->umin_value; - if (__reg64_bound_u32(reg->umax_value)) reg->u32_max_value = (u32)reg->umax_value; + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), @@ -1519,39 +1553,210 @@ static int add_subprog(struct bpf_verifier_env *env, int off) } ret = find_subprog(env, off); if (ret >= 0) - return 0; + return ret; if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; } + /* determine subprog starts. The end is one before the next starts */ env->subprog_info[env->subprog_cnt++].start = off; sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); + return env->subprog_cnt - 1; +} + +struct bpf_kfunc_desc { + struct btf_func_model func_model; + u32 func_id; + s32 imm; +}; + +#define MAX_KFUNC_DESCS 256 +struct bpf_kfunc_desc_tab { + struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; + u32 nr_descs; +}; + +static int kfunc_desc_cmp_by_id(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + /* func_id is not greater than BTF_MAX_TYPE */ + return d0->func_id - d1->func_id; +} + +static const struct bpf_kfunc_desc * +find_kfunc_desc(const struct bpf_prog *prog, u32 func_id) +{ + struct bpf_kfunc_desc desc = { + .func_id = func_id, + }; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + return bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_id); +} + +static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id) +{ + const struct btf_type *func, *func_proto; + struct bpf_kfunc_desc_tab *tab; + struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_desc *desc; + const char *func_name; + unsigned long addr; + int err; + + prog_aux = env->prog->aux; + tab = prog_aux->kfunc_tab; + if (!tab) { + if (!btf_vmlinux) { + verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!env->prog->jit_requested) { + verbose(env, "JIT is required for calling kernel function\n"); + return -ENOTSUPP; + } + + if (!bpf_jit_supports_kfunc_call()) { + verbose(env, "JIT does not support calling kernel function\n"); + return -ENOTSUPP; + } + + if (!env->prog->gpl_compatible) { + verbose(env, "cannot call kernel function from non-GPL compatible program\n"); + return -EINVAL; + } + + tab = kzalloc(sizeof(*tab), GFP_KERNEL); + if (!tab) + return -ENOMEM; + prog_aux->kfunc_tab = tab; + } + + if (find_kfunc_desc(env->prog, func_id)) + return 0; + + if (tab->nr_descs == MAX_KFUNC_DESCS) { + verbose(env, "too many different kernel function calls\n"); + return -E2BIG; + } + + func = btf_type_by_id(btf_vmlinux, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %u is not a function\n", + func_id); + return -EINVAL; + } + func_proto = btf_type_by_id(btf_vmlinux, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + addr = kallsyms_lookup_name(func_name); + if (!addr) { + verbose(env, "cannot find address for kernel function %s\n", + func_name); + return -EINVAL; + } + + desc = &tab->descs[tab->nr_descs++]; + desc->func_id = func_id; + desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base; + err = btf_distill_func_proto(&env->log, btf_vmlinux, + func_proto, func_name, + &desc->func_model); + if (!err) + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id, NULL); + return err; +} + +static int kfunc_desc_cmp_by_imm(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + if (d0->imm > d1->imm) + return 1; + else if (d0->imm < d1->imm) + return -1; return 0; } -static int check_subprogs(struct bpf_verifier_env *env) +static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) +{ + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + if (!tab) + return; + + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_imm, NULL); +} + +bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) +{ + return !!prog->aux->kfunc_tab; +} + +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc desc = { + .imm = insn->imm, + }; + const struct bpf_kfunc_desc *res; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + res = bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm); + + return res ? &res->func_model : NULL; +} + +static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { - int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; + int i, ret, insn_cnt = env->prog->len; /* Add entry function. */ ret = add_subprog(env, 0); - if (ret < 0) + if (ret) return ret; - /* determine subprog starts. The end is one before the next starts */ - for (i = 0; i < insn_cnt; i++) { - if (!bpf_pseudo_call(insn + i)) + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) && + !bpf_pseudo_kfunc_call(insn)) continue; + if (!env->bpf_capable) { - verbose(env, - "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } - ret = add_subprog(env, i + insn[i].imm + 1); + + if (bpf_pseudo_func(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + if (ret >= 0) + /* remember subprog */ + insn[1].imm = ret; + } else if (bpf_pseudo_call(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + } else { + ret = add_kfunc_call(env, insn->imm); + } + if (ret < 0) return ret; } @@ -1565,6 +1770,16 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + /* now check that all jumps are within the same subprog */ subprog_start = subprog[cur_subprog].start; subprog_end = subprog[cur_subprog + 1].start; @@ -1873,6 +2088,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, return i; } +static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) +{ + const struct btf_type *func; + + if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) + return NULL; + + func = btf_type_by_id(btf_vmlinux, insn->imm); + return btf_name_by_offset(btf_vmlinux, func->name_off); +} + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -1881,6 +2107,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, u32 *reg_mask, u64 *stack_mask) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -2295,6 +2522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2899,6 +3128,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3304,6 +3537,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3405,7 +3641,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3842,7 +4078,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4258,6 +4506,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4474,6 +4725,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4505,6 +4757,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4517,6 +4770,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4529,6 +4783,9 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; +static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4557,6 +4814,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, + [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4738,6 +4998,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -4805,6 +5067,44 @@ skip_type_check: if (err) return err; err = check_ptr_alignment(env, reg, 0, size, true); + } else if (arg_type == ARG_PTR_TO_CONST_STR) { + struct bpf_map *map = reg->map_ptr; + int map_off; + u64 map_addr; + char *str_ptr; + + if (!bpf_map_is_rdonly(map)) { + verbose(env, "R%d does not point to a readonly map'\n", regno); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a constant address'\n", regno); + return -EACCES; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + return -EACCES; + } + + err = check_map_access(env, regno, reg->off, + map->value_size - reg->off, false); + if (err) + return err; + + map_off = reg->off + reg->var_off.value; + err = map->ops->map_direct_value_addr(map, &map_addr, map_off); + if (err) { + verbose(env, "direct value access on string failed\n"); + return err; + } + + str_ptr = (char *)(long)(map_addr); + if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { + verbose(env, "string is not zero-terminated\n"); + return -EINVAL; + } } return err; @@ -5258,13 +5558,19 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) +typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx); + +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) { struct bpf_verifier_state *state = env->cur_state; struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; - int i, err, subprog, target_insn; + int err; bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { @@ -5273,14 +5579,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -E2BIG; } - target_insn = *insn_idx + insn->imm; - subprog = find_subprog(env, target_insn + 1); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn + 1); - return -EFAULT; - } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", @@ -5291,7 +5589,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_info_aux = env->prog->aux->func_info_aux; if (func_info_aux) is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, subprog, caller->regs); + err = btf_check_subprog_arg_match(env, subprog, caller->regs); if (err == -EFAULT) return err; if (is_global) { @@ -5335,11 +5633,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err) return err; - /* copy r1 - r5 args that callee can access. The copy includes parent - * pointers, which connects us up to the liveness chain - */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - callee->regs[i] = caller->regs[i]; + err = set_callee_state_cb(env, caller, callee, *insn_idx); + if (err) + return err; clear_caller_saved_regs(env, caller->regs); @@ -5347,7 +5643,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe++; /* and go analyze first insn of the callee */ - *insn_idx = target_insn; + *insn_idx = env->subprog_info[subprog].start - 1; if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); @@ -5358,6 +5654,92 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee) +{ + /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, + * void *callback_ctx, u64 flags); + * callback_fn(struct bpf_map *map, void *key, void *value, + * void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* pointer to stack or null */ + callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx) +{ + int i; + + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + int subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn); + return -EFAULT; + } + + return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); +} + +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5380,8 +5762,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5409,6 +5805,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, if (ret_type != RET_INTEGER || (func_id != BPF_FUNC_get_stack && + func_id != BPF_FUNC_get_task_stack && func_id != BPF_FUNC_probe_read_str && func_id != BPF_FUNC_probe_read_kernel_str && func_id != BPF_FUNC_probe_read_user_str)) @@ -5436,7 +5833,9 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem && + func_id != BPF_FUNC_redirect_map) return 0; if (map == NULL) { @@ -5517,15 +5916,55 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_bpf_snprintf_call(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; + + /* data must be an array of u64 */ + if (data_len_reg->var_off.value % 8) + return -EINVAL; + num_args = data_len_reg->var_off.value / 8; + + /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const + * and map_direct_value_addr is set. + */ + fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, + fmt_map_off); + if (err) { + verbose(env, "verifier bug\n"); + return -EFAULT; + } + fmt = (char *)(long)fmt_addr + fmt_map_off; + + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ + err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args); + if (err < 0) + verbose(env, "Invalid format string\n"); + + return err; +} + +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5571,7 +6010,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - for (i = 0; i < 5; i++) { + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { err = check_func_arg(env, i, &meta, fn); if (err) return err; @@ -5621,6 +6060,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + + if (func_id == BPF_FUNC_snprintf) { + err = check_bpf_snprintf_call(env, regs); + if (err < 0) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5776,6 +6228,98 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return 0; } +/* mark_btf_func_reg_size() is used when the reg size is determined by + * the BTF func_proto's return value size and argument. + */ +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + struct bpf_reg_state *reg = &cur_regs(env)[regno]; + + if (regno == BPF_REG_0) { + /* Function return value */ + reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = reg_size == sizeof(u64) ? + DEF_NOT_SUBREG : env->insn_idx + 1; + } else { + /* Function argument */ + if (reg_size == sizeof(u64)) { + mark_insn_zext(env, reg); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + } else { + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); + } + } +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct btf_type *t, *func, *func_proto, *ptr_type; + struct bpf_reg_state *regs = cur_regs(env); + const char *func_name, *ptr_type_name; + u32 i, nargs, func_id, ptr_type_id; + const struct btf_param *args; + int err; + + func_id = insn->imm; + func = btf_type_by_id(btf_vmlinux, func_id); + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + func_proto = btf_type_by_id(btf_vmlinux, func->type); + + if (!env->ops->check_kfunc_call || + !env->ops->check_kfunc_call(func_id)) { + verbose(env, "calling kernel function %s is not allowed\n", + func_name); + return -EACCES; + } + + /* Check the arguments */ + err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs); + if (err) + return err; + + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + /* Check return type */ + t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL); + if (btf_type_is_scalar(t)) { + mark_reg_unknown(env, regs, BPF_REG_0); + mark_btf_func_reg_size(env, BPF_REG_0, t->size); + } else if (btf_type_is_ptr(t)) { + ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type, + &ptr_type_id); + if (!btf_type_is_struct(ptr_type)) { + ptr_type_name = btf_name_by_offset(btf_vmlinux, + ptr_type->name_off); + verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", + func_name, btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; + mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ + + nargs = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nargs; i++) { + u32 regno = i + 1; + + t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL); + if (btf_type_is_ptr(t)) + mark_btf_func_reg_size(env, regno, sizeof(void *)); + else + /* scalar. ensured by btf_check_kfunc_arg_match() */ + mark_btf_func_reg_size(env, regno, t->size); + } + + return 0; +} + static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -5920,7 +6464,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, aux->alu_limit != alu_limit)) return REASON_PATHS; - /* Corresponding fixup done in fixup_bpf_calls(). */ + /* Corresponding fixup done in do_misc_fixups(). */ aux->alu_state = alu_state; aux->alu_limit = alu_limit; return 0; @@ -5952,6 +6496,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; struct bpf_verifier_state *vstate = env->cur_state; + bool off_is_imm = tnum_is_const(off_reg->var_off); bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); @@ -5982,6 +6527,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_limit = abs(tmp_aux->alu_limit - alu_limit); } else { alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; } @@ -8345,6 +8891,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8573,17 +9137,7 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { - char tn_buf[48]; - - verbose(env, "At program exit the register R0 "); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); - } - tnum_strn(tn_buf, sizeof(tn_buf), range); - verbose(env, " should have been in %s\n", tn_buf); + verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); return -EINVAL; } @@ -8710,6 +9264,27 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } +static int visit_func_call_insn(int t, int insn_cnt, + struct bpf_insn *insns, + struct bpf_verifier_env *env, + bool visit_callee) +{ + int ret; + + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + if (ret) + return ret; + + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + if (visit_callee) { + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); + } + return ret; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -8720,6 +9295,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -8730,18 +9308,8 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret) - return ret; - - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); - } - return ret; + return visit_func_call_insn(t, insn_cnt, insns, env, + insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: if (BPF_SRC(insns[t].code) != BPF_K) @@ -9354,6 +9922,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10037,6 +10606,7 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -10184,7 +10754,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL) || + insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); @@ -10199,11 +10770,12 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + err = check_kfunc_call(env, insn); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10632,6 +11204,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10764,9 +11342,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11405,6 +11987,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11494,6 +12082,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->linfo = prog->aux->linfo; func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; @@ -11534,6 +12123,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11579,6 +12174,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11590,7 +12190,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; - bpf_prog_free_unused_jited_linfo(prog); + bpf_prog_jit_attempt_done(prog); return 0; out_free: for (i = 0; i < env->subprog_cnt; i++) { @@ -11613,7 +12213,7 @@ out_undo_insn: insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; } - bpf_prog_free_jited_linfo(prog); + bpf_prog_jit_attempt_done(prog); return err; } @@ -11622,6 +12222,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) #ifndef CONFIG_BPF_JIT_ALWAYS_ON struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; + bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); int i, depth; #endif int err = 0; @@ -11635,6 +12236,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (has_kfunc_call) { + verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); + return -EINVAL; + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. @@ -11643,6 +12248,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); @@ -11655,12 +12268,30 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } -/* fixup insn->imm field of bpf_call instructions - * and inline eligible helpers as explicit sequence of BPF instructions - * - * this function is called after eBPF program passed verification +static int fixup_kfunc_call(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc *desc; + + /* insn->imm has the btf func_id. Replace it with + * an address (relative to __bpf_base_call). + */ + desc = find_kfunc_desc(env->prog, insn->imm); + if (!desc) { + verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", + insn->imm); + return -EFAULT; + } + + insn->imm = desc->imm; + + return 0; +} + +/* Do various post-verification rewrites in a single program pass. + * These rewrites simplify JIT and interpreter implementations. */ -static int fixup_bpf_calls(struct bpf_verifier_env *env) +static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; bool expect_blinding = bpf_jit_blinding_enabled(prog); @@ -11675,6 +12306,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || insn->code == (BPF_ALU | BPF_MOD | BPF_X) || @@ -11715,6 +12347,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || BPF_MODE(insn->code) == BPF_IND)) { @@ -11734,13 +12367,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Rewrite pointer arithmetic to mitigate speculation attacks. */ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; - struct bpf_insn insn_buf[16]; struct bpf_insn *patch = &insn_buf[0]; - bool issrc, isneg; + bool issrc, isneg, isimm; u32 off_reg; aux = &env->insn_aux_data[i + delta]; @@ -11751,28 +12384,29 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) isneg = aux->alu_state & BPF_ALU_NEG_VALUE; issrc = (aux->alu_state & BPF_ALU_SANITIZE) == BPF_ALU_SANITIZE_SRC; + isimm = aux->alu_state & BPF_ALU_IMMEDIATE; off_reg = issrc ? insn->src_reg : insn->dst_reg; - if (isneg) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); - *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - if (issrc) { - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, - off_reg); - insn->src_reg = BPF_REG_AX; + if (isimm) { + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); } else { - *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, - BPF_REG_AX); + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); } + if (!issrc) + *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); + insn->src_reg = BPF_REG_AX; if (isneg) insn->code = insn->code == code_add ? code_sub : code_add; *patch++ = *insn; - if (issrc && isneg) + if (issrc && isneg && !isimm) *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); cnt = patch - insn_buf; @@ -11790,6 +12424,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; if (insn->src_reg == BPF_PSEUDO_CALL) continue; + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + ret = fixup_kfunc_call(env, insn); + if (ret) + return ret; + continue; + } if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -11882,7 +12522,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_delete_elem || insn->imm == BPF_FUNC_map_push_elem || insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem)) { + insn->imm == BPF_FUNC_map_peek_elem || + insn->imm == BPF_FUNC_redirect_map)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -11924,6 +12565,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_redirect, + (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: @@ -11950,11 +12594,16 @@ patch_map_ops_generic: insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - __bpf_call_base; continue; + case BPF_FUNC_redirect_map: + insn->imm = BPF_CAST_CALL(ops->map_redirect) - + __bpf_call_base; + continue; } goto patch_call_imm; } + /* Implement bpf_jiffies64 inline. */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_jiffies64) { struct bpf_insn ld_jiffies_addr[2] = { @@ -12010,6 +12659,8 @@ patch_call_imm: } } + sort_kfunc_descs_by_imm(env->prog); + return 0; } @@ -12120,7 +12771,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); - ret = btf_check_func_arg_match(env, subprog, regs); + ret = btf_check_subprog_arg_match(env, subprog, regs); if (ret == -EFAULT) /* unlikely verifier bug. abort. * ret == 0 and ret < 0 are sadly acceptable for @@ -12720,6 +13371,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; + ret = add_subprog_and_kfunc(env); + if (ret < 0) + goto skip_full_check; + ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -12770,7 +13425,7 @@ skip_full_check: ret = convert_ctx_accesses(env); if (ret == 0) - ret = fixup_bpf_calls(env); + ret = do_misc_fixups(env); /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9153b20e5cc6..e049edd66776 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1339,6 +1339,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_unlock(&cgroup_mutex); + cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1751,6 +1752,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) &dcgrp->e_csets[ss->id]); spin_unlock_irq(&css_set_lock); + if (ss->css_rstat_flush) { + list_del_rcu(&css->rstat_css_node); + list_add_rcu(&css->rstat_css_node, + &dcgrp->rstat_css_list); + } + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -1971,10 +1978,14 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = rebind_subsystems(root, ss_mask); + ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto exit_stats; + ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); @@ -2006,6 +2017,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = 0; goto out; +exit_stats: + cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -4934,8 +4947,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4976,8 +4988,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_flush(cgrp); + cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5034,7 +5045,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); @@ -5159,11 +5170,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - if (cgroup_on_dfl(parent)) { - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - } + ret = cgroup_rstat_init(cgrp); + if (ret) + goto out_cancel_ref; /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); @@ -5250,8 +5259,7 @@ out_psi_free: out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: - if (cgroup_on_dfl(parent)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d51175cedfca..3a3fd2993a65 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -25,13 +25,8 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup *parent; unsigned long flags; - /* nothing to do for root */ - if (!cgroup_parent(cgrp)) - return; - /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. @@ -46,10 +41,10 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_lock_irqsave(cpu_lock, flags); /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { + while (true) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -58,8 +53,17 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (rstatc->updated_next) break; + /* Root has no parent to link it to, but mark it busy */ + if (!parent) { + rstatc->updated_next = cgrp; + break; + } + + prstatc = cgroup_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = cgrp; + + cgrp = parent; } raw_spin_unlock_irqrestore(cpu_lock, flags); @@ -113,23 +117,26 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, */ if (rstatc->updated_next) { struct cgroup *parent = cgroup_parent(pos); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); - struct cgroup_rstat_cpu *nrstatc; - struct cgroup **nextp; - - nextp = &prstatc->updated_children; - while (true) { - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; + + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; + + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (true) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + if (*nextp == pos) + break; + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - return pos; } @@ -285,8 +292,6 @@ void __init cgroup_rstat_boot(void) for_each_possible_cpu(cpu) raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); - - BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); } /* @@ -311,11 +316,15 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_base_stat cur, delta; unsigned seq; + /* Root-level stats are sourced from system-wide CPU stats */ + if (!parent) + return; + /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); @@ -328,8 +337,8 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); - /* propagate global delta to parent */ - if (parent) { + /* propagate global delta to parent (unless that's root) */ + if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index d3fd428f4b92..eb701b2ac72f 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -1,5 +1,4 @@ # KEEP ALPHABETICALLY SORTED -# CONFIG_DEVKMEM is not set # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config new file mode 100644 index 000000000000..2f0e6bf6db2c --- /dev/null +++ b/kernel/configs/tiny-base.config @@ -0,0 +1 @@ +CONFIG_EMBEDDED=y diff --git a/kernel/cred.c b/kernel/cred.c index 421b1149c651..e1d274cd741b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -33,7 +33,7 @@ do { \ static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; +static struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; /* * The initial credentials for the initial task diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 002268262c9a..f737e3347059 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -344,8 +344,8 @@ void dma_direct_sync_sg_for_device(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, sg->length, - dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, paddr, sg->length, + dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, sg->length, @@ -370,8 +370,8 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu(paddr, sg->length, dir); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, - SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, paddr, sg->length, + dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, sg->length); diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index b98615578737..50afc05b6f1d 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -57,7 +57,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, addr); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, size, dir); @@ -74,7 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, } if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, paddr, size, dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, size); @@ -114,6 +114,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index e0e64f8b0739..9b9af1bd6be3 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -38,7 +38,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; struct map_benchmark_data { @@ -58,9 +59,11 @@ static int map_benchmark_thread(void *data) void *buf; dma_addr_t dma_addr; struct map_benchmark_data *map = data; + int npages = map->bparam.granule; + u64 size = npages * PAGE_SIZE; int ret = 0; - buf = (void *)__get_free_page(GFP_KERNEL); + buf = alloc_pages_exact(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -76,10 +79,10 @@ static int map_benchmark_thread(void *data) * 66 means evertything goes well! 66 is lucky. */ if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, PAGE_SIZE); + memset(buf, 0x66, size); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir); + dma_addr = dma_map_single(map->dev, buf, size, map->dir); if (unlikely(dma_mapping_error(map->dev, dma_addr))) { pr_err("dma_map_single failed on %s\n", dev_name(map->dev)); @@ -93,7 +96,7 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); + dma_unmap_single(map->dev, dma_addr, size, map->dir); unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -112,7 +115,7 @@ static int map_benchmark_thread(void *data) } out: - free_page((unsigned long)buf); + free_pages_exact(buf, size); return ret; } @@ -203,7 +206,6 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, struct map_benchmark_data *map = file->private_data; void __user *argp = (void __user *)arg; u64 old_dma_mask; - int ret; if (copy_from_user(&map->bparam, argp, sizeof(map->bparam))) @@ -234,6 +236,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, return -EINVAL; } + if (map->bparam.granule < 1 || map->bparam.granule > 1024) { + pr_err("invalid granule size\n"); + return -EINVAL; + } + switch (map->bparam.dma_dir) { case DMA_MAP_BIDIRECTIONAL: map->dir = DMA_BIDIRECTIONAL; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b6a633679933..2b06a809d0b9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -477,11 +477,10 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); -struct page *dma_alloc_pages(struct device *dev, size_t size, +static struct page *__dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { const struct dma_map_ops *ops = get_dma_ops(dev); - struct page *page; if (WARN_ON_ONCE(!dev->coherent_dma_mask)) return NULL; @@ -490,33 +489,162 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) - page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); - else if (ops->alloc_pages) - page = ops->alloc_pages(dev, size, dma_handle, dir, gfp); - else + return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!ops->alloc_pages) return NULL; + return ops->alloc_pages(dev, size, dma_handle, dir, gfp); +} - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); +struct page *dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (page) + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); -void dma_free_pages(struct device *dev, size_t size, struct page *page, +static void __dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); size = PAGE_ALIGN(size); - debug_dma_unmap_page(dev, dma_handle, size, dir); - if (dma_alloc_direct(dev, ops)) dma_direct_free_pages(dev, size, page, dma_handle, dir); else if (ops->free_pages) ops->free_pages(dev, size, page, dma_handle, dir); } + +void dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + debug_dma_unmap_page(dev, dma_handle, size, dir); + __dma_free_pages(dev, size, page, dma_handle, dir); +} EXPORT_SYMBOL_GPL(dma_free_pages); +int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(page) + vma->vm_pgoff, + vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot); +} +EXPORT_SYMBOL_GPL(dma_mmap_pages); + +static struct sg_table *alloc_single_sgt(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp) +{ + struct sg_table *sgt; + struct page *page; + + sgt = kmalloc(sizeof(*sgt), gfp); + if (!sgt) + return NULL; + if (sg_alloc_table(sgt, 1, gfp)) + goto out_free_sgt; + page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp); + if (!page) + goto out_free_table; + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + sg_dma_len(sgt->sgl) = sgt->sgl->length; + return sgt; +out_free_table: + sg_free_table(sgt); +out_free_sgt: + kfree(sgt); + return NULL; +} + +struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct sg_table *sgt; + + if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) + return NULL; + + if (ops && ops->alloc_noncontiguous) + sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); + else + sgt = alloc_single_sgt(dev, size, dir, gfp); + + if (sgt) { + sgt->nents = 1; + debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir); + } + return sgt; +} +EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous); + +static void free_single_sgt(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address, + dir); + sg_free_table(sgt); + kfree(sgt); +} + +void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); + if (ops && ops->free_noncontiguous) + ops->free_noncontiguous(dev, size, sgt, dir); + else + free_single_sgt(dev, size, sgt, dir); +} +EXPORT_SYMBOL_GPL(dma_free_noncontiguous); + +void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (ops && ops->alloc_noncontiguous) + return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL); + return page_address(sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous); + +void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) + vunmap(vaddr); +} +EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous); + +int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) { + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || + vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return vm_map_pages(vma, sgt_handle(sgt)->pages, count); + } + return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 905c3fa005f1..b4526668072e 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -66,6 +66,5 @@ void dma_common_free_remap(void *cpu_addr, size_t size) return; } - unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c10e855a03bc..8ca7d505d61c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -59,32 +59,11 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -enum swiotlb_force swiotlb_force; - -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ -phys_addr_t io_tlb_start, io_tlb_end; - -/* - * The number of IO TLB blocks (in groups of 64) between io_tlb_start and - * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. - */ -static unsigned long io_tlb_nslabs; +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) -/* - * The number of used IO TLB block - */ -static unsigned long io_tlb_used; +enum swiotlb_force swiotlb_force; -/* - * This is a free list describing the number of free entries available from - * each index - */ -static unsigned int *io_tlb_list; -static unsigned int io_tlb_index; +struct io_tlb_mem *io_tlb_default_mem; /* * Max segment that we can provide which (if pages are contingous) will @@ -92,57 +71,30 @@ static unsigned int io_tlb_index; */ static unsigned int max_segment; -/* - * We need to save away the original address corresponding to a mapped entry - * for the sync operations. - */ -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) -static phys_addr_t *io_tlb_orig_addr; - -/* - * The mapped buffer's size should be validated during a sync operation. - */ -static size_t *io_tlb_orig_size; - -/* - * Protect the above data structures in the map and unmap calls - */ -static DEFINE_SPINLOCK(io_tlb_lock); - -static int late_alloc; +static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static int __init setup_io_tlb_npages(char *str) { if (isdigit(*str)) { - io_tlb_nslabs = simple_strtoul(str, &str, 0); /* avoid tail segment of size < IO_TLB_SEGSIZE */ - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + default_nslabs = + ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE); } if (*str == ',') ++str; - if (!strcmp(str, "force")) { + if (!strcmp(str, "force")) swiotlb_force = SWIOTLB_FORCE; - } else if (!strcmp(str, "noforce")) { + else if (!strcmp(str, "noforce")) swiotlb_force = SWIOTLB_NO_FORCE; - io_tlb_nslabs = 1; - } return 0; } early_param("swiotlb", setup_io_tlb_npages); -static bool no_iotlb_memory; - -unsigned long swiotlb_nr_tbl(void) -{ - return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs; -} -EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); - unsigned int swiotlb_max_segment(void) { - return unlikely(no_iotlb_memory) ? 0 : max_segment; + return io_tlb_default_mem ? max_segment : 0; } EXPORT_SYMBOL_GPL(swiotlb_max_segment); @@ -156,42 +108,34 @@ void swiotlb_set_max_segment(unsigned int val) unsigned long swiotlb_size_or_default(void) { - unsigned long size; - - size = io_tlb_nslabs << IO_TLB_SHIFT; - - return size ? size : (IO_TLB_DEFAULT_SIZE); + return default_nslabs << IO_TLB_SHIFT; } -void __init swiotlb_adjust_size(unsigned long new_size) +void __init swiotlb_adjust_size(unsigned long size) { - unsigned long size; - /* * If swiotlb parameter has not been specified, give a chance to * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ - if (!io_tlb_nslabs) { - size = ALIGN(new_size, IO_TLB_SIZE); - io_tlb_nslabs = size >> IO_TLB_SHIFT; - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - - pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); - } + if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) + return; + size = ALIGN(size, IO_TLB_SIZE); + default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } void swiotlb_print_info(void) { - unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; + struct io_tlb_mem *mem = io_tlb_default_mem; - if (no_iotlb_memory) { + if (!mem) { pr_warn("No low mem\n"); return; } - pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end, - bytes >> 20); + pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end, + (mem->nslabs << IO_TLB_SHIFT) >> 20); } static inline unsigned long io_tlb_offset(unsigned long val) @@ -212,64 +156,51 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { + struct io_tlb_mem *mem = io_tlb_default_mem; void *vaddr; unsigned long bytes; - if (no_iotlb_memory || late_alloc) + if (!mem || mem->late_alloc) return; - - vaddr = phys_to_virt(io_tlb_start); - bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); + vaddr = phys_to_virt(mem->start); + bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); memset(vaddr, 0, bytes); } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - unsigned long i, bytes; + unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + struct io_tlb_mem *mem; size_t alloc_size; - bytes = nslabs << IO_TLB_SHIFT; - - io_tlb_nslabs = nslabs; - io_tlb_start = __pa(tlb); - io_tlb_end = io_tlb_start + bytes; - - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int)); - io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_list) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)); - io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_orig_addr) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); + /* protect against double initialization */ + if (WARN_ON_ONCE(io_tlb_default_mem)) + return -ENOMEM; - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)); - io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_orig_size) + alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs)); + mem = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + mem->nslabs = nslabs; + mem->start = __pa(tlb); + mem->end = mem->start + bytes; + mem->index = 0; + spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } - io_tlb_index = 0; - no_iotlb_memory = false; + io_tlb_default_mem = mem; if (verbose) swiotlb_print_info(); - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; } @@ -280,29 +211,24 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) void __init swiotlb_init(int verbose) { - size_t default_size = IO_TLB_DEFAULT_SIZE; - unsigned char *vstart; - unsigned long bytes; - - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); + void *tlb; - /* Get IO TLB memory from the low pages */ - vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + if (swiotlb_force == SWIOTLB_NO_FORCE) return; - if (io_tlb_start) { - memblock_free_early(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - io_tlb_start = 0; - } + /* Get IO TLB memory from the low pages */ + tlb = memblock_alloc_low(bytes, PAGE_SIZE); + if (!tlb) + goto fail; + if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose)) + goto fail_free_mem; + return; + +fail_free_mem: + memblock_free_early(__pa(tlb), bytes); +fail: pr_warn("Cannot allocate buffer"); - no_iotlb_memory = true; } /* @@ -313,22 +239,22 @@ swiotlb_init(int verbose) int swiotlb_late_init_with_default_size(size_t default_size) { - unsigned long bytes, req_nslabs = io_tlb_nslabs; + unsigned long nslabs = + ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + unsigned long bytes; unsigned char *vstart = NULL; unsigned int order; int rc = 0; - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; /* * Get IO TLB memory from the low pages */ - order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); - io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + order = get_order(nslabs << IO_TLB_SHIFT); + nslabs = SLABS_PER_PAGE << order; + bytes = nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, @@ -338,134 +264,99 @@ swiotlb_late_init_with_default_size(size_t default_size) order--; } - if (!vstart) { - io_tlb_nslabs = req_nslabs; + if (!vstart) return -ENOMEM; - } + if (order != get_order(bytes)) { pr_warn("only able to allocate %ld MB\n", (PAGE_SIZE << order) >> 20); - io_tlb_nslabs = SLABS_PER_PAGE << order; + nslabs = SLABS_PER_PAGE << order; } - rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); + rc = swiotlb_late_init_with_tbl(vstart, nslabs); if (rc) free_pages((unsigned long)vstart, order); return rc; } -static void swiotlb_cleanup(void) -{ - io_tlb_end = 0; - io_tlb_start = 0; - io_tlb_nslabs = 0; - max_segment = 0; -} - int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - unsigned long i, bytes; + unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + struct io_tlb_mem *mem; - bytes = nslabs << IO_TLB_SHIFT; + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; - io_tlb_nslabs = nslabs; - io_tlb_start = virt_to_phys(tlb); - io_tlb_end = io_tlb_start + bytes; + /* protect against double initialization */ + if (WARN_ON_ONCE(io_tlb_default_mem)) + return -ENOMEM; - set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - memset(tlb, 0, bytes); + mem = (void *)__get_free_pages(GFP_KERNEL, + get_order(struct_size(mem, slots, nslabs))); + if (!mem) + return -ENOMEM; - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); - if (!io_tlb_list) - goto cleanup3; - - io_tlb_orig_addr = (phys_addr_t *) - __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * - sizeof(phys_addr_t))); - if (!io_tlb_orig_addr) - goto cleanup4; - - io_tlb_orig_size = (size_t *) - __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * - sizeof(size_t))); - if (!io_tlb_orig_size) - goto cleanup5; - - - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + mem->nslabs = nslabs; + mem->start = virt_to_phys(tlb); + mem->end = mem->start + bytes; + mem->index = 0; + mem->late_alloc = 1; + spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } - io_tlb_index = 0; - no_iotlb_memory = false; - - swiotlb_print_info(); - late_alloc = 1; - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); + memset(tlb, 0, bytes); + io_tlb_default_mem = mem; + swiotlb_print_info(); + swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; - -cleanup5: - free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * - sizeof(phys_addr_t))); - -cleanup4: - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - io_tlb_list = NULL; -cleanup3: - swiotlb_cleanup(); - return -ENOMEM; } void __init swiotlb_exit(void) { - if (!io_tlb_orig_addr) + struct io_tlb_mem *mem = io_tlb_default_mem; + size_t size; + + if (!mem) return; - if (late_alloc) { - free_pages((unsigned long)io_tlb_orig_size, - get_order(io_tlb_nslabs * sizeof(size_t))); - free_pages((unsigned long)io_tlb_orig_addr, - get_order(io_tlb_nslabs * sizeof(phys_addr_t))); - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - free_pages((unsigned long)phys_to_virt(io_tlb_start), - get_order(io_tlb_nslabs << IO_TLB_SHIFT)); - } else { - memblock_free_late(__pa(io_tlb_orig_addr), - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - memblock_free_late(__pa(io_tlb_orig_size), - PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t))); - memblock_free_late(__pa(io_tlb_list), - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - memblock_free_late(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - } - swiotlb_cleanup(); + size = struct_size(mem, slots, mem->nslabs); + if (mem->late_alloc) + free_pages((unsigned long)mem, get_order(size)); + else + memblock_free_late(__pa(mem), PAGE_ALIGN(size)); + io_tlb_default_mem = NULL; } /* * Bounce: copy the swiotlb buffer from or back to the original dma location */ -static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) +static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, + enum dma_data_direction dir) { + struct io_tlb_mem *mem = io_tlb_default_mem; + int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = mem->slots[index].orig_addr; + size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); + if (orig_addr == INVALID_PHYS_ADDR) + return; + + if (size > alloc_size) { + dev_WARN_ONCE(dev, 1, + "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", + alloc_size, size); + size = alloc_size; + } + if (PageHighMem(pfn_to_page(pfn))) { /* The buffer does not have a mapping. Map it in and copy */ unsigned int offset = orig_addr & ~PAGE_MASK; @@ -517,9 +408,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask) return nr_slots(boundary_mask + 1); } -static unsigned int wrap_index(unsigned int index) +static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) { - if (index >= io_tlb_nslabs) + if (index >= mem->nslabs) return 0; return index; } @@ -531,9 +422,10 @@ static unsigned int wrap_index(unsigned int index) static int find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size) { + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = - phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; + phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); @@ -552,15 +444,15 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, if (alloc_size >= PAGE_SIZE) stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); - spin_lock_irqsave(&io_tlb_lock, flags); - if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + spin_lock_irqsave(&mem->lock, flags); + if (unlikely(nslots > mem->nslabs - mem->used)) goto not_found; - index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); + index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); do { if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { - index = wrap_index(index + 1); + index = wrap_index(mem, index + 1); continue; } @@ -572,34 +464,34 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, if (!iommu_is_span_boundary(index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (io_tlb_list[index] >= nslots) + if (mem->slots[index].list >= nslots) goto found; } - index = wrap_index(index + stride); + index = wrap_index(mem, index + stride); } while (index != wrap); not_found: - spin_unlock_irqrestore(&io_tlb_lock, flags); + spin_unlock_irqrestore(&mem->lock, flags); return -1; found: for (i = index; i < index + nslots; i++) - io_tlb_list[i] = 0; + mem->slots[i].list = 0; for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; + mem->slots[i].list; i--) + mem->slots[i].list = ++count; /* * Update the indices to avoid searching in the next round. */ - if (index + nslots < io_tlb_nslabs) - io_tlb_index = index + nslots; + if (index + nslots < mem->nslabs) + mem->index = index + nslots; else - io_tlb_index = 0; - io_tlb_used += nslots; + mem->index = 0; + mem->used += nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + spin_unlock_irqrestore(&mem->lock, flags); return index; } @@ -607,11 +499,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); - unsigned int index, i; + unsigned int i; + int index; phys_addr_t tlb_addr; - if (no_iotlb_memory) + if (!mem) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (mem_encrypt_active()) @@ -628,7 +522,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, io_tlb_nslabs, io_tlb_used); + alloc_size, mem->nslabs, mem->used); return (phys_addr_t)DMA_MAPPING_ERROR; } @@ -638,49 +532,37 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * needed. */ for (i = 0; i < nr_slots(alloc_size + offset); i++) { - io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); + mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); + mem->slots[index + i].alloc_size = + alloc_size - (i << IO_TLB_SHIFT); } - tlb_addr = slot_addr(io_tlb_start, index) + offset; + tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size) -{ - if (*size > orig_size) { - /* Warn and truncate mapping_size */ - dev_WARN_ONCE(hwdev, 1, - "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n", - orig_size, *size); - *size = orig_size; - } -} - /* * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t mapping_size, size_t alloc_size, - enum dma_data_direction dir, unsigned long attrs) + size_t mapping_size, enum dma_data_direction dir, + unsigned long attrs) { + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); - int i, count, nslots = nr_slots(alloc_size + offset); - int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size); + int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; + int nslots = nr_slots(mem->slots[index].alloc_size + offset); + int count, i; /* * First, sync the memory before unmapping the entry */ - if (orig_addr != INVALID_PHYS_ADDR && - !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -688,9 +570,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ - spin_lock_irqsave(&io_tlb_lock, flags); + spin_lock_irqsave(&mem->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) - count = io_tlb_list[index + nslots]; + count = mem->slots[index + nslots].list; else count = 0; @@ -699,9 +581,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * superceeding slots */ for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + mem->slots[i].list = ++count; + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } /* @@ -709,44 +591,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * available (non zero) */ for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i]; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) - io_tlb_list[i] = ++count; - io_tlb_used -= nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + mem->slots[i].list = ++count; + mem->used -= nslots; + spin_unlock_irqrestore(&mem->lock, flags); } -void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) +void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) { - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - size_t orig_size = io_tlb_orig_size[index]; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - if (orig_addr == INVALID_PHYS_ADDR) - return; + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); +} - validate_sync_size_and_truncate(hwdev, orig_size, &size); - - switch (target) { - case SYNC_FOR_CPU: - if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_FROM_DEVICE); - else - BUG_ON(dir != DMA_TO_DEVICE); - break; - case SYNC_FOR_DEVICE: - if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); - break; - default: - BUG(); - } +void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) +{ + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE); + else + BUG_ON(dir != DMA_TO_DEVICE); } /* @@ -770,7 +637,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, /* Ensure that the address returned is DMA'ble */ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); dev_WARN_ONCE(dev, 1, "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", @@ -790,22 +657,21 @@ size_t swiotlb_max_mapping_size(struct device *dev) bool is_swiotlb_active(void) { - /* - * When SWIOTLB is initialized, even if io_tlb_start points to physical - * address zero, io_tlb_end surely doesn't. - */ - return io_tlb_end != 0; + return io_tlb_default_mem != NULL; } +EXPORT_SYMBOL_GPL(is_swiotlb_active); #ifdef CONFIG_DEBUG_FS static int __init swiotlb_create_debugfs(void) { - struct dentry *root; + struct io_tlb_mem *mem = io_tlb_default_mem; - root = debugfs_create_dir("swiotlb", NULL); - debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs); - debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used); + if (!mem) + return 0; + mem->debugfs = debugfs_create_dir("swiotlb", NULL); + debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); + debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 928b166d888e..2e947a485898 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -581,11 +581,6 @@ static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } -extern __weak const char *perf_pmu_name(void) -{ - return "pmu"; -} - static inline u64 perf_clock(void) { return local_clock(); diff --git a/kernel/exit.c b/kernel/exit.c index 0596526ed9ea..fd1c04193e18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1440,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent) TASK_INTERRUPTIBLE, p); } +static bool is_effectively_child(struct wait_opts *wo, bool ptrace, + struct task_struct *target) +{ + struct task_struct *parent = + !ptrace ? target->real_parent : target->parent; + + return current == parent || (!(wo->wo_flags & __WNOTHREAD) && + same_thread_group(current, parent)); +} + +/* + * Optimization for waiting on PIDTYPE_PID. No need to iterate through child + * and tracee lists to find the target task. + */ +static int do_wait_pid(struct wait_opts *wo) +{ + bool ptrace; + struct task_struct *target; + int retval; + + ptrace = false; + target = pid_task(wo->wo_pid, PIDTYPE_TGID); + if (target && is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + ptrace = true; + target = pid_task(wo->wo_pid, PIDTYPE_PID); + if (target && target->ptrace && + is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + return 0; +} + static long do_wait(struct wait_opts *wo) { - struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); @@ -1464,19 +1503,27 @@ repeat: set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); - tsk = current; - do { - retval = do_wait_thread(wo, tsk); - if (retval) - goto end; - retval = ptrace_do_wait(wo, tsk); + if (wo->wo_type == PIDTYPE_PID) { + retval = do_wait_pid(wo); if (retval) goto end; + } else { + struct task_struct *tsk = current; + + do { + retval = do_wait_thread(wo, tsk); + if (retval) + goto end; - if (wo->wo_flags & __WNOTHREAD) - break; - } while_each_thread(current, tsk); + retval = ptrace_do_wait(wo, tsk); + if (retval) + goto end; + + if (wo->wo_flags & __WNOTHREAD) + break; + } while_each_thread(current, tsk); + } read_unlock(&tasklist_lock); notask: diff --git a/kernel/fork.c b/kernel/fork.c index 0f1992d3f80b..dc06afd725cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> +#include <linux/bpf.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -379,14 +380,17 @@ static void account_kernel_stack(struct task_struct *tsk, int account) void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); + if (vm) { + int i; - /* All stack pages are in the same node. */ - if (vm) - mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); - else + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + account * (PAGE_SIZE / 1024)); + } else { + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); + } } static int memcg_charge_kernel_stack(struct task_struct *tsk) @@ -734,6 +738,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -1140,7 +1145,7 @@ void mmput_async(struct mm_struct *mm) * invocations: in mmput() nobody alive left, in execve task is single * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the * mm->exe_file, but does so without using set_mm_exe_file() in order - * to do avoid the need for any locks. + * to avoid the need for any locks. */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { @@ -1391,7 +1396,6 @@ fail_nomem: static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; - int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; @@ -1418,21 +1422,15 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; - goto good_mm; + } else { + mm = dup_mm(tsk, current->mm); + if (!mm) + return -ENOMEM; } - retval = -ENOMEM; - mm = dup_mm(tsk, current->mm); - if (!mm) - goto fail_nomem; - -good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; - -fail_nomem: - return retval; } static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) @@ -1738,7 +1736,7 @@ static int pidfd_release(struct inode *inode, struct file *file) * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a - * different branch of the tree, i.e. where no ancestoral relation is + * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the @@ -2080,6 +2078,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + RCU_INIT_POINTER(p->bpf_storage, NULL); +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); @@ -2727,8 +2728,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) return false; /* - * - make the CLONE_DETACHED bit reuseable for clone3 - * - make the CSIGNAL bits reuseable for clone3 + * - make the CLONE_DETACHED bit reusable for clone3 + * - make the CSIGNAL bits reusable for clone3 */ if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index f62de2dea8a3..58f87a3092f3 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS + depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 select CONSTRUCTORS default n help diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 0ffe9f194080..073a3738c5e6 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -49,6 +49,55 @@ void gcov_enable_events(void) mutex_unlock(&gcov_lock); } +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + #ifdef CONFIG_MODULES /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index c466c7fbdece..cbb0bed958ab 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -48,9 +48,8 @@ #include <linux/list.h> #include <linux/printk.h> #include <linux/ratelimit.h> -#include <linux/seq_file.h> #include <linux/slab.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "gcov.h" typedef void (*llvm_gcov_callback)(void); @@ -70,16 +69,10 @@ struct gcov_fn_info { u32 ident; u32 checksum; -#if CONFIG_CLANG_VERSION < 110000 - u8 use_extra_checksum; -#endif u32 cfg_checksum; u32 num_counters; u64 *counters; -#if CONFIG_CLANG_VERSION < 110000 - const char *function_name; -#endif }; static struct gcov_info *current_info; @@ -109,16 +102,6 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) } EXPORT_SYMBOL(llvm_gcov_init); -#if CONFIG_CLANG_VERSION < 110000 -void llvm_gcda_start_file(const char *orig_filename, const char version[4], - u32 checksum) -{ - current_info->filename = orig_filename; - memcpy(¤t_info->version, version, sizeof(current_info->version)); - current_info->checksum = checksum; -} -EXPORT_SYMBOL(llvm_gcda_start_file); -#else void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) { current_info->filename = orig_filename; @@ -126,28 +109,7 @@ void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) current_info->checksum = checksum; } EXPORT_SYMBOL(llvm_gcda_start_file); -#endif - -#if CONFIG_CLANG_VERSION < 110000 -void llvm_gcda_emit_function(u32 ident, const char *function_name, - u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) -{ - struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); - - if (!info) - return; - - INIT_LIST_HEAD(&info->head); - info->ident = ident; - info->checksum = func_checksum; - info->use_extra_checksum = use_extra_checksum; - info->cfg_checksum = cfg_checksum; - if (function_name) - info->function_name = kstrdup(function_name, GFP_KERNEL); - list_add_tail(&info->head, ¤t_info->functions); -} -#else void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) { struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -161,7 +123,6 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) info->cfg_checksum = cfg_checksum; list_add_tail(&info->head, ¤t_info->functions); } -#endif EXPORT_SYMBOL(llvm_gcda_emit_function); void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) @@ -292,16 +253,8 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) !list_is_last(&fn_ptr2->head, &info2->functions)) { if (fn_ptr1->checksum != fn_ptr2->checksum) return false; -#if CONFIG_CLANG_VERSION < 110000 - if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) - return false; - if (fn_ptr1->use_extra_checksum && - fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) - return false; -#else if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) return false; -#endif fn_ptr1 = list_next_entry(fn_ptr1, head); fn_ptr2 = list_next_entry(fn_ptr2, head); } @@ -330,35 +283,6 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) } } -#if CONFIG_CLANG_VERSION < 110000 -static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) -{ - size_t cv_size; /* counter values size */ - struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), - GFP_KERNEL); - if (!fn_dup) - return NULL; - INIT_LIST_HEAD(&fn_dup->head); - - fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); - if (!fn_dup->function_name) - goto err_name; - - cv_size = fn->num_counters * sizeof(fn->counters[0]); - fn_dup->counters = vmalloc(cv_size); - if (!fn_dup->counters) - goto err_counters; - memcpy(fn_dup->counters, fn->counters, cv_size); - - return fn_dup; - -err_counters: - kfree(fn_dup->function_name); -err_name: - kfree(fn_dup); - return NULL; -} -#else static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) { size_t cv_size; /* counter values size */ @@ -369,7 +293,7 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) INIT_LIST_HEAD(&fn_dup->head); cv_size = fn->num_counters * sizeof(fn->counters[0]); - fn_dup->counters = vmalloc(cv_size); + fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL); if (!fn_dup->counters) { kfree(fn_dup); return NULL; @@ -379,7 +303,6 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) return fn_dup; } -#endif /** * gcov_info_dup - duplicate profiling data set @@ -420,99 +343,18 @@ err: * gcov_info_free - release memory for profiling data set duplicate * @info: profiling data set duplicate to free */ -#if CONFIG_CLANG_VERSION < 110000 -void gcov_info_free(struct gcov_info *info) -{ - struct gcov_fn_info *fn, *tmp; - - list_for_each_entry_safe(fn, tmp, &info->functions, head) { - kfree(fn->function_name); - vfree(fn->counters); - list_del(&fn->head); - kfree(fn); - } - kfree(info->filename); - kfree(info); -} -#else void gcov_info_free(struct gcov_info *info) { struct gcov_fn_info *fn, *tmp; list_for_each_entry_safe(fn, tmp, &info->functions, head) { - vfree(fn->counters); + kvfree(fn->counters); list_del(&fn->head); kfree(fn); } kfree(info->filename); kfree(info); } -#endif - -#define ITER_STRIDE PAGE_SIZE - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @buffer: buffer containing file data - * @size: size of buffer - * @pos: current position in file - */ -struct gcov_iterator { - struct gcov_info *info; - void *buffer; - size_t size; - loff_t pos; -}; - -/** - * store_gcov_u32 - store 32 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't - * store anything. - */ -static size_t store_gcov_u32(void *buffer, size_t off, u32 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - *data = v; - } - - return sizeof(*data); -} - -/** - * store_gcov_u64 - store 64 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store - * anything. - */ -static size_t store_gcov_u64(void *buffer, size_t off, u64 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - } - - return sizeof(*data) * 2; -} /** * convert_to_gcda - convert profiling data set to gcda file format @@ -521,7 +363,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v) * * Returns the number of bytes that were/would have been stored into the buffer. */ -static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +size_t convert_to_gcda(char *buffer, struct gcov_info *info) { struct gcov_fn_info *fi_ptr; size_t pos = 0; @@ -535,21 +377,10 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) u32 i; pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); -#if CONFIG_CLANG_VERSION < 110000 - pos += store_gcov_u32(buffer, pos, - fi_ptr->use_extra_checksum ? 3 : 2); -#else pos += store_gcov_u32(buffer, pos, 3); -#endif pos += store_gcov_u32(buffer, pos, fi_ptr->ident); pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); -#if CONFIG_CLANG_VERSION < 110000 - if (fi_ptr->use_extra_checksum) - pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); -#else pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); -#endif - pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); for (i = 0; i < fi_ptr->num_counters; i++) @@ -558,102 +389,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) return pos; } - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); - if (!iter) - goto err_free; - - iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - - convert_to_gcda(iter->buffer, info); - - return iter; - -err_free: - kfree(iter); - return NULL; -} - - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - vfree(iter->buffer); - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - iter->pos = 0; -} - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - if (iter->pos < iter->size) - iter->pos += ITER_STRIDE; - - if (iter->pos >= iter->size) - return -EINVAL; - - return 0; -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - size_t len; - - if (iter->pos >= iter->size) - return -EINVAL; - - len = ITER_STRIDE; - if (iter->pos + len > iter->size) - len = iter->size - iter->pos; - - seq_write(seq, iter->buffer + iter->pos, len); - - return 0; -} diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 82babf5aa077..5c3086cad8f9 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/seq_file.h> +#include <linux/mm.h> #include "gcov.h" /** @@ -85,6 +86,115 @@ static int __init gcov_persist_setup(char *str) } __setup("gcov_persist=", gcov_persist_setup); +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + size_t size; + loff_t pos; + char buffer[]; +}; + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +static struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + size_t size; + + /* Dry-run to get the actual buffer size. */ + size = convert_to_gcda(NULL, info); + + iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL); + if (!iter) + return NULL; + + iter->info = info; + iter->size = size; + convert_to_gcda(iter->buffer, info); + + return iter; +} + + +/** + * gcov_iter_free - free iterator data + * @iter: file iterator + */ +static void gcov_iter_free(struct gcov_iterator *iter) +{ + kvfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +static void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +static int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} + /* * seq_file.start() implementation for gcov data files. Note that the * gcov_iterator interface is designed to be more restrictive than seq_file diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index c53408a00d0b..460c12b7dfea 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -15,8 +15,7 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> -#include <linux/seq_file.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "gcov.h" #if (__GNUC__ >= 10) @@ -310,7 +309,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) cv_size = sizeof(gcov_type) * sci_ptr->num; - dci_ptr->values = vmalloc(cv_size); + dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL); if (!dci_ptr->values) goto err_free; @@ -352,7 +351,7 @@ void gcov_info_free(struct gcov_info *info) ci_ptr = info->functions[fi_idx]->ctrs; for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) - vfree(ci_ptr->values); + kvfree(ci_ptr->values); kfree(info->functions[fi_idx]); } @@ -363,71 +362,6 @@ free_info: kfree(info); } -#define ITER_STRIDE PAGE_SIZE - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @buffer: buffer containing file data - * @size: size of buffer - * @pos: current position in file - */ -struct gcov_iterator { - struct gcov_info *info; - void *buffer; - size_t size; - loff_t pos; -}; - -/** - * store_gcov_u32 - store 32 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't - * store anything. - */ -static size_t store_gcov_u32(void *buffer, size_t off, u32 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - *data = v; - } - - return sizeof(*data); -} - -/** - * store_gcov_u64 - store 64 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store - * anything. - */ -static size_t store_gcov_u64(void *buffer, size_t off, u64 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - } - - return sizeof(*data) * 2; -} - /** * convert_to_gcda - convert profiling data set to gcda file format * @buffer: the buffer to store file data or %NULL if no data should be stored @@ -435,7 +369,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v) * * Returns the number of bytes that were/would have been stored into the buffer. */ -static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +size_t convert_to_gcda(char *buffer, struct gcov_info *info) { struct gcov_fn_info *fi_ptr; struct gcov_ctr_info *ci_ptr; @@ -481,102 +415,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) return pos; } - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); - if (!iter) - goto err_free; - - iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - - convert_to_gcda(iter->buffer, info); - - return iter; - -err_free: - kfree(iter); - return NULL; -} - - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - vfree(iter->buffer); - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - iter->pos = 0; -} - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - if (iter->pos < iter->size) - iter->pos += ITER_STRIDE; - - if (iter->pos >= iter->size) - return -EINVAL; - - return 0; -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - size_t len; - - if (iter->pos >= iter->size) - return -EINVAL; - - len = ITER_STRIDE; - if (iter->pos + len > iter->size) - len = iter->size - iter->pos; - - seq_write(seq, iter->buffer + iter->pos, len); - - return 0; -} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 6ab2c1808c9d..912b8ea01d33 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -48,6 +48,7 @@ struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); bool gcov_info_within_module(struct gcov_info *info, struct module *mod); +size_t convert_to_gcda(char *buffer, struct gcov_info *info); /* Base interface. */ enum gcov_action { @@ -58,16 +59,9 @@ enum gcov_action { void gcov_event(enum gcov_action action, struct gcov_info *info); void gcov_enable_events(void); -/* Iterator control. */ -struct seq_file; -struct gcov_iterator; - -struct gcov_iterator *gcov_iter_new(struct gcov_info *info); -void gcov_iter_free(struct gcov_iterator *iter); -void gcov_iter_start(struct gcov_iterator *iter); -int gcov_iter_next(struct gcov_iterator *iter); -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq); -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter); +/* writing helpers */ +size_t store_gcov_u32(void *buffer, size_t off, u32 v); +size_t store_gcov_u64(void *buffer, size_t off, u64 v); /* gcov_info control. */ void gcov_info_reset(struct gcov_info *info); diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index c1510f0ab3ea..34a1dc2abc7d 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -36,7 +36,7 @@ all_dirs="$all_dirs $dir_list" # # When Kconfig regenerates include/generated/autoconf.h, its timestamp is # updated, but the contents might be still the same. When any CONFIG option is -# changed, Kconfig touches the corresponding timestamp file include/config/*.h. +# changed, Kconfig touches the corresponding timestamp file include/config/*. # Hence, the md5sum detects the configuration change anyway. We do not need to # check include/generated/autoconf.h explicitly. # diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index a23ac2bbf433..f8f23af6ab0d 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) irq_gc_unlock(gc); return 0; } +EXPORT_SYMBOL_GPL(irq_gc_set_wake); static u32 irq_readl_be(void __iomem *addr) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f42ef868efd3..6284443b87ec 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -295,8 +295,8 @@ void irq_domain_update_bus_token(struct irq_domain *domain, EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** - * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs - * @of_node: pointer to interrupt controller's device tree node. + * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs + * @fwnode: firmware node for the interrupt controller * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then @@ -312,15 +312,15 @@ EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); * irqs get mapped dynamically on the fly. However, if the controller requires * static virq assignments (non-DT boot) then it will set that up correctly. */ -struct irq_domain *irq_domain_add_simple(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; - domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); + domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data); if (!domain) return NULL; @@ -328,7 +328,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { /* attempt to allocated irq_descs */ int rc = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); + of_node_to_nid(to_of_node(fwnode))); if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); @@ -338,7 +338,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, return domain; } -EXPORT_SYMBOL_GPL(irq_domain_add_simple); +EXPORT_SYMBOL_GPL(irq_domain_create_simple); /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e8da1e71583a..23a7a0ba1388 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -19,7 +19,7 @@ #include <linux/notifier.h> #include <linux/smp.h> #include <asm/processor.h> - +#include <linux/kasan.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); @@ -70,6 +70,9 @@ bool irq_work_queue(struct irq_work *work) if (!irq_work_claim(work)) return false; + /*record irq_work call stack in order to print it in KASAN reports*/ + kasan_record_aux_stack(work); + /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); @@ -98,6 +101,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; + kasan_record_aux_stack(work); + preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index a0b6780740c8..f099baee3578 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -37,6 +37,7 @@ #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/objtool.h> +#include <linux/kmsg_dump.h> #include <asm/page.h> #include <asm/sections.h> @@ -1165,7 +1166,7 @@ int kernel_kexec(void) #endif { kexec_in_progress = true; - kernel_restart_prepare(NULL); + kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); /* @@ -1179,6 +1180,7 @@ int kernel_kexec(void) machine_shutdown(); } + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 5c3447cf7ad5..33400ff051a8 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -740,8 +740,10 @@ static int kexec_calculate_store_digests(struct kimage *image) sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) + if (!sha_regions) { + ret = -ENOMEM; goto out_free_desc; + } desc->tfm = tfm; diff --git a/kernel/kmod.c b/kernel/kmod.c index 3cd075ce2a1e..b717134ebe17 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -58,7 +58,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. */ -char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH; static void free_modprobe_argv(struct subprocess_info *info) { diff --git a/kernel/module.c b/kernel/module.c index 20fb004e7d8d..b5dd92e35b02 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2807,7 +2807,11 @@ void * __weak module_alloc(unsigned long size) bool __weak module_init_section(const char *name) { +#ifndef CONFIG_MODULE_UNLOAD + return strstarts(name, ".init") || module_exit_section(name); +#else return strstarts(name, ".init"); +#endif } bool __weak module_exit_section(const char *name) @@ -3121,11 +3125,6 @@ static int rewrite_section_headers(struct load_info *info, int flags) */ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; -#ifndef CONFIG_MODULE_UNLOAD - /* Don't load .exit sections */ - if (module_exit_section(info->secstrings+shdr->sh_name)) - shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; -#endif } /* Track but don't keep modinfo and version sections. */ diff --git a/kernel/resource.c b/kernel/resource.c index 627e61b0c124..028a5ab18818 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -64,12 +64,8 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static struct resource *next_resource(struct resource *p, bool sibling_only) +static struct resource *next_resource(struct resource *p) { - /* Caller wants to traverse through siblings only */ - if (sibling_only) - return p->sibling; - if (p->child) return p->child; while (!p->sibling && p->parent) @@ -81,7 +77,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; (*pos)++; - return (void *)next_resource(p, false); + return (void *)next_resource(p); } #ifdef CONFIG_PROC_FS @@ -330,14 +326,10 @@ EXPORT_SYMBOL(release_resource); * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * - * This function walks the whole tree and not just first level children - * unless @first_lvl is true. - * * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have * @desc: descriptor the resource must have - * @first_lvl: walk only the first level children, if set * @res: return ptr, if resource found * * The caller must specify @start, @end, @flags, and @desc @@ -345,9 +337,8 @@ EXPORT_SYMBOL(release_resource); */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, struct resource *res) + struct resource *res) { - bool siblings_only = true; struct resource *p; if (!res) @@ -358,7 +349,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) { + for (p = iomem_resource.child; p; p = next_resource(p)) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -369,13 +360,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; - /* - * Now that we found a range that matches what we look for, - * check the flags and the descriptor. If we were not asked to - * use only the first level, start looking at children as well. - */ - siblings_only = first_lvl; - if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) @@ -402,14 +386,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, void *arg, + void *arg, int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { + !find_next_iomem_res(start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -431,7 +415,6 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @arg: function argument for the callback @func * @func: callback function that is called for each qualifying resource area * - * This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. * @@ -441,7 +424,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func); + return __walk_iomem_res_desc(start, end, flags, desc, arg, func); } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); @@ -457,8 +440,8 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); } /* @@ -470,17 +453,14 @@ int walk_mem_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); } /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. - * - * This will find System RAM ranges that are children of top-level resources - * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -495,8 +475,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && - !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - false, &res)) { + !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) { pfn = PFN_UP(res.start); end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) @@ -523,6 +502,34 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +static int __region_intersects(resource_size_t start, size_t size, + unsigned long flags, unsigned long desc) +{ + struct resource res; + int type = 0; int other = 0; + struct resource *p; + + res.start = start; + res.end = start + size - 1; + + for (p = iomem_resource.child; p ; p = p->sibling) { + bool is_type = (((p->flags & flags) == flags) && + ((desc == IORES_DESC_NONE) || + (desc == p->desc))); + + if (resource_overlaps(p, &res)) + is_type ? type++ : other++; + } + + if (type == 0) + return REGION_DISJOINT; + + if (other == 0) + return REGION_INTERSECTS; + + return REGION_MIXED; +} + /** * region_intersects() - determine intersection of region with known resources * @start: region start address @@ -546,31 +553,13 @@ EXPORT_SYMBOL_GPL(page_is_ram); int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - struct resource res; - int type = 0; int other = 0; - struct resource *p; - - res.start = start; - res.end = start + size - 1; + int ret; read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = (((p->flags & flags) == flags) && - ((desc == IORES_DESC_NONE) || - (desc == p->desc))); - - if (resource_overlaps(p, &res)) - is_type ? type++ : other++; - } + ret = __region_intersects(start, size, flags, desc); read_unlock(&resource_lock); - if (type == 0) - return REGION_DISJOINT; - - if (other == 0) - return REGION_INTERSECTS; - - return REGION_MIXED; + return ret; } EXPORT_SYMBOL_GPL(region_intersects); @@ -1171,31 +1160,16 @@ struct address_space *iomem_get_mapping(void) return smp_load_acquire(&iomem_inode)->i_mapping; } -/** - * __request_region - create a new busy resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * @name: reserving caller's ID string - * @flags: IO resource flags - */ -struct resource * __request_region(struct resource *parent, +static int __request_region_locked(struct resource *res, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = alloc_resource(GFP_KERNEL); - struct resource *orig_parent = parent; - - if (!res) - return NULL; res->name = name; res->start = start; res->end = start + n - 1; - write_lock(&resource_lock); - for (;;) { struct resource *conflict; @@ -1231,13 +1205,40 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - free_resource(res); - res = NULL; - break; + return -EBUSY; } + + return 0; +} + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string + * @flags: IO resource flags + */ +struct resource *__request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name, int flags) +{ + struct resource *res = alloc_resource(GFP_KERNEL); + int ret; + + if (!res) + return NULL; + + write_lock(&resource_lock); + ret = __request_region_locked(res, parent, start, n, name, flags); write_unlock(&resource_lock); - if (res && orig_parent == &iomem_resource) + if (ret) { + free_resource(res); + return NULL; + } + + if (parent == &iomem_resource) revoke_iomem(res); return res; @@ -1779,25 +1780,56 @@ static struct resource *__request_free_mem_region(struct device *dev, { resource_size_t end, addr; struct resource *res; + struct region_devres *dr = NULL; size = ALIGN(size, 1UL << PA_SECTION_SHIFT); end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); addr = end - size + 1UL; + res = alloc_resource(GFP_KERNEL); + if (!res) + return ERR_PTR(-ENOMEM); + + if (dev) { + dr = devres_alloc(devm_region_release, + sizeof(struct region_devres), GFP_KERNEL); + if (!dr) { + free_resource(res); + return ERR_PTR(-ENOMEM); + } + } + + write_lock(&resource_lock); for (; addr > size && addr >= base->start; addr -= size) { - if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + if (__region_intersects(addr, size, 0, IORES_DESC_NONE) != REGION_DISJOINT) continue; - if (dev) - res = devm_request_mem_region(dev, addr, size, name); - else - res = request_mem_region(addr, size, name); - if (!res) - return ERR_PTR(-ENOMEM); + if (!__request_region_locked(res, &iomem_resource, addr, size, + name, 0)) + break; + + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + write_unlock(&resource_lock); + + /* + * A driver is claiming this region so revoke any mappings. + */ + revoke_iomem(res); return res; } + write_unlock(&resource_lock); + + free_resource(res); + if (dr) + devres_free(dr); return ERR_PTR(-ERANGE); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1e63db4dbd9a..6ecd3f3a52b5 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -119,8 +119,11 @@ struct seccomp_kaddfd { int fd; unsigned int flags; - /* To only be set on reply */ - int ret; + union { + bool setfd; + /* To only be set on reply */ + int ret; + }; struct completion completion; struct list_head list; }; @@ -1069,7 +1072,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) * that it has been handled. */ list_del_init(&addfd->list); - addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + if (!addfd->setfd) + addfd->ret = receive_fd(addfd->file, addfd->flags); + else + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, + addfd->flags); complete(&addfd->completion); } @@ -1583,8 +1590,8 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, return -EBADF; kaddfd.flags = addfd.newfd_flags; - kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? - addfd.newfd : -1; + kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD; + kaddfd.fd = addfd.newfd; init_completion(&kaddfd.completion); ret = mutex_lock_interruptible(&filter->notify_lock); diff --git a/kernel/sys.c b/kernel/sys.c index 3d62c9599dc0..3a583a29815f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1590,7 +1590,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, /* * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not - * infite. In case of RLIM_INFINITY the posix CPU timer code + * infinite. In case of RLIM_INFINITY the posix CPU timer code * ignores the rlimit. */ if (!retval && new_rlim && resource == RLIMIT_CPU && @@ -2029,7 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } /* - * arg_lock protects concurent updates but we still need mmap_lock for + * arg_lock protects concurrent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); @@ -2041,7 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * output in procfs mostly, except * * - @start_brk/@brk which are used in do_brk_flags but kernel lookups - * for VMAs when updating these memvers so anything wrong written + * for VMAs when updating these members so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself */ @@ -2143,7 +2143,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; /* - * arg_lock protects concurent updates of arg boundaries, we need + * arg_lock protects concurrent updates of arg boundaries, we need * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ @@ -2210,7 +2210,7 @@ static int prctl_set_mm(int opt, unsigned long addr, * If command line arguments and environment * are placed somewhere else on stack, we can * set them up here, ARG_START/END to setup - * command line argumets and ENV_START/END + * command line arguments and ENV_START/END * for environment. */ case PR_SET_MM_START_STACK: @@ -2258,8 +2258,8 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* - * If task has has_child_subreaper - all its decendants - * already have these flag too and new decendants will + * If task has has_child_subreaper - all its descendants + * already have these flag too and new descendants will * inherit it on fork, skip them. * * If we've found child_reaper - skip descendants in diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d24431782414..0ea8128468c3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -267,6 +267,11 @@ COND_SYSCALL(request_key); COND_SYSCALL(keyctl); COND_SYSCALL_COMPAT(keyctl); +/* security/landlock/syscalls.c */ +COND_SYSCALL(landlock_create_ruleset); +COND_SYSCALL(landlock_add_rule); +COND_SYSCALL(landlock_restrict_self); + /* arch/example/kernel/sys_example.c */ /* mm/fadvise.c */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0151c8c206c4..14edf84cc571 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1026,6 +1026,65 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } +/** + * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success or an error on write when the range check fails. + */ +int proc_dou8vec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp; + unsigned int min = 0, max = 255U, val; + u8 *data = table->data; + struct do_proc_douintvec_minmax_conv_param param = { + .min = &min, + .max = &max, + }; + int res; + + /* Do not support arrays yet. */ + if (table->maxlen != sizeof(u8)) + return -EINVAL; + + if (table->extra1) { + min = *(unsigned int *) table->extra1; + if (min > 255U) + return -EINVAL; + } + if (table->extra2) { + max = *(unsigned int *) table->extra2; + if (max > 255U) + return -EINVAL; + } + + tmp = *table; + + tmp.maxlen = sizeof(val); + tmp.data = &val; + val = *data; + res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); + if (res) + return res; + if (write) + *data = val; + return 0; +} +EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); + static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) @@ -1574,6 +1633,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dou8vec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -2765,7 +2830,7 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_COMPACTION { .procname = "compact_memory", - .data = &sysctl_compact_memory, + .data = NULL, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, diff --git a/kernel/task_work.c b/kernel/task_work.c index e9316198c64b..1698fbe6f0e1 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -34,6 +34,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, { struct callback_head *head; + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + do { head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1d1a61371b5a..2cd902592fc1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -920,6 +920,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); + if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) + cs->id = CSID_GENERIC; if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 81fe2a33b80c..8a364aa9881a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1048,6 +1048,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); + systime_snapshot->cs_id = tk->tkr_mono.clock->id; systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..d2d7cf6cfe83 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -372,188 +372,34 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } -static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, - size_t bufsz) -{ - void __user *user_ptr = (__force void __user *)unsafe_ptr; - - buf[0] = 0; - - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)unsafe_ptr < TASK_SIZE) { - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } - fallthrough; -#endif - case 'k': - strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); - break; - case 'u': - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } -} - static DEFINE_RAW_SPINLOCK(trace_printk_lock); -#define BPF_TRACE_PRINTK_SIZE 1024 +#define MAX_TRACE_PRINTK_VARARGS 3 +#define BPF_TRACE_PRINTK_SIZE 1024 -static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; + u32 *bin_args; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; - va_list ap; int ret; + ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args, + MAX_TRACE_PRINTK_VARARGS); + if (ret < 0) + return ret; + raw_spin_lock_irqsave(&trace_printk_lock, flags); - va_start(ap, fmt); - ret = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - /* vsnprintf() will not append null for zero-length strings */ - if (ret == 0) - buf[0] = '\0'; + ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); + trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - return ret; -} - -/* - * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s - */ -BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, - u64, arg2, u64, arg3) -{ - int i, mod[3] = {}, fmt_cnt = 0; - char buf[64], fmt_ptype; - void *unsafe_ptr = NULL; - bool str_seen = false; - - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - return -EINVAL; - - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) - return -EINVAL; - - if (fmt[i] != '%') - continue; - - if (fmt_cnt >= 3) - return -EINVAL; - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } else if (fmt[i] == 'p') { - mod[fmt_cnt]++; - if ((fmt[i + 1] == 'k' || - fmt[i + 1] == 'u') && - fmt[i + 2] == 's') { - fmt_ptype = fmt[i + 1]; - i += 2; - goto fmt_str; - } - - if (fmt[i + 1] == 'B') { - i++; - goto fmt_next; - } - - /* disallow any further format extensions */ - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - goto fmt_next; - } else if (fmt[i] == 's') { - mod[fmt_cnt]++; - fmt_ptype = fmt[i]; -fmt_str: - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - switch (fmt_cnt) { - case 0: - unsafe_ptr = (void *)(long)arg1; - arg1 = (long)buf; - break; - case 1: - unsafe_ptr = (void *)(long)arg2; - arg2 = (long)buf; - break; - case 2: - unsafe_ptr = (void *)(long)arg3; - arg3 = (long)buf; - break; - } - - bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, - sizeof(buf)); - goto fmt_next; - } - - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') - return -EINVAL; -fmt_next: - fmt_cnt++; - } + bpf_bprintf_cleanup(); -/* Horrid workaround for getting va_list handling working with different - * argument type combinations generically for 32 and 64 bit archs. - */ -#define __BPF_TP_EMIT() __BPF_ARG3_TP() -#define __BPF_TP(...) \ - bpf_do_trace_printk(fmt, ##__VA_ARGS__) - -#define __BPF_ARG1_TP(...) \ - ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_TP(arg1, ##__VA_ARGS__) \ - : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ - : __BPF_TP((u32)arg1, ##__VA_ARGS__))) - -#define __BPF_ARG2_TP(...) \ - ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ - : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ - : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) - -#define __BPF_ARG3_TP(...) \ - ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ - : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ - : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) - - return __BPF_TP_EMIT(); + return ret; } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -581,184 +427,27 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) } #define MAX_SEQ_PRINTF_VARARGS 12 -#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 -#define MAX_SEQ_PRINTF_STR_LEN 128 - -struct bpf_seq_printf_buf { - char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; -}; -static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); -static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { - int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; - int i, buf_used, copy_size, num_args; - u64 params[MAX_SEQ_PRINTF_VARARGS]; - struct bpf_seq_printf_buf *bufs; - const u64 *args = data; - - buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); - if (WARN_ON_ONCE(buf_used > 1)) { - err = -EBUSY; - goto out; - } - - bufs = this_cpu_ptr(&bpf_seq_printf_buf); - - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - goto out; - - if (data_len & 7) - goto out; - - for (i = 0; i < fmt_size; i++) { - if (fmt[i] == '%') { - if (fmt[i + 1] == '%') - i++; - else if (!data || !data_len) - goto out; - } - } + int err, num_args; + u32 *bin_args; + if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; num_args = data_len / 8; - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - /* only printable ascii for now. */ - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { - err = -EINVAL; - goto out; - } - - if (fmt[i] != '%') - continue; - - if (fmt[i + 1] == '%') { - i++; - continue; - } - - if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { - err = -E2BIG; - goto out; - } - - if (fmt_cnt >= num_args) { - err = -EINVAL; - goto out; - } - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - - /* skip optional "[0 +-][num]" width formating field */ - while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || - fmt[i] == ' ') - i++; - if (fmt[i] >= '1' && fmt[i] <= '9') { - i++; - while (fmt[i] >= '0' && fmt[i] <= '9') - i++; - } - - if (fmt[i] == 's') { - void *unsafe_ptr; - - /* try our best to copy */ - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - unsafe_ptr = (void *)(long)args[fmt_cnt]; - err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], - unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); - if (err < 0) - bufs->buf[memcpy_cnt][0] = '\0'; - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'p') { - if (fmt[i + 1] == 0 || - fmt[i + 1] == 'K' || - fmt[i + 1] == 'x' || - fmt[i + 1] == 'B') { - /* just kernel pointers */ - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - continue; - } - - /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ - if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { - err = -EINVAL; - goto out; - } - if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { - err = -EINVAL; - goto out; - } - - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - - copy_size = (fmt[i + 2] == '4') ? 4 : 16; - - err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - copy_size); - if (err < 0) - memset(bufs->buf[memcpy_cnt], 0, copy_size); - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - i += 2; - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'l') { - i++; - if (fmt[i] == 'l') - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x' && - fmt[i] != 'X') { - err = -EINVAL; - goto out; - } + err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); + if (err < 0) + return err; - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - } + seq_bprintf(m, fmt, bin_args); - /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give - * all of them to seq_printf(). - */ - seq_printf(m, fmt, params[0], params[1], params[2], params[3], - params[4], params[5], params[6], params[7], params[8], - params[9], params[10], params[11]); + bpf_bprintf_cleanup(); - err = seq_has_overflowed(m) ? -EOVERFLOW : 0; -out: - this_cpu_dec(bpf_seq_printf_buf_used); - return err; + return seq_has_overflowed(m) ? -EOVERFLOW : 0; } BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) @@ -1367,6 +1056,14 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 29a6ebeebc9e..b8a0d1d564fb 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void) } /** - * ftrace_graph_stop - set to permanently disable function graph tracincg + * ftrace_graph_stop - set to permanently disable function graph tracing * * In case of an error int function graph tracing, this is called * to try to keep function graph tracing from causing any more harm. @@ -117,7 +117,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, /* * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses is unpredicatable anymore. + * since call sequence and return addresses are unpredictable anyway. * Ex: BPF trampoline may call original function and may skip frame * depending on type of BPF programs attached. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ba52d4e1314..2e8a3fde7104 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1045,7 +1045,7 @@ struct ftrace_ops global_ops = { }; /* - * Used by the stack undwinder to know about dynamic ftrace trampolines. + * Used by the stack unwinder to know about dynamic ftrace trampolines. */ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { @@ -1090,7 +1090,7 @@ struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; int index; - int size; + int order; }; #define ENTRY_SIZE sizeof(struct dyn_ftrace) @@ -3000,7 +3000,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * When the kernel is preemptive, tasks can be preempted * while on a ftrace trampoline. Just scheduling a task on * a CPU is not good enough to flush them. Calling - * synchornize_rcu_tasks() will wait for those tasks to + * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ if (IS_ENABLED(CONFIG_PREEMPTION)) @@ -3156,15 +3156,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) if (WARN_ON(!count)) return -EINVAL; + /* We want to fill as much as possible, with no empty pages */ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); - order = get_count_order(pages); - - /* - * We want to fill as much as possible. No more than a page - * may be empty. - */ - if (!is_power_of_2(pages)) - order--; + order = fls(pages) - 1; again: pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); @@ -3181,7 +3175,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) ftrace_number_of_groups++; cnt = (PAGE_SIZE << order) / ENTRY_SIZE; - pg->size = cnt; + pg->order = order; if (cnt > count) cnt = count; @@ -3194,7 +3188,6 @@ ftrace_allocate_pages(unsigned long num_to_init) { struct ftrace_page *start_pg; struct ftrace_page *pg; - int order; int cnt; if (!num_to_init) @@ -3230,13 +3223,13 @@ ftrace_allocate_pages(unsigned long num_to_init) free_pages: pg = start_pg; while (pg) { - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } start_pg = pg->next; kfree(pg); pg = start_pg; - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); @@ -5407,7 +5400,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); * @reset - non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled - * If @ip is NULL, it failes to update filter. + * If @ip is NULL, it fails to update filter. */ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) @@ -5631,7 +5624,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file) parser = &iter->parser; if (trace_parser_loaded(parser)) { - ftrace_match_records(iter->hash, parser->buffer, parser->idx); + int enable = !(iter->flags & FTRACE_ITER_NOTRACE); + + ftrace_process_regex(iter, parser->buffer, + parser->idx, enable); } trace_parser_put(parser); @@ -6221,6 +6217,7 @@ static int ftrace_process_locs(struct module *mod, p = start; pg = start_pg; while (p < end) { + unsigned long end_offset; addr = ftrace_call_adjust(*p++); /* * Some architecture linkers will pad between @@ -6231,7 +6228,8 @@ static int ftrace_process_locs(struct module *mod, if (!addr) continue; - if (pg->index == pg->size) { + end_offset = (pg->index+1) * sizeof(pg->records[0]); + if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ if (WARN_ON(!pg->next)) break; @@ -6359,7 +6357,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) } } -/* Clear any records from hashs */ +/* Clear any records from hashes */ static void clear_mod_from_hashes(struct ftrace_page *pg) { struct trace_array *tr; @@ -6400,7 +6398,6 @@ void ftrace_release_mod(struct module *mod) struct ftrace_page **last_pg; struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; - int order; mutex_lock(&ftrace_lock); @@ -6451,12 +6448,12 @@ void ftrace_release_mod(struct module *mod) /* Needs to be called outside of ftrace_lock */ clear_mod_from_hashes(pg); - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } tmp_page = pg->next; kfree(pg); - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } } @@ -6774,7 +6771,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct ftrace_mod_map *mod_map = NULL; struct ftrace_init_func *func, *func_next; struct list_head clear_hash; - int order; INIT_LIST_HEAD(&clear_hash); @@ -6812,10 +6808,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); - ftrace_number_of_pages -= 1 << order; + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 68744c51517e..2c0ee6484990 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -287,17 +287,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) -/** - * ring_buffer_event_time_stamp - return the event's extended timestamp - * @event: the event to get the timestamp of - * - * Returns the extended timestamp associated with a data event. - * An extended time_stamp is a 64-bit timestamp represented - * internally in a special way that makes the best use of space - * contained within a ring buffer event. This function decodes - * it and maps it to a straight u64 value. - */ -u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) +static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -487,6 +477,8 @@ struct rb_time_struct { #endif typedef struct rb_time_struct rb_time_t; +#define MAX_NEST 5 + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -524,6 +516,7 @@ struct ring_buffer_per_cpu { unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; + u64 event_stamp[MAX_NEST]; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -749,6 +742,99 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) } #endif +/* + * Enable this to make sure that the event passed to + * ring_buffer_event_time_stamp() is not committed and also + * is on the buffer that it passed in. + */ +//#define RB_VERIFY_EVENT +#ifdef RB_VERIFY_EVENT +static struct list_head *rb_list_head(struct list_head *list); +static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ + struct buffer_page *page = cpu_buffer->commit_page; + struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); + struct list_head *next; + long commit, write; + unsigned long addr = (unsigned long)event; + bool done = false; + int stop = 0; + + /* Make sure the event exists and is not committed yet */ + do { + if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) + done = true; + commit = local_read(&page->page->commit); + write = local_read(&page->write); + if (addr >= (unsigned long)&page->page->data[commit] && + addr < (unsigned long)&page->page->data[write]) + return; + + next = rb_list_head(page->list.next); + page = list_entry(next, struct buffer_page, list); + } while (!done); + WARN_ON_ONCE(1); +} +#else +static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ +} +#endif + + +static inline u64 rb_time_stamp(struct trace_buffer *buffer); + +/** + * ring_buffer_event_time_stamp - return the event's current time stamp + * @buffer: The buffer that the event is on + * @event: the event to get the time stamp of + * + * Note, this must be called after @event is reserved, and before it is + * committed to the ring buffer. And must be called from the same + * context where the event was reserved (normal, softirq, irq, etc). + * + * Returns the time stamp associated with the current event. + * If the event has an extended time stamp, then that is used as + * the time stamp to return. + * In the highly unlikely case that the event was nested more than + * the max nesting, then the write_stamp of the buffer is returned, + * otherwise current time is returned, but that really neither of + * the last two cases should ever happen. + */ +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; + unsigned int nest; + u64 ts; + + /* If the event includes an absolute time, then just use that */ + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) + return rb_event_time_stamp(event); + + nest = local_read(&cpu_buffer->committing); + verify_event(cpu_buffer, event); + if (WARN_ON_ONCE(!nest)) + goto fail; + + /* Read the current saved nesting level time stamp */ + if (likely(--nest < MAX_NEST)) + return cpu_buffer->event_stamp[nest]; + + /* Shouldn't happen, warn if it does */ + WARN_ONCE(1, "nest (%d) greater than max", nest); + + fail: + /* Can only fail on 32 bit */ + if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) + /* Screw it, just read the current time */ + ts = rb_time_stamp(cpu_buffer->buffer); + + return ts; +} + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from @@ -994,7 +1080,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer) return ts << DEBUG_SHIFT; } -u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) +u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; @@ -2710,6 +2796,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, { unsigned length = info->length; u64 delta = info->delta; + unsigned int nest = local_read(&cpu_buffer->committing) - 1; + + if (!WARN_ON_ONCE(nest >= MAX_NEST)) + cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we @@ -2766,7 +2856,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event) return 0; case RINGBUF_TYPE_TIME_EXTEND: - return ring_buffer_event_time_stamp(event); + return rb_event_time_stamp(event); case RINGBUF_TYPE_TIME_STAMP: return 0; @@ -3064,7 +3154,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code - * does the trace_recursive_unlock() it will clear the TRANSTION bit + * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will @@ -3212,13 +3302,13 @@ static void dump_buffer_page(struct buffer_data_page *bpage, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3289,12 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; break; @@ -3451,7 +3541,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - info->ts = ts; } else { /* * Interrupted between C and E: @@ -3463,6 +3552,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ info->delta = 0; } + info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } @@ -4256,12 +4346,12 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp = delta; return; @@ -4286,12 +4376,12 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp = delta; return; @@ -4544,7 +4634,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4635,7 +4725,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -5021,6 +5111,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); + memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); + cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index a4b4bbf8c3bf..0b15e975d2c2 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Test module for in-kernel sythetic event creation and generation. + * Test module for in-kernel synthetic event creation and generation. * * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org> */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 915fe8790f04..560e4c8d3825 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -514,7 +514,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list) * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * - * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) @@ -545,7 +545,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { /* - * If filterd_no_pids is not empty, and the task's pid is listed + * If filtered_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids @@ -612,7 +612,7 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) (*pos)++; - /* pid already is +1 of the actual prevous bit */ + /* pid already is +1 of the actual previous bit */ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); /* Return pid + 1 to allow zero to be represented */ @@ -771,7 +771,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(buf->buffer, cpu); + ts = ring_buffer_time_stamp(buf->buffer); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; @@ -834,7 +834,7 @@ DEFINE_MUTEX(trace_types_lock); * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page - * (not reader page) in ring buffer, and this page will be rewrited + * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. @@ -1520,7 +1520,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) #undef C #define C(a, b) b -/* These must match the bit postions in trace_iterator_flags */ +/* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL @@ -2390,14 +2390,13 @@ static void tracing_stop_tr(struct trace_array *tr) static int trace_save_cmdline(struct task_struct *tsk) { - unsigned pid, idx; + unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; - if (unlikely(tsk->pid > PID_MAX_DEFAULT)) - return 0; + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get @@ -2408,26 +2407,15 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; - idx = savedcmd->map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - /* - * Check whether the cmdline buffer at idx has a pid - * mapped. We are going to overwrite that entry so we - * need to clear the map_pid_to_cmdline. Otherwise we - * would read the new comm for the old pid. - */ - pid = savedcmd->map_cmdline_to_pid[idx]; - if (pid != NO_CMDLINE_MAP) - savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - + savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); @@ -2438,6 +2426,7 @@ static int trace_save_cmdline(struct task_struct *tsk) static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; + int tpid; if (!pid) { strcpy(comm, "<idle>"); @@ -2449,16 +2438,16 @@ static void __trace_find_cmdline(int pid, char comm[]) return; } - if (pid > PID_MAX_DEFAULT) { - strcpy(comm, "<...>"); - return; + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } } - - map = savedcmd->map_pid_to_cmdline[pid]; - if (map != NO_CMDLINE_MAP) - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - else - strcpy(comm, "<...>"); + strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) @@ -2737,12 +2726,13 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, unsigned int trace_ctx) { struct ring_buffer_event *entry; + struct trace_array *tr = trace_file->tr; int val; - *current_rb = trace_file->tr->array_buffer.buffer; + *current_rb = tr->array_buffer.buffer; - if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & - (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + if (!tr->no_filter_buffering_ref && + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); @@ -3116,6 +3106,40 @@ static void ftrace_trace_userstack(struct trace_array *tr, #endif /* CONFIG_STACKTRACE */ +static inline void +func_repeats_set_delta_ts(struct func_repeats_entry *entry, + unsigned long long delta) +{ + entry->bottom_delta_ts = delta & U32_MAX; + entry->top_delta_ts = (delta >> 32); +} + +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct func_repeats_entry *entry; + struct ring_buffer_event *event; + u64 delta; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, + sizeof(*entry), trace_ctx); + if (!event) + return; + + delta = ring_buffer_event_time_stamp(buffer, event) - + last_info->ts_last_call; + + entry = ring_buffer_event_data(event); + entry->ip = last_info->ip; + entry->parent_ip = last_info->parent_ip; + entry->count = last_info->count; + func_repeats_set_delta_ts(entry, delta); + + __buffer_unlock_commit(buffer, event); +} + /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; @@ -3368,7 +3392,7 @@ int trace_array_vprintk(struct trace_array *tr, * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not - * be ever encorporated in normal use. + * be ever incorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. @@ -3562,6 +3586,204 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) return tmp; } +/* Returns true if the string is safe to dereference from an event */ +static bool trace_safe_str(struct trace_iterator *iter, const char *str) +{ + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) + return true; + + /* OK if part of the temp seq buffer */ + if ((addr >= (unsigned long)iter->tmp_seq.buffer) && + (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + return true; + + /* Core rodata can not be freed */ + if (is_kernel_rodata(addr)) + return true; + + if (trace_is_tracepoint_string(str)) + return true; + + /* + * Now this could be a module event, referencing core module + * data, which is OK. + */ + if (!iter->ent) + return false; + + trace_event = ftrace_find_event(iter->ent->type); + if (!trace_event) + return false; + + event = container_of(trace_event, struct trace_event_call, event); + if (!event->mod) + return false; + + /* Would rather have rodata, but this will suffice */ + if (within_module_core(addr, event->mod)) + return true; + + return false; +} + +static const char *show_buffer(struct trace_seq *s) +{ + struct seq_buf *seq = &s->seq; + + seq_buf_terminate(seq); + + return seq->buffer; +} + +static DEFINE_STATIC_KEY_FALSE(trace_no_verify); + +static int test_can_verify_check(const char *fmt, ...) +{ + char buf[16]; + va_list ap; + int ret; + + /* + * The verifier is dependent on vsnprintf() modifies the va_list + * passed to it, where it is sent as a reference. Some architectures + * (like x86_32) passes it by value, which means that vsnprintf() + * does not modify the va_list passed to it, and the verifier + * would then need to be able to understand all the values that + * vsnprintf can use. If it is passed by value, then the verifier + * is disabled. + */ + va_start(ap, fmt); + vsnprintf(buf, 16, "%d", ap); + ret = va_arg(ap, int); + va_end(ap); + + return ret; +} + +static void test_can_verify(void) +{ + if (!test_can_verify_check("%d %d", 0, 1)) { + pr_info("trace event string verifier disabled\n"); + static_branch_inc(&trace_no_verify); + } +} + +/** + * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * @iter: The iterator that holds the seq buffer and the event being printed + * @fmt: The format used to print the event + * @ap: The va_list holding the data to print from @fmt. + * + * This writes the data into the @iter->seq buffer using the data from + * @fmt and @ap. If the format has a %s, then the source of the string + * is examined to make sure it is safe to print, otherwise it will + * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string + * pointer. + */ +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap) +{ + const char *p = fmt; + const char *str; + int i, j; + + if (WARN_ON_ONCE(!fmt)) + return; + + if (static_branch_unlikely(&trace_no_verify)) + goto print; + + /* Don't bother checking when doing a ftrace_dump() */ + if (iter->fmt == static_fmt_buf) + goto print; + + while (*p) { + j = 0; + + /* We only care about %s and variants */ + for (i = 0; p[i]; i++) { + if (i + 1 >= iter->fmt_size) { + /* + * If we can't expand the copy buffer, + * just print it. + */ + if (!trace_iter_expand_format(iter)) + goto print; + } + + if (p[i] == '\\' && p[i+1]) { + i++; + continue; + } + if (p[i] == '%') { + /* Need to test cases like %08.*s */ + for (j = 1; p[i+j]; j++) { + if (isdigit(p[i+j]) || + p[i+j] == '*' || + p[i+j] == '.') + continue; + break; + } + if (p[i+j] == 's') + break; + } + j = 0; + } + /* If no %s found then just print normally */ + if (!p[i]) + break; + + /* Copy up to the %s, and print that */ + strncpy(iter->fmt, p, i); + iter->fmt[i] = '\0'; + trace_seq_vprintf(&iter->seq, iter->fmt, ap); + + /* The ap now points to the string data of the %s */ + str = va_arg(ap, const char *); + + /* + * If you hit this warning, it is likely that the + * trace event in question used %s on a string that + * was saved at the time of the event, but may not be + * around when the trace is read. Use __string(), + * __assign_str() and __get_str() helpers in the TRACE_EVENT() + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ + if (WARN_ONCE(!trace_safe_str(iter, str), + "fmt: '%s' current_buffer: '%s'", + fmt, show_buffer(&iter->seq))) { + int ret; + + /* Try to safely read the string */ + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + if (ret < 0) + trace_seq_printf(&iter->seq, "(0x%px)", str); + else + trace_seq_printf(&iter->seq, "(0x%px:%s)", + str, iter->fmt); + str = "[UNSAFE-MEMORY]"; + strcpy(iter->fmt, "%s"); + } else { + strncpy(iter->fmt, p + i, j + 1); + iter->fmt[j+1] = '\0'; + } + trace_seq_printf(&iter->seq, iter->fmt, str); + + p += i + j + 1; + } + print: + if (*p) + trace_seq_vprintf(&iter->seq, p, ap); +} + const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; @@ -6768,7 +6990,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ entry->buf[cnt] = '\0'; - tt = event_triggers_call(tr->trace_marker_file, entry, event); + tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); } if (entry->buf[cnt - 1] != '\n') { @@ -6976,31 +7198,34 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) return ret; } -int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) +u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) +{ + if (rbe == this_cpu_read(trace_buffered_event)) + return ring_buffer_time_stamp(buffer); + + return ring_buffer_event_time_stamp(buffer, rbe); +} + +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +int tracing_set_filter_buffering(struct trace_array *tr, bool set) { int ret = 0; mutex_lock(&trace_types_lock); - if (abs && tr->time_stamp_abs_ref++) + if (set && tr->no_filter_buffering_ref++) goto out; - if (!abs) { - if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { ret = -EINVAL; goto out; } - if (--tr->time_stamp_abs_ref) - goto out; + --tr->no_filter_buffering_ref; } - - ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs); - -#ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) - ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); -#endif out: mutex_unlock(&trace_types_lock); @@ -7336,11 +7561,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) * @cmd: The tracing command that caused the error * @str: The string to position the caret at within @cmd * - * Finds the position of the first occurence of @str within @cmd. The + * Finds the position of the first occurrence of @str within @cmd. The * return value can be passed to tracing_log_err() for caret placement * within @cmd. * - * Returns the index within @cmd of the first occurence of @str or 0 + * Returns the index within @cmd of the first occurrence of @str or 0 * if @str was not found. */ unsigned int err_pos(char *cmd, const char *str) @@ -7890,7 +8115,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { @@ -7899,7 +8124,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(trace_buf->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); @@ -8906,6 +9131,7 @@ static int __remove_instance(struct trace_array *tr) ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove(tr->dir); + free_percpu(tr->last_func_repeats); free_trace_buffers(tr); for (i = 0; i < tr->nr_topts; i++) { @@ -9123,7 +9349,7 @@ int tracing_init_dentry(void) * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount * the tracefs file system there, so older tools still - * work with the newer kerenl. + * work with the newer kernel. */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); @@ -9676,6 +9902,8 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); + test_can_verify(); + return 0; out_free_savedcmd: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a6446c03cfbc..cd80d046c7a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,6 +45,7 @@ enum trace_type { TRACE_BPUTS, TRACE_HWLAT, TRACE_RAW_DATA, + TRACE_FUNC_REPEATS, __TRACE_LAST_TYPE, }; @@ -262,6 +263,17 @@ struct cond_snapshot { }; /* + * struct trace_func_repeats - used to keep track of the consecutive + * (on the same CPU) calls of a single function. + */ +struct trace_func_repeats { + unsigned long ip; + unsigned long parent_ip; + unsigned long count; + u64 ts_last_call; +}; + +/* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: @@ -352,11 +364,12 @@ struct trace_array { /* function tracing enabled */ int function_enabled; #endif - int time_stamp_abs_ref; + int no_filter_buffering_ref; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; #endif + struct trace_func_repeats __percpu *last_func_repeats; }; enum { @@ -372,7 +385,8 @@ extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); -extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); +extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); @@ -441,6 +455,8 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct func_repeats_entry, \ + TRACE_FUNC_REPEATS); \ __ftrace_bad_type(); \ } while (0) @@ -581,7 +597,10 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); +bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap); int trace_empty(struct trace_iterator *iter); @@ -676,6 +695,10 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } #endif /* CONFIG_STACKTRACE */ +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx); + extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); @@ -1329,7 +1352,7 @@ __event_trigger_test_discard(struct trace_event_file *file, unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry, event); + *tt = event_triggers_call(file, buffer, entry, event); if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && @@ -1343,7 +1366,7 @@ __event_trigger_test_discard(struct trace_event_file *file, /** * event_trigger_unlock_commit - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself @@ -1370,7 +1393,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, /** * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself @@ -1626,7 +1649,7 @@ extern int register_trigger_hist_enable_disable_cmds(void); */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, - void *rec, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aaf6793ededa..c1637f90c8a3 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; - u64 now; + u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = sched_clock_cpu(this_cpu); + /* - * If in an NMI context then dont risk lockups and return the - * cpu_clock() time: + * The global clock "guarantees" that the events are ordered + * between CPUs. But if two events on two different CPUS call + * trace_clock_global at roughly the same time, it really does + * not matter which one gets the earlier time. Just make sure + * that the same CPU will always show a monotonic clock. + * + * Use a read memory barrier to get the latest written + * time that was recorded. */ - if (unlikely(in_nmi())) - goto out; + smp_rmb(); + prev_time = READ_ONCE(trace_clock_struct.prev_time); + now = sched_clock_cpu(this_cpu); - arch_spin_lock(&trace_clock_struct.lock); + /* Make sure that now is always greater than prev_time */ + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; /* - * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_time+1, to make sure - * we start ticking with the local clock from now on? + * If in an NMI context then dont risk lockups and simply return + * the current time. */ - if ((s64)(now - trace_clock_struct.prev_time) < 0) - now = trace_clock_struct.prev_time + 1; + if (unlikely(in_nmi())) + goto out; - trace_clock_struct.prev_time = now; + /* Tracing can cause strange recursion, always use a try lock */ + if (arch_spin_trylock(&trace_clock_struct.lock)) { + /* Reread prev_time in case it was already updated */ + prev_time = READ_ONCE(trace_clock_struct.prev_time); + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; - arch_spin_unlock(&trace_clock_struct.lock); + trace_clock_struct.prev_time = now; + /* The unlock acts as the wmb for the above rmb */ + arch_spin_unlock(&trace_clock_struct.lock); + } out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4547ac59da61..251c819cf0c5 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -338,3 +338,25 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->nmi_total_ts, __entry->nmi_count) ); + +#define FUNC_REPEATS_GET_DELTA_TS(entry) \ + (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \ + +FTRACE_ENTRY(func_repeats, func_repeats_entry, + + TRACE_FUNC_REPEATS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + __field( u16 , count ) + __field( u16 , top_delta_ts ) + __field( u32 , bottom_delta_ts ) + ), + + F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)", + (void *)__entry->ip, + (void *)__entry->parent_ip, + __entry->count, + FUNC_REPEATS_GET_DELTA_TS(__entry)) +); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 288ad2c274fb..03be4435d103 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses - * suprises + * surprises */ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) perf_trace_t; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3563afd412d..80e96989770e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -217,6 +217,214 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field = call->class->fields_array; + const char *array_descriptor; + const char *p = fmt; + int len; + + if (!(len = str_has_prefix(fmt, "REC->"))) + return false; + fmt += len; + for (p = fmt; *p; p++) { + if (!isalnum(*p) && *p != '_') + break; + } + len = p - fmt; + + for (; field->type; field++) { + if (strncmp(field->name, fmt, len) || + field->name[len]) + continue; + array_descriptor = strchr(field->type, '['); + /* This is an array and is OK to dereference. */ + return array_descriptor != NULL; + } + return false; +} + +/* + * Examine the print fmt of the event looking for unsafe dereference + * pointers using %p* that could be recorded in the trace event and + * much later referenced after the pointer was freed. Dereferencing + * pointers are OK, if it is dereferenced into the event itself. + */ +static void test_event_printk(struct trace_event_call *call) +{ + u64 dereference_flags = 0; + bool first = true; + const char *fmt, *c, *r, *a; + int parens = 0; + char in_quote = 0; + int start_arg = 0; + int arg = 0; + int i; + + fmt = call->print_fmt; + + if (!fmt) + return; + + for (i = 0; fmt[i]; i++) { + switch (fmt[i]) { + case '\\': + i++; + if (!fmt[i]) + return; + continue; + case '"': + case '\'': + /* + * The print fmt starts with a string that + * is processed first to find %p* usage, + * then after the first string, the print fmt + * contains arguments that are used to check + * if the dereferenced %p* usage is safe. + */ + if (first) { + if (fmt[i] == '\'') + continue; + if (in_quote) { + arg = 0; + first = false; + /* + * If there was no %p* uses + * the fmt is OK. + */ + if (!dereference_flags) + return; + } + } + if (in_quote) { + if (in_quote == fmt[i]) + in_quote = 0; + } else { + in_quote = fmt[i]; + } + continue; + case '%': + if (!first || !in_quote) + continue; + i++; + if (!fmt[i]) + return; + switch (fmt[i]) { + case '%': + continue; + case 'p': + /* Find dereferencing fields */ + switch (fmt[i + 1]) { + case 'B': case 'R': case 'r': + case 'b': case 'M': case 'm': + case 'I': case 'i': case 'E': + case 'U': case 'V': case 'N': + case 'a': case 'd': case 'D': + case 'g': case 't': case 'C': + case 'O': case 'f': + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + } + break; + default: + { + bool star = false; + int j; + + /* Increment arg if %*s exists. */ + for (j = 0; fmt[i + j]; j++) { + if (isdigit(fmt[i + j]) || + fmt[i + j] == '.') + continue; + if (fmt[i + j] == '*') { + star = true; + continue; + } + if ((fmt[i + j] == 's') && star) + arg++; + break; + } + break; + } /* default */ + + } /* switch */ + arg++; + continue; + case '(': + if (in_quote) + continue; + parens++; + continue; + case ')': + if (in_quote) + continue; + parens--; + if (WARN_ONCE(parens < 0, + "Paren mismatch for event: %s\narg='%s'\n%*s", + trace_event_name(call), + fmt + start_arg, + (i - start_arg) + 5, "^")) + return; + continue; + case ',': + if (in_quote || parens) + continue; + i++; + while (isspace(fmt[i])) + i++; + start_arg = i; + if (!(dereference_flags & (1ULL << arg))) + goto next_arg; + + /* Find the REC-> in the argument */ + c = strchr(fmt + i, ','); + r = strstr(fmt + i, "REC->"); + if (r && (!c || r < c)) { + /* + * Addresses of events on the buffer, + * or an array on the buffer is + * OK to dereference. + * There's ways to fool this, but + * this is to catch common mistakes, + * not malicious code. + */ + a = strchr(fmt + i, '&'); + if ((a && (a < r)) || test_field(r, call)) + dereference_flags &= ~(1ULL << arg); + } + next_arg: + i--; + arg++; + } + } + + /* + * If you triggered the below warning, the trace event reported + * uses an unsafe dereference pointer %p*. As the data stored + * at the trace event time may no longer exist when the trace + * event is printed, dereferencing to the original source is + * unsafe. The source of the dereference must be copied into the + * event itself, and the dereference must access the copy instead. + */ + if (WARN_ON_ONCE(dereference_flags)) { + arg = 1; + while (!(dereference_flags & 1)) { + dereference_flags >>= 1; + arg++; + } + pr_warn("event %s has unsafe dereference of argument %d\n", + trace_event_name(call), arg); + pr_warn("print_fmt: %s\n", fmt); + } +} + int trace_event_raw_init(struct trace_event_call *call) { int id; @@ -225,6 +433,8 @@ int trace_event_raw_init(struct trace_event_call *call) if (!id) return -ENODEV; + test_event_printk(call); + return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); @@ -2436,7 +2646,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } /* - * Since calls are grouped by systems, the likelyhood that the + * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call @@ -2496,7 +2706,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) } /* - * Just create a decriptor for early init. A descriptor is required + * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e91259f6a722..c9124038b140 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -256,7 +256,7 @@ enum { * is "&&" we don't call update_preds(). Instead continue to "c". As the * next token after "c" is not "&&" but the end of input, we first process the * "&&" by calling update_preds() for the "&&" then we process the "||" by - * callin updates_preds() with the values for processing "||". + * calling updates_preds() with the values for processing "||". * * What does that mean? What update_preds() does is to first save the "target" * of the program entry indexed by the current program entry's "target" @@ -296,7 +296,7 @@ enum { * and "FALSE" the program entry after that, we are now done with the first * pass. * - * Making the above "a || b && c" have a progam of: + * Making the above "a || b && c" have a program of: * prog[0] = { "a", 1, 2 } * prog[1] = { "b", 0, 2 } * prog[2] = { "c", 0, 3 } @@ -390,7 +390,7 @@ enum { * F: return FALSE * * As "r = a; if (!r) goto n5;" is obviously the same as - * "if (!a) goto n5;" without doing anything we can interperate the + * "if (!a) goto n5;" without doing anything we can interpret the * program as: * n1: if (!a) goto n5; * n2: if (!b) goto n5; @@ -1693,6 +1693,7 @@ static void create_filter_finish(struct filter_parse_error *pe) /** * create_filter - create a filter for a trace_event_call + * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter @@ -1741,8 +1742,8 @@ int create_event_filter(struct trace_array *tr, } /** - * create_system_filter - create a filter for an event_subsystem - * @system: event_subsystem to create a filter for + * create_system_filter - create a filter for an event subsystem + * @dir: the descriptor for the subsystem directory * @filter_str: filter string * @filterp: out param for created filter (always updated on return) * @@ -1750,7 +1751,6 @@ int create_event_filter(struct trace_array *tr, * and always remembers @filter_str. */ static int create_system_filter(struct trace_subsystem_dir *dir, - struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; @@ -1758,13 +1758,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir, err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { - err = process_system_preds(dir, tr, pe, filter_str); + err = process_system_preds(dir, dir->tr, pe, filter_str); if (!err) { /* System filters just show a default message */ kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(tr, pe, *filterp); + append_filter_err(dir->tr, pe, *filterp); } } create_filter_finish(pe); @@ -1852,7 +1852,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, goto out_unlock; } - err = create_system_filter(dir, tr, filter_string, &filter); + err = create_system_filter(dir, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 39ebe1826fc3..c1abd63f1d6c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -81,6 +81,7 @@ struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event); @@ -153,6 +154,7 @@ struct hist_field { static u64 hist_field_none(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -161,6 +163,7 @@ static u64 hist_field_none(struct hist_field *field, static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -169,6 +172,7 @@ static u64 hist_field_counter(struct hist_field *field, static u64 hist_field_string(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -179,6 +183,7 @@ static u64 hist_field_string(struct hist_field *hist_field, static u64 hist_field_dynstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -191,6 +196,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -201,52 +207,56 @@ static u64 hist_field_pstring(struct hist_field *hist_field, static u64 hist_field_log2(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, rbe, event); + u64 val = operand->fn(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 + val2; } static u64 hist_field_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 - val2; } static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, rbe, event); + s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -255,6 +265,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ struct tracing_map_elt *elt, \ + struct trace_buffer *buffer, \ struct ring_buffer_event *rbe, \ void *event) \ { \ @@ -380,7 +391,8 @@ struct hist_trigger_data { struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); @@ -608,7 +620,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, } static void action_trace(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -624,13 +637,14 @@ struct hist_var_data { static u64 hist_field_timestamp(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_trigger_data *hist_data = hist_field->hist_data; struct trace_array *tr = hist_data->event_file->tr; - u64 ts = ring_buffer_event_time_stamp(rbe); + u64 ts = ring_buffer_event_time_stamp(buffer, rbe); if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) ts = ns2usecs(ts); @@ -640,6 +654,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, static u64 hist_field_cpu(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -1020,6 +1035,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -2561,6 +2577,7 @@ find_target_event_var(struct hist_trigger_data *hist_data, } static inline void __update_field_vars(struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec, struct field_var **field_vars, @@ -2576,7 +2593,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, rbe, rec); + var_val = val->fn(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -2592,19 +2609,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, static void update_field_vars(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec) { - __update_field_vars(elt, rbe, rec, hist_data->field_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars, hist_data->n_field_vars, 0); } static void save_track_data_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - __update_field_vars(elt, rbe, rec, hist_data->save_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars, hist_data->n_save_vars, hist_data->n_field_var_str); } @@ -2780,12 +2799,14 @@ static void save_track_val(struct hist_trigger_data *hist_data, } static void save_track_data(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { if (data->track_data.save_data) - data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->track_data.save_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } static bool check_track_val(struct tracing_map_elt *elt, @@ -2836,7 +2857,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) @@ -2905,7 +2927,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) return false; } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) {} @@ -2947,7 +2970,8 @@ static void track_data_print(struct seq_file *m, } static void ontrack_action(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -2955,7 +2979,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data, if (check_track_val(elt, data, var_val)) { save_track_val(hist_data, elt, data, var_val); - save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + save_track_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } } @@ -4400,7 +4425,8 @@ create_hist_data(unsigned int map_bits, } static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, u64 *var_ref_vals) { @@ -4414,7 +4440,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4442,13 +4468,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } } - update_field_vars(hist_data, elt, rbe, rec); + update_field_vars(hist_data, elt, buffer, rbe, rec); } static inline void add_to_key(char *compound_key, void *key, @@ -4478,7 +4504,8 @@ static inline void add_to_key(char *compound_key, void *key, static void hist_trigger_actions(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, u64 *var_ref_vals) { @@ -4487,11 +4514,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals); } } -static void event_hist_trigger(struct event_trigger_data *data, void *rec, +static void event_hist_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; @@ -4516,7 +4544,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, rbe, rec); + field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -4539,10 +4567,10 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, if (!elt) return; - hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5456,7 +5484,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - tracing_set_time_stamp_abs(file->tr, true); + tracing_set_filter_buffering(file->tr, true); } if (named_data) @@ -5564,7 +5592,7 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->enable_timestamps) { if (!hist_data->remove || unregistered) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); } } @@ -5611,7 +5639,7 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); if (test->ops->free) test->ops->free(test->ops, test); } @@ -5812,7 +5840,8 @@ __init int register_trigger_hist_cmd(void) } static void -hist_enable_trigger(struct event_trigger_data *data, void *rec, +hist_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -5830,7 +5859,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, } static void -hist_enable_count_trigger(struct event_trigger_data *data, void *rec, +hist_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -5839,7 +5869,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - hist_enable_trigger(data, rec, event); + hist_enable_trigger(data, buffer, rec, event); } static struct event_trigger_ops hist_enable_trigger_ops = { diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8d71e6c83f10..2ac75eb6aa86 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1385,7 +1385,7 @@ static int destroy_synth_event(struct synth_event *se) /** * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event + * @event_name: The name of the new synthetic event * * Delete a synthetic event that was created with synth_event_create(). * diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f725802160c0..b8bfa8505b7b 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -53,7 +53,8 @@ void trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec, +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct event_trigger_data *data; @@ -67,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, if (data->paused) continue; if (!rec) { - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -77,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); } return tt; } @@ -105,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL); + data->ops->func(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -937,7 +938,8 @@ get_named_trigger_data(struct event_trigger_data *data) } static void -traceon_trigger(struct event_trigger_data *data, void *rec, +traceon_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -947,7 +949,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec, } static void -traceon_count_trigger(struct event_trigger_data *data, void *rec, +traceon_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -963,7 +966,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_trigger(struct event_trigger_data *data, void *rec, +traceoff_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -973,7 +977,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_count_trigger(struct event_trigger_data *data, void *rec, +traceoff_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -1071,7 +1076,8 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data, void *rec, +snapshot_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; @@ -1083,7 +1089,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec, } static void -snapshot_count_trigger(struct event_trigger_data *data, void *rec, +snapshot_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1092,7 +1099,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - snapshot_trigger(data, rec, event); + snapshot_trigger(data, buffer, rec, event); } static int @@ -1176,14 +1183,16 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif static void -stacktrace_trigger(struct event_trigger_data *data, void *rec, +stacktrace_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data, void *rec, +stacktrace_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1192,7 +1201,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - stacktrace_trigger(data, rec, event); + stacktrace_trigger(data, buffer, rec, event); } static int @@ -1254,7 +1263,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) } static void -event_enable_trigger(struct event_trigger_data *data, void *rec, +event_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1266,7 +1276,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec, } static void -event_enable_count_trigger(struct event_trigger_data *data, void *rec, +event_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1281,7 +1292,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - event_enable_trigger(data, rec, event); + event_enable_trigger(data, buffer, rec, event); } int event_enable_trigger_print(struct seq_file *m, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index f93723ca66bc..1f0e63f5d1f9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -27,13 +27,28 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs); static struct tracer_flags func_flags; /* Our option */ enum { - TRACE_FUNC_OPT_STACK = 0x1, + + TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_NO_REPEATS = 0x2, + + /* Update this to next highest bit. */ + TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 }; +#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) + int ftrace_allocate_ftrace_ops(struct trace_array *tr) { struct ftrace_ops *ops; @@ -86,6 +101,34 @@ void ftrace_destroy_function_files(struct trace_array *tr) ftrace_free_ftrace_ops(tr); } +static ftrace_func_t select_trace_function(u32 flags_val) +{ + switch (flags_val & TRACE_FUNC_OPT_MASK) { + case TRACE_FUNC_NO_OPTS: + return function_trace_call; + case TRACE_FUNC_OPT_STACK: + return function_stack_trace_call; + case TRACE_FUNC_OPT_NO_REPEATS: + return function_no_repeats_trace_call; + case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS: + return function_stack_no_repeats_trace_call; + default: + return NULL; + } +} + +static bool handle_func_repeats(struct trace_array *tr, u32 flags_val) +{ + if (!tr->last_func_repeats && + (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) { + tr->last_func_repeats = alloc_percpu(struct trace_func_repeats); + if (!tr->last_func_repeats) + return false; + } + + return true; +} + static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; @@ -97,12 +140,12 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - /* Currently only the global instance can do stack tracing */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && - func_flags.val & TRACE_FUNC_OPT_STACK) - func = function_stack_trace_call; - else - func = function_trace_call; + func = select_trace_function(func_flags.val); + if (!func) + return -EINVAL; + + if (!handle_func_repeats(tr, func_flags.val)) + return -ENOMEM; ftrace_init_array_ops(tr, func); @@ -205,15 +248,137 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } +static inline bool is_repeat_check(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned long ip, unsigned long parent_ip) +{ + if (last_info->ip == ip && + last_info->parent_ip == parent_ip && + last_info->count < U16_MAX) { + last_info->ts_last_call = + ring_buffer_time_stamp(tr->array_buffer.buffer); + last_info->count++; + return true; + } + + return false; +} + +static inline void process_repeats(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + if (last_info->count) { + trace_last_func_repeats(tr, last_info, trace_ctx); + last_info->count = 0; + } + + last_info->ip = ip; + last_info->parent_ip = parent_ip; +} + +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + unsigned long flags; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (atomic_read(&data->disabled)) + goto out; + + /* + * An interrupt may happen at any place here. But as far as I can see, + * the only damage that this can cause is to mess up the repetition + * counter without valuable data being lost. + * TODO: think about a solution that is better than just hoping to be + * lucky. + */ + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + local_save_flags(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + +out: + ftrace_test_recursion_unlock(bit); + preempt_enable_notrace(); +} + +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif + { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, { } /* Always set a last empty entry */ }; static struct tracer_flags func_flags = { - .val = 0, /* By default: all flags disabled */ + .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */ .opts = func_opts }; @@ -235,30 +400,32 @@ static struct tracer function_trace; static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - switch (bit) { - case TRACE_FUNC_OPT_STACK: - /* do nothing if already set */ - if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - break; - - /* We can change this flag when not running. */ - if (tr->current_trace != &function_trace) - break; + ftrace_func_t func; + u32 new_flags; - unregister_ftrace_function(tr->ops); + /* Do nothing if already set. */ + if (!!set == !!(func_flags.val & bit)) + return 0; - if (set) { - tr->ops->func = function_stack_trace_call; - register_ftrace_function(tr->ops); - } else { - tr->ops->func = function_trace_call; - register_ftrace_function(tr->ops); - } + /* We can change this flag only when not running. */ + if (tr->current_trace != &function_trace) + return 0; - break; - default: + new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + func = select_trace_function(new_flags); + if (!func) return -EINVAL; - } + + /* Check if there's anything to change. */ + if (tr->ops->func == func) + return 0; + + if (!handle_func_repeats(tr, new_flags)) + return -ENOMEM; + + unregister_ftrace_function(tr->ops); + tr->ops->func = func; + register_ftrace_function(tr->ops); return 0; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0aa6e6faa943..0de6837722da 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -764,7 +764,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * - we are inside irq code * - we just entered irq code * - * retunns 0 if + * returns 0 if * - funcgraph-interrupts option is set * - we are not inside irq code */ diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 34dc1a712dcb..632ef88131a9 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -83,7 +83,7 @@ struct hwlat_sample { u64 nmi_total_ts; /* Total time spent in NMIs */ struct timespec64 timestamp; /* wall time */ int nmi_count; /* # NMIs during this sample */ - int count; /* # of iteratons over threash */ + int count; /* # of iterations over thresh */ }; /* keep the global state somewhere. */ @@ -389,7 +389,7 @@ static int start_kthread(struct trace_array *tr) } /** - * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * stop_kthread - Inform the hardware latency sampling/detector kthread to stop * * This kicks the running hardware latency sampling/detector kernel thread and * tells it to stop sampling now. Use this on unload and at system shutdown. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6fe770d86dc3..ea6178cb5e33 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1748,7 +1748,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) kretprobe_perf_func(tk, ri, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return 0; /* We don't tweak kernel, so just return 0 */ } NOKPROBE_SYMBOL(kretprobe_dispatcher); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 61255bad7e01..d0368a569bfa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,7 +317,7 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) va_list ap; va_start(ap, fmt); - trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap); + trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); @@ -587,13 +587,26 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) return !trace_seq_has_overflowed(s); } +static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter, + unsigned long long ts) +{ + unsigned long secs, usec_rem; + unsigned long long t; + + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem); + } else + trace_seq_printf(s, " %12llu", ts); +} + int trace_print_context(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - unsigned long long t; - unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; trace_find_cmdline(entry->pid, comm); @@ -614,13 +627,8 @@ int trace_print_context(struct trace_iterator *iter) if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); - if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { - t = ns2usecs(iter->ts); - usec_rem = do_div(t, USEC_PER_SEC); - secs = (unsigned long)t; - trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); - } else - trace_seq_printf(s, " %12llu: ", iter->ts); + trace_print_time(s, iter, iter->ts); + trace_seq_puts(s, ": "); return !trace_seq_has_overflowed(s); } @@ -837,6 +845,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, return trace_handle_return(&iter->seq); } +static void print_fn_trace(struct trace_seq *s, unsigned long ip, + unsigned long parent_ip, int flags) +{ + seq_print_ip_sym(s, ip, flags); + + if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, parent_ip, flags); + } +} + /* TRACE_FN */ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -846,13 +865,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); - - if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, field->parent_ip, flags); - } - + print_fn_trace(s, field->ip, field->parent_ip, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1373,6 +1386,51 @@ static struct trace_event trace_raw_data_event = { .funcs = &trace_raw_data_funcs, }; +static enum print_line_t +trace_func_repeats_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lu %lu %u %llu\n", + field->ip, + field->parent_ip, + field->count, + FUNC_REPEATS_GET_DELTA_TS(field)); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_func_repeats_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + print_fn_trace(s, field->ip, field->parent_ip, flags); + trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); + trace_print_time(s, iter, + iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); + trace_seq_puts(s, ")\n"); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_func_repeats_funcs = { + .trace = trace_func_repeats_print, + .raw = trace_func_repeats_raw, +}; + +static struct trace_event trace_func_repeats_event = { + .type = TRACE_FUNC_REPEATS, + .funcs = &trace_func_repeats_funcs, +}; static struct trace_event *events[] __initdata = { &trace_fn_event, @@ -1385,6 +1443,7 @@ static struct trace_event *events[] __initdata = { &trace_print_event, &trace_hwlat_event, &trace_raw_data_event, + &trace_func_repeats_event, NULL }; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ff32476df072..4b320fe7df70 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -251,6 +251,17 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +bool trace_is_tracepoint_string(const char *str) +{ + const char **ptr = __start___tracepoint_str; + + for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) { + if (str == *ptr) + return true; + } + return false; +} + static const char **find_next(void *v, loff_t *pos) { const char **fmt = v; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ec589a4612df..15413ad7cef2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -168,7 +168,7 @@ void __trace_probe_log_err(int offset, int err_type) if (!trace_probe_log.argv) return; - /* Recalcurate the length and allocate buffer */ + /* Recalculate the length and allocate buffer */ for (i = 0; i < trace_probe_log.argc; i++) { if (i == trace_probe_log.index) pos = len; @@ -182,7 +182,7 @@ void __trace_probe_log_err(int offset, int err_type) /** * Set the error position is next to the last arg + space. * Note that len includes the terminal null and the cursor - * appaers at pos + 1. + * appears at pos + 1. */ pos = len; offset = 0; @@ -592,7 +592,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } /* - * Since $comm and immediate string can not be dereferred, + * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. */ if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 7ce4027089ee..227d518e5ba5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -134,7 +134,7 @@ struct fetch_type { size_t size; /* Byte size of type */ int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ - const char *fmt; /* Fromat string */ + const char *fmt; /* Format string */ const char *fmttype; /* Name in format file */ }; diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index e5282828f4a6..f003c5d02a3a 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -167,7 +167,7 @@ array: return code->op == FETCH_OP_END ? ret : -EILSEQ; } -/* Sum up total data length for dynamic arraies (strings) */ +/* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 73ef12092250..adf7ef194005 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -878,7 +878,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes @@ -940,7 +940,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1d84fcc78e3e..9c90b3a7dce2 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -16,7 +16,7 @@ * The buffer size is currently PAGE_SIZE, although it may become dynamic * in the future. * - * A write to the buffer will either succed or fail. That is, unlike + * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails @@ -73,7 +73,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf() is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); * @c: simple character to record * * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple charater + * copy to user routines. This function records a simple character * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. */ @@ -348,7 +348,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path) EXPORT_SYMBOL_GPL(trace_seq_path); /** - * trace_seq_to_user - copy the squence buffer to user space + * trace_seq_to_user - copy the sequence buffer to user space * @s: trace sequence descriptor * @ubuf: The userspace memory location to copy to * @cnt: The amount to copy @@ -363,7 +363,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the - * sequenc (@s->len == @s->readpos). + * sequence (@s->len == @s->readpos). * * Returns -EFAULT if the copy to userspace fails. */ diff --git a/kernel/umh.c b/kernel/umh.c index 3f646613a9d3..36c123360ab8 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -27,6 +27,7 @@ #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> +#include <linux/initrd.h> #include <trace/events/module.h> @@ -107,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); + wait_for_initramfs(); retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); @@ -336,8 +338,8 @@ static void helper_unlock(void) * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation - * @cleanup: a cleanup function * @init: an init function + * @cleanup: a cleanup function * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info @@ -348,7 +350,7 @@ static void helper_unlock(void) * exec. A non-zero return code causes the process to error out, exit, * and return the failure to the calling process * - * The cleanup function is just before ethe subprocess_info is about to + * The cleanup function is just before the subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. @@ -384,7 +386,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa + * @sub_info: information about the subprocess * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call diff --git a/kernel/up.c b/kernel/up.c index c7321307923a..a38b8b095251 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -38,7 +38,7 @@ EXPORT_SYMBOL(smp_call_function_single_async); /* * Preemption is disabled here to make sure the cond_func is called under the - * same condtions in UP and SMP. + * same conditions in UP and SMP. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9a4b980d695b..8d62863721b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -85,7 +85,7 @@ int create_user_ns(struct cred *new) /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, - * by verifing that the root directory is at the root of the + * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; @@ -1014,7 +1014,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; ret = -EINVAL; } - /* Be very certaint the new map actually exists */ + /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; @@ -1169,7 +1169,7 @@ static bool new_idmap_permitted(const struct file *file, /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. - * And the opener of the id file also had the approprpiate capability. + * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 107bc38b1945..7c397907d0e9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -154,7 +154,11 @@ static void lockup_detector_update_enable(void) #ifdef CONFIG_SOFTLOCKUP_DETECTOR -#define SOFTLOCKUP_RESET ULONG_MAX +/* + * Delay the soflockup report when running a known slow code. + * It does _not_ affect the timestamp of the last successdul reschedule. + */ +#define SOFTLOCKUP_DELAY_REPORT ULONG_MAX #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; @@ -169,10 +173,12 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +/* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +/* Timestamp of the last softlockup report. */ +static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -235,10 +241,16 @@ static void set_sample_period(void) watchdog_update_hrtimer_threshold(sample_period); } +static void update_report_ts(void) +{ + __this_cpu_write(watchdog_report_ts, get_timestamp()); +} + /* Commands for resetting the watchdog */ -static void __touch_watchdog(void) +static void update_touch_ts(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); + update_report_ts(); } /** @@ -252,10 +264,10 @@ static void __touch_watchdog(void) notrace void touch_softlockup_watchdog_sched(void) { /* - * Preemption can be enabled. It doesn't matter which CPU's timestamp - * gets zeroed here, so use the raw_ operation. + * Preemption can be enabled. It doesn't matter which CPU's watchdog + * report period gets restarted here, so use the raw_ operation. */ - raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); + raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } notrace void touch_softlockup_watchdog(void) @@ -279,7 +291,7 @@ void touch_all_softlockup_watchdogs(void) * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) { - per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET; + per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT; wq_watchdog_touch(cpu); } } @@ -287,16 +299,16 @@ void touch_all_softlockup_watchdogs(void) void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); - __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); + __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } -static int is_softlockup(unsigned long touch_ts) +static int is_softlockup(unsigned long touch_ts, unsigned long period_ts) { unsigned long now = get_timestamp(); if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ - if (time_after(now, touch_ts + get_softlockup_thresh())) + if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; } return 0; @@ -332,7 +344,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __touch_watchdog(); + update_touch_ts(); complete(this_cpu_ptr(&softlockup_completion)); return 0; @@ -342,6 +354,7 @@ static int softlockup_fn(void *data) static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); + unsigned long period_ts = __this_cpu_read(watchdog_report_ts); struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; @@ -363,7 +376,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - if (touch_ts == SOFTLOCKUP_RESET) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a soft lockup. Check to see if the host + * stopped the vm before we process the timestamps. + */ + kvm_check_and_clear_guest_paused(); + + /* Reset the interval when touched by known problematic code. */ + if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically @@ -373,9 +394,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) sched_clock_tick(); } - /* Clear the guest paused flag on watchdog reset */ - kvm_check_and_clear_guest_paused(); - __touch_watchdog(); + update_report_ts(); return HRTIMER_RESTART; } @@ -385,31 +404,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) * indicate it is getting cpu time. If it hasn't then * this is a good indication some task is hogging the cpu */ - duration = is_softlockup(touch_ts); + duration = is_softlockup(touch_ts, period_ts); if (unlikely(duration)) { /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like a soft lockup, check to see if the host - * stopped the vm before we issue the warning + * Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping all cpu back traces. */ - if (kvm_check_and_clear_guest_paused()) - return HRTIMER_RESTART; - - /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) - return HRTIMER_RESTART; - if (softlockup_all_cpu_backtrace) { - /* Prevent multiple soft-lockup reports if one cpu is already - * engaged in dumping cpu back traces - */ - if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { - /* Someone else will report us. Let's give up */ - __this_cpu_write(soft_watchdog_warn, true); + if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn)) return HRTIMER_RESTART; - } } + /* Start period for the next softlockup warning. */ + update_report_ts(); + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -421,22 +429,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) dump_stack(); if (softlockup_all_cpu_backtrace) { - /* Avoid generating two back traces for current - * given that one is already made above - */ trigger_allbutself_cpu_backtrace(); - - clear_bit(0, &soft_lockup_nmi_warn); - /* Barrier to sync with other cpus */ - smp_mb__after_atomic(); + clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); - __this_cpu_write(soft_watchdog_warn, true); - } else - __this_cpu_write(soft_watchdog_warn, false); + } return HRTIMER_RESTART; } @@ -461,7 +461,7 @@ static void watchdog_enable(unsigned int cpu) HRTIMER_MODE_REL_PINNED_HARD); /* Initialize timestamp */ - __touch_watchdog(); + update_touch_ts(); /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); |