diff options
Diffstat (limited to 'kernel')
67 files changed, 3176 insertions, 994 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 486c968214d9..da8dc0db5bd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2261,6 +2261,33 @@ out: } /** + * audit_signal_info - record signal info for shutting down audit subsystem + * @sig: signal value + * @t: task being signaled + * + * If the audit subsystem is being terminated, record the task (pid) + * and uid that is doing that. + */ +int audit_signal_info(int sig, struct task_struct *t) +{ + kuid_t uid = current_uid(), auid; + + if (auditd_test_task(t) && + (sig == SIGTERM || sig == SIGHUP || + sig == SIGUSR1 || sig == SIGUSR2)) { + audit_sig_pid = task_tgid_nr(current); + auid = audit_get_loginuid(current); + if (uid_valid(auid)) + audit_sig_uid = auid; + else + audit_sig_uid = uid; + security_task_getsecid(current, &audit_sig_sid); + } + + return audit_signal_info_syscall(t); +} + +/** * audit_log_end - end one audit record * @ab: the audit_buffer * diff --git a/kernel/audit.h b/kernel/audit.h index 6c076d4982da..6fb7160412d4 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -286,7 +286,7 @@ extern const char *audit_tree_path(struct audit_tree *tree); extern void audit_put_tree(struct audit_tree *tree); extern void audit_kill_trees(struct audit_context *context); -extern int audit_signal_info(int sig, struct task_struct *t); +extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); @@ -317,7 +317,11 @@ extern struct list_head *audit_killed_trees(void); #define audit_tree_path(rule) "" /* never called */ #define audit_kill_trees(context) BUG() -#define audit_signal_info(s, t) AUDIT_DISABLED +static inline int audit_signal_info_syscall(struct task_struct *t) +{ + return 0; +} + #define audit_filter_inodes(t, c) AUDIT_DISABLED #endif /* CONFIG_AUDITSYSCALL */ diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9f8e190e3bea..b0126e9c0743 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -322,7 +322,7 @@ static u32 audit_to_op(u32 op) /* check if an audit field is valid */ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { - switch(f->type) { + switch (f->type) { case AUDIT_MSGTYPE: if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) @@ -334,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) break; } - switch(entry->rule.listnr) { + switch (entry->rule.listnr) { case AUDIT_FILTER_FS: switch(f->type) { case AUDIT_FSTYPE: @@ -345,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) } } - switch(f->type) { - default: - return -EINVAL; + /* Check for valid field type and op */ + switch (f->type) { + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + case AUDIT_PERS: /* <uapi/linux/personality.h> */ + case AUDIT_DEVMINOR: + /* all ops are valid */ + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -360,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_FSGID: case AUDIT_OBJ_GID: case AUDIT_PID: - case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: case AUDIT_SESSIONID: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + case AUDIT_SADDR_FAM: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: case AUDIT_WATCH: case AUDIT_DIR: case AUDIT_FILTERKEY: - break; case AUDIT_LOGINUID_SET: - if ((f->val != 0) && (f->val != 1)) - return -EINVAL; - /* FALL THROUGH */ case AUDIT_ARCH: case AUDIT_FSTYPE: + case AUDIT_PERM: + case AUDIT_FILETYPE: + case AUDIT_FIELD_COMPARE: + case AUDIT_EXE: + /* only equal and not equal valid ops */ if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; + default: + /* field not recognized */ + return -EINVAL; + } + + /* Check for select valid field values */ + switch (f->type) { + case AUDIT_LOGINUID_SET: + if ((f->val != 0) && (f->val != 1)) + return -EINVAL; + break; case AUDIT_PERM: if (f->val & ~15) return -EINVAL; @@ -412,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; - case AUDIT_EXE: - if (f->op != Audit_not_equal && f->op != Audit_equal) + case AUDIT_SADDR_FAM: + if (f->val >= AF_MAX) return -EINVAL; break; + default: + break; } + return 0; } @@ -1190,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right) case Audit_bittest: return ((left & right) == right); default: - BUG(); return 0; } } @@ -1213,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) case Audit_bitmask: case Audit_bittest: default: - BUG(); return 0; } } @@ -1236,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) case Audit_bitmask: case Audit_bittest: default: - BUG(); return 0; } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 95ae27edd417..4effe01ebbe2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name) - result = audit_watch_compare(rule->watch, name->ino, name->dev); + if (name) { + result = audit_watch_compare(rule->watch, + name->ino, + name->dev); + if (f->op == Audit_not_equal) + result = !result; + } break; case AUDIT_DIR: - if (ctx) + if (ctx) { result = match_tree_refs(ctx, rule->tree); + if (f->op == Audit_not_equal) + result = !result; + } break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(tsk), @@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; + case AUDIT_SADDR_FAM: + if (ctx->sockaddr) + result = audit_comparator(ctx->sockaddr->ss_family, + f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -684,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_PERM: result = audit_match_perm(ctx, f->val); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_FIELD_COMPARE: result = audit_field_compare(tsk, cred, f, ctx, name); @@ -2360,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t) } /** - * audit_signal_info - record signal info for shutting down audit subsystem - * @sig: signal value + * audit_signal_info_syscall - record signal info for syscalls * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -int audit_signal_info(int sig, struct task_struct *t) +int audit_signal_info_syscall(struct task_struct *t) { struct audit_aux_data_pids *axp; struct audit_context *ctx = audit_context(); - kuid_t uid = current_uid(), auid, t_uid = task_uid(t); - - if (auditd_test_task(t) && - (sig == SIGTERM || sig == SIGHUP || - sig == SIGUSR1 || sig == SIGUSR2)) { - audit_sig_pid = task_tgid_nr(current); - auid = audit_get_loginuid(current); - if (uid_valid(auid)) - audit_sig_uid = auid; - else - audit_sig_uid = uid; - security_task_getsecid(current, &audit_sig_sid); - } + kuid_t t_uid = task_uid(t); if (!audit_signals || audit_dummy_context()) return 0; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4c2fa3ac56f6..29d781061cd5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o +CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 262a321f58a6..1c65ce0098a9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -75,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); u64 cost, array_size, mask64; + struct bpf_map_memory mem; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -108,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* make sure there is no u32 overflow later in round_up() */ cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - if (percpu) { + if (percpu) cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - } - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { + bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cad09858a5f2..546ebee39e2a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_resolve_source_only(index_type) || - btf_type_nosize_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type) || + btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_resolve_source_only(elem_type) || - btf_type_nosize_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type) || + btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_resolve_source_only(member_type) || - btf_type_nosize_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type) || + btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 92a7d0cf8d13..0a00eaca6fae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,19 +15,34 @@ #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> +#include <net/bpf_sk_storage.h> + +#include "../cgroup/cgroup-internal.h" DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +void cgroup_bpf_offline(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + percpu_ref_kill(&cgrp->bpf.refcnt); +} + /** - * cgroup_bpf_put() - put references of all bpf programs - * @cgrp: the cgroup to modify + * cgroup_bpf_release() - put references of all bpf programs and + * release all cgroup bpf data + * @work: work structure embedded into the cgroup to modify */ -void cgroup_bpf_put(struct cgroup *cgrp) +static void cgroup_bpf_release(struct work_struct *work) { + struct cgroup *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; + struct bpf_prog_array *old_array; unsigned int type; + mutex_lock(&cgroup_mutex); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog_list *pl, *tmp; @@ -42,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp) kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_prog_array_free(cgrp->bpf.effective[type]); + old_array = rcu_dereference_protected( + cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + bpf_prog_array_free(old_array); } + + mutex_unlock(&cgroup_mutex); + + percpu_ref_exit(&cgrp->bpf.refcnt); + cgroup_put(cgrp); +} + +/** + * cgroup_bpf_release_fn() - callback used to schedule releasing + * of bpf cgroup data + * @ref: percpu ref counter structure + */ +static void cgroup_bpf_release_fn(struct percpu_ref *ref) +{ + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); + + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); + queue_work(system_wq, &cgrp->bpf.release_work); } /* count number of elements in the list. @@ -98,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, */ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu **array) + struct bpf_prog_array **array) { enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; @@ -136,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp, } } while ((p = cgroup_parent(p))); - rcu_assign_pointer(*array, progs); + *array = progs; return 0; } static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu *array) + struct bpf_prog_array *old_array) { - struct bpf_prog_array __rcu *old_array; - - old_array = xchg(&cgrp->bpf.effective[type], array); + rcu_swap_protected(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ @@ -163,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) * that array below is variable length */ #define NR ARRAY_SIZE(cgrp->bpf.effective) - struct bpf_prog_array __rcu *arrays[NR] = {}; - int i; + struct bpf_prog_array *arrays[NR] = {}; + int ret, i; + + ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, + GFP_KERNEL); + if (ret) + return ret; for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); @@ -180,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + + percpu_ref_exit(&cgrp->bpf.refcnt); + return -ENOMEM; } @@ -193,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + err = compute_effective_progs(desc, type, &desc->bpf.inactive); if (err) goto cleanup; @@ -202,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) { + if (unlikely(desc->bpf.inactive)) { + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + continue; + } + activate_effective_progs(desc, type, desc->bpf.inactive); desc->bpf.inactive = NULL; } @@ -441,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog_array *effective; int cnt, ret = 0, i; + effective = rcu_dereference_protected(cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + cnt = bpf_prog_array_length(effective); else cnt = prog_list_length(progs); @@ -461,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], - prog_ids, cnt); + return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); } else { struct bpf_prog_list *pl; u32 id; @@ -545,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -572,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -867,6 +939,190 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +#ifdef CONFIG_NET +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); +#endif + static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { @@ -1127,3 +1383,155 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#ifdef CONFIG_NET + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1e12ac382a90..7e98f36a14e2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1364,10 +1364,10 @@ select_insn: insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> SRC); CONT; ALU_ARSH_K: - DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; @@ -1790,38 +1790,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) return &empty_prog_array.hdr; } -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || - progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + if (!progs || progs == &empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *array) +int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; - rcu_read_lock(); - item = rcu_dereference(array)->items; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; - rcu_read_unlock(); return cnt; } +bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +{ + struct bpf_prog_array_item *item; + + for (item = array->items; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) + return false; + return true; +} -static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, +static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference_check(array, 1)->items; - for (; item->prog; item++) { + for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; @@ -1834,7 +1838,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; @@ -1845,18 +1849,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); - * so below kcalloc doesn't need extra cnt > 0 check, but - * bpf_prog_array_length() releases rcu lock and - * prog array could have been swapped with empty or larger array, - * so always copy 'cnt' prog_ids to the user. - * In a rare race the user will see zero prog_ids + * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; - rcu_read_lock(); nospc = bpf_prog_array_copy_core(array, ids, cnt); - rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) @@ -1866,19 +1864,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, +void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { - struct bpf_prog_array_item *item = array->items; + struct bpf_prog_array_item *item; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array) @@ -1942,7 +1940,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { @@ -2085,6 +2083,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func) return false; } +/* Return TRUE if the JIT backend wants verifier to enable sub-register usage + * analysis code and wants explicit zero extension inserted by verifier. + * Otherwise, return FALSE. + */ +bool __weak bpf_jit_needs_zext(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ @@ -2102,3 +2109,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8ebd0fa826f8..ef49e17ae47c 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -32,14 +32,19 @@ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on + * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_cmap; - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.pages); + ret = bpf_map_charge_init(&cmap->map.memory, cost); if (ret) { err = ret; goto free_cmap; } - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) - goto free_cmap; + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) + goto free_charge; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * @@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); +free_charge: + bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); @@ -209,6 +210,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + /* Allow SKB to reuse area used by xdp_frame */ xdp_scrub_frame(xdpf); @@ -332,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -347,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -403,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -486,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -512,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * list be empty on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can assume no new + * items will be added to the list. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -536,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -588,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -619,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del_clearprev(&bq->flush_node); + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -629,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -644,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -663,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index cd8297b3bdb9..d83cf8ccc872 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -17,9 +17,8 @@ * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed - * until all bits are cleared indicating outstanding flush operations have - * completed. + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until + * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the @@ -48,9 +47,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -65,22 +68,17 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; - int err = -EINVAL; + int err, cpu; u64 cost; if (!capable(CAP_NET_ADMIN)) @@ -91,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + /* Lookup returns a pointer straight to dev->ifindex, so make sure the + * verifier prevents writes from the BPF side + */ + attr->map_flags |= BPF_F_RDONLY_PROG; + dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); @@ -99,39 +102,39 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_dtab; - - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + cost += sizeof(struct list_head) * num_possible_cpus(); - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) goto free_dtab; err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) - goto free_dtab; + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) + goto free_charge; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_dtab; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); +free_charge: + bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -160,14 +163,14 @@ static void dev_map_free(struct bpf_map *map) rcu_barrier(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * list to empty on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * from the program we can assume no new items will be added. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -183,7 +186,7 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -205,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -243,6 +238,7 @@ out: trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del_clearprev(&bq->flush_node); return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -265,31 +261,18 @@ error: * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the ctx bitmap - * is zeroed before completing to ensure all flush operations have completed. + * net device can be torn down. On devmap tear down we ensure the flush list + * is empty before completing to ensure all flush operations have completed. */ void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - - __clear_bit(bit, bitmap); - } + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); rcu_read_unlock(); } @@ -316,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -329,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -379,17 +367,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; rcu_read_lock(); for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } rcu_read_unlock(); } @@ -436,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; - u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + u32 i = *(u32 *)key; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -460,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 583df5cb302d..22066a62c8c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -352,14 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else cost += (u64) htab->elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - /* make sure page count doesn't overflow */ - goto free_htab; - - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&htab->map.memory, cost); if (err) goto free_htab; @@ -368,7 +362,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_htab; + goto free_charge; if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; @@ -401,6 +395,8 @@ free_prealloc: prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); +free_charge: + bpf_map_charge_finish(&htab->map.memory); free_htab: kfree(htab); return ERR_PTR(err); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 980e8f1f6cb5..addd6fdceec8 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + struct bpf_map_memory mem; + int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); @@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); + ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); + if (ret < 0) + return ERR_PTR(ret); + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); - if (!map) + if (!map) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } - map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), - PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 57b59cca4db7..56e6c75d354d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -570,14 +570,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) cost_per_node = sizeof(struct lpm_trie_node) + attr->value_size + trie->data_size; cost += (u64) attr->max_entries * cost_per_node; - if (cost >= U32_MAX - PAGE_SIZE) { - ret = -E2BIG; - goto out_err; - } - - trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(trie->map.pages); + ret = bpf_map_charge_init(&trie->map.memory, cost); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0b140d236889..f697647ceb54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem = {0}; struct bpf_queue_stack *qs; u64 size, queue_size, cost; size = (u64) attr->max_entries + 1; cost = queue_size = sizeof(*qs) + size * attr->value_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) + if (!qs) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - qs->map.pages = cost; + bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 18e225de80ff..50c083ba978c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - u64 cost, array_size; + struct bpf_map_memory mem; + u64 array_size; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_precharge_memlock(cost); + err = bpf_map_charge_init(&mem, array_size); if (err) return ERR_PTR(err); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d38e49f943a1..052580c33d26 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -86,6 +86,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; + struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -113,40 +114,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + err = bpf_map_charge_init(&mem, cost); + if (err) + return ERR_PTR(err); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) + if (!smap) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); - - err = -E2BIG; - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_smap; + } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_precharge_memlock(smap->map.pages); - if (err) - goto free_smap; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_smap; + goto free_charge; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; + bpf_map_charge_move(&smap->map.memory, &mem); + return &smap->map; put_buffers: put_callchain_buffers(); -free_smap: +free_charge: + bpf_map_charge_finish(&mem); bpf_map_area_free(smap); return ERR_PTR(err); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5b30f8baaf02..5d141f16f6fa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -180,19 +180,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->numa_node = bpf_map_attr_numa_node(attr); } -int bpf_map_precharge_memlock(u32 pages) -{ - struct user_struct *user = get_current_user(); - unsigned long memlock_limit, cur; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - cur = atomic_long_read(&user->locked_vm); - free_uid(user); - if (cur + pages > memlock_limit) - return -EPERM; - return 0; -} - static int bpf_charge_memlock(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -206,45 +193,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages) static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) { - atomic_long_sub(pages, &user->locked_vm); + if (user) + atomic_long_sub(pages, &user->locked_vm); } -static int bpf_map_init_memlock(struct bpf_map *map) +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) { - struct user_struct *user = get_current_user(); + u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; + struct user_struct *user; int ret; - ret = bpf_charge_memlock(user, map->pages); + if (size >= U32_MAX - PAGE_SIZE) + return -E2BIG; + + user = get_current_user(); + ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); return ret; } - map->user = user; - return ret; + + mem->pages = pages; + mem->user = user; + + return 0; } -static void bpf_map_release_memlock(struct bpf_map *map) +void bpf_map_charge_finish(struct bpf_map_memory *mem) { - struct user_struct *user = map->user; - bpf_uncharge_memlock(user, map->pages); - free_uid(user); + bpf_uncharge_memlock(mem->user, mem->pages); + free_uid(mem->user); +} + +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src) +{ + *dst = *src; + + /* Make sure src will not be used for the redundant uncharging. */ + memset(src, 0, sizeof(struct bpf_map_memory)); } int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) { int ret; - ret = bpf_charge_memlock(map->user, pages); + ret = bpf_charge_memlock(map->memory.user, pages); if (ret) return ret; - map->pages += pages; + map->memory.pages += pages; return ret; } void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) { - bpf_uncharge_memlock(map->user, pages); - map->pages -= pages; + bpf_uncharge_memlock(map->memory.user, pages); + map->memory.pages -= pages; } static int bpf_map_alloc_id(struct bpf_map *map) @@ -295,11 +299,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + struct bpf_map_memory mem; - bpf_map_release_memlock(map); + bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); + bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -387,7 +393,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT, + map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); @@ -541,6 +547,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -565,7 +572,7 @@ static int map_create(union bpf_attr *attr) err = bpf_obj_name_cpy(map->name, attr->map_name); if (err) - goto free_map_nouncharge; + goto free_map; atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -575,20 +582,20 @@ static int map_create(union bpf_attr *attr) if (!attr->btf_value_type_id) { err = -EINVAL; - goto free_map_nouncharge; + goto free_map; } btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); - goto free_map_nouncharge; + goto free_map; } err = map_check_btf(map, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) { btf_put(btf); - goto free_map_nouncharge; + goto free_map; } map->btf = btf; @@ -600,15 +607,11 @@ static int map_create(union bpf_attr *attr) err = security_bpf_map_alloc(map); if (err) - goto free_map_nouncharge; - - err = bpf_map_init_memlock(map); - if (err) - goto free_map_sec; + goto free_map; err = bpf_map_alloc_id(map); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -624,13 +627,13 @@ static int map_create(union bpf_attr *attr) return err; -free_map: - bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); -free_map_nouncharge: +free_map: btf_put(map->btf); + bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); + bpf_map_charge_finish(&mem); return err; } @@ -1579,6 +1582,22 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + switch (expected_attach_type) { + case BPF_CGROUP_SETSOCKOPT: + case BPF_CGROUP_GETSOCKOPT: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1598,7 +1617,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -1827,7 +1848,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } @@ -1895,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -1978,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -2014,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a5c369e60343..a2e763703c30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,7 +168,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL @@ -326,7 +326,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK; + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) @@ -398,6 +399,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", }; static char slot_type_char[] = { @@ -445,12 +447,12 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); - if (t == PTR_TO_STACK) - verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -512,11 +514,17 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, "=%s", - reg_type_str[state->stack[i].spilled_ptr.type]); - else + if (state->stack[i].slot_type[0] == STACK_SPILL) { + reg = &state->stack[i].spilled_ptr; + t = reg->type; + verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) + verbose(env, "%lld", reg->var_off.value + reg->off); + } else { verbose(env, "=%s", types_buf); + } } if (state->acquired_refs && state->refs[0].id) { verbose(env, " refs=%d", state->refs[0].id); @@ -665,6 +673,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -674,6 +689,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } + clear_jmp_history(state); if (free_self) kfree(state); } @@ -701,8 +717,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, const struct bpf_verifier_state *src) { struct bpf_func_state *dst; + u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; int i, err; + if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { + kfree(dst_state->jmp_history); + dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; + } + memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); @@ -711,6 +737,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; dst_state->active_spin_lock = src->active_spin_lock; + dst_state->branches = src->branches; + dst_state->parent = src->parent; + dst_state->first_insn_idx = src->first_insn_idx; + dst_state->last_insn_idx = src->last_insn_idx; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -726,6 +756,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + while (st) { + u32 br = --st->branches; + + /* WARN_ON(br > 1) technically makes sense here, + * but see comment in push_stack(), hence: + */ + WARN_ONCE((int)br < 0, + "BUG update_branch_counts:branches_to_explore=%d\n", + br); + if (br) + break; + st = st->parent; + } +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -774,10 +821,23 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; elem->st.speculative |= speculative; - if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose(env, "BPF program is too complex\n"); + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, "The sequence of %d jumps is too complex.\n", + env->stack_size); goto err; } + if (elem->st.parent) { + ++elem->st.parent->branches; + /* WARN_ON(branches > 2) technically makes sense here, + * but + * 1. speculative states will bump 'branches' for non-branch + * instructions + * 2. is_state_visited() heuristics may decide not to create + * a new state for a sequence of branches and all such current + * and cloned states will be pointing to a single parent state + * which might have large 'branches' count. + */ + } return &elem->st; err: free_verifier_state(env->cur_state, true); @@ -925,6 +985,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; + + /* constant backtracking is enabled for root only for now */ + reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -973,6 +1036,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); } +#define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) { @@ -983,6 +1047,7 @@ static void init_reg_state(struct bpf_verifier_env *env, mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; regs[i].parent = NULL; + regs[i].subreg_def = DEF_NOT_SUBREG; } /* frame pointer */ @@ -1128,7 +1193,7 @@ next: */ static int mark_reg_read(struct bpf_verifier_env *env, const struct bpf_reg_state *state, - struct bpf_reg_state *parent) + struct bpf_reg_state *parent, u8 flag) { bool writes = parent == state->parent; /* Observe write marks */ int cnt = 0; @@ -1143,17 +1208,26 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } - if (parent->live & REG_LIVE_READ) + /* The first condition is more likely to be true than the + * second, checked it first. + */ + if ((parent->live & REG_LIVE_READ) == flag || + parent->live & REG_LIVE_READ64) /* The parentage chain never changes and * this parent was already marked as LIVE_READ. * There is no need to keep walking the chain again and * keep re-marking all parents as LIVE_READ. * This case happens when the same register is read * multiple times without writes into it in-between. + * Also, if parent has the stronger REG_LIVE_READ64 set, + * then no need to set the weak REG_LIVE_READ32. */ break; /* ... then we depend on parent's value */ - parent->live |= REG_LIVE_READ; + parent->live |= flag; + /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ + if (flag == REG_LIVE_READ64) + parent->live &= ~REG_LIVE_READ32; state = parent; parent = state->parent; writes = true; @@ -1165,12 +1239,129 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } +/* This function is supposed to be used by the following 32-bit optimization + * code only. It returns TRUE if the source or destination register operates + * on 64-bit, otherwise return FALSE. + */ +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, + u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +{ + u8 code, class, op; + + code = insn->code; + class = BPF_CLASS(code); + op = BPF_OP(code); + if (class == BPF_JMP) { + /* BPF_EXIT for "main" will reach here. Return TRUE + * conservatively. + */ + if (op == BPF_EXIT) + return true; + if (op == BPF_CALL) { + /* BPF to BPF call will reach here because of marking + * caller saved clobber with DST_OP_NO_MARK for which we + * don't care the register def because they are anyway + * marked as NOT_INIT already. + */ + if (insn->src_reg == BPF_PSEUDO_CALL) + return false; + /* Helper call will reach here because of arg type + * check, conservatively return TRUE. + */ + if (t == SRC_OP) + return true; + + return false; + } + } + + if (class == BPF_ALU64 || class == BPF_JMP || + /* BPF_END always use BPF_ALU class. */ + (class == BPF_ALU && op == BPF_END && insn->imm == 64)) + return true; + + if (class == BPF_ALU || class == BPF_JMP32) + return false; + + if (class == BPF_LDX) { + if (t != SRC_OP) + return BPF_SIZE(code) == BPF_DW; + /* LDX source must be ptr. */ + return true; + } + + if (class == BPF_STX) { + if (reg->type != SCALAR_VALUE) + return true; + return BPF_SIZE(code) == BPF_DW; + } + + if (class == BPF_LD) { + u8 mode = BPF_MODE(code); + + /* LD_IMM64 */ + if (mode == BPF_IMM) + return true; + + /* Both LD_IND and LD_ABS return 32-bit data. */ + if (t != SRC_OP) + return false; + + /* Implicit ctx ptr. */ + if (regno == BPF_REG_6) + return true; + + /* Explicit source could be any width. */ + return true; + } + + if (class == BPF_ST) + /* The only source register for BPF_ST is a ptr. */ + return true; + + /* Conservatively return true at default. */ + return true; +} + +/* Return TRUE if INSN doesn't have explicit value define. */ +static bool insn_no_def(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + return (class == BPF_JMP || class == BPF_JMP32 || + class == BPF_STX || class == BPF_ST); +} + +/* Return TRUE if INSN has defined any 32-bit value explicitly. */ +static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + if (insn_no_def(insn)) + return false; + + return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); +} + +static void mark_insn_zext(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + s32 def_idx = reg->subreg_def; + + if (def_idx == DEF_NOT_SUBREG) + return; + + env->insn_aux_data[def_idx - 1].zext_dst = true; + /* The dst will be zero extended, so won't be sub-register anymore. */ + reg->subreg_def = DEF_NOT_SUBREG; +} + static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; struct bpf_reg_state *reg, *regs = state->regs; + bool rw64; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -1178,6 +1369,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, } reg = ®s[regno]; + rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -1188,7 +1380,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, if (regno == BPF_REG_FP) return 0; - return mark_reg_read(env, reg, reg->parent); + if (rw64) + mark_insn_zext(env, reg); + + return mark_reg_read(env, reg, reg->parent, + rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1196,12 +1392,441 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return -EACCES; } reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } return 0; } +/* for any branch, call, exit record the history of jmps in the given state */ +static int push_jmp_history(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur) +{ + u32 cnt = cur->jmp_history_cnt; + struct bpf_idx_pair *p; + + cnt++; + p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); + if (!p) + return -ENOMEM; + p[cnt - 1].idx = env->insn_idx; + p[cnt - 1].prev_idx = env->prev_insn_idx; + cur->jmp_history = p; + cur->jmp_history_cnt = cnt; + return 0; +} + +/* Backtrack one insn at a time. If idx is not at the top of recorded + * history then previous instruction came from straight line execution. + */ +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) +{ + u32 cnt = *history; + + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; + } else { + i--; + } + return i; +} + +/* For given verifier state backtrack_insn() is called from the last insn to + * the first insn. Its purpose is to compute a bitmask of registers and + * stack slots that needs precision in the parent verifier state. + */ +static int backtrack_insn(struct bpf_verifier_env *env, int idx, + u32 *reg_mask, u64 *stack_mask) +{ + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + .private_data = env, + }; + struct bpf_insn *insn = env->prog->insnsi + idx; + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u32 dreg = 1u << insn->dst_reg; + u32 sreg = 1u << insn->src_reg; + u32 spi; + + if (insn->code == 0) + return 0; + if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); + verbose(env, "%d: ", idx); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + if (!(*reg_mask & dreg)) + return 0; + if (opcode == BPF_MOV) { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg = sreg + * dreg needs precision after this insn + * sreg needs precision before this insn + */ + *reg_mask &= ~dreg; + *reg_mask |= sreg; + } else { + /* dreg = K + * dreg needs precision after this insn. + * Corresponding register is already marked + * as precise=true in this verifier state. + * No further markings in parent are necessary + */ + *reg_mask &= ~dreg; + } + } else { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg += sreg + * both dreg and sreg need precision + * before this insn + */ + *reg_mask |= sreg; + } /* else dreg += K + * dreg still needs precision before this insn + */ + } + } else if (class == BPF_LDX) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + + /* scalars can only be spilled into stack w/o losing precision. + * Load from any other memory can be zero extended. + * The desire to keep that precision is already indicated + * by 'precise' mark in corresponding register of this state. + * No further tracking necessary. + */ + if (insn->src_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + + /* dreg = *(u64 *)[fp - off] was a fill from the stack. + * that [fp - off] slot contains scalar that needs to be + * tracked with precision + */ + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + *stack_mask |= 1ull << spi; + } else if (class == BPF_STX) { + if (*reg_mask & dreg) + /* stx shouldn't be using _scalar_ dst_reg + * to access memory. It means backtracking + * encountered a case of pointer subtraction. + */ + return -ENOTSUPP; + /* scalars can only be spilled into stack */ + if (insn->dst_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + if (!(*stack_mask & (1ull << spi))) + return 0; + *stack_mask &= ~(1ull << spi); + *reg_mask |= sreg; + } else if (class == BPF_JMP || class == BPF_JMP32) { + if (opcode == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + return -ENOTSUPP; + /* regular helper call sets R0 */ + *reg_mask &= ~1; + if (*reg_mask & 0x3f) { + /* if backtracing was looking for registers R1-R5 + * they should have been found already. + */ + verbose(env, "BUG regs %x\n", *reg_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } else if (opcode == BPF_EXIT) { + return -ENOTSUPP; + } + } else if (class == BPF_LD) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + /* It's ld_imm64 or ld_abs or ld_ind. + * For ld_imm64 no further tracking of precision + * into parent is necessary + */ + if (mode == BPF_IND || mode == BPF_ABS) + /* to be analyzed */ + return -ENOTSUPP; + } else if (class == BPF_ST) { + if (*reg_mask & dreg) + /* likely pointer subtraction */ + return -ENOTSUPP; + } + return 0; +} + +/* the scalar precision tracking algorithm: + * . at the start all registers have precise=false. + * . scalar ranges are tracked as normal through alu and jmp insns. + * . once precise value of the scalar register is used in: + * . ptr + scalar alu + * . if (scalar cond K|scalar) + * . helper_call(.., scalar, ...) where ARG_CONST is expected + * backtrack through the verifier states and mark all registers and + * stack slots with spilled constants that these scalar regisers + * should be precise. + * . during state pruning two registers (or spilled stack slots) + * are equivalent if both are not precise. + * + * Note the verifier cannot simply walk register parentage chain, + * since many different registers and stack slots could have been + * used to compute single precise scalar. + * + * The approach of starting with precise=true for all registers and then + * backtrack to mark a register as not precise when the verifier detects + * that program doesn't care about specific value (e.g., when helper + * takes register as ARG_ANYTHING parameter) is not safe. + * + * It's ok to walk single parentage chain of the verifier states. + * It's possible that this backtracking will go all the way till 1st insn. + * All other branches will be explored for needing precision later. + * + * The backtracking needs to deal with cases like: + * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) + * r9 -= r8 + * r5 = r9 + * if r5 > 0x79f goto pc+7 + * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) + * r5 += 1 + * ... + * call bpf_perf_event_output#25 + * where .arg5_type = ARG_CONST_SIZE_OR_ZERO + * + * and this case: + * r6 = 1 + * call foo // uses callee's r6 inside to compute r0 + * r0 += r6 + * if r0 == 0 goto + * + * to track above reg_mask/stack_mask needs to be independent for each frame. + * + * Also if parent's curframe > frame where backtracking started, + * the verifier need to mark registers in both frames, otherwise callees + * may incorrectly prune callers. This is similar to + * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") + * + * For now backtracking falls back into conservative marking. + */ +static void mark_all_scalars_precise(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + /* big hammer: mark all scalars precise in this path. + * pop_stack may still get !precise scalars. + */ + for (; st; st = st->parent) + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (func->stack[j].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + } +} + +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, + int spi) +{ + struct bpf_verifier_state *st = env->cur_state; + int first_idx = st->first_insn_idx; + int last_idx = env->insn_idx; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + u32 reg_mask = regno >= 0 ? 1u << regno : 0; + u64 stack_mask = spi >= 0 ? 1ull << spi : 0; + bool skip_first = true; + bool new_marks = false; + int i, err; + + if (!env->allow_ptr_leaks) + /* backtracking is root only for now */ + return 0; + + func = st->frame[st->curframe]; + if (regno >= 0) { + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (!reg->precise) + new_marks = true; + else + reg_mask = 0; + reg->precise = true; + } + + while (spi >= 0) { + if (func->stack[spi].slot_type[0] != STACK_SPILL) { + stack_mask = 0; + break; + } + reg = &func->stack[spi].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask = 0; + break; + } + if (!reg->precise) + new_marks = true; + else + stack_mask = 0; + reg->precise = true; + break; + } + + if (!new_marks) + return 0; + if (!reg_mask && !stack_mask) + return 0; + for (;;) { + DECLARE_BITMAP(mask, 64); + u32 history = st->jmp_history_cnt; + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + for (i = last_idx;;) { + if (skip_first) { + err = 0; + skip_first = false; + } else { + err = backtrack_insn(env, i, ®_mask, &stack_mask); + } + if (err == -ENOTSUPP) { + mark_all_scalars_precise(env, st); + return 0; + } else if (err) { + return err; + } + if (!reg_mask && !stack_mask) + /* Found assignment(s) into tracked register in this state. + * Since this state is already marked, just return. + * Nothing to be tracked further in the parent state. + */ + return 0; + if (i == first_idx) + break; + i = get_prev_insn_idx(st, i, &history); + if (i >= env->prog->len) { + /* This can happen if backtracking reached insn 0 + * and there are still reg_mask or stack_mask + * to backtrack. + * It means the backtracking missed the spot where + * particular register was initialized with a constant. + */ + verbose(env, "BUG backtracking idx %d\n", i); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } + st = st->parent; + if (!st) + break; + + new_marks = false; + func = st->frame[st->curframe]; + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); + continue; + } + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) { + /* This can happen if backtracking + * is propagating stack precision where + * caller has larger stack frame + * than callee, but backtrack_insn() should + * have returned -ENOTSUPP. + */ + verbose(env, "BUG spi %d stack_size %d\n", + i, func->allocated_stack); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + + if (func->stack[i].slot_type[0] != STACK_SPILL) { + stack_mask &= ~(1ull << i); + continue; + } + reg = &func->stack[i].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask &= ~(1ull << i); + continue; + } + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + if (env->log.level & BPF_LOG_LEVEL) { + print_verifier_state(env, func); + verbose(env, "parent %s regs=%x stack=%llx marks\n", + new_marks ? "didn't have" : "already had", + reg_mask, stack_mask); + } + + if (!reg_mask && !stack_mask) + break; + if (!new_marks) + break; + + last_idx = st->last_insn_idx; + first_idx = st->first_insn_idx; + } + return 0; +} + +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + return __mark_chain_precision(env, regno, -1); +} + +static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +{ + return __mark_chain_precision(env, -1, spi); +} + static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -1220,6 +1845,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return true; default: return false; @@ -1232,6 +1858,23 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } +static bool register_is_const(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); +} + +static void save_register_state(struct bpf_func_state *state, + int spi, struct bpf_reg_state *reg) +{ + int i; + + state->stack[spi].spilled_ptr = *reg; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_SPILL; +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -1241,7 +1884,8 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - enum bpf_reg_type type; + u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; + struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), state->acquired_refs, true); @@ -1258,27 +1902,48 @@ static int check_stack_write(struct bpf_verifier_env *env, } cur = env->cur_state->frame[env->cur_state->curframe]; - if (value_regno >= 0 && - is_spillable_regtype((type = cur->regs[value_regno].type))) { - + if (value_regno >= 0) + reg = &cur->regs[value_regno]; + + if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + !register_is_null(reg) && env->allow_ptr_leaks) { + if (dst_reg != BPF_REG_FP) { + /* The backtracking logic can only recognize explicit + * stack slot address like [fp - 8]. Other spill of + * scalar via different register has to be conervative. + * Backtrack from here and mark all registers as precise + * that contributed into 'reg' being a constant. + */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + save_register_state(state, spi, reg); + } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { + verbose_linfo(env, insn_idx, "; "); verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && type == PTR_TO_STACK) { + if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - /* save register state */ - state->stack[spi].spilled_ptr = cur->regs[value_regno]; - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + if (!env->allow_ptr_leaks) { + bool sanitize = false; - for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack[spi].slot_type[i] == STACK_MISC && - !env->allow_ptr_leaks) { + if (state->stack[spi].slot_type[0] == STACK_SPILL && + register_is_const(&state->stack[spi].spilled_ptr)) + sanitize = true; + for (i = 0; i < BPF_REG_SIZE; i++) + if (state->stack[spi].slot_type[i] == STACK_MISC) { + sanitize = true; + break; + } + if (sanitize) { int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; int soff = (-spi - 1) * BPF_REG_SIZE; @@ -1301,8 +1966,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } *poff = soff; } - state->stack[spi].slot_type[i] = STACK_SPILL; } + save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -1325,9 +1990,13 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (value_regno >= 0 && - register_is_null(&cur->regs[value_regno])) + if (reg && register_is_null(reg)) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; type = STACK_ZERO; + } /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) @@ -1344,6 +2013,7 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + struct bpf_reg_state *reg; u8 *stype; if (reg_state->allocated_stack <= slot) { @@ -1352,11 +2022,21 @@ static int check_stack_read(struct bpf_verifier_env *env, return -EACCES; } stype = reg_state->stack[spi].slot_type; + reg = ®_state->stack[spi].spilled_ptr; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose(env, "invalid size of register spill\n"); - return -EACCES; + if (reg->type != SCALAR_VALUE) { + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "invalid size of register fill\n"); + return -EACCES; + } + if (value_regno >= 0) { + mark_reg_unknown(env, state->regs, value_regno); + state->regs[value_regno].live |= REG_LIVE_WRITTEN; + } + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + return 0; } for (i = 1; i < BPF_REG_SIZE; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { @@ -1367,16 +2047,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + state->regs[value_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); - return 0; + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { int zeros = 0; @@ -1391,22 +2069,32 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, * so the whole register == const_zero */ __mark_reg_const_zero(&state->regs[value_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[value_regno].precise = true; } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, value_regno); } state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - return 0; } + return 0; } static int check_stack_access(struct bpf_verifier_env *env, @@ -1572,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, env->seen_direct_write = true; return true; + + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + if (t == BPF_WRITE) + env->seen_direct_write = true; + + return true; + default: return false; } @@ -1698,6 +2393,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_TCP_SOCK: valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_XDP_SOCK: + valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -1862,6 +2560,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_TCP_SOCK: pointer_desc = "tcp_sock "; break; + case PTR_TO_XDP_SOCK: + pointer_desc = "xdp_sock "; + break; default: break; } @@ -2101,6 +2802,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); if (reg_type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; + /* A load of ctx field could have different + * actual load size with the one encoded in the + * insn. When the dst is PTR, it is for sure not + * a sub-register. + */ + regs[value_regno].subreg_def = DEF_NOT_SUBREG; } regs[value_regno].type = reg_type; } @@ -2255,7 +2962,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, slot, spi; + int err, min_off, max_off, i, j, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2343,6 +3050,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + __mark_reg_unknown(&state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + goto mark; + } + err: if (tnum_is_const(reg->var_off)) { verbose(env, "invalid indirect read from stack off %d+%d size %d\n", @@ -2360,7 +3075,8 @@ mark: * the whole slot to be marked as 'read' */ mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent); + state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); } return update_stack_depth(env, state, min_off); } @@ -2693,6 +3409,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -2741,22 +3459,23 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_get_local_storage) goto error; break; - /* devmap returns a pointer to a live net_device ifindex that we cannot - * allow to be modified from bpf side. So do not allow lookup elements - * for now. - */ case BPF_MAP_TYPE_DEVMAP: - if (func_id != BPF_FUNC_redirect_map) + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases * appear. */ case BPF_MAP_TYPE_CPUMAP: - case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; + case BPF_MAP_TYPE_XSKMAP: + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -3324,6 +4043,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + /* helper call returns 64-bit value. */ + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ @@ -3644,6 +4366,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4121,6 +4844,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); + int err; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -4147,11 +4871,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * This is legal, but we have to reverse our * src/dest handling in computing the range */ + err = mark_chain_precision(env, insn->dst_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); } @@ -4255,6 +4985,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -4265,6 +4996,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -4881,6 +5613,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (reg->map_ptr->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = reg->map_ptr->inner_map_meta; + } else if (reg->map_ptr->map_type == + BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; } else { reg->type = PTR_TO_MAP_VALUE; } @@ -5052,9 +5787,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *this_branch = env->cur_state; struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; - struct bpf_reg_state *dst_reg, *other_branch_regs; + struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; u8 opcode = BPF_OP(insn->code); bool is_jmp32; + int pred = -1; int err; /* Only conditional jumps are expected to reach here. */ @@ -5079,6 +5815,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -5094,20 +5831,29 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - if (BPF_SRC(insn->code) == BPF_K) { - int pred = is_branch_taken(dst_reg, insn->imm, opcode, - is_jmp32); - - if (pred == 1) { - /* only follow the goto, ignore fall-through */ - *insn_idx += insn->off; - return 0; - } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go - */ - return 0; - } + if (BPF_SRC(insn->code) == BPF_K) + pred = is_branch_taken(dst_reg, insn->imm, + opcode, is_jmp32); + else if (src_reg->type == SCALAR_VALUE && + tnum_is_const(src_reg->var_off)) + pred = is_branch_taken(dst_reg, src_reg->var_off.value, + opcode, is_jmp32); + if (pred >= 0) { + err = mark_chain_precision(env, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X && !err) + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; + } + if (pred == 1) { + /* only follow the goto, ignore fall-through */ + *insn_idx += insn->off; + return 0; + } else if (pred == 0) { + /* only follow fall-through branch, since + * that's where the program will go + */ + return 0; } other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, @@ -5344,11 +6090,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * Already marked as written above. */ mark_reg_unknown(env, regs, BPF_REG_0); + /* ld_abs load up to 32-bit skb data. */ + regs[BPF_REG_0].subreg_def = env->insn_idx + 1; return 0; } static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); @@ -5358,10 +6107,15 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; default: return 0; @@ -5388,6 +6142,10 @@ static int check_return_code(struct bpf_verifier_env *env) verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } @@ -5431,14 +6189,33 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + +static struct bpf_verifier_state_list **explored_state( + struct bpf_verifier_env *env, + int idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; +} + +static void init_explored_state(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].prune_point = true; +} /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, + bool loop_ok) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -5457,7 +6234,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) if (e == BRANCH) /* mark branch target for state pruning */ - env->explored_states[w] = STATE_LIST_MARK; + init_explored_state(env, w); if (insn_state[w] == 0) { /* tree-edge */ @@ -5468,6 +6245,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + if (loop_ok && env->allow_ptr_leaks) + return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -5519,16 +6298,17 @@ peek_stack: if (opcode == BPF_EXIT) { goto mark_explored; } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { - env->explored_states[t] = STATE_LIST_MARK; - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5541,26 +6321,31 @@ peek_stack: } /* unconditional jump with single edge */ ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env); + FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); /* tell verifier to check for equivalent states * after every call and jump */ if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); } else { /* conditional jump with two edges */ - env->explored_states[t] = STATE_LIST_MARK; - ret = push_insn(t, t + 1, FALLTHROUGH, env); + init_explored_state(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5570,7 +6355,7 @@ peek_stack: /* all other non-branch instructions with single * fall-through edge */ - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -6001,12 +6786,12 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state_list *sl; int i; - sl = env->explored_states[insn]; - if (!sl) - return; - - while (sl != STATE_LIST_MARK) { - if (sl->state.curframe != cur->curframe) + sl = *explored_state(env, insn); + while (sl) { + if (sl->state.branches) + goto next; + if (sl->state.insn_idx != insn || + sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) @@ -6046,6 +6831,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, switch (rold->type) { case SCALAR_VALUE: if (rcur->type == SCALAR_VALUE) { + if (!rold->precise && !rcur->precise) + return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); @@ -6118,6 +6905,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6288,20 +7076,33 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +/* Return 0 if no propagation happened. Return negative error code if error + * happened. Otherwise, return the propagated bit. + */ static int propagate_liveness_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct bpf_reg_state *parent_reg) { + u8 parent_flag = parent_reg->live & REG_LIVE_READ; + u8 flag = reg->live & REG_LIVE_READ; int err; - if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + /* When comes here, read flags of PARENT_REG or REG could be any of + * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need + * of propagation if PARENT_REG has strongest REG_LIVE_READ64. + */ + if (parent_flag == REG_LIVE_READ64 || + /* Or if there is no read flag from REG. */ + !flag || + /* Or if the read flag from REG is the same as PARENT_REG. */ + parent_flag == flag) return 0; - err = mark_reg_read(env, reg, parent_reg); + err = mark_reg_read(env, reg, parent_reg, flag); if (err) return err; - return 0; + return flag; } /* A write screens off any subsequent reads; but write marks come from the @@ -6335,8 +7136,10 @@ static int propagate_liveness(struct bpf_verifier_env *env, for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { err = propagate_liveness_reg(env, &state_reg[i], &parent_reg[i]); - if (err) + if (err < 0) return err; + if (err == REG_LIVE_READ64) + mark_insn_zext(env, &parent_reg[i]); } /* Propagate stack slots. */ @@ -6346,32 +7149,132 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = &state->stack[i].spilled_ptr; err = propagate_liveness_reg(env, state_reg, parent_reg); - if (err) + if (err < 0) return err; } } - return err; + return 0; +} + +/* find precise scalars in the previous equivalent state and + * propagate them into the current state + */ +static int propagate_precision(struct bpf_verifier_env *env, + const struct bpf_verifier_state *old) +{ + struct bpf_reg_state *state_reg; + struct bpf_func_state *state; + int i, err = 0; + + state = old->frame[old->curframe]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating r%d\n", i); + err = mark_chain_precision(env, i); + if (err < 0) + return err; + } + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE); + err = mark_chain_precision_stack(env, i); + if (err < 0) + return err; + } + return 0; } +static bool states_maybe_looping(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr = cur->curframe; + + if (old->curframe != fr) + return false; + + fold = old->frame[fr]; + fcur = cur->frame[fr]; + for (i = 0; i < MAX_BPF_REG; i++) + if (memcmp(&fold->regs[i], &fcur->regs[i], + offsetof(struct bpf_reg_state, parent))) + return false; + return true; +} + + static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; + bool add_new_state = false; - pprev = &env->explored_states[insn_idx]; - sl = *pprev; - - if (!sl) + cur->last_insn_idx = env->prev_insn_idx; + if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ return 0; + /* bpf progs typically have pruning point every 4 instructions + * http://vger.kernel.org/bpfconf2019.html#session-1 + * Do not add new state for future pruning if the verifier hasn't seen + * at least 2 jumps and at least 8 instructions. + * This heuristics helps decrease 'total_states' and 'peak_states' metric. + * In tests that amounts to up to 50% reduction into total verifier + * memory consumption and 20% verifier time speedup. + */ + if (env->jmps_processed - env->prev_jmps_processed >= 2 && + env->insn_processed - env->prev_insn_processed >= 8) + add_new_state = true; + + pprev = explored_state(env, insn_idx); + sl = *pprev; + clean_live_states(env, insn_idx, cur); - while (sl != STATE_LIST_MARK) { + while (sl) { + states_cnt++; + if (sl->state.insn_idx != insn_idx) + goto next; + if (sl->state.branches) { + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur)) { + verbose_linfo(env, insn_idx, "; "); + verbose(env, "infinite loop detected at insn %d\n", insn_idx); + return -EINVAL; + } + /* if the verifier is processing a loop, avoid adding new state + * too often, since different loop iterations have distinct + * states and may not help future pruning. + * This threshold shouldn't be too low to make sure that + * a loop with large bound will be rejected quickly. + * The most abusive loop will be: + * r1 += 1 + * if r1 < 1000000 goto pc-2 + * 1M insn_procssed limit / 100 == 10k peak states. + * This threshold shouldn't be too high either, since states + * at the end of the loop are likely to be useful in pruning. + */ + if (env->jmps_processed - env->prev_jmps_processed < 20 && + env->insn_processed - env->prev_insn_processed < 100) + add_new_state = false; + goto miss; + } if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6385,12 +7288,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * this state and will pop a new one. */ err = propagate_liveness(env, &sl->state, cur); + + /* if previous state reached the exit with precision and + * current state is equivalent to it (except precsion marks) + * the precision needs to be propagated back in + * the current state. + */ + err = err ? : push_jmp_history(env, cur); + err = err ? : propagate_precision(env, &sl->state); if (err) return err; return 1; } - states_cnt++; - sl->miss_cnt++; +miss: + /* when new state is not going to be added do not increase miss count. + * Otherwise several loop iterations will remove the state + * recorded earlier. The goal of these heuristics is to have + * states from some iterations of the loop (some in the beginning + * and some at the end) to help pruning. + */ + if (add_new_state) + sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, @@ -6402,6 +7320,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ *pprev = sl->next; if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + u32 br = sl->state.branches; + + WARN_ONCE(br, + "BUG live_done but branches_to_explore %d\n", + br); free_verifier_state(&sl->state, false); kfree(sl); env->peak_states--; @@ -6416,6 +7339,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) sl = *pprev; continue; } +next: pprev = &sl->next; sl = *pprev; } @@ -6424,20 +7348,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) env->max_states_per_insn = states_cnt; if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return 0; + return push_jmp_history(env, cur); + + if (!add_new_state) + return push_jmp_history(env, cur); - /* there were no equivalent states, remember current one. - * technically the current state is not proven to be safe yet, + /* There were no equivalent states, remember the current one. + * Technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. Since there are no loops, we won't be + * or it will be rejected. When there are no loops the verifier won't be * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit + * again on the way to bpf_exit. + * When looping the sl->state.branches will be > 0 and this state + * will not be considered for equivalence until branches == 0. */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; env->total_states++; env->peak_states++; + env->prev_jmps_processed = env->jmps_processed; + env->prev_insn_processed = env->insn_processed; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6447,8 +7378,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } - new_sl->next = env->explored_states[insn_idx]; - env->explored_states[insn_idx] = new_sl; + new->insn_idx = insn_idx; + WARN_ONCE(new->branches != 1, + "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + + cur->parent = new; + cur->first_insn_idx = insn_idx; + clear_jmp_history(cur); + new_sl->next = *explored_state(env, insn_idx); + *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just @@ -6456,17 +7394,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * the state of the call instruction (with WRITTEN set), and r0 comes * from callee with its full parentage chain, anyway. */ - for (j = 0; j <= cur->curframe; j++) - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark * their parent and current state never has children yet. Only * explored_states can get read marks.) */ - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + for (j = 0; j <= cur->curframe; j++) { + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; + for (i = 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].live = REG_LIVE_NONE; + } /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { @@ -6493,6 +7432,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return false; default: return true; @@ -6524,6 +7464,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; + int prev_insn_idx = -1; env->prev_linfo = NULL; @@ -6532,6 +7473,7 @@ static int do_check(struct bpf_verifier_env *env) return -ENOMEM; state->curframe = 0; state->speculative = false; + state->branches = 1; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -6548,6 +7490,7 @@ static int do_check(struct bpf_verifier_env *env) u8 class; int err; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt); @@ -6620,6 +7563,7 @@ static int do_check(struct bpf_verifier_env *env) regs = cur_regs(env); env->insn_aux_data[env->insn_idx].seen = true; + prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -6738,6 +7682,7 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); + env->jmps_processed++; if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || @@ -6792,7 +7737,6 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - env->prev_insn_idx = env->insn_idx; err = prepare_func_exit(env, &env->insn_idx); if (err) return err; @@ -6823,7 +7767,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &env->prev_insn_idx, + update_branch_counts(env, env->cur_state); + err = pop_stack(env, &prev_insn_idx, &env->insn_idx); if (err < 0) { if (err != -ENOENT) @@ -7126,14 +8071,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, - u32 off, u32 cnt) +static int adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_prog *new_prog, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn *insn = new_prog->insnsi; + u32 prog_len; int i; + /* aux info at OFF always needs adjustment, no matter fast path + * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the + * original insn at old prog. + */ + old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + if (cnt == 1) return 0; + prog_len = new_prog->len; new_data = vzalloc(array_size(prog_len, sizeof(struct bpf_insn_aux_data))); if (!new_data) @@ -7141,8 +8095,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); - for (i = off; i < off + cnt - 1; i++) + for (i = off; i < off + cnt - 1; i++) { new_data[i].seen = true; + new_data[i].zext_dst = insn_has_def32(env, insn + i); + } env->insn_aux_data = new_data; vfree(old_data); return 0; @@ -7175,7 +8131,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of env->insn_aux_data[off].orig_idx); return NULL; } - if (adjust_insn_aux_data(env, new_prog->len, off, len)) + if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); return new_prog; @@ -7439,6 +8395,84 @@ static int opt_remove_nops(struct bpf_verifier_env *env) return 0; } +static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, + const union bpf_attr *attr) +{ + struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i, patch_len, delta = 0, len = env->prog->len; + struct bpf_insn *insns = env->prog->insnsi; + struct bpf_prog *new_prog; + bool rnd_hi32; + + rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; + zext_patch[1] = BPF_ZEXT_REG(0); + rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); + rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); + for (i = 0; i < len; i++) { + int adj_idx = i + delta; + struct bpf_insn insn; + + insn = insns[adj_idx]; + if (!aux[adj_idx].zext_dst) { + u8 code, class; + u32 imm_rnd; + + if (!rnd_hi32) + continue; + + code = insn.code; + class = BPF_CLASS(code); + if (insn_no_def(&insn)) + continue; + + /* NOTE: arg "reg" (the fourth one) is only used for + * BPF_STX which has been ruled out in above + * check, it is safe to pass NULL here. + */ + if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { + if (class == BPF_LD && + BPF_MODE(code) == BPF_IMM) + i++; + continue; + } + + /* ctx load could be transformed into wider load. */ + if (class == BPF_LDX && + aux[adj_idx].ptr_type == PTR_TO_CTX) + continue; + + imm_rnd = get_random_int(); + rnd_hi32_patch[0] = insn; + rnd_hi32_patch[1].imm = imm_rnd; + rnd_hi32_patch[3].dst_reg = insn.dst_reg; + patch = rnd_hi32_patch; + patch_len = 4; + goto apply_patch_buffer; + } + + if (!bpf_jit_needs_zext()) + continue; + + zext_patch[0] = insn; + zext_patch[1].dst_reg = insn.dst_reg; + zext_patch[1].src_reg = insn.dst_reg; + patch = zext_patch; + patch_len = 2; +apply_patch_buffer: + new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + insns = new_prog->insnsi; + aux = env->insn_aux_data; + delta += patch_len - 1; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -7537,6 +8571,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_TCP_SOCK: convert_ctx_access = bpf_tcp_sock_convert_ctx_access; break; + case PTR_TO_XDP_SOCK: + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; + break; default: continue; } @@ -8126,16 +9163,15 @@ static void free_states(struct bpf_verifier_env *env) if (!env->explored_states) return; - for (i = 0; i < env->prog->len; i++) { + for (i = 0; i < state_htab_size(env); i++) { sl = env->explored_states[i]; - if (sl) - while (sl != STATE_LIST_MARK) { - sln = sl->next; - free_verifier_state(&sl->state, false); - kfree(sl); - sl = sln; - } + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } } kvfree(env->explored_states); @@ -8235,7 +9271,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kvcalloc(env->prog->len, + env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -8290,6 +9326,15 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + /* do 32-bit optimization after insn patching has done so those patched + * insns could be handled correctly. + */ + if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); + env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret + : false; + } + if (ret == 0) ret = fixup_call_args(env); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 686d244e798d..9bb96ace9fa1 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -17,8 +17,8 @@ struct xsk_map { static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - int cpu, err = -EINVAL; struct xsk_map *m; + int cpu, err; u64 cost; if (!capable(CAP_NET_ADMIN)) @@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_m; - - m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.pages); + err = bpf_map_charge_init(&m->map.memory, cost); if (err) goto free_m; @@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) m->flush_list = alloc_percpu(struct list_head); if (!m->flush_list) - goto free_m; + goto free_charge; for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); @@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) free_percpu: free_percpu(m->flush_list); +free_charge: + bpf_map_charge_finish(&m->map.memory); free_m: kfree(m); return ERR_PTR(err); @@ -147,13 +145,18 @@ void __xsk_map_flush(struct bpf_map *map) list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); - __list_del(xs->flush_node.prev, xs->flush_node.next); - xs->flush_node.prev = NULL; + __list_del_clearprev(&xs->flush_node); } } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) +{ return ERR_PTR(-EOPNOTSUPP); } @@ -220,6 +223,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cdbeff87fa99..300b0c416341 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4226,6 +4226,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css @@ -5005,8 +5006,6 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - - cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5532,6 +5531,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); + cgroup_bpf_offline(cgrp); + /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6240,6 +6241,48 @@ struct cgroup *cgroup_get_from_fd(int fd) } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); +static u64 power_of_ten(int power) +{ + u64 v = 1; + while (power--) + v *= 10; + return v; +} + +/** + * cgroup_parse_float - parse a floating number + * @input: input string + * @dec_shift: number of decimal digits to shift + * @v: output + * + * Parse a decimal floating point number in @input and store the result in + * @v with decimal point right shifted @dec_shift times. For example, if + * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. + * Returns 0 on success, -errno otherwise. + * + * There's nothing cgroup specific about this function except that it's + * currently the only user. + */ +int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) +{ + s64 whole, frac = 0; + int fstart = 0, fend = 0, flen; + + if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) + return -EINVAL; + if (frac < 0) + return -EINVAL; + + flen = fend > fstart ? fend - fstart : 0; + if (flen < dec_shift) + frac *= power_of_ten(dec_shift - flen); + else + frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); + + *v = whole * power_of_ten(dec_shift) + frac; + return 0; +} + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. @@ -6278,6 +6321,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) * Don't use cgroup_get_live(). */ cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); return; } @@ -6289,6 +6333,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { skcd->val = (unsigned long)cset->dfl_cgrp; + cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); @@ -6299,7 +6344,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) void cgroup_sk_free(struct sock_cgroup_data *skcd) { - cgroup_put(sock_cgroup_ptr(skcd)); + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + cgroup_bpf_put(cgrp); + cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ @@ -6402,4 +6450,5 @@ static int __init cgroup_sysfs_init(void) return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); + #endif /* CONFIG_SYSFS */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a1590e244f5f..863e434a6020 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -729,7 +729,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this diff --git a/kernel/cred.c b/kernel/cred.c index c73a87a4df13..f9a0ce66c9c3 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -170,6 +170,11 @@ void exit_creds(struct task_struct *tsk) validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); + +#ifdef CONFIG_KEYS_REQUEST_CACHE + key_put(current->cached_requested_key); + current->cached_requested_key = NULL; +#endif } /** @@ -323,6 +328,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; +#ifdef CONFIG_KEYS_REQUEST_CACHE + p->cached_requested_key = NULL; +#endif + if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && @@ -460,9 +469,9 @@ int commit_creds(struct cred *new) /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) - key_fsuid_changed(task); + key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) - key_fsgid_changed(task); + key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index b2a87905846d..bfc0c17f2a3d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -214,6 +214,62 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return cma_release(dev_get_cma_area(dev), pages, count); } +/** + * dma_alloc_contiguous() - allocate contiguous pages + * @dev: Pointer to device for which the allocation is performed. + * @size: Requested allocation size. + * @gfp: Allocation flags. + * + * This function allocates contiguous memory buffer for specified device. It + * first tries to use device specific contiguous memory area if available or + * the default global one, then tries a fallback allocation of normal pages. + * + * Note that it byapss one-page size of allocations from the global area as + * the addresses within one page are always contiguous, so there is no need + * to waste CMA pages for that kind; it also helps reduce fragmentations. + */ +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) +{ + int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; + size_t align = get_order(PAGE_ALIGN(size)); + struct page *page = NULL; + struct cma *cma = NULL; + + if (dev && dev->cma_area) + cma = dev->cma_area; + else if (count > 1) + cma = dma_contiguous_default_area; + + /* CMA can be used only in the context which permits sleeping */ + if (cma && gfpflags_allow_blocking(gfp)) { + align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); + page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN); + } + + /* Fallback allocation of normal pages */ + if (!page) + page = alloc_pages_node(node, gfp, align); + return page; +} + +/** + * dma_free_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @page: Pointer to the allocated pages. + * @size: Size of allocated pages. + * + * This function releases memory allocated by dma_alloc_contiguous(). As the + * cma_release returns false when provided pages do not belong to contiguous + * area and true otherwise, this function then does a fallback __free_pages() + * upon a false-return. + */ +void dma_free_contiguous(struct device *dev, struct page *page, size_t size) +{ + if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT)) + __free_pages(page, get_order(size)); +} + /* * Support for reserved memory regions defined in device tree */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 2c2772e9702a..b90e1aede743 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - int page_order = get_order(size); struct page *page = NULL; u64 phys_mask; @@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); again: - /* CMA can be used only in the context which permits sleeping */ - if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, - gfp & __GFP_NOWARN); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_release_from_contiguous(dev, page, count); - page = NULL; - } - } - if (!page) - page = alloc_pages_node(dev_to_node(dev), gfp, page_order); - + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __free_pages(page, page_order); + dma_free_contiguous(dev, page, size); page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && @@ -151,10 +138,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + /* remove any dirty cache lines on the kernel alias */ + if (!PageHighMem(page)) + arch_dma_prep_coherent(page, size); + /* return the page pointer as the opaque cookie */ + return page; + } + if (PageHighMem(page)) { /* * Depending on the cma= arguments and per-arch setup - * dma_alloc_from_contiguous could return highmem pages. + * dma_alloc_contiguous could return highmem pages. * Without remapping there is no way to return them here, * so log an error and fail. */ @@ -171,15 +166,19 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, *dma_handle = phys_to_dma(dev, page_to_phys(page)); } memset(ret, 0, size); + + if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) { + arch_dma_prep_coherent(page, size); + ret = uncached_kernel_address(ret); + } + return ret; } void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - if (!dma_release_from_contiguous(dev, page, count)) - __free_pages(page, get_order(size)); + dma_free_contiguous(dev, page, size); } void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, @@ -187,15 +186,26 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + /* cpu_addr is a struct page cookie, not a kernel address */ + __dma_direct_free_pages(dev, size, cpu_addr); + return; + } + if (force_dma_unencrypted()) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); + + if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) + cpu_addr = cached_kernel_address(cpu_addr); __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) + if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); } @@ -203,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) + if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); else dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f7afdadb6770..1f628e7ac709 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -317,6 +317,12 @@ void arch_dma_set_mask(struct device *dev, u64 mask); int dma_set_mask(struct device *dev, u64 mask) { + /* + * Truncate the mask to the actually supported dma_addr_t width to + * avoid generating unsupportable addresses. + */ + mask = (dma_addr_t)mask; + if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; @@ -330,6 +336,12 @@ EXPORT_SYMBOL(dma_set_mask); #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { + /* + * Truncate the mask to the actually supported dma_addr_t width to + * avoid generating unsupportable addresses. + */ + mask = (dma_addr_t)mask; + if (!dma_supported(dev, mask)) return -EIO; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 7a723194ecbe..a594aec07882 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -158,6 +158,9 @@ out: bool dma_in_atomic_pool(void *start, size_t size) { + if (unlikely(!atomic_pool)) + return false; + return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); } @@ -199,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, size = PAGE_ALIGN(size); - if (!gfpflags_allow_blocking(flags) && - !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) { + if (!gfpflags_allow_blocking(flags)) { ret = dma_alloc_from_pool(size, &page, flags); if (!ret) return NULL; @@ -214,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - ret = page; /* opaque cookie */ - goto done; - } - /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), @@ -237,10 +234,7 @@ done: void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - /* vaddr is a struct page cookie, not a kernel address */ - __dma_direct_free_pages(dev, size, vaddr); - } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { + if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { phys_addr_t phys = dma_to_phys(dev, dma_handle); struct page *page = pfn_to_page(__phys_to_pfn(phys)); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 13f0cb080a4d..62fa5a82a065 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -696,29 +696,12 @@ bool is_swiotlb_active(void) static int __init swiotlb_create_debugfs(void) { - struct dentry *d_swiotlb_usage; - struct dentry *ent; - - d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL); - - if (!d_swiotlb_usage) - return -ENOMEM; - - ent = debugfs_create_ulong("io_tlb_nslabs", 0400, - d_swiotlb_usage, &io_tlb_nslabs); - if (!ent) - goto fail; - - ent = debugfs_create_ulong("io_tlb_used", 0400, - d_swiotlb_usage, &io_tlb_used); - if (!ent) - goto fail; + struct dentry *root; + root = debugfs_create_dir("swiotlb", NULL); + debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs); + debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used); return 0; - -fail: - debugfs_remove_recursive(d_swiotlb_usage); - return -ENOMEM; } late_initcall(swiotlb_create_debugfs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 29e5f7880a4b..eea9d52b010c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2553,6 +2553,9 @@ unlock: return ret; } +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx); + /* * Attach a performance event to a context. * @@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); + WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); + if (event->cpu != -1) event->cpu = cpu; @@ -2952,6 +2957,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; + /* + * If we had been multiplexing, no rotations are necessary, now no events + * are active. + */ + ctx->rotate_necessary = 0; + perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) @@ -3319,10 +3330,13 @@ static int flexible_sched_in(struct perf_event *event, void *data) return 0; if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) - list_add_tail(&event->active_list, &sid->ctx->flexible_active); - else + int ret = group_sched_in(event, sid->cpuctx, sid->ctx); + if (ret) { sid->can_add_hw = 0; + sid->ctx->rotate_necessary = 1; + return 0; + } + list_add_tail(&event->active_list, &sid->ctx->flexible_active); } return 0; @@ -3690,24 +3704,17 @@ ctx_first_active(struct perf_event_context *ctx) static bool perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event *cpu_event = NULL, *task_event = NULL; - bool cpu_rotate = false, task_rotate = false; - struct perf_event_context *ctx = NULL; + struct perf_event_context *task_ctx = NULL; + int cpu_rotate, task_rotate; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ - if (cpuctx->ctx.nr_events) { - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - cpu_rotate = true; - } - - ctx = cpuctx->task_ctx; - if (ctx && ctx->nr_events) { - if (ctx->nr_events != ctx->nr_active) - task_rotate = true; - } + cpu_rotate = cpuctx->ctx.rotate_necessary; + task_ctx = cpuctx->task_ctx; + task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; @@ -3716,7 +3723,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(ctx); + task_event = ctx_first_active(task_ctx); if (cpu_rotate) cpu_event = ctx_first_active(&cpuctx->ctx); @@ -3724,17 +3731,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ - if (task_event || (ctx && cpu_event)) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (task_event || (task_ctx && cpu_event)) + ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); if (cpu_event) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (task_event) - rotate_ctx(ctx, task_event); + rotate_ctx(task_ctx, task_event); if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); - perf_event_sched_in(cpuctx, ctx, current); + perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4358,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return 0; /* @@ -4389,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ @@ -4409,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) return false; } -/* Called under the same ctx::mutex as perf_install_in_context() */ static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + lockdep_assert_held(&ctx->mutex); + + if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { @@ -4463,12 +4471,20 @@ static void _free_event(struct perf_event *event) if (event->destroy) event->destroy(event); - if (event->ctx) - put_ctx(event->ctx); - + /* + * Must be after ->destroy(), due to uprobe_perf_close() using + * hw.target. + */ if (event->hw.target) put_task_struct(event->hw.target); + /* + * perf_event_free_task() relies on put_ctx() being 'last', in particular + * all task references must be cleaned up. + */ + if (event->ctx) + put_ctx(event->ctx); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -4648,8 +4664,17 @@ again: mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { + void *var = &child->ctx->refcount; + list_del(&child->child_list); free_event(child); + + /* + * Wake any perf_event_free_task() waiting for this event to be + * freed. + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); } no_ctx: @@ -8535,9 +8560,9 @@ static int perf_tp_event_match(struct perf_event *event, if (event->hw.state & PERF_HES_STOPPED) return 0; /* - * All tracepoints are from kernel-space. + * If exclude_kernel, only trace user-space tracepoints (uprobes) */ - if (event->attr.exclude_kernel) + if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, data)) @@ -9877,6 +9902,12 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto del_dev; + if (pmu->attr_update) + ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); + + if (ret) + goto del_dev; + out: return ret; @@ -10922,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } - if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { - err = -EBUSY; - goto err_context; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -11014,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open, move_group = 0; } } + + /* + * Failure to create exclusive events returns -EBUSY. + */ + err = -EBUSY; + if (!exclusive_event_installable(group_leader, ctx)) + goto err_locked; + + for_each_sibling_event(sibling, group_leader) { + if (!exclusive_event_installable(sibling, ctx)) + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } @@ -11050,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open, * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { - /* exclusive and group stuff are assumed mutually exclusive */ - WARN_ON_ONCE(move_group); - err = -EBUSY; goto err_locked; } @@ -11519,11 +11554,11 @@ static void perf_free_event(struct perf_event *event, } /* - * Free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. + * Free a context as created by inheritance by perf_event_init_task() below, + * used by fork() in case of fail. * - * Not all locks are strictly required, but take them anyway to be nice and - * help out with the lockdep assertions. + * Even though the task has never lived, the context and events have been + * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { @@ -11553,7 +11588,23 @@ void perf_event_free_task(struct task_struct *task) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); - put_ctx(ctx); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 97c367f0a9aa..84fa00497c49 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2112,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs) sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); - force_sig(SIGILL, current); + force_sig(SIGILL); } @@ -2228,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); - force_sig(SIGILL, current); + force_sig(SIGILL); } } diff --git a/kernel/fail_function.c b/kernel/fail_function.c index feb80712b913..63b349168da7 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, "%llx\n"); -static int fei_debugfs_add_attr(struct fei_attr *attr) +static void fei_debugfs_add_attr(struct fei_attr *attr) { struct dentry *dir; dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); - if (!dir) - return -ENOMEM; - - if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { - debugfs_remove_recursive(dir); - return -ENOMEM; - } - return 0; + debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops); } static void fei_debugfs_remove_attr(struct fei_attr *attr) @@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = register_kprobe(&attr->kp); if (!ret) - ret = fei_debugfs_add_attr(attr); + fei_debugfs_add_attr(attr); if (ret < 0) fei_attr_remove(attr); else { @@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void) return PTR_ERR(dir); /* injectable attribute is just a symlink of error_inject/list */ - if (!debugfs_create_symlink("injectable", dir, - "../error_injection/list")) - goto error; + debugfs_create_symlink("injectable", dir, "../error_injection/list"); - if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) - goto error; + debugfs_create_file("inject", 0600, dir, NULL, &fei_ops); fei_debugfs_dir = dir; return 0; -error: - debugfs_remove_recursive(dir); - return -ENOMEM; } late_initcall(fei_debugfs_init); diff --git a/kernel/fork.c b/kernel/fork.c index 847dd147b068..d8ae0f1b4148 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); @@ -1711,8 +1710,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) } #endif +/* + * Poll support for process exit notification. + */ +static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct task_struct *task; + struct pid *pid = file->private_data; + int poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + /* + * Inform pollers only when the whole thread group exits. + * If the thread group leader exits before all other threads in the + * group, then poll(2) should block, similar to the wait(2) family. + */ + if (!task || (task->exit_state && thread_group_empty(task))) + poll_flags = POLLIN | POLLRDNORM; + rcu_read_unlock(); + + return poll_flags; +} + const struct file_operations pidfd_fops = { .release = pidfd_release, + .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif @@ -1742,20 +1767,16 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) * flags). The actual kick-off is left to the caller. */ static __latent_entropy struct task_struct *copy_process( - unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls, - int node) + int node, + struct kernel_clone_args *args) { int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; + u64 clone_flags = args->flags; /* * Don't allow sharing the root directory with processes in a different @@ -1805,14 +1826,11 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { /* - * - CLONE_PARENT_SETTID is useless for pidfds and also - * parent_tidptr is used to return pidfds. * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. * - CLONE_THREAD is blocked until someone really needs it. */ - if (clone_flags & - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) return ERR_PTR(-EINVAL); } @@ -1845,11 +1863,11 @@ static __latent_entropy struct task_struct *copy_process( * p->set_child_tid which is (ab)used as a kthread's data pointer for * kernel threads (PF_KTHREAD). */ - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); @@ -2005,7 +2023,8 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); + retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, + args->tls); if (retval) goto bad_fork_cleanup_io; @@ -2040,7 +2059,7 @@ static __latent_entropy struct task_struct *copy_process( } get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, parent_tidptr); + retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; } @@ -2083,7 +2102,7 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PARENT) p->exit_signal = current->group_leader->exit_signal; else - p->exit_signal = (clone_flags & CSIGNAL); + p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2296,8 +2315,11 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, - cpu_to_node(cpu)); + struct kernel_clone_args args = { + .flags = CLONE_VM, + }; + + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); @@ -2317,13 +2339,9 @@ struct mm_struct *copy_init_mm(void) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long _do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - unsigned long tls) +long _do_fork(struct kernel_clone_args *args) { + u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; @@ -2339,7 +2357,7 @@ long _do_fork(unsigned long clone_flags, if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; - else if ((clone_flags & CSIGNAL) != SIGCHLD) + else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; @@ -2348,8 +2366,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) @@ -2365,7 +2382,7 @@ long _do_fork(unsigned long clone_flags, nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); + put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; @@ -2388,6 +2405,16 @@ long _do_fork(unsigned long clone_flags, return nr; } +bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) +{ + /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ + if ((kargs->flags & CLONE_PIDFD) && + (kargs->flags & CLONE_PARENT_SETTID)) + return false; + + return true; +} + #ifndef CONFIG_HAVE_COPY_THREAD_TLS /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ @@ -2397,8 +2424,20 @@ long do_fork(unsigned long clone_flags, int __user *parent_tidptr, int __user *child_tidptr) { - return _do_fork(clone_flags, stack_start, stack_size, - parent_tidptr, child_tidptr, 0); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = stack_start, + .stack_size = stack_size, + }; + + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + + return _do_fork(&args); } #endif @@ -2407,15 +2446,25 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return _do_fork(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2426,8 +2475,12 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = CLONE_VFORK | CLONE_VM, + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); } #endif @@ -2455,7 +2508,111 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, unsigned long, tls) #endif { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls, + }; + + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + + return _do_fork(&args); +} +#endif + +#ifdef __ARCH_WANT_SYS_CLONE3 +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, + struct clone_args __user *uargs, + size_t size) +{ + struct clone_args args; + + if (unlikely(size > PAGE_SIZE)) + return -E2BIG; + + if (unlikely(size < sizeof(struct clone_args))) + return -EINVAL; + + if (unlikely(!access_ok(uargs, size))) + return -EFAULT; + + if (size > sizeof(struct clone_args)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uargs + sizeof(struct clone_args); + end = (void __user *)uargs + size; + + for (; addr < end; addr++) { + if (get_user(val, addr)) + return -EFAULT; + if (val) + return -E2BIG; + } + + size = sizeof(struct clone_args); + } + + if (copy_from_user(&args, uargs, size)) + return -EFAULT; + + *kargs = (struct kernel_clone_args){ + .flags = args.flags, + .pidfd = u64_to_user_ptr(args.pidfd), + .child_tid = u64_to_user_ptr(args.child_tid), + .parent_tid = u64_to_user_ptr(args.parent_tid), + .exit_signal = args.exit_signal, + .stack = args.stack, + .stack_size = args.stack_size, + .tls = args.tls, + }; + + return 0; +} + +static bool clone3_args_valid(const struct kernel_clone_args *kargs) +{ + /* + * All lower bits of the flag word are taken. + * Verify that no other unknown flags are passed along. + */ + if (kargs->flags & ~CLONE_LEGACY_FLAGS) + return false; + + /* + * - make the CLONE_DETACHED bit reuseable for clone3 + * - make the CSIGNAL bits reuseable for clone3 + */ + if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) + return false; + + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && + kargs->exit_signal) + return false; + + return true; +} + +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) +{ + int err; + + struct kernel_clone_args kargs; + + err = copy_clone_args_from_user(&kargs, uargs, size); + if (err) + return err; + + if (!clone3_args_valid(&kargs)) + return -EINVAL; + + return _do_fork(&kargs); } #endif diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 6e40ff6be083..e5eb5ea7ea59 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -64,7 +64,6 @@ struct gcov_node { static const char objtree[] = OBJTREE; static const char srctree[] = SRCTREE; static struct gcov_node root_node; -static struct dentry *reset_dentry; static LIST_HEAD(all_head); static DEFINE_MUTEX(node_lock); @@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent) goto out_err; node->links[i] = debugfs_create_symlink(deskew(basename), parent, target); - if (!node->links[i]) - goto out_err; kfree(target); } @@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent, parent->dentry, node, &gcov_data_fops); } else node->dentry = debugfs_create_dir(node->name, parent->dentry); - if (!node->dentry) { - pr_warn("could not create file\n"); - kfree(node); - return NULL; - } if (info) add_links(node, parent->dentry); list_add(&node->list, &parent->children); @@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) /* Create debugfs entries. */ static __init int gcov_fs_init(void) { - int rc = -EIO; - init_node(&root_node, NULL, NULL, NULL); /* * /sys/kernel/debug/gcov will be parent for the reset control file * and all profiling files. */ root_node.dentry = debugfs_create_dir("gcov", NULL); - if (!root_node.dentry) - goto err_remove; /* * Create reset file which resets all profiling counts when written * to. */ - reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry, - NULL, &gcov_reset_fops); - if (!reset_dentry) - goto err_remove; + debugfs_create_file("reset", 0600, root_node.dentry, NULL, + &gcov_reset_fops); /* Replay previous events to get our fs hierarchy up-to-date. */ gcov_enable_events(); return 0; - -err_remove: - pr_err("init failed\n"); - debugfs_remove(root_node.dentry); - - return rc; } device_initcall(gcov_fs_init); diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 9a34e1d9bd7f..9ff449888d9c 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -4,24 +4,12 @@ # This script generates an archive consisting of kernel headers # for CONFIG_IKHEADERS. set -e -spath="$(dirname "$(readlink -f "$0")")" -kroot="$spath/.." +sfile="$(readlink -f "$0")" outdir="$(pwd)" tarfile=$1 cpio_dir=$outdir/$tarfile.tmp -# Script filename relative to the kernel source root -# We add it to the archive because it is small and any changes -# to this script will also cause a rebuild of the archive. -sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" - -src_file_list=" -include/ -arch/$SRCARCH/include/ -$sfile -" - -obj_file_list=" +dir_list=" include/ arch/$SRCARCH/include/ " @@ -33,33 +21,29 @@ arch/$SRCARCH/include/ # Uncomment it for debugging. # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter -# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter +# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter +# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter # include/generated/compile.h is ignored because it is touched even when none # of the source files changed. This causes pointless regeneration, so let us # ignore them for md5 calculation. -pushd $kroot > /dev/null -src_files_md5="$(find $src_file_list -type f | +pushd $srctree > /dev/null +src_files_md5="$(find $dir_list -name "*.h" | grep -v "include/generated/compile.h" | grep -v "include/generated/autoconf.h" | - grep -v "include/config/auto.conf" | - grep -v "include/config/auto.conf.cmd" | - grep -v "include/config/tristate.conf" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" + xargs ls -l | md5sum | cut -d ' ' -f1)" popd > /dev/null -obj_files_md5="$(find $obj_file_list -type f | +obj_files_md5="$(find $dir_list -name "*.h" | grep -v "include/generated/compile.h" | grep -v "include/generated/autoconf.h" | - grep -v "include/config/auto.conf" | - grep -v "include/config/auto.conf.cmd" | - grep -v "include/config/tristate.conf" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" - + xargs ls -l | md5sum | cut -d ' ' -f1)" +# Any changes to this script will also cause a rebuild of the archive. +this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] && [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then exit fi @@ -71,16 +55,16 @@ fi rm -rf $cpio_dir mkdir $cpio_dir -pushd $kroot > /dev/null -for f in $src_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; +pushd $srctree > /dev/null +for f in $dir_list; + do find "$f" -name "*.h"; done | cpio --quiet -pd $cpio_dir popd > /dev/null # The second CPIO can complain if files already exist which can # happen with out of tree builds. Just silence CPIO for now. -for f in $obj_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; +for f in $dir_list; + do find "$f" -name "*.h"; done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 # Remove comments except SDPX lines @@ -91,6 +75,7 @@ tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$this_file_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 rm -rf $cpio_dir diff --git a/kernel/iomem.c b/kernel/iomem.c index 93c264444510..62c92e43aa0d 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap); void memunmap(void *addr) { - if (is_vmalloc_addr(addr)) + if (is_ioremap_addr(addr)) iounmap((void __iomem *) addr); } EXPORT_SYMBOL(memunmap); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ef7b951a8087..b8cc032d5620 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -196,9 +196,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, return ret; image->kernel_buf_len = size; - /* IMA needs to pass the measurement list to the next kernel. */ - ima_add_kexec_buffer(image); - /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); @@ -239,8 +236,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = -EINVAL; goto out; } + + ima_kexec_cmdline(image->cmdline_buf, + image->cmdline_buf_len - 1); } + /* IMA needs to pass the measurement list to the next kernel. */ + ima_add_kexec_buffer(image); + /* Call arch image load handlers */ ldata = arch_kexec_kernel_image_load(image); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 445337c107e0..9f5433a52488 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2570,33 +2570,20 @@ static const struct file_operations fops_kp = { static int __init debugfs_kprobe_init(void) { - struct dentry *dir, *file; + struct dentry *dir; unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); - if (!dir) - return -ENOMEM; - file = debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); - if (!file) - goto error; + debugfs_create_file("list", 0400, dir, NULL, + &debugfs_kprobes_operations); - file = debugfs_create_file("enabled", 0600, dir, - &value, &fops_kp); - if (!file) - goto error; + debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); - file = debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); - if (!file) - goto error; + debugfs_create_file("blacklist", 0400, dir, NULL, + &debugfs_kprobe_blacklist_ops); return 0; - -error: - debugfs_remove(dir); - return -ENOMEM; } late_initcall(debugfs_kprobe_init); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index abb2a4a2cbb2..cdf318d86dd6 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -247,7 +247,6 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) int ret, nr_entries; ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); - WARN_ON_ONCE(ret == -ENOSYS); if (ret < 0) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d has an unreliable stack\n", @@ -281,11 +280,11 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) */ static bool klp_try_switch_task(struct task_struct *task) { + static char err_buf[STACK_ERR_BUF_SIZE]; struct rq *rq; struct rq_flags flags; int ret; bool success = false; - char err_buf[STACK_ERR_BUF_SIZE]; err_buf[0] = '\0'; @@ -294,6 +293,13 @@ static bool klp_try_switch_task(struct task_struct *task) return true; /* + * For arches which don't have reliable stack traces, we have to rely + * on other methods (e.g., switching tasks at kernel exit). + */ + if (!klp_have_reliable_stack()) + return false; + + /* * Now try to check the stack for any to-be-patched or to-be-unpatched * functions. If all goes well, switch the task to the target patch * state. @@ -328,7 +334,6 @@ done: pr_debug("%s", err_buf); return success; - } /* diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 9c49ec645d8b..65b6a1600c8f 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -210,6 +210,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, sum_forward_deps = 0; +#ifdef CONFIG_PROVE_LOCKING list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) @@ -241,13 +242,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) nr_hardirq_read_unsafe++; -#ifdef CONFIG_PROVE_LOCKING sum_forward_deps += lockdep_count_forward_deps(class); -#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif + +#endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0c601ae072b3..edd1c082dbf5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -16,7 +16,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/locking/mutex-design.txt. + * Also see Documentation/locking/mutex-design.rst. */ #include <linux/mutex.h> #include <linux/ww_mutex.h> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 38fbf9fa7f1b..fa83d36e30c6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -9,7 +9,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/locking/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.rst for details. */ #include <linux/spinlock.h> #include <linux/export.h> diff --git a/kernel/memremap.c b/kernel/memremap.c index 6e1970719dc2..bea6f887adad 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,41 +11,39 @@ #include <linux/types.h> #include <linux/wait_bit.h> #include <linux/xarray.h> -#include <linux/hmm.h> static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); +static atomic_t devmap_managed_enable; + +static void devmap_managed_enable_put(void *data) { - struct page *page = device_private_entry_to_page(entry); - struct hmm_devmem *devmem; + if (atomic_dec_and_test(&devmap_managed_enable)) + static_branch_disable(&devmap_managed_key); +} - devmem = container_of(page->pgmap, typeof(*devmem), pagemap); +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + if (!pgmap->ops || !pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return -EINVAL; + } - /* - * The page_fault() callback must migrate page back to system memory - * so that CPU can access it. This might fail for various reasons - * (device issue, device was unsafely unplugged, ...). When such - * error conditions happen, the callback must return VM_FAULT_SIGBUS. - * - * Note that because memory cgroup charges are accounted to the device - * memory, this should never fail because of memory restrictions (but - * allocation of regular system page might still fail because we are - * out of memory). - * - * There is a more in-depth description of what that callback can and - * cannot do, in include/linux/memremap.h - */ - return devmem->page_fault(vma, addr, page, flags, pmdp); + if (atomic_inc_return(&devmap_managed_enable) == 1) + static_branch_enable(&devmap_managed_key); + return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); } -#endif /* CONFIG_DEVICE_PRIVATE */ +#else +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + return -EINVAL; +} +#endif /* CONFIG_DEV_PAGEMAP_OPS */ static void pgmap_array_delete(struct resource *res) { @@ -56,14 +54,8 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - const struct resource *res = &pgmap->res; - struct vmem_altmap *altmap = &pgmap->altmap; - unsigned long pfn; - - pfn = res->start >> PAGE_SHIFT; - if (pgmap->altmap_valid) - pfn += vmem_altmap_offset(altmap); - return pfn; + return (pgmap->res.start >> PAGE_SHIFT) + + vmem_altmap_offset(pgmap_altmap(pgmap)); } static unsigned long pfn_end(struct dev_pagemap *pgmap) @@ -83,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + +static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -92,10 +102,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + dev_pagemap_kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->cleanup(pgmap->ref); + dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -111,7 +121,7 @@ static void devm_memremap_pages_release(void *data) align_size >> PAGE_SHIFT, NULL); } else { arch_remove_memory(nid, align_start, align_size, - pgmap->altmap_valid ? &pgmap->altmap : NULL); + pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(align_start), align_size); } mem_hotplug_done(); @@ -122,20 +132,29 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized + * 1/ At a minimum the res and type members of @pgmap must be initialized * by the caller before passing it to this function * - * 2/ The altmap field may optionally be initialized, in which case altmap_valid - * must be set to true + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * - * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped - * at devm_memremap_pages_release() time, or if this routine fails. + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -144,22 +163,66 @@ static void devm_memremap_pages_release(void *data) void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; - struct vmem_altmap *altmap = pgmap->altmap_valid ? - &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; struct mhp_restrictions restrictions = { /* * We do not want any optional features only our own memmap */ - .altmap = altmap, + .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + bool need_devmap_managed = true; + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_FS_DAX: + if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || + IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + WARN(1, "File system DAX not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_PCI_P2PDMA: + need_devmap_managed = false; + break; + default: + WARN(1, "Invalid pgmap type %d\n", pgmap->type); + break; + } + + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } + } - if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); + if (need_devmap_managed) { + error = devmap_managed_enable_get(dev, pgmap); + if (error) + return ERR_PTR(error); } align_start = res->start & ~(SECTION_SIZE - 1); @@ -241,7 +304,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, altmap); + align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); } mem_hotplug_done(); @@ -271,9 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); - + dev_pagemap_kill(pgmap); + dev_pagemap_cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); @@ -287,7 +349,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ - return altmap->reserve + altmap->free; + if (altmap) + return altmap->reserve + altmap->free; + return 0; } void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) @@ -329,28 +393,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_enable; - -/* - * Toggle the static key for ->page_free() callbacks when dev_pagemap - * pages go idle. - */ -void dev_pagemap_get_ops(void) -{ - if (atomic_inc_return(&devmap_enable) == 1) - static_branch_enable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); - -void dev_pagemap_put_ops(void) -{ - if (atomic_dec_and_test(&devmap_enable)) - static_branch_disable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); - void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -366,7 +408,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page); } else if (!count) __put_page(page); } diff --git a/kernel/panic.c b/kernel/panic.c index 4d9f55bf7d38..057540b6eee9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -372,7 +372,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { /** * print_tainted - return a string to represent the kernel taint state. * - * For individual taint flag meanings, see Documentation/sysctl/kernel.txt + * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst * * The string is overwritten by the next call to print_tainted(), * but is always NULL terminated. diff --git a/kernel/pid.c b/kernel/pid.c index e5cad0c7d5dd..0a9f2e437217 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -37,12 +37,14 @@ #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> -#include <linux/proc_fs.h> +#include <linux/refcount.h> +#include <linux/anon_inodes.h> +#include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> struct pid init_struct_pid = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, @@ -106,8 +108,7 @@ void put_pid(struct pid *pid) return; ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -210,10 +211,12 @@ struct pid *alloc_pid(struct pid_namespace *ns) } get_pid_ns(ns); - atomic_set(&pid->count, 1); + refcount_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) @@ -451,6 +454,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + +/** + * pidfd_open() - Open new pid file descriptor. + * + * @pid: pid for which to retrieve a pidfd + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set for + * the process identified by @pid. Currently, the process identified by + * @pid must be a thread-group leader. This restriction currently exists + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot + * be used with CLONE_THREAD) and pidfd polling (only supports thread group + * leaders). + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) +{ + int fd, ret; + struct pid *p; + + if (flags) + return -EINVAL; + + if (pid <= 0) + return -EINVAL; + + p = find_get_pid(pid); + if (!p) + return -ESRCH; + + ret = 0; + rcu_read_lock(); + if (!pid_task(p, PIDTYPE_TGID)) + ret = -EINVAL; + rcu_read_unlock(); + + fd = ret ?: pidfd_create(p); + put_pid(p); + return fd; +} + void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f54bc7cb6c2d..6d726cef241c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -326,7 +326,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) } read_lock(&tasklist_lock); - force_sig(SIGKILL, pid_ns->child_reaper); + send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ff8592ddedee..d3667b4075c1 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -66,7 +66,7 @@ config HIBERNATION need to run mkswap against the swap partition used for the suspend. It also works with swap files to a limited extent (for details see - <file:Documentation/power/swsusp-and-swap-files.txt>). + <file:Documentation/power/swsusp-and-swap-files.rst>). Right now you may boot without resuming and resume later but in the meantime you cannot use the swap partition(s)/file(s) involved in @@ -75,7 +75,7 @@ config HIBERNATION MOUNT any journaled filesystems mounted before the suspend or they will get corrupted in a nasty way. - For more information take a look at <file:Documentation/power/swsusp.txt>. + For more information take a look at <file:Documentation/power/swsusp.rst>. config ARCH_SAVE_PAGE_KEYS bool @@ -256,7 +256,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/power/apm-acpi.txt> + and more information, read <file:Documentation/power/apm-acpi.rst> and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. diff --git a/kernel/power/power.h b/kernel/power/power.h index 9e58bdc8a562..44bee462ff57 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} #endif /* !CONFIG_HIBERNATION */ -extern int pfn_is_nosave(unsigned long); - #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 096211299c07..c874a7026e24 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -62,16 +62,16 @@ enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); /** - * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend. + * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend. * * Return 'true' if suspend-to-idle has been selected as the default system * suspend method. */ -bool pm_suspend_via_s2idle(void) +bool pm_suspend_default_s2idle(void) { return mem_sleep_current == PM_SUSPEND_TO_IDLE; } -EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); +EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e1912ad13bdc..ca0fcb5ced71 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -974,12 +974,11 @@ static int get_swap_reader(struct swap_map_handle *handle, last = handle->maps = NULL; offset = swsusp_header->image; while (offset) { - tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); if (!tmp) { release_swap_reader(handle); return -ENOMEM; } - memset(tmp, 0, sizeof(*tmp)); if (!handle->maps) handle->maps = tmp; if (last) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83a531cea2f3..cb9ddcc08119 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -32,6 +32,8 @@ #include <linux/compat.h> #include <linux/sched/signal.h> +#include <asm/syscall.h> /* for syscall_get_* */ + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); -#endif + +static unsigned long +ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int i; + + info->op = PTRACE_SYSCALL_INFO_ENTRY; + info->entry.nr = syscall_get_nr(child, regs); + syscall_get_arguments(child, regs, args); + for (i = 0; i < ARRAY_SIZE(args); i++) + info->entry.args[i] = args[i]; + + /* args is the last field in struct ptrace_syscall_info.entry */ + return offsetofend(struct ptrace_syscall_info, entry.args); +} + +static unsigned long +ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * As struct ptrace_syscall_info.entry is currently a subset + * of struct ptrace_syscall_info.seccomp, it makes sense to + * initialize that subset using ptrace_get_syscall_info_entry(). + * This can be reconsidered in the future if these structures + * diverge significantly enough. + */ + ptrace_get_syscall_info_entry(child, regs, info); + info->op = PTRACE_SYSCALL_INFO_SECCOMP; + info->seccomp.ret_data = child->ptrace_message; + + /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); +} + +static unsigned long +ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + info->op = PTRACE_SYSCALL_INFO_EXIT; + info->exit.rval = syscall_get_error(child, regs); + info->exit.is_error = !!info->exit.rval; + if (!info->exit.is_error) + info->exit.rval = syscall_get_return_value(child, regs); + + /* is_error is the last field in struct ptrace_syscall_info.exit */ + return offsetofend(struct ptrace_syscall_info, exit.is_error); +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = PTRACE_SYSCALL_INFO_NONE, + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + /* + * This does not need lock_task_sighand() to access + * child->last_siginfo because ptrace_freeze_traced() + * called earlier by ptrace_check_attach() ensures that + * the tracee cannot go away and clear its last_siginfo. + */ + switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { + case SIGTRAP | 0x80: + switch (child->ptrace_message) { + case PTRACE_EVENTMSG_SYSCALL_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, + &info); + break; + case PTRACE_EVENTMSG_SYSCALL_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, + &info); + break; + } + break; + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + actual_size = ptrace_get_syscall_info_seccomp(child, regs, + &info); + break; + } + + write_size = min(actual_size, user_size); + return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request, ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } + + case PTRACE_GET_SYSCALL_INFO: + ret = ptrace_get_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/kernel/resource.c b/kernel/resource.c index 158f04ec1d4f..d22423e85cf8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); +#ifdef CONFIG_DEVICE_PRIVATE +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + resource_size_t end, addr; + struct resource *res; + + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); + addr = end - size + 1UL; + + for (; addr > size && addr >= base->start; addr -= size) { + if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) + continue; + + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (!res) + return ERR_PTR(-ENOMEM); + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + return res; + } + + return ERR_PTR(-ERANGE); +} +EXPORT_SYMBOL_GPL(devm_request_free_mem_region); +#endif /* CONFIG_DEVICE_PRIVATE */ + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) diff --git a/kernel/rseq.c b/kernel/rseq.c index 9424ee90589e..27c48eb7de40 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -277,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) error: sig = ksig ? ksig->sig : 0; - force_sigsegv(sig, t); + force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ @@ -296,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs) return; if (!access_ok(t->rseq, sizeof(*t->rseq)) || rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV, t); + force_sig(SIGSEGV); } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa43ce3962e7..2b037f195473 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2399,6 +2399,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; + preempt_disable(); if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -2412,7 +2413,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ if (!(p->state & state)) - return false; + goto out; success = 1; cpu = task_cpu(p); @@ -2526,6 +2527,7 @@ unlock: out: if (success) ttwu_stat(p, cpu, wake_flags); + preempt_enable(); return success; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b5bb2ac16e2..ef5b9f6b1d42 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -726,7 +726,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * refill the runtime and set the deadline a period in the future, * because keeping the current (absolute) deadline of the task would * result in breaking guarantees promised to other tasks (refer to - * Documentation/scheduler/sched-deadline.txt for more information). + * Documentation/scheduler/sched-deadline.rst for more information). * * This function returns true if: * diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 811b4a86cdf6..dba52a7db5e8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -609,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason) { struct kernel_siginfo info; seccomp_init_siginfo(&info, syscall, reason); - force_sig_info(SIGSYS, &info, current); + force_sig_info(&info); } #endif /* CONFIG_SECCOMP_FILTER */ diff --git a/kernel/signal.c b/kernel/signal.c index edf8915ddd54..91b789dd6e72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -45,6 +45,7 @@ #include <linux/posix-timers.h> #include <linux/livepatch.h> #include <linux/cgroup.h> +#include <linux/audit.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -54,7 +55,6 @@ #include <asm/unistd.h> #include <asm/siginfo.h> #include <asm/cacheflush.h> -#include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. @@ -1057,29 +1057,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -#ifdef CONFIG_USER_NS -static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) -{ - if (current_user_ns() == task_cred_xxx(t, user_ns)) - return; - - if (SI_FROMKERNEL(info)) - return; - - rcu_read_lock(); - info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), - make_kuid(current_user_ns(), info->si_uid)); - rcu_read_unlock(); -} -#else -static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) -{ - return; -} -#endif - static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type, int from_ancestor_ns) + enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; @@ -1089,8 +1068,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, - from_ancestor_ns || (info == SEND_SIG_PRIV))) + if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; @@ -1135,7 +1113,11 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); - q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + rcu_read_lock(); + q->info.si_uid = + from_kuid_munged(task_cred_xxx(t, user_ns), + current_uid()); + rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); @@ -1147,30 +1129,24 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc break; default: copy_siginfo(&q->info, info); - if (from_ancestor_ns) - q->info.si_pid = 0; break; } - - userns_fixup_signal_uid(&q->info, t); - - } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) { - /* - * Queue overflow, abort. We may abort if the - * signal was rt and sent by user using something - * other than kill(). - */ - result = TRACE_SIGNAL_OVERFLOW_FAIL; - ret = -EAGAIN; - goto ret; - } else { - /* - * This is a silent loss of information. We still - * send the signal, but the *info bits are lost. - */ - result = TRACE_SIGNAL_LOSE_INFO; - } + } else if (!is_si_special(info) && + sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + result = TRACE_SIGNAL_OVERFLOW_FAIL; + ret = -EAGAIN; + goto ret; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + result = TRACE_SIGNAL_LOSE_INFO; } out_set: @@ -1197,17 +1173,62 @@ ret: return ret; } +static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) +{ + bool ret = false; + switch (siginfo_layout(info->si_signo, info->si_code)) { + case SIL_KILL: + case SIL_CHLD: + case SIL_RT: + ret = true; + break; + case SIL_TIMER: + case SIL_POLL: + case SIL_FAULT: + case SIL_FAULT_MCEERR: + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: + case SIL_SYS: + ret = false; + break; + } + return ret; +} + static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { - int from_ancestor_ns = 0; + /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ + bool force = false; -#ifdef CONFIG_PID_NS - from_ancestor_ns = si_fromuser(info) && - !task_pid_nr_ns(current, task_active_pid_ns(t)); -#endif + if (info == SEND_SIG_NOINFO) { + /* Force if sent from an ancestor pid namespace */ + force = !task_pid_nr_ns(current, task_active_pid_ns(t)); + } else if (info == SEND_SIG_PRIV) { + /* Don't ignore kernel generated signals */ + force = true; + } else if (has_si_pid_and_uid(info)) { + /* SIGKILL and SIGSTOP is special or has ids */ + struct user_namespace *t_user_ns; - return __send_signal(sig, info, t, type, from_ancestor_ns); + rcu_read_lock(); + t_user_ns = task_cred_xxx(t, user_ns); + if (current_user_ns() != t_user_ns) { + kuid_t uid = make_kuid(current_user_ns(), info->si_uid); + info->si_uid = from_kuid_munged(t_user_ns, uid); + } + rcu_read_unlock(); + + /* A kernel generated signal? */ + force = (info->si_code == SI_KERNEL); + + /* From an ancestor pid namespace? */ + if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { + info->si_pid = 0; + force = true; + } + } + return __send_signal(sig, info, t, type, force); } static void print_fatal_signal(int signr) @@ -1274,12 +1295,13 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ -int -force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) +static int +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) { unsigned long int flags; int ret, blocked, ignored; struct k_sigaction *action; + int sig = info->si_signo; spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; @@ -1304,6 +1326,11 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) return ret; } +int force_sig_info(struct kernel_siginfo *info) +{ + return force_sig_info_to_task(info, current); +} + /* * Nuke all other threads in the group. */ @@ -1440,13 +1467,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred, uid_eq(cred->uid, pcred->uid); } -/* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, - const struct cred *cred) +/* + * The usb asyncio usage of siginfo is wrong. The glibc support + * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. + * AKA after the generic fields: + * kernel_pid_t si_pid; + * kernel_uid32_t si_uid; + * sigval_t si_value; + * + * Unfortunately when usb generates SI_ASYNCIO it assumes the layout + * after the generic fields is: + * void __user *si_addr; + * + * This is a practical problem when there is a 64bit big endian kernel + * and a 32bit userspace. As the 32bit address will encoded in the low + * 32bits of the pointer. Those low 32bits will be stored at higher + * address than appear in a 32 bit pointer. So userspace will not + * see the address it was expecting for it's completions. + * + * There is nothing in the encoding that can allow + * copy_siginfo_to_user32 to detect this confusion of formats, so + * handle this by requiring the caller of kill_pid_usb_asyncio to + * notice when this situration takes place and to store the 32bit + * pointer in sival_int, instead of sival_addr of the sigval_t addr + * parameter. + */ +int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, + struct pid *pid, const struct cred *cred) { - int ret = -EINVAL; + struct kernel_siginfo info; struct task_struct *p; unsigned long flags; + int ret = -EINVAL; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = errno; + info.si_code = SI_ASYNCIO; + *((sigval_t *)&info.si_pid) = addr; if (!valid_signal(sig)) return ret; @@ -1457,17 +1515,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { + if (!kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } - ret = security_task_kill(p, info, sig, cred); + ret = security_task_kill(p, &info, sig, cred); if (ret) goto out_unlock; if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); + ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -1476,7 +1534,7 @@ out_unlock: rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); +EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1552,9 +1610,17 @@ send_sig(int sig, struct task_struct *p, int priv) } EXPORT_SYMBOL(send_sig); -void force_sig(int sig, struct task_struct *p) +void force_sig(int sig) { - force_sig_info(sig, SEND_SIG_PRIV, p); + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + force_sig_info(&info); } EXPORT_SYMBOL(force_sig); @@ -1564,18 +1630,20 @@ EXPORT_SYMBOL(force_sig); * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -void force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig) { + struct task_struct *p = current; + if (sig == SIGSEGV) { unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; spin_unlock_irqrestore(&p->sighand->siglock, flags); } - force_sig(SIGSEGV, p); + force_sig(SIGSEGV); } -int force_sig_fault(int sig, int code, void __user *addr +int force_sig_fault_to_task(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) @@ -1595,7 +1663,16 @@ int force_sig_fault(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info(info.si_signo, &info, t); + return force_sig_info_to_task(&info, t); +} + +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)) +{ + return force_sig_fault_to_task(sig, code, addr + ___ARCH_SI_TRAPNO(trapno) + ___ARCH_SI_IA64(imm, flags, isr), current); } int send_sig_fault(int sig, int code, void __user *addr @@ -1621,7 +1698,7 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } -int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +int force_sig_mceerr(int code, void __user *addr, short lsb) { struct kernel_siginfo info; @@ -1632,7 +1709,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; - return force_sig_info(info.si_signo, &info, t); + return force_sig_info(&info); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) @@ -1661,7 +1738,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) info.si_addr = addr; info.si_lower = lower; info.si_upper = upper; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } #ifdef SEGV_PKUERR @@ -1675,7 +1752,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) info.si_code = SEGV_PKUERR; info.si_addr = addr; info.si_pkey = pkey; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } #endif @@ -1691,7 +1768,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr) info.si_errno = errno; info.si_code = TRAP_HWBKPT; info.si_addr = addr; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } int kill_pgrp(struct pid *pid, int sig, int priv) @@ -1804,6 +1881,14 @@ ret: return ret; } +static void do_notify_pidfd(struct task_struct *task) +{ + struct pid *pid; + + pid = task_pid(task); + wake_up_all(&pid->wait_pidfd); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1827,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + /* Wake up all pidfd waiters */ + do_notify_pidfd(tsk); + if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. @@ -2676,7 +2764,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping) void signal_setup_done(int failed, struct ksignal *ksig, int stepping) { if (failed) - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); else signal_delivered(ksig, stepping); } @@ -2863,80 +2951,49 @@ EXPORT_SYMBOL(sigprocmask); * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. + * + * Note that it does set_restore_sigmask() in advance, so it must be always + * paired with restore_saved_sigmask_unless() before return from syscall. */ -int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, - sigset_t *oldset, size_t sigsetsize) +int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(set, usigmask, sizeof(sigset_t))) + if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_user_sigmask); #ifdef CONFIG_COMPAT -int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, - sigset_t *set, sigset_t *oldset, +int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (get_compat_sigset(set, usigmask)) + if (get_compat_sigset(&kmask, umask)) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_compat_user_sigmask); #endif -/* - * restore_user_sigmask: - * usigmask: sigmask passed in from userland. - * sigsaved: saved sigmask when the syscall started and changed the sigmask to - * usigmask. - * - * This is useful for syscalls such as ppoll, pselect, io_pgetevents and - * epoll_pwait where a new sigmask is passed in from userland for the syscalls. - */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, - bool interrupted) -{ - - if (!usigmask) - return; - /* - * When signals are pending, do not restore them here. - * Restoring sigmask here can lead to delivering signals that the above - * syscalls are intended to block because of the sigmask passed in. - */ - if (interrupted) { - current->saved_sigmask = *sigsaved; - set_restore_sigmask(); - return; - } - - /* - * This is needed because the fast syscall return path does not restore - * saved_sigmask when signals are not pending. - */ - set_current_blocked(sigsaved); -} -EXPORT_SYMBOL(restore_user_sigmask); - /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals @@ -4477,6 +4534,28 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_syscall); CHECK_OFFSET(si_arch); #undef CHECK_OFFSET + + /* usb asyncio */ + BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != + offsetof(struct siginfo, si_addr)); + if (sizeof(int) == sizeof(void __user *)) { + BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != + sizeof(void __user *)); + } else { + BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + + sizeof_field(struct siginfo, si_uid)) != + sizeof(void __user *)); + BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != + offsetof(struct siginfo, si_uid)); + } +#ifdef CONFIG_COMPAT + BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != + offsetof(struct compat_siginfo, si_addr)); + BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != + sizeof(compat_uptr_t)); + BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != + sizeof_field(struct siginfo, si_pid)); +#endif } void __init signals_init(void) diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 36139de0a3c4..f5440abb7532 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -226,12 +226,17 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) .store = store, .size = size, }; + mm_segment_t fs; /* Trace user stack if not a kernel thread */ - if (!current->mm) + if (current->flags & PF_KTHREAD) return 0; + fs = get_fs(); + set_fs(USER_DS); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); + set_fs(fs); + return c.len; } #endif @@ -255,14 +260,6 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); } -__weak int -save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); - return -ENOSYS; -} - /** * stack_trace_save - Save a stack trace into a storage array * @store: Pointer to storage array diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d9ae5ea6caf..34b76895b81e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -137,6 +137,8 @@ COND_SYSCALL(capset); /* kernel/exit.c */ /* kernel/fork.c */ +/* __ARCH_WANT_SYS_CLONE3 */ +COND_SYSCALL(clone3); /* kernel/futex.c */ COND_SYSCALL(futex); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1c1ad1e14f21..43186ccfa139 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -188,17 +188,17 @@ extern int no_unaligned_warning; * enum sysctl_writes_mode - supported sysctl write modes * * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value - * to be written, and multiple writes on the same sysctl file descriptor - * will rewrite the sysctl value, regardless of file position. No warning - * is issued when the initial position is not 0. + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is - * not 0. + * not 0. * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at - * file position 0 and the value must be fully contained in the buffer - * sent to the write syscall. If dealing with strings respect the file - * position, but restrict this to the max length of the buffer, anything - * passed the max lenght will be ignored. Multiple writes will append - * to the buffer. + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max length will be ignored. Multiple writes will append + * to the buffer. * * These write modes control how current file position affects the behavior of * updating sysctl values through the proc interface on each write. diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index a80893180826..8cf3596a4ce6 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -104,11 +104,7 @@ void update_vsyscall(struct timekeeper *tk) vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; - while (nsec >= NSEC_PER_SEC) { - nsec = nsec - NSEC_PER_SEC; - vdso_ts->sec++; - } - vdso_ts->nsec = nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); if (__arch_use_vsyscall(vdata)) update_vdso_data(vdata, tk); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e1c6d79fb4cc..2d6e93ab0478 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -512,8 +512,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); - if (!dir) - goto err; bt->dev = dev; atomic_set(&bt->dropped, 0); @@ -522,12 +520,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); - if (!bt->dropped_file) - goto err; bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - if (!bt->msg_file) - goto err; bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1c9a4745e596..ca1255d14576 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -19,6 +19,9 @@ #include "trace_probe.h" #include "trace.h" +#define bpf_event_rcu_dereference(p) \ + rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -591,6 +594,69 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; +struct send_signal_irq_work { + struct irq_work irq_work; + struct task_struct *task; + u32 sig; +}; + +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); + +static void do_bpf_send_signal(struct irq_work *entry) +{ + struct send_signal_irq_work *work; + + work = container_of(entry, struct send_signal_irq_work, irq_work); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + struct send_signal_irq_work *work = NULL; + + /* Similar to bpf_probe_write_user, task needs to be + * in a sound condition and kernel memory access be + * permitted in order to send signal to the current + * task. + */ + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(uaccess_kernel())) + return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; + + if (in_nmi()) { + /* Do an early check on signal validity. Otherwise, + * the error is lost in deferred irq_work. + */ + if (unlikely(!valid_signal(sig))) + return -EINVAL; + + work = this_cpu_ptr(&send_signal_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + return -EBUSY; + + /* Add the current task, which is the target of sending signal, + * to the irq_work. The current task may change when queued + * irq works get executed. + */ + work->task = current; + work->sig = sig; + irq_work_queue(&work->irq_work); + return 0; + } + + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); +} + +static const struct bpf_func_proto bpf_send_signal_proto = { + .func = bpf_send_signal, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -641,6 +707,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; #endif + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; default: return NULL; } @@ -1102,7 +1170,7 @@ static DEFINE_MUTEX(bpf_event_mutex); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret = -EEXIST; @@ -1120,7 +1188,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, if (event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; @@ -1143,7 +1211,7 @@ unlock: void perf_event_detach_bpf_prog(struct perf_event *event) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret; @@ -1152,7 +1220,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) if (!event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret == -ENOENT) goto unlock; @@ -1174,6 +1242,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + struct bpf_prog_array *progs; u32 *ids, prog_cnt, ids_len; int ret; @@ -1198,10 +1267,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) */ mutex_lock(&bpf_event_mutex); - ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - ids, - ids_len, - &prog_cnt); + progs = bpf_event_rcu_dereference(event->tp_event->prog_array); + ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); mutex_unlock(&bpf_event_mutex); if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || @@ -1364,6 +1431,20 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + +subsys_initcall(send_signal_irq_work_init); + #ifdef CONFIG_MODULES static int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c3aabb576fe5..c90c687cf950 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8618,10 +8618,6 @@ struct dentry *tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); - if (!tr->dir) { - pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return ERR_PTR(-ENOMEM); - } return NULL; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b55906c77ce0..7860e3f59fad 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1336,7 +1336,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu, call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; - call->flags = TRACE_EVENT_FL_UPROBE; + call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; call->data = tu; } diff --git a/kernel/user.c b/kernel/user.c index 78b17e36e705..5235d7f49982 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -63,9 +63,9 @@ struct user_namespace init_user_ns = { .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, -#ifdef CONFIG_PERSISTENT_KEYRINGS - .persistent_keyring_register_sem = - __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), +#ifdef CONFIG_KEYS + .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), + .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -141,8 +141,6 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); - key_put(up->uid_keyring); - key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0eff45ce7703..8eadadc478f9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -128,8 +128,9 @@ int create_user_ns(struct cred *new) ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); -#ifdef CONFIG_PERSISTENT_KEYRINGS - init_rwsem(&ns->persistent_keyring_register_sem); +#ifdef CONFIG_KEYS + INIT_LIST_HEAD(&ns->keyring_name_list); + init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) @@ -191,9 +192,7 @@ static void free_user_ns(struct work_struct *work) kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); -#ifdef CONFIG_PERSISTENT_KEYRINGS - key_put(ns->persistent_keyring_register); -#endif + key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 95aea04ff722..601d61150b65 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3329,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); * * Undo alloc_workqueue_attrs(). */ -void free_workqueue_attrs(struct workqueue_attrs *attrs) +static void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); @@ -3339,21 +3339,20 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) /** * alloc_workqueue_attrs - allocate a workqueue_attrs - * @gfp_mask: allocation mask to use * * Allocate a new workqueue_attrs, initialize with default settings and * return it. * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +static struct workqueue_attrs *alloc_workqueue_attrs(void) { struct workqueue_attrs *attrs; - attrs = kzalloc(sizeof(*attrs), gfp_mask); + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (!attrs) goto fail; - if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); @@ -3431,7 +3430,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->refcnt = 1; /* shouldn't fail above this point */ - pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; return 0; @@ -3896,8 +3895,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); - new_attrs = alloc_workqueue_attrs(GFP_KERNEL); - tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); + new_attrs = alloc_workqueue_attrs(); + tmp_attrs = alloc_workqueue_attrs(); if (!ctx || !new_attrs || !tmp_attrs) goto out_free; @@ -4033,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Return: 0 on success and -errno on failure. */ -int apply_workqueue_attrs(struct workqueue_struct *wq, +static int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; @@ -4044,7 +4043,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, return ret; } -EXPORT_SYMBOL_GPL(apply_workqueue_attrs); /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug @@ -4242,7 +4240,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return NULL; if (flags & WQ_UNBOUND) { - wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); + wq->unbound_attrs = alloc_workqueue_attrs(); if (!wq->unbound_attrs) goto err_free_wq; } @@ -5395,7 +5393,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) lockdep_assert_held(&wq_pool_mutex); - attrs = alloc_workqueue_attrs(GFP_KERNEL); + attrs = alloc_workqueue_attrs(); if (!attrs) return NULL; @@ -5817,7 +5815,7 @@ static void __init wq_numa_init(void) return; } - wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); /* @@ -5892,7 +5890,7 @@ int __init workqueue_init_early(void) for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; - BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; @@ -5901,7 +5899,7 @@ int __init workqueue_init_early(void) * guaranteed by max_active which is enforced by pwqs. * Turn off NUMA so that dfl_pwq is used for all nodes. */ - BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; attrs->no_numa = true; ordered_wq_attrs[i] = attrs; |