diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 09:18:14 +0100 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 09:18:14 +0100 |
| commit | 9c87e61e3c5797277407ba5eae4eac8a52be3fa3 (patch) | |
| tree | e3f902cb5363b5b90ab74a4b7e26fafbc15aaeaf /include/linux | |
| parent | b85966adbf5de0668a815c6e3527f87e0c387fb4 (diff) | |
| parent | e4287bf34f97a88c7d9322f5bde828724c073a6b (diff) | |
Merge tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov:
"Major changes:
- Recover from BPF arena page faults using a scratch page and add
ptep_try_set() for lockless empty-slot installs on x86 and arm64.
This allows BPF kfuncs to access arena pointers directly.
The 'arena_direct_access' stable branch was created for this work
and was pulled into sched-ext and bpf-next trees (Tejun Heo, Kumar
Kartikeya Dwivedi)
- Lift old restriction and support 6+ arguments in BPF programs and
kfuncs on x86 and arm64 (Yonghong Song, Puranjay Mohan)
Other features and fixes:
- Add 24-bit BTF vlen and reclaim unused bits in the BTF UAPI to ease
addition of new BTF kinds (Alan Maguire)
- Raise the maximum BPF call chain depth from 8 to 16 frames (Alexei
Starovoitov)
- Refactor object relationship tracking in the verifier and fix a
dynptr use-after-free bug (Amery Hung)
- Harden the signed program loader and reject exclusive maps as inner
maps (Daniel Borkmann)
- Replace the verifier min/max bounds fields with a circular number
(cnum) representation and improve 32->64 bit range refinements
(Eduard Zingerman)
- Introduce the arena library and runtime (libarena) with a buddy
allocator, rbtree and SPMC queue data structures, ASAN support and
a parallel test harness. Allow subprograms to return arena pointers
and switch to a BTF type-tag based __arena annotation (Emil
Tsalapatis)
- Cache build IDs in the sleepable stackmap path and avoid faultable
build ID reads under mm locks (Ihor Solodrai)
- Introduce the tracing_multi link to attach a single BPF program to
many kernel functions at once. Allow specifying the uprobe_multi
target via FD (Jiri Olsa)
- Extend the bpf_list family of kfuncs with bpf_list_add/del(), and
bpf_list_is_first/is_last/empty() (Kaitao Cheng)
- Extend the BPF syscall with common attributes support for
prog_load, btf_load and map_create (Leon Hwang)
- Wrap rhashtable as BPF map (Mykyta Yatsenko, Herbert Xu)
- Add sleepable support for tracepoint programs and fix deadlocks in
LRU map due to NMI reentry (Mykyta Yatsenko)
- Fix OOB access in bpf_flow_keys, fix nullness analysis of inner
arrays, enforce write checks for global subprograms (Nuoqi Gui)
- Report the maximum combined stack depth and print a breakdown of
instructions processed per subprogram (Paul Chaignon)
- Add an XDP load-balancer benchmark and arm64 JIT support for stack
arguments (Puranjay Mohan)
- Add kfuncs to traverse over wakeup_sources (Samuel Wu)
- Allow sleepable BPF programs to use LPM trie maps directly (Vlad
Poenaru)
- Many more fixes and cleanups across the verifier, BTF, sockmap,
devmap, bpffs, security hooks, s390/riscv/loongarch JITs,
rqspinlock, libbpf, bpftool, selftests"
* tag 'bpf-next-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (336 commits)
selftests/bpf: Work around llvm stack overflow in crypto progs
selftests/bpf: add test for bpf_msg_pop_data() overflow
bpf, sockmap: fix integer overflow in bpf_msg_pop_data() bounds check
sockmap: Fix use-after-free in udp_bpf_recvmsg()
bpf, sockmap: keep sk_msg copy state in sync
bpf, sockmap: Fix wrong rsge offset in bpf_msg_push_data()
bpf, sockmap: reject overflowing copy + len in bpf_msg_push_data()
selftsets/bpf: Retry map update on helper_fill_hashmap()
selftests/bpf: Add test for sleepable lsm_cgroup rejection
selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper
bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket
selftests/bpf: Avoid static LLVM linking for cross builds
selftests/bpf: Use common CFLAGS for urandom_read
selftests/bpf: Initialize operation name before use
tools/bpf: build: Append extra cflags
libbpf: Initialize CFLAGS before including Makefile.include
bpftool: Append extra host flags
bpftool: Avoid adding EXTRA_CFLAGS to HOST_CFLAGS
bpftool: Pass host flags to bootstrap libbpf
selftests/bpf: correct CONFIG_PPC64 macro name in comment
...
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/bpf-cgroup.h | 5 | ||||
| -rw-r--r-- | include/linux/bpf.h | 282 | ||||
| -rw-r--r-- | include/linux/bpf_defs.h | 19 | ||||
| -rw-r--r-- | include/linux/bpf_lsm.h | 6 | ||||
| -rw-r--r-- | include/linux/bpf_types.h | 2 | ||||
| -rw-r--r-- | include/linux/bpf_verifier.h | 301 | ||||
| -rw-r--r-- | include/linux/btf.h | 7 | ||||
| -rw-r--r-- | include/linux/btf_ids.h | 1 | ||||
| -rw-r--r-- | include/linux/cnum.h | 82 | ||||
| -rw-r--r-- | include/linux/filter.h | 27 | ||||
| -rw-r--r-- | include/linux/ftrace.h | 9 | ||||
| -rw-r--r-- | include/linux/pgtable.h | 43 | ||||
| -rw-r--r-- | include/linux/rhashtable.h | 4 | ||||
| -rw-r--r-- | include/linux/syscalls.h | 3 | ||||
| -rw-r--r-- | include/linux/trace_events.h | 12 |
15 files changed, 656 insertions, 147 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b2e79c2b41d5..4d0cc65976a1 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -421,7 +421,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr); + union bpf_attr __user *uattr, u32 uattr_size); const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -452,7 +452,8 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, } static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, + u32 uattr_size) { return -EINVAL; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 64efc3fdb716..7719f6528445 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -6,6 +6,7 @@ #include <uapi/linux/bpf.h> #include <uapi/linux/filter.h> +#include <linux/bpf_defs.h> #include <crypto/sha2.h> #include <linux/workqueue.h> @@ -32,6 +33,8 @@ #include <linux/memcontrol.h> #include <linux/cfi.h> #include <linux/xattr.h> +#include <linux/key.h> +#include <linux/ftrace.h> #include <asm/rqspinlock.h> struct bpf_verifier_env; @@ -111,7 +114,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); - int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); + int (*map_get_hash)(struct bpf_map *map); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -296,6 +299,7 @@ struct bpf_map_owner { struct bpf_map { u8 sha[SHA256_DIGEST_SIZE]; + u32 excl; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY @@ -489,6 +493,35 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } +static inline bool btf_field_is_nmi_safe(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: + case BPF_TIMER: + case BPF_WORKQUEUE: + case BPF_TASK_WORK: + case BPF_KPTR_UNREF: + case BPF_REFCOUNT: + return true; + default: + return false; + } +} + +static inline bool btf_record_has_nmi_unsafe_fields(const struct btf_record *rec) +{ + int i; + + if (IS_ERR_OR_NULL(rec)) + return false; + for (i = 0; i < rec->cnt; i++) { + if (!btf_field_is_nmi_safe(rec->fields[i].type)) + return true; + } + return false; +} + static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; @@ -618,6 +651,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena); u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena); +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map); +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -679,6 +714,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags); void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt); +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, + u64 flags); #else static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags) @@ -689,6 +726,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) { } + +static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + return NULL; +} #endif extern const struct bpf_map_ops bpf_map_offload_ops; @@ -1052,7 +1095,7 @@ struct bpf_insn_access_aux { struct { struct btf *btf; u32 btf_id; - u32 ref_obj_id; + u32 ref_id; }; }; struct bpf_verifier_log *log; /* for verbose logs */ @@ -1152,6 +1195,11 @@ struct bpf_prog_offload { /* The longest tracepoint has 12 args. * See include/trace/bpf_probe.h + * + * Also reuse this macro for maximum number of arguments a BPF function + * or a kfunc can have. Args 1-5 are passed in registers, args 6-12 via + * stack arg slots. The JIT may map some stack arg slots to registers based + * on the native calling convention (e.g., arg 6 to R9 on x86-64). */ #define MAX_BPF_FUNC_ARGS 12 @@ -1234,9 +1282,9 @@ enum { #define BPF_TRAMP_COOKIE_INDEX_SHIFT 8 #define BPF_TRAMP_IS_RETURN_SHIFT 63 -struct bpf_tramp_links { - struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; - int nr_links; +struct bpf_tramp_nodes { + struct bpf_tramp_node *nodes[BPF_MAX_TRAMP_LINKS]; + int nr_nodes; }; struct bpf_tramp_run_ctx; @@ -1264,13 +1312,13 @@ struct bpf_tramp_run_ctx; struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr); void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size); int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr); + struct bpf_tramp_nodes *tnodes, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); @@ -1336,8 +1384,6 @@ struct bpf_trampoline { /* hlist for trampoline_ip_table */ struct hlist_node hlist_ip; struct ftrace_ops *fops; - /* serializes access to fields of this trampoline */ - struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; @@ -1358,6 +1404,11 @@ struct bpf_trampoline { int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; + /* Used as temporary old image storage for multi_attach */ + struct { + struct bpf_tramp_image *old_image; + u32 old_flags; + } multi_attach; }; struct bpf_attach_target_info { @@ -1455,11 +1506,13 @@ static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u6 return 0; } +struct bpf_tracing_multi_link; + #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); struct bpf_trampoline *bpf_trampoline_get(u64 key, @@ -1467,6 +1520,11 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link); +int bpf_trampoline_multi_detach(struct bpf_prog *prog, + struct bpf_tracing_multi_link *link); + /* * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn * indirection with a direct call to the bpf program. If the architecture does @@ -1544,14 +1602,15 @@ void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog, int insn_idx); +u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog); #else -static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +static inline int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { @@ -1578,6 +1637,16 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { return false; } +static inline int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link) +{ + return -ENOTSUPP; +} +static inline int bpf_trampoline_multi_detach(struct bpf_prog *prog, + struct bpf_tracing_multi_link *link) +{ + return -ENOTSUPP; +} #endif struct bpf_func_info_aux { @@ -1615,7 +1684,7 @@ struct bpf_ctx_arg_aux { enum bpf_reg_type reg_type; struct btf *btf; u32 btf_id; - u32 ref_obj_id; + u32 ref_id; bool refcounted; }; @@ -1657,6 +1726,19 @@ struct bpf_stream_stage { int len; }; +enum bpf_sig_verdict { + BPF_SIG_UNSIGNED = 0, + BPF_SIG_VERIFIED, +}; + +enum bpf_sig_keyring { + BPF_SIG_KEYRING_NONE = 0, + BPF_SIG_KEYRING_BUILTIN, + BPF_SIG_KEYRING_SECONDARY, + BPF_SIG_KEYRING_PLATFORM, + BPF_SIG_KEYRING_USER, +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1699,6 +1781,11 @@ struct bpf_prog_aux { bool changes_pkt_data; bool might_sleep; bool kprobe_write_ctx; + struct { + s32 keyring_serial; + u8 keyring_type; + u8 verdict; + } sig; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; @@ -1731,6 +1818,7 @@ struct bpf_prog_aux { struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); + u16 stack_arg_sp_adjust; #ifdef CONFIG_SECURITY void *security; #endif @@ -1874,12 +1962,17 @@ struct bpf_link_ops { __poll_t (*poll)(struct file *file, struct poll_table_struct *pts); }; -struct bpf_tramp_link { - struct bpf_link link; +struct bpf_tramp_node { + struct bpf_link *link; struct hlist_node tramp_hlist; u64 cookie; }; +struct bpf_tramp_link { + struct bpf_link link; + struct bpf_tramp_node node; +}; + struct bpf_shim_tramp_link { struct bpf_tramp_link link; struct bpf_trampoline *trampoline; @@ -1887,13 +1980,31 @@ struct bpf_shim_tramp_link { struct bpf_tracing_link { struct bpf_tramp_link link; + struct bpf_tramp_node fexit; struct bpf_trampoline *trampoline; struct bpf_prog *tgt_prog; }; -struct bpf_fsession_link { - struct bpf_tracing_link link; - struct bpf_tramp_link fexit; +struct bpf_tracing_multi_node { + struct bpf_tramp_node node; + struct bpf_trampoline *trampoline; + struct ftrace_func_entry entry; +}; + +struct bpf_tracing_multi_data { + struct ftrace_hash *unreg; + struct ftrace_hash *modify; + struct ftrace_hash *reg; + struct ftrace_func_entry *entry; +}; + +struct bpf_tracing_multi_link { + struct bpf_link link; + struct bpf_tracing_multi_data data; + u64 *cookies; + struct bpf_tramp_node *fexits; + int nodes_cnt; + struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); }; struct bpf_raw_tp_link { @@ -2079,6 +2190,12 @@ static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) #endif } +static inline bool is_tracing_multi(enum bpf_attach_type type) +{ + return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI || + type == BPF_TRACE_FSESSION_MULTI; +} + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register @@ -2099,8 +2216,8 @@ void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, - struct bpf_tramp_link *link, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes, + struct bpf_tramp_node *node, const struct btf_func_model *model, void *stub_func, void **image, u32 *image_off, @@ -2125,6 +2242,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); +int bpf_struct_ops_for_each_prog(const void *kdata, + int (*cb)(struct bpf_prog *prog, void *data), + void *data); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ @@ -2192,31 +2312,33 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif -static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +static inline int bpf_fsession_cnt(struct bpf_tramp_nodes *nodes) { - struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY]; int cnt = 0; - for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { + if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) cnt++; } return cnt; } -static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link) +static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_node *node) { - return link->link.prog->call_session_cookie; + return node->link->prog->call_session_cookie; } -static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) +static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_nodes *nodes) { - struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY]; int cnt = 0; - for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (bpf_prog_calls_session_cookie(fentries.links[i])) + for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fentries.nodes[i])) cnt++; } @@ -2598,6 +2720,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); +void bpf_obj_cancel_fields(struct bpf_map *map, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); @@ -2764,6 +2887,9 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, enum bpf_attach_type attach_type, bool sleepable); +void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); @@ -2917,7 +3043,9 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); +struct bpf_log_attr; +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log); #ifndef CONFIG_BPF_JIT_ALWAYS_ON int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); @@ -3088,6 +3216,56 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); +static __always_inline u32 +bpf_prog_run_array_sleepable(const struct bpf_prog_array *array, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + struct bpf_prog *prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + u32 ret = 1; + + if (unlikely(!array)) + return ret; + + migrate_disable(); + + run_ctx.is_uprobe = false; + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + /* Skip dummy_bpf_prog placeholder (len == 0) */ + if (unlikely(!prog->len)) { + item++; + continue; + } + + if (unlikely(!bpf_prog_get_recursion_context(prog))) { + bpf_prog_inc_misses_counter(prog); + bpf_prog_put_recursion_context(prog); + item++; + continue; + } + + run_ctx.bpf_cookie = item->bpf_cookie; + + if (!prog->sleepable) { + guard(rcu)(); + ret &= run_prog(prog, ctx); + } else { + ret &= run_prog(prog, ctx); + } + + bpf_prog_put_recursion_context(prog); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + migrate_enable(); + return ret; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -3135,6 +3313,12 @@ static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_ { } +static inline void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie) +{ +} + static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { @@ -3626,15 +3810,25 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ -#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) +#ifdef CONFIG_KEYS +struct bpf_key { + struct key *key; + bool has_ref; +}; +#endif /* CONFIG_KEYS */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); struct bpf_key *bpf_lookup_system_key(u64 id); void bpf_key_put(struct bpf_key *bkey); -int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring); +static inline s32 bpf_key_serial(const struct bpf_key *key) +{ + return key->has_ref ? key->key->serial : 0; +} #else static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) { @@ -3650,12 +3844,17 @@ static inline void bpf_key_put(struct bpf_key *bkey) { } -static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +static inline int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { return -EOPNOTSUPP; } + +static inline s32 bpf_key_serial(const struct bpf_key *key) +{ + return 0; +} #endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ /* verifier prototypes for helper functions called from eBPF programs */ @@ -3931,15 +4130,6 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ -struct key; - -#ifdef CONFIG_KEYS -struct bpf_key { - struct key *key; - bool has_ref; -}; -#endif /* CONFIG_KEYS */ - static inline bool type_is_alloc(u32 type) { return type & MEM_ALLOC; diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h new file mode 100644 index 000000000000..2185cd3966d4 --- /dev/null +++ b/include/linux/bpf_defs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Subset of bpf.h declarations, split out so files that need only these + * declarations can avoid bpf.h's full include cost. + */ +#ifndef _LINUX_BPF_DEFS_H +#define _LINUX_BPF_DEFS_H + +#ifdef CONFIG_BPF_SYSCALL +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip); +#else +static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, + unsigned long fault_ip) +{ + return false; +} +#endif + +#endif /* _LINUX_BPF_DEFS_H */ diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 643809cc78c3..143775a27a2a 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -52,6 +52,7 @@ int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags); int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str); bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog); +bool bpf_lsm_hook_returns_errno(u32 btf_id); #else /* !CONFIG_BPF_LSM */ @@ -104,6 +105,11 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) { return false; } + +static inline bool bpf_lsm_hook_returns_errno(u32 btf_id) +{ + return true; +} #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b13de31e163f..e5906829aa6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -134,6 +134,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_RHASH, rhtab_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) @@ -155,3 +156,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi) +BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING_MULTI, tracing_multi) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 185b2aa43a42..39a851e690ec 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -8,6 +8,7 @@ #include <linux/btf.h> /* for struct btf and btf_id() */ #include <linux/filter.h> /* for MAX_BPF_STACK */ #include <linux/tnum.h> +#include <linux/cnum.h> /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit @@ -65,7 +66,6 @@ struct bpf_reg_state { struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ u32 mem_size; - u32 dynptr_id; /* for dynptr slices */ }; /* For dynptr stack slots */ @@ -120,14 +120,8 @@ struct bpf_reg_state { * These refer to the same value as var_off, not necessarily the actual * contents of the register. */ - s64 smin_value; /* minimum possible (s64)value */ - s64 smax_value; /* maximum possible (s64)value */ - u64 umin_value; /* minimum possible (u64)value */ - u64 umax_value; /* maximum possible (u64)value */ - s32 s32_min_value; /* minimum possible (s32)value */ - s32 s32_max_value; /* maximum possible (s32)value */ - u32 u32_min_value; /* minimum possible (u32)value */ - u32 u32_max_value; /* maximum possible (u32)value */ + struct cnum64 r64; /* 64-bit range as circular number */ + struct cnum32 r32; /* 32-bit range as circular number */ /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we @@ -153,46 +147,14 @@ struct bpf_reg_state { #define BPF_ADD_CONST32 (1U << 30) #define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32) u32 id; - /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned - * from a pointer-cast helper, bpf_sk_fullsock() and - * bpf_tcp_sock(). - * - * Consider the following where "sk" is a reference counted - * pointer returned from "sk = bpf_sk_lookup_tcp();": - * - * 1: sk = bpf_sk_lookup_tcp(); - * 2: if (!sk) { return 0; } - * 3: fullsock = bpf_sk_fullsock(sk); - * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } - * 5: tp = bpf_tcp_sock(fullsock); - * 6: if (!tp) { bpf_sk_release(sk); return 0; } - * 7: bpf_sk_release(sk); - * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain - * - * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and - * "tp" ptr should be invalidated also. In order to do that, - * the reg holding "fullsock" and "sk" need to remember - * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id - * such that the verifier can reset all regs which have - * ref_obj_id matching the sk_reg->id. - * - * sk_reg->ref_obj_id is set to sk_reg->id at line 1. - * sk_reg->id will stay as NULL-marking purpose only. - * After NULL-marking is done, sk_reg->id can be reset to 0. - * - * After "fullsock = bpf_sk_fullsock(sk);" at line 3, - * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. - * - * After "tp = bpf_tcp_sock(fullsock);" at line 5, - * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id - * which is the same as sk_reg->ref_obj_id. - * - * From the verifier perspective, if sk, fullsock and tp - * are not NULL, they are the same ptr with different - * reg->type. In particular, bpf_sk_release(tp) is also - * allowed and has the same effect as bpf_sk_release(sk). + /* + * Tracks the parent object this register was derived from. + * Used for cascading invalidation: when the parent object is + * released or invalidated, all registers with matching parent_id + * are also invalidated. For example, a slice from bpf_dynptr_data() + * gets parent_id set to the dynptr's id. */ - u32 ref_obj_id; + u32 parent_id; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -209,6 +171,66 @@ struct bpf_reg_state { bool precise; }; +static inline s64 reg_smin(const struct bpf_reg_state *reg) +{ + return cnum64_smin(reg->r64); +} + +static inline s64 reg_smax(const struct bpf_reg_state *reg) +{ + return cnum64_smax(reg->r64); +} + +static inline u64 reg_umin(const struct bpf_reg_state *reg) +{ + return cnum64_umin(reg->r64); +} + +static inline u64 reg_umax(const struct bpf_reg_state *reg) +{ + return cnum64_umax(reg->r64); +} + +static inline s32 reg_s32_min(const struct bpf_reg_state *reg) +{ + return cnum32_smin(reg->r32); +} + +static inline s32 reg_s32_max(const struct bpf_reg_state *reg) +{ + return cnum32_smax(reg->r32); +} + +static inline u32 reg_u32_min(const struct bpf_reg_state *reg) +{ + return cnum32_umin(reg->r32); +} + +static inline u32 reg_u32_max(const struct bpf_reg_state *reg) +{ + return cnum32_umax(reg->r32); +} + +static inline void reg_set_srange32(struct bpf_reg_state *reg, s32 smin, s32 smax) +{ + reg->r32 = cnum32_from_srange(smin, smax); +} + +static inline void reg_set_urange32(struct bpf_reg_state *reg, u32 umin, u32 umax) +{ + reg->r32 = cnum32_from_urange(umin, umax); +} + +static inline void reg_set_srange64(struct bpf_reg_state *reg, s64 smin, s64 smax) +{ + reg->r64 = cnum64_from_srange(smin, smax); +} + +static inline void reg_set_urange64(struct bpf_reg_state *reg, u64 umin, u64 umax) +{ + reg->r64 = cnum64_from_urange(umin, umax); +} + enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ @@ -309,10 +331,14 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; - /* Use to keep track of the source object of a lock, to ensure - * it matches on unlock. - */ - void *ptr; + union { + /* For REF_TYPE_PTR */ + int parent_id; + /* Use to keep track of the source object of a lock, to ensure + * it matches on unlock. + */ + void *ptr; + }; }; struct bpf_retval_range { @@ -347,6 +373,7 @@ struct bpf_func_state { bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; + bool no_stack_arg_load; /* For callback calling functions that limit number of possible * callback executions (e.g. bpf_loop) keeps track of current * simulated iteration number. @@ -372,46 +399,49 @@ struct bpf_func_state { * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. */ int allocated_stack; + + u16 out_stack_arg_cnt; /* Number of outgoing on-stack argument slots */ + struct bpf_reg_state *stack_arg_regs; /* Outgoing on-stack arguments */ }; -#define MAX_CALL_FRAMES 8 +#define MAX_CALL_FRAMES 16 -/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +/* instruction history flags, used in bpf_jmp_history_entry.flags field. + * Frame number and SPI are stored in dedicated fields of bpf_jmp_history_entry. + */ enum { - /* instruction references stack slot through PTR_TO_STACK register; - * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) - * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, - * 8 bytes per slot, so slot index (spi) is [0, 63]) - */ - INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ - - INSN_F_SPI_MASK = 0x3f, /* 6 bits */ - INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + INSN_F_STACK_ACCESS = BIT(0), - INSN_F_STACK_ACCESS = BIT(9), + INSN_F_DST_REG_STACK = BIT(1), /* dst_reg is PTR_TO_STACK */ + INSN_F_SRC_REG_STACK = BIT(2), /* src_reg is PTR_TO_STACK */ - INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */ - INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */ - /* total 12 bits are used now. */ + INSN_F_STACK_ARG_ACCESS = BIT(3), }; -static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); -static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); - struct bpf_jmp_history_entry { - u32 idx; /* insn idx can't be bigger than 1 million */ + u32 idx : 20; + u32 frame : 4; /* stack access frame number */ + u32 spi : 6; /* stack slot index (0..63) */ + u32 : 2; u32 prev_idx : 20; /* special INSN_F_xxx flags */ - u32 flags : 12; - /* additional registers that need precision tracking when this - * jump is backtracked, vector of six 10-bit records + u32 flags : 4; + u32 : 8; + /* + * additional registers that need precision tracking when this + * jump is backtracked, vector of five 11-bit records */ u64 linked_regs; }; -/* Maximum number of register states that can exist at once */ -#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) +static_assert(MAX_CALL_FRAMES <= (1 << 4)); +static_assert(MAX_BPF_STACK / 8 <= (1 << 6)); + +/* Maximum number of bpf_reg_state objects that can exist at once */ +#define MAX_STACK_ARG_SLOTS (MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS) +#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE + \ + MAX_STACK_ARG_SLOTS) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; @@ -497,10 +527,23 @@ struct bpf_verifier_state { u32 may_goto_depth; }; -#define bpf_get_spilled_reg(slot, frame, mask) \ - (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ - ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \ - ? &frame->stack[slot].spilled_ptr : NULL) +static inline struct bpf_reg_state * +bpf_get_spilled_reg(int slot, struct bpf_func_state *frame, u32 mask) +{ + if (slot < frame->allocated_stack / BPF_REG_SIZE && + (1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & mask) + return &frame->stack[slot].spilled_ptr; + return NULL; +} + +static inline struct bpf_reg_state * +bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) +{ + if (slot < frame->out_stack_arg_cnt && + frame->stack_arg_regs[slot].type != NOT_INIT) + return &frame->stack_arg_regs[slot]; + return NULL; +} /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ #define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ @@ -508,7 +551,13 @@ struct bpf_verifier_state { iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame, mask)) -#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ +/* Iterate over 'frame', setting 'reg' to either NULL or a spilled stack arg. */ +#define bpf_for_each_spilled_stack_arg(iter, frame, reg) \ + for (iter = 0, reg = bpf_get_spilled_stack_arg(iter, frame); \ + iter < frame->out_stack_arg_cnt; \ + iter++, reg = bpf_get_spilled_stack_arg(iter, frame)) + +#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __stack, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ int ___i, ___j; \ @@ -516,6 +565,7 @@ struct bpf_verifier_state { struct bpf_reg_state *___regs; \ __state = ___vstate->frame[___i]; \ ___regs = __state->regs; \ + __stack = NULL; \ for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ __reg = &___regs[___j]; \ (void)(__expr); \ @@ -523,14 +573,27 @@ struct bpf_verifier_state { bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \ if (!__reg) \ continue; \ + __stack = &__state->stack[___j]; \ (void)(__expr); \ } \ + __stack = NULL; \ + bpf_for_each_spilled_stack_arg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ } \ + (void)__stack; \ }) /* Invoke __expr over regsiters in __vst, setting __state and __reg */ -#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ - bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr) +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_stack_state * ___stack; \ + (void)___stack; \ + bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, ___stack,\ + 1 << STACK_SPILL, __expr); \ + }) /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { @@ -700,6 +763,22 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) return log && log->level; } +struct bpf_log_attr { + char __user *ubuf; + u32 size; + u32 level; + u32 offsetof_true_size; + bpfptr_t uattr; +}; + +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, + u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, + bpfptr_t uattr_common, u32 size_common); +struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log, + struct bpf_common_attr *common, bpfptr_t uattr, + u32 size); +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log); + #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_arg_info { @@ -724,6 +803,7 @@ struct bpf_subprog_info { u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; + u32 insn_processed; /* offsets in range [stack_depth .. fastcall_stack_off) * are used for bpf_fastcall spills and fills. */ @@ -740,12 +820,21 @@ struct bpf_subprog_info { bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; bool might_sleep: 1; - u8 arg_cnt:3; + u8 arg_cnt:4; enum priv_stack_mode priv_stack_mode; - struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; + struct bpf_subprog_arg_info args[MAX_BPF_FUNC_ARGS]; + u16 stack_arg_cnt; /* incoming + max outgoing */ + u16 max_out_stack_arg_cnt; }; +static inline u16 bpf_in_stack_arg_cnt(const struct bpf_subprog_info *sub) +{ + if (sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) + return sub->arg_cnt - MAX_BPF_FUNC_REG_ARGS; + return 0; +} + struct bpf_verifier_env; struct backtrack_state { @@ -753,6 +842,7 @@ struct backtrack_state { u32 frame; u32 reg_masks[MAX_CALL_FRAMES]; u64 stack_masks[MAX_CALL_FRAMES]; + u8 stack_arg_masks[MAX_CALL_FRAMES]; }; struct bpf_id_pair { @@ -881,6 +971,8 @@ struct bpf_verifier_env { u32 prev_insn_processed, insn_processed; /* number of jmps, calls, exits analyzed so far */ u32 prev_jmps_processed, jmps_processed; + /* maximum combined stack depth */ + u32 max_stack_depth; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ @@ -914,6 +1006,7 @@ struct bpf_verifier_env { * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; + char tmp_arg_name[32]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; @@ -1087,7 +1180,7 @@ struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx); void bpf_free_verifier_state(struct bpf_verifier_state *state, bool free_self); void bpf_free_backedges(struct bpf_scc_visit *visit); int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs); + int insn_flags, int spi, int frame, u64 linked_regs); void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist); void bpf_mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); @@ -1150,6 +1243,11 @@ static inline void bpf_bt_set_frame_slot(struct backtrack_state *bt, u32 frame, bt->stack_masks[frame] |= 1ull << slot; } +static inline void bt_set_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] |= 1 << slot; +} + static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) { return bt->reg_masks[frame] & (1 << reg); @@ -1321,6 +1419,25 @@ struct bpf_map_desc { int uid; }; +/* The last initialized dynptr; Populated by process_dynptr_func() */ +struct bpf_dynptr_desc { + enum bpf_dynptr_type type; + u32 id; + u32 parent_id; +}; + +/* + * The last seen rereferenced object; Updated by update_ref_obj() when a register refers to a + * referenced object. Used when the helper or kfunc is casting a referenced object, returning + * allocated memory derived from referenced object or creating a dynptr with a referenced + * object as parent. + */ +struct ref_obj_desc { + u32 id; + u32 parent_id; + u8 cnt; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -1329,7 +1446,6 @@ struct bpf_kfunc_call_arg_meta { const struct btf_type *func_proto; const char *func_name; /* Out parameters */ - u32 ref_obj_id; u8 release_regno; bool r0_rdonly; u32 ret_btf_id; @@ -1362,15 +1478,12 @@ struct bpf_kfunc_call_arg_meta { struct btf_field *field; } arg_rbtree_root; struct { - enum bpf_dynptr_type type; - u32 id; - u32 ref_obj_id; - } initialized_dynptr; - struct { u8 spi; u8 frameno; } iter; struct bpf_map_desc map; + struct bpf_dynptr_desc dynptr; + struct ref_obj_desc ref_obj; u64 mem_size; }; @@ -1479,6 +1592,10 @@ int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset); int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt); +/* Functions exported from verifier.c, used by trampoline.c */ +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id, + struct bpf_attach_target_info *tgt_info); + /* Functions in fixups.c, called from bpf_check() */ int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env); int bpf_optimize_bpf_loop(struct bpf_verifier_env *env); diff --git a/include/linux/btf.h b/include/linux/btf.h index 48108471c5b1..240401d9b25b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -145,7 +145,8 @@ const char *btf_get_name(const struct btf *btf); void btf_get(struct btf *btf); void btf_put(struct btf *btf); const struct btf_header *btf_header(const struct btf *btf); -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); +struct bpf_log_attr; +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, @@ -415,12 +416,12 @@ static inline bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static inline u16 btf_type_vlen(const struct btf_type *t) +static inline u32 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); } -static inline u16 btf_vlen(const struct btf_type *t) +static inline u32 btf_vlen(const struct btf_type *t) { return btf_type_vlen(t); } diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index af011db39ab3..8b5a9ee92513 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -284,5 +284,6 @@ extern u32 bpf_cgroup_btf_id[]; extern u32 bpf_local_storage_map_btf_id[]; extern u32 btf_bpf_map_id[]; extern u32 bpf_kmem_cache_btf_id[]; +extern u32 bpf_multi_func_btf_id[]; #endif diff --git a/include/linux/cnum.h b/include/linux/cnum.h new file mode 100644 index 000000000000..49b7d0c7645d --- /dev/null +++ b/include/linux/cnum.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef _LINUX_CNUM_H +#define _LINUX_CNUM_H + +#include <linux/types.h> + +/* + * cnum32: a circular number. + * A unified representation for signed and unsigned ranges. + * + * Assume that a 32-bit range is a circle, with 0 being in the 12 o'clock + * position, numbers placed sequentially in clockwise order and U32_MAX + * in the 11 o'clock position. Signed values map onto the same circle: + * S32_MAX sits at 5 o'clock, S32_MIN sits at 6 o'clock (opposite 0), + * negative values occupy the left half and positive values the right half. + * + * @cnum32 represents an arc on this circle drawn clockwise. + * @base corresponds to the first value of the range. + * @size corresponds to the number of integers in the range excluding @base. + * (The @base is excluded to avoid integer overflow when representing the full + * 0..U32_MAX range, which corresponds to 2^32, which can't be stored in u32). + * + * For example: {U32_MAX, 1} corresponds to signed range [-1, 0], + * {S32_MAX, 1} corresponds to unsigned range [S32_MAX, S32_MIN]. + */ +struct cnum32 { + u32 base; + u32 size; +}; + +#define CNUM32_UNBOUNDED ((struct cnum32){ .base = 0, .size = U32_MAX }) +#define CNUM32_EMPTY ((struct cnum32){ .base = U32_MAX, .size = U32_MAX }) + +struct cnum32 cnum32_from_urange(u32 min, u32 max); +struct cnum32 cnum32_from_srange(s32 min, s32 max); +u32 cnum32_umin(struct cnum32 cnum); +u32 cnum32_umax(struct cnum32 cnum); +s32 cnum32_smin(struct cnum32 cnum); +s32 cnum32_smax(struct cnum32 cnum); +struct cnum32 cnum32_intersect(struct cnum32 a, struct cnum32 b); +void cnum32_intersect_with(struct cnum32 *dst, struct cnum32 src); +void cnum32_intersect_with_urange(struct cnum32 *dst, u32 min, u32 max); +void cnum32_intersect_with_srange(struct cnum32 *dst, s32 min, s32 max); +bool cnum32_contains(struct cnum32 cnum, u32 v); +bool cnum32_is_const(struct cnum32 cnum); +bool cnum32_is_empty(struct cnum32 cnum); +struct cnum32 cnum32_add(struct cnum32 a, struct cnum32 b); +struct cnum32 cnum32_negate(struct cnum32 a); +bool cnum32_is_subset(struct cnum32 outer, struct cnum32 inner); + +/* Same as cnum32 but for 64-bit ranges */ +struct cnum64 { + u64 base; + u64 size; +}; + +#define CNUM64_UNBOUNDED ((struct cnum64){ .base = 0, .size = U64_MAX }) +#define CNUM64_EMPTY ((struct cnum64){ .base = U64_MAX, .size = U64_MAX }) + +struct cnum64 cnum64_from_urange(u64 min, u64 max); +struct cnum64 cnum64_from_srange(s64 min, s64 max); +u64 cnum64_umin(struct cnum64 cnum); +u64 cnum64_umax(struct cnum64 cnum); +s64 cnum64_smin(struct cnum64 cnum); +s64 cnum64_smax(struct cnum64 cnum); +struct cnum64 cnum64_intersect(struct cnum64 a, struct cnum64 b); +void cnum64_intersect_with(struct cnum64 *dst, struct cnum64 src); +void cnum64_intersect_with_urange(struct cnum64 *dst, u64 min, u64 max); +void cnum64_intersect_with_srange(struct cnum64 *dst, s64 min, s64 max); +bool cnum64_contains(struct cnum64 cnum, u64 v); +bool cnum64_is_const(struct cnum64 cnum); +bool cnum64_is_empty(struct cnum64 cnum); +struct cnum64 cnum64_add(struct cnum64 a, struct cnum64 b); +struct cnum64 cnum64_negate(struct cnum64 a); +bool cnum64_is_subset(struct cnum64 outer, struct cnum64 inner); + +struct cnum32 cnum32_from_cnum64(struct cnum64 cnum); +struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b); + +#endif /* _LINUX_CNUM_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 88a241aac36a..67d337ede91b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -58,8 +58,9 @@ struct ctl_table_header; #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ -#define BPF_REG_AX MAX_BPF_REG -#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +#define BPF_REG_PARAMS MAX_BPF_REG +#define BPF_REG_AX (MAX_BPF_REG + 1) +#define MAX_BPF_EXT_REG (MAX_BPF_REG + 2) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ @@ -748,6 +749,27 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, return ret; } +static inline bool is_stack_arg_ldx(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LDX | BPF_MEM | BPF_DW) && + insn->src_reg == BPF_REG_PARAMS && + insn->off > 0 && insn->off % 8 == 0; +} + +static inline bool is_stack_arg_st(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ST | BPF_MEM | BPF_DW) && + insn->dst_reg == BPF_REG_PARAMS && + insn->off < 0 && insn->off % 8 == 0; +} + +static inline bool is_stack_arg_stx(const struct bpf_insn *insn) +{ + return insn->code == (BPF_STX | BPF_MEM | BPF_DW) && + insn->dst_reg == BPF_REG_PARAMS && + insn->off < 0 && insn->off % 8 == 0; +} + #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { @@ -1159,6 +1181,7 @@ bool bpf_jit_inlines_helper_call(s32 imm); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); +bool bpf_jit_supports_stack_args(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 28b30c6f1031..02bc5027523a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -415,6 +415,8 @@ struct ftrace_hash *alloc_ftrace_hash(int size_bits); void free_ftrace_hash(struct ftrace_hash *hash); struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct); +void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry); +void ftrace_hash_remove(struct ftrace_hash *hash); /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { @@ -551,6 +553,8 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b void ftrace_stub_direct_tramp(void); +unsigned long ftrace_hash_count(struct ftrace_hash *hash); + #else struct ftrace_ops; static inline unsigned long ftrace_find_rec_direct(unsigned long ip) @@ -590,6 +594,11 @@ static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline unsigned long ftrace_hash_count(struct ftrace_hash *hash) +{ + return 0; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index eca14547f9c1..2981e386da7b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1070,6 +1070,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef ptep_try_set +/** + * ptep_try_set - atomically set an empty kernel PTE + * @ptep: page table entry + * @new_pte: value to install + * + * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return true on + * success, false if the slot was already populated or the arch has no + * implementation. + * + * For special kernel page tables only - never user page tables. The caller must + * prevent concurrent teardown of @ptep and must accept that other writers may + * race. Concurrent clearers must use ptep_get_and_clear() so racing accesses + * agree on the outcome. + * + * Architectures opt in by providing a cmpxchg-based override and defining + * ptep_try_set as an identity macro. The generic stub returns false, which is + * correct for callers that fall through to oops on failure. + */ +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) +{ + return false; +} +#endif + +#ifndef flush_tlb_before_set +/** + * flush_tlb_before_set - invalidate a kernel PTE's TLB before re-setting it + * @addr: kernel virtual address whose PTE was just cleared + * + * Some architectures (e.g. arm64) do not allow a live page-table entry to be + * repointed at a different page in one step. The old entry must first be made + * invalid and its translation flushed from every TLB, and only then may the new + * entry be written. + * + * This is only for the lockless atomic kernel-PTE installers (ptep_try_set()). + * It must be callable with interrupts disabled. + */ +static inline void flush_tlb_before_set(unsigned long addr) +{ +} +#endif + #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index ef5230cece36..79f83b6eec27 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -263,6 +263,8 @@ struct rhash_lock_head __rcu **__rht_bucket_nested( struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); +void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key); + #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -1117,7 +1119,7 @@ unlocked: atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); err = 0; } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4fb7291f54b6..874d9067a43b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -940,7 +940,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); -asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size); +asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size, + struct bpf_common_attr __user *attr_common, unsigned int size_common); asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 40a43a4c7caf..308c76b57d13 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -770,6 +770,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); @@ -786,12 +787,18 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, unsigned long *missed); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { return 1; } +static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + return 1; +} + static inline int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { @@ -838,6 +845,11 @@ bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } +static inline int +bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} #endif enum { |
