diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 9 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 68 | ||||
| -rw-r--r-- | kernel/bpf/disasm.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/dispatcher.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 34 | ||||
| -rw-r--r-- | kernel/events/core.c | 17 | ||||
| -rw-r--r-- | kernel/fork.c | 9 | ||||
| -rw-r--r-- | kernel/futex/requeue.c | 6 | ||||
| -rw-r--r-- | kernel/trace/fprobe.c | 10 | ||||
| -rw-r--r-- | kernel/trace/trace_eprobe.c | 2 | ||||
| -rw-r--r-- | kernel/trace/trace_probe.c | 15 | ||||
| -rw-r--r-- | kernel/trace/trace_probe.h | 2 |
12 files changed, 134 insertions, 45 deletions
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 0da8d923e39d..f9e81060c1f4 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -178,6 +178,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { + /* + * Do not allow allocation of BPF_MAP_TYPE_INODE_STORAGE if the BPF LSM + * was not initialized by the LSM framework at boot. Without proper + * initialization, the BPF inode security blob offset remains unprepared, + * causing bpf_inode() to calculate an invalid memory offset and corrupt + * inode->i_security. + */ + if (!bpf_lsm_initialized) + return ERR_PTR(-EOPNOTSUPP); return bpf_local_storage_map_alloc(attr, &inode_cache); } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 649cce41e13f..6e19a030da6f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -20,6 +20,7 @@ #include <uapi/linux/btf.h> #include <linux/filter.h> #include <linux/skbuff.h> +#include <linux/static_call.h> #include <linux/vmalloc.h> #include <linux/prandom.h> #include <linux/bpf.h> @@ -875,6 +876,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_prog_pack { struct list_head list; void *ptr; + bool arch_flush_needed; unsigned long bitmap[]; }; @@ -883,6 +885,15 @@ void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) memset(area, 0, size); } +DEFINE_STATIC_CALL_NULL(bpf_arch_pred_flush, bpf_arch_pred_flush); + +/* + * Enabled once bpf_arch_pred_flush points at a real flush routine. Lets the + * pack allocator test "is a predictor flush wired up at all" with a cheap + * static branch instead of repeatedly querying the static call target. + */ +DEFINE_STATIC_KEY_FALSE(bpf_pred_flush_enabled); + #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); @@ -918,6 +929,8 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); + if (static_branch_unlikely(&bpf_pred_flush_enabled)) + pack->arch_flush_needed = true; set_vm_flush_reset_perms(pack->ptr); err = set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); @@ -932,15 +945,23 @@ out: return NULL; } -void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns, bool was_classic) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); - struct bpf_prog_pack *pack; - unsigned long pos; + struct bpf_prog_pack *pack, *fallback_pack = NULL; + unsigned long pos, fallback_pos = 0; void *ptr = NULL; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { + /* + * Allocations larger than a pack get their own pages, and + * predictors are not flushed for such allocation. This is only + * safe because cBPF programs (the unprivileged attack surface) + * are bounded well below a pack size. + */ + if (was_classic && static_branch_unlikely(&bpf_pred_flush_enabled)) + pr_warn_once("BPF: Predictors not flushed for allocations greater than BPF_PROG_PACK_SIZE\n"); size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { @@ -960,8 +981,29 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) list_for_each_entry(pack, &pack_list, list) { pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); - if (pos < BPF_PROG_CHUNK_COUNT) + if (pos >= BPF_PROG_CHUNK_COUNT) + continue; + /* Flush not enabled, use any pack */ + if (!static_branch_unlikely(&bpf_pred_flush_enabled)) goto found_free_area; + /* + * cBPF reuse of a dirty pack triggers a flush, so prefer a + * clean pack for cBPF. eBPF never flushes, so steer it to a + * dirty pack and keep clean packs free for cBPF. + */ + if (was_classic ^ pack->arch_flush_needed) + goto found_free_area; + if (!fallback_pack) { + fallback_pack = pack; + fallback_pos = pos; + } + } + + /* No preferred pack found */ + if (fallback_pack) { + pack = fallback_pack; + pos = fallback_pos; + goto found_free_area; } pack = alloc_new_pack(bpf_fill_ill_insns); @@ -971,6 +1013,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) pos = 0; found_free_area: + /* Flush only for cBPF as it may contain a crafted gadget */ + if (static_branch_unlikely(&bpf_pred_flush_enabled) && + pack->arch_flush_needed && + was_classic) { + struct bpf_prog_pack *p; + + static_call_cond(bpf_arch_pred_flush)(); + list_for_each_entry(p, &pack_list, list) + p->arch_flush_needed = false; + } bitmap_set(pack->bitmap, pos, nbits); ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); @@ -1008,6 +1060,9 @@ void bpf_prog_pack_free(void *ptr, u32 size) "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); + + if (static_branch_unlikely(&bpf_pred_flush_enabled)) + pack->arch_flush_needed = true; if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); @@ -1130,7 +1185,8 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, struct bpf_binary_header **rw_header, u8 **rw_image, - bpf_jit_fill_hole_t bpf_fill_ill_insns) + bpf_jit_fill_hole_t bpf_fill_ill_insns, + bool was_classic) { struct bpf_binary_header *ro_header; u32 size, hole, start; @@ -1143,7 +1199,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, if (bpf_jit_charge_modmem(size)) return NULL; - ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); + ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns, was_classic); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index f8a3c7eb451e..0391b3bc0073 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -323,7 +323,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || - insn->src_reg == BPF_PSEUDO_MAP_VALUE; + insn->src_reg == BPF_PSEUDO_MAP_VALUE || + insn->src_reg == BPF_PSEUDO_MAP_IDX || + insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE || + insn->src_reg == BPF_PSEUDO_BTF_ID; char tmp[64]; if (is_ptr && !allow_ptr_leaks) diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index b77db7413f8c..ea2d60dc1fee 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -145,7 +145,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero); + d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero, false); if (!d->image) goto out; d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 21a365d436a5..6515d4d3c003 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7996,9 +7996,10 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) return field; } -static int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, argno_t argno, - enum bpf_arg_type arg_type) +static int __check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, argno_t argno, + enum bpf_arg_type arg_type, + bool btf_id_fixed_off_ok) { u32 type = reg->type; @@ -8055,12 +8056,11 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset - * can be non-zero. This was already checked above. So pass - * fixed_off_ok as true to allow fixed offset for all other - * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we - * still need to do checks instead of returning. + * can be non-zero unless the caller requires otherwise. + * var_off always must be 0 for PTR_TO_BTF_ID, hence we still + * need to do checks instead of returning. */ - return __check_ptr_off_reg(env, reg, argno, true); + return __check_ptr_off_reg(env, reg, argno, btf_id_fixed_off_ok); case PTR_TO_CTX: /* * Allow fixed and variable offsets for syscall context, but @@ -8076,6 +8076,13 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, } } +static int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, argno_t argno, + enum bpf_arg_type arg_type) +{ + return __check_func_arg_reg_off(env, reg, argno, arg_type, true); +} + static int check_arg_const_str(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno) { @@ -11947,6 +11954,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type arg_type = ARG_DONTCARE; argno_t argno = argno_from_arg(i + 1); int regno = reg_from_argno(argno); + bool btf_id_fixed_off_ok = true; u32 ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type; @@ -12120,7 +12128,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_MEM: case KF_ARG_PTR_TO_MEM_SIZE: case KF_ARG_PTR_TO_CALLBACK: - case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: case KF_ARG_PTR_TO_TIMER: @@ -12134,6 +12141,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CTX: arg_type = ARG_PTR_TO_CTX; break; + case KF_ARG_PTR_TO_REFCOUNTED_KPTR: + arg_type = ARG_PTR_TO_BTF_ID; + btf_id_fixed_off_ok = false; + break; default: verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type); return -EFAULT; @@ -12141,7 +12152,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (regno == meta->release_regno) arg_type |= OBJ_RELEASE; - ret = check_func_arg_reg_off(env, reg, argno, arg_type); + ret = __check_func_arg_reg_off(env, reg, argno, arg_type, + btf_id_fixed_off_ok); if (ret < 0) return ret; @@ -19994,13 +20006,13 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); bpf_clear_insn_aux_data(env, 0, env->prog->len); - vfree(env->insn_aux_data); err_free_env: bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); kvfree(env->succ); kvfree(env->gotox_tmp_buf); + vfree(env->insn_aux_data); kvfree(env); return ret; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 954c36e28101..d7f3e2c2ecb1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4729,7 +4729,7 @@ static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx, struct task_struct *task, - bool revoke); + unsigned long detach_flags); /* * Removes all events from the current task that have been marked @@ -4756,7 +4756,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx, ctx->task, false); + perf_event_exit_event(event, ctx, ctx->task, DETACH_GROUP); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -12937,7 +12937,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, /* * De-schedule the event and mark it REVOKED. */ - perf_event_exit_event(event, ctx, ctx->task, true); + perf_event_exit_event(event, ctx, ctx->task, DETACH_REVOKE); /* * All _free_event() bits that rely on event->pmu: @@ -14525,12 +14525,13 @@ static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx, struct task_struct *task, - bool revoke) + unsigned long detach_flags) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = DETACH_EXIT; unsigned int attach_state; + detach_flags |= DETACH_EXIT; + if (parent_event) { /* * Do not destroy the 'original' grouping; because of the @@ -14553,8 +14554,8 @@ perf_event_exit_event(struct perf_event *event, sync_child_event(event, task); } - if (revoke) - detach_flags |= DETACH_GROUP | DETACH_REVOKE; + if (detach_flags & DETACH_REVOKE) + detach_flags |= DETACH_GROUP; perf_remove_from_context(event, detach_flags); /* @@ -14642,7 +14643,7 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit) perf_event_task(task, ctx, 0); list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) - perf_event_exit_event(child_event, ctx, exit ? task : NULL, false); + perf_event_exit_event(child_event, ctx, exit ? task : NULL, 0); mutex_unlock(&ctx->mutex); diff --git a/kernel/fork.c b/kernel/fork.c index 13e38e89a1f3..f0e2e131a9a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1009,6 +1009,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->mm_cid.active = 0; INIT_HLIST_NODE(&tsk->mm_cid.node); #endif + +#ifdef CONFIG_BPF_SYSCALL + RCU_INIT_POINTER(tsk->bpf_storage, NULL); + tsk->bpf_ctx = NULL; +#endif return tsk; free_stack: @@ -2247,10 +2252,6 @@ __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif -#ifdef CONFIG_BPF_SYSCALL - RCU_INIT_POINTER(p->bpf_storage, NULL); - p->bpf_ctx = NULL; -#endif unwind_task_init(p); diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index 7384672916fb..79823ad13683 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -645,12 +645,6 @@ retry_private: continue; } - /* Self-deadlock: non-top waiter already owns the PI futex. */ - if (rt_mutex_owner(&pi_state->pi_mutex) == this->task) { - ret = -EDEADLK; - break; - } - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, this->task); diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index f378613ad120..f215990b9061 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -613,6 +613,16 @@ static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops continue; data_size = fp->entry_data_size; + /* + * The list may have grown since it was sized, so this node + * may not fit. Skip it as missed rather than overrun the + * reservation. + */ + if (fp->exit_handler && + used + FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(data_size) > reserved_words) { + fp->nmissed++; + continue; + } if (data_size && fp->exit_handler) data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG; else diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b66d6196338d..50518b071414 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -315,7 +315,7 @@ get_event_field(struct fetch_insn *code, void *rec) val = (unsigned long)addr; break; case FILTER_PTR_STRING: - val = (unsigned long)(*(char *)addr); + val = *(unsigned long *)addr; break; default: WARN_ON_ONCE(1); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fd1caa1f9723..d17cfee77d9c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -342,10 +342,6 @@ static int parse_trace_event(char *arg, struct fetch_insn *code, ret = parse_trace_event_arg(arg, code, ctx); if (!ret) return 0; - if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { - code->op = FETCH_OP_COMM; - return 0; - } return -EINVAL; } @@ -678,7 +674,7 @@ static int parse_btf_arg(char *varname, int i, is_ptr, ret; u32 tid; - if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT))) + if (!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT)) return -EINVAL; is_ptr = split_next_field(varname, &field, ctx); @@ -1068,8 +1064,14 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, int len; if (ctx->flags & TPARG_FL_TEVENT) { - if (parse_trace_event(arg, code, ctx) < 0) + if (parse_trace_event(arg, code, ctx) < 0) { + /* 'comm' should be checked after field parsing. */ + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } goto inval; + } return 0; } @@ -1241,6 +1243,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_FOFFS; code->immediate = (unsigned long)offset; // imm64? + offset = 0; } else { /* uprobes don't support symbols */ if (!(ctx->flags & TPARG_FL_KERNEL)) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 15758cc11fc6..0f09f7aaf93f 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -511,7 +511,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_RETVAL, "This function returns 'void' type"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ - C(BAD_VAR, "Invalid $-valiable specified"), \ + C(BAD_VAR, "Invalid $-variable specified"), \ C(BAD_REG_NAME, "Invalid register name"), \ C(BAD_MEM_ADDR, "Invalid memory address"), \ C(BAD_IMM, "Invalid immediate value"), \ |
