diff options
| author | Christian Brauner <brauner@kernel.org> | 2026-05-21 08:46:22 +0200 |
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2026-05-26 11:02:02 +0200 |
| commit | 4425cd76b5e73ce92bea9dc61a0027ef3d55c9f0 (patch) | |
| tree | 4a0cefcb5798c8b96c2481271e3edd2ea9d08558 /kernel/exec_state.c | |
| parent | 5200f5f493f79f14bbdc349e402a40dfb32f23c8 (diff) | |
| parent | 6b1c66c9cca99bf00386481c7b2aa7394c26d8b8 (diff) | |
Merge patch series "exec: introduce task_exec_state for exec-time metadata"
Christian Brauner (Amutable) <brauner@kernel.org> says:
This series relocates the dumpable mode and the user_namespace
captured at execve() from mm_struct onto a new per-task
task_exec_state structure that stays attached to the task for its
full lifetime.
__ptrace_may_access() and several /proc owner / visibility checks
need to consult two pieces of state for any observable task,
including zombies that have already gone through exit_mm(): the
dumpable mode and the user namespace captured at execve(). Both
live on mm_struct today, which exit_mm() clears from the task long
before the task is reaped.
A reader that races with do_exit() observes task->mm == NULL and
either fails the check or falls back to init_user_ns - which denies
legitimate access to non-dumpable zombies that were running in a
nested user namespace.
mm_struct loses ->user_ns and the dumpability bits in ->flags.
MMF_DUMPABLE_BITS is reserved so MMF_DUMP_FILTER_* layout exposed via
/proc/<pid>/coredump_filter stays stable. task->user_dumpable and its
exit_mm() snapshot are removed.
task_exec_state is the privilege domain established by an execve()
[1]. Within a thread group it is shared via refcount; across thread
groups each task has its own:
- CLONE_VM siblings (thread-group members, io_uring workers)
refcount-share the parent's exec_state.
- Non-CLONE_VM clones (fork(), vfork() without CLONE_VM)
allocate a fresh exec_state inheriting the parent's dumpable
mode and user_ns.
- execve() in the child allocates a fresh instance and installs
it under task_lock + exec_update_lock via
task_exec_state_replace().
- Credential changes (setresuid, capset, ...) and
prctl(PR_SET_DUMPABLE) update dumpability on the current
task's exec_state, i.e. on the thread group's shared instance.
Behavioral change:
Kernel threads that briefly use a user mm via kthread_use_mm() no
longer inherit dumpability from the borrowed mm. Kthreads are not
ptraceable (PF_KTHREAD short-circuits __ptrace_may_access), so this
is observable only via /proc surfaces that a sufficiently privileged
reader can reach.
[1] https://lore.kernel.org/r/CAHk-=wj+NgoDH3GSicJ140SV8OoDd71pLmL3fgFEsTcgoMC6Og@mail.gmail.com
* patches from https://patch.msgid.link/20260520-work-task_exec_state-v3-0-69f895bc1385@kernel.org:
exec_state: relocate dumpable information
ptrace: add ptracer_access_allowed()
exec: introduce struct task_exec_state
sched/coredump: introduce enum task_dumpable
Link: https://patch.msgid.link/20260520-work-task_exec_state-v3-0-69f895bc1385@kernel.org
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
Diffstat (limited to 'kernel/exec_state.c')
| -rw-r--r-- | kernel/exec_state.c | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/kernel/exec_state.c b/kernel/exec_state.c new file mode 100644 index 000000000000..6034f4b4808f --- /dev/null +++ b/kernel/exec_state.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/refcount.h> +#include <linux/sched.h> +#include <linux/sched/coredump.h> +#include <linux/sched/exec_state.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +static struct kmem_cache *task_exec_state_cachep; + +static void __free_task_exec_state(struct rcu_head *rcu) +{ + struct task_exec_state *exec_state = container_of(rcu, struct task_exec_state, rcu); + + put_user_ns(exec_state->user_ns); + kmem_cache_free(task_exec_state_cachep, exec_state); +} + +void put_task_exec_state(struct task_exec_state *exec_state) +{ + if (exec_state && refcount_dec_and_test(&exec_state->count)) + call_rcu(&exec_state->rcu, __free_task_exec_state); +} + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns) +{ + struct task_exec_state *exec_state; + + exec_state = kmem_cache_alloc(task_exec_state_cachep, GFP_KERNEL); + if (!exec_state) + return NULL; + refcount_set(&exec_state->count, 1); + exec_state->dumpable = TASK_DUMPABLE_OFF; + exec_state->user_ns = get_user_ns(user_ns); + return exec_state; +} + +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk) +{ + struct task_exec_state *exec_state; + + exec_state = rcu_dereference_check(tsk->exec_state, + lockdep_is_held(&tsk->alloc_lock)); + WARN_ON_ONCE(!exec_state); + return exec_state; +} + +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state) +{ + /* + * Updates must hold both locks so callers needing a consistent + * snapshot of mm + dumpability are covered. + */ + lockdep_assert_held(&tsk->alloc_lock); + lockdep_assert_held_write(&tsk->signal->exec_update_lock); + + return rcu_replace_pointer(tsk->exec_state, exec_state, true); +} + +/* + * The non-CLONE_VM clone path: allocate a fresh exec_state and + * inherit the parent's dumpable mode and user_ns reference. CLONE_VM + * siblings refcount-share via copy_exec_state() in fork.c; only this + * path and execve() ever allocate. + */ +int task_exec_state_copy(struct task_struct *tsk) +{ + struct task_exec_state *src, *dst; + + src = rcu_dereference_protected(current->exec_state, true); + dst = alloc_task_exec_state(src->user_ns); + if (!dst) + return -ENOMEM; + dst->dumpable = READ_ONCE(src->dumpable); + rcu_assign_pointer(tsk->exec_state, dst); + return 0; +} + +/* + * Store TASK_DUMPABLE_* on current->exec_state. All callers + * (commit_creds, begin_new_exec, prctl(PR_SET_DUMPABLE)) act on the + * running task, which guarantees ->exec_state is allocated and cannot + * be replaced under us. + */ +void task_exec_state_set_dumpable(enum task_dumpable value) +{ + struct task_exec_state *exec_state; + + if (WARN_ON_ONCE(value > TASK_DUMPABLE_ROOT)) + value = TASK_DUMPABLE_OFF; + + exec_state = rcu_dereference_protected(current->exec_state, true); + /* mm-less tasks share init_task's exec_state; never mutate it */ + if (WARN_ON_ONCE(exec_state == &init_task_exec_state)) + return; + WRITE_ONCE(exec_state->dumpable, value); +} + +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task) +{ + struct task_exec_state *exec_state; + + guard(rcu)(); + exec_state = rcu_dereference(task->exec_state); + return READ_ONCE(exec_state->dumpable); +} + +void __init exec_state_init(void) +{ + task_exec_state_cachep = kmem_cache_create("task_exec_state", + sizeof(struct task_exec_state), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} |
