diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 03:00:58 +0530 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 03:00:58 +0530 |
| commit | 9c9e6bd4cca02f2d183eb260451fb6018f9ee67e (patch) | |
| tree | 5c351e1db3b13ffce9ce02c95173e58d35975d1f /kernel | |
| parent | 5d15ab717d503ff10b585a144870648b9a88c616 (diff) | |
| parent | 38205ecbe6b6dc47968ad4e9c978e2117720969e (diff) | |
Merge tag 'kernel-7.2-rc1.task_exec_state' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull task_exec_state updates from Christian Brauner:
"This introduces a new per-task task_exec_state structure and relocates
the dumpable mode and the user namespace captured at execve() from
mm_struct onto it. It stays attached to the task for its full
lifetime.
__ptrace_may_access() and several /proc owner and visibility checks
need to consult two pieces of state for any observable task, including
zombies that have already gone through exit_mm(): the dumpable mode
and the user namespace captured at execve(). Both live on mm_struct
today, which exit_mm() clears from the task long before the task is
reaped. A reader that races with do_exit() observes task->mm == NULL
and either fails the check or falls back to init_user_ns - which
denies legitimate access to non-dumpable zombies that were running in
a nested user namespace.
mm_struct loses ->user_ns and the dumpability bits in ->flags.
MMF_DUMPABLE_BITS is reserved so the MMF_DUMP_FILTER_* layout exposed
via /proc/<pid>/coredump_filter stays stable. task->user_dumpable and
its exit_mm() snapshot are removed.
task_exec_state is the privilege domain established by an execve().
Within a thread group it is shared via refcount; across thread groups
each task has its own:
- CLONE_VM siblings (thread-group members, io_uring workers)
refcount-share the parent's exec_state.
- Non-CLONE_VM clones (fork(), vfork() without CLONE_VM) allocate a
fresh exec_state inheriting the parent's dumpable mode and user_ns.
- execve() in the child allocates a fresh instance and installs it
under task_lock + exec_update_lock via task_exec_state_replace().
- Credential changes (setresuid, capset, ...) and
prctl(PR_SET_DUMPABLE) update dumpability on the current task's
exec_state, i.e., on the thread group's shared instance.
On top of this exec_mmap() no longer tears down the old mm while
holding exec_update_lock for writing and cred_guard_mutex. Neither
lock is needed for that: exec_update_lock only exists to make the mm
swap atomic with the later commit_creds() and all its readers operate
on the new mm; none looks at the detached old mm.
The cost was real: __mmput() runs exit_mmap() over the entire old
address space and can block in exit_aio() waiting for in-flight AIO,
so execve() of a large process blocked ptrace_attach() and every
exec_update_lock reader for the duration of the teardown.
The old mm is now stashed in bprm->old_mm and released from
setup_new_exec() after both locks are dropped, with a backstop in
free_bprm() for the error paths"
* tag 'kernel-7.2-rc1.task_exec_state' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
exec: free the old mm outside the exec locks
exec_state: relocate dumpable information
ptrace: add ptracer_access_allowed()
exec: introduce struct task_exec_state
sched/coredump: introduce enum task_dumpable
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Makefile | 2 | ||||
| -rw-r--r-- | kernel/cred.c | 3 | ||||
| -rw-r--r-- | kernel/exec_state.c | 119 | ||||
| -rw-r--r-- | kernel/exit.c | 1 | ||||
| -rw-r--r-- | kernel/fork.c | 33 | ||||
| -rw-r--r-- | kernel/kthread.c | 1 | ||||
| -rw-r--r-- | kernel/ptrace.c | 51 | ||||
| -rw-r--r-- | kernel/sys.c | 6 |
8 files changed, 185 insertions, 31 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6785982013dc..1e1a31673577 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -obj-y = fork.o exec_domain.o panic.o \ +obj-y = fork.o exec_domain.o exec_state.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ diff --git a/kernel/cred.c b/kernel/cred.c index 12a7b1ce5131..3df4e15bd67f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -384,8 +384,9 @@ int commit_creds(struct cred *new) !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || !cred_cap_issubset(old, new)) { + /* mm-less tasks share init_task's exec_state */ if (task->mm) - set_dumpable(task->mm, suid_dumpable); + task_exec_state_set_dumpable(suid_dumpable); task->pdeath_signal = 0; /* * If a task drops privileges and becomes nondumpable, diff --git a/kernel/exec_state.c b/kernel/exec_state.c new file mode 100644 index 000000000000..6034f4b4808f --- /dev/null +++ b/kernel/exec_state.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/refcount.h> +#include <linux/sched.h> +#include <linux/sched/coredump.h> +#include <linux/sched/exec_state.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +static struct kmem_cache *task_exec_state_cachep; + +static void __free_task_exec_state(struct rcu_head *rcu) +{ + struct task_exec_state *exec_state = container_of(rcu, struct task_exec_state, rcu); + + put_user_ns(exec_state->user_ns); + kmem_cache_free(task_exec_state_cachep, exec_state); +} + +void put_task_exec_state(struct task_exec_state *exec_state) +{ + if (exec_state && refcount_dec_and_test(&exec_state->count)) + call_rcu(&exec_state->rcu, __free_task_exec_state); +} + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns) +{ + struct task_exec_state *exec_state; + + exec_state = kmem_cache_alloc(task_exec_state_cachep, GFP_KERNEL); + if (!exec_state) + return NULL; + refcount_set(&exec_state->count, 1); + exec_state->dumpable = TASK_DUMPABLE_OFF; + exec_state->user_ns = get_user_ns(user_ns); + return exec_state; +} + +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk) +{ + struct task_exec_state *exec_state; + + exec_state = rcu_dereference_check(tsk->exec_state, + lockdep_is_held(&tsk->alloc_lock)); + WARN_ON_ONCE(!exec_state); + return exec_state; +} + +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state) +{ + /* + * Updates must hold both locks so callers needing a consistent + * snapshot of mm + dumpability are covered. + */ + lockdep_assert_held(&tsk->alloc_lock); + lockdep_assert_held_write(&tsk->signal->exec_update_lock); + + return rcu_replace_pointer(tsk->exec_state, exec_state, true); +} + +/* + * The non-CLONE_VM clone path: allocate a fresh exec_state and + * inherit the parent's dumpable mode and user_ns reference. CLONE_VM + * siblings refcount-share via copy_exec_state() in fork.c; only this + * path and execve() ever allocate. + */ +int task_exec_state_copy(struct task_struct *tsk) +{ + struct task_exec_state *src, *dst; + + src = rcu_dereference_protected(current->exec_state, true); + dst = alloc_task_exec_state(src->user_ns); + if (!dst) + return -ENOMEM; + dst->dumpable = READ_ONCE(src->dumpable); + rcu_assign_pointer(tsk->exec_state, dst); + return 0; +} + +/* + * Store TASK_DUMPABLE_* on current->exec_state. All callers + * (commit_creds, begin_new_exec, prctl(PR_SET_DUMPABLE)) act on the + * running task, which guarantees ->exec_state is allocated and cannot + * be replaced under us. + */ +void task_exec_state_set_dumpable(enum task_dumpable value) +{ + struct task_exec_state *exec_state; + + if (WARN_ON_ONCE(value > TASK_DUMPABLE_ROOT)) + value = TASK_DUMPABLE_OFF; + + exec_state = rcu_dereference_protected(current->exec_state, true); + /* mm-less tasks share init_task's exec_state; never mutate it */ + if (WARN_ON_ONCE(exec_state == &init_task_exec_state)) + return; + WRITE_ONCE(exec_state->dumpable, value); +} + +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task) +{ + struct task_exec_state *exec_state; + + guard(rcu)(); + exec_state = rcu_dereference(task->exec_state); + return READ_ONCE(exec_state->dumpable); +} + +void __init exec_state_init(void) +{ + task_exec_state_cachep = kmem_cache_create("task_exec_state", + sizeof(struct task_exec_state), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} diff --git a/kernel/exit.c b/kernel/exit.c index f50d73c272d6..9a909993ab1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -571,7 +571,6 @@ static void exit_mm(void) */ smp_mb__after_spinlock(); local_irq_disable(); - current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); diff --git a/kernel/fork.c b/kernel/fork.c index 8ac38beae360..ba6b03d4a85c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/sched/ext.h> +#include <linux/sched/exec_state.h> #include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> @@ -555,6 +556,7 @@ void free_task(struct task_struct *tsk) if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); + put_task_exec_state(rcu_access_pointer(tsk->exec_state)); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -731,7 +733,6 @@ void __mmdrop(struct mm_struct *mm) destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); - put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); @@ -946,6 +947,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif + RCU_INIT_POINTER(tsk->exec_state, NULL); + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -1072,8 +1075,7 @@ static void mmap_init_lock(struct mm_struct *mm) #endif } -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, - struct user_namespace *user_ns) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); @@ -1132,7 +1134,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, NR_MM_COUNTERS)) goto fail_pcpu; - mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; @@ -1163,7 +1164,7 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - return mm_init(mm, current, current_user_ns()); + return mm_init(mm, current); } EXPORT_SYMBOL_IF_KUNIT(mm_alloc); @@ -1527,7 +1528,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm, tsk, mm->user_ns)) + if (!mm_init(mm, tsk)) goto fail_nomem; uprobe_start_dup_mmap(); @@ -1593,6 +1594,22 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk) return 0; } +static int copy_exec_state(u64 clone_flags, struct task_struct *tsk) +{ + struct task_exec_state *exec_state; + + /* CLONE_VM siblings refcount-share the parent's exec_state. */ + if (clone_flags & CLONE_VM) { + exec_state = rcu_dereference_protected(current->exec_state, true); + refcount_inc(&exec_state->count); + rcu_assign_pointer(tsk->exec_state, exec_state); + return 0; + } + + /* Everyone else inherits a fresh copy. */ + return task_exec_state_copy(tsk); +} + static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; @@ -2090,6 +2107,9 @@ __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; + retval = copy_exec_state(clone_flags, p); + if (retval) + goto bad_fork_free; p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; @@ -3097,6 +3117,7 @@ void __init proc_caches_init(void) sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); + exec_state_init(); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, diff --git a/kernel/kthread.c b/kernel/kthread.c index 791210daf8b4..63beb59b7a3d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1619,7 +1619,6 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); - WARN_ON_ONCE(!mm->user_ns); /* * It is possible for mm to be the same as tsk->active_mm, but diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 130043bfc209..d041645d9d17 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> +#include <linux/sched/exec_state.h> #include <linux/sched/task.h> #include <linux/errno.h> #include <linux/mm.h> @@ -36,6 +37,30 @@ #include <asm/syscall.h> /* for syscall_get_* */ +/** + * ptracer_access_allowed - may current peek/poke @tsk's address space? + * @tsk: tracee + * + * Per-access check used by ptrace_access_vm() and architecture-specific + * tag/register accessors. Returns true iff current is the registered + * ptracer of @tsk and either @tsk is owner-dumpable or current holds + * CAP_SYS_PTRACE in @tsk's exec namespace. Lighter than + * __ptrace_may_access(): it re-validates only dumpability and + * capability on every access, without re-running LSM hooks or + * cred_cap_issubset() checks performed at attach time. + */ +bool ptracer_access_allowed(struct task_struct *tsk) +{ + const struct task_exec_state *es; + + guard(rcu)(); + if (ptrace_parent(tsk) != current) + return false; + es = task_exec_state_rcu(tsk); + return READ_ONCE(es->dumpable) == TASK_DUMPABLE_OWNER || + ptracer_capable(tsk, es->user_ns); +} + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -45,21 +70,14 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; - int ret; + int ret = 0; mm = get_task_mm(tsk); if (!mm) return 0; - if (!tsk->ptrace || - (current != tsk->parent) || - ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptracer_capable(tsk, mm->user_ns))) { - mmput(mm); - return 0; - } - - ret = access_remote_vm(mm, addr, buf, len, gup_flags); + if (ptracer_access_allowed(tsk)) + ret = access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; @@ -274,16 +292,13 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static bool task_still_dumpable(struct task_struct *task, unsigned int mode) { - struct mm_struct *mm = task->mm; - if (mm) { - if (get_dumpable(mm) == SUID_DUMP_USER) - return true; - return ptrace_has_cap(mm->user_ns, mode); - } + const struct task_exec_state *exec_state; - if (task->user_dumpable) + guard(rcu)(); + exec_state = task_exec_state_rcu(task); + if (READ_ONCE(exec_state->dumpable) == TASK_DUMPABLE_OWNER) return true; - return ptrace_has_cap(&init_user_ns, mode); + return ptrace_has_cap(exec_state->user_ns, mode); } /* Returns 0 on success, -errno on denial. */ diff --git a/kernel/sys.c b/kernel/sys.c index 62e842055cc9..df69bd71de03 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2565,14 +2565,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = put_user(me->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - error = get_dumpable(me->mm); + error = task_exec_state_get_dumpable(me); break; case PR_SET_DUMPABLE: - if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { + if (arg2 != TASK_DUMPABLE_OFF && arg2 != TASK_DUMPABLE_OWNER) { error = -EINVAL; break; } - set_dumpable(me->mm, arg2); + task_exec_state_set_dumpable(arg2); break; case PR_SET_UNALIGN: |
