From 4f365e7a5d448dab7e0bb56ed32ff2bfddd134bd Mon Sep 17 00:00:00 2001 From: "Christian Brauner (Amutable)" Date: Wed, 20 May 2026 23:48:52 +0200 Subject: sched/coredump: introduce enum task_dumpable Replace the SUID_DUMP_DISABLE/USER/ROOT preprocessor constants with enum task_dumpable. Numeric values are preserved (kernel.suid_dumpable sysctl and prctl(PR_SET_DUMPABLE) ABI), so this is a pure rename with no behavioral change. Subsequent commits relocate dumpability onto a per-task structure where the enum type will allow stronger type-checking on the new API. Reviewed-by: Jann Horn Reviewed-by: David Hildenbrand (arm) Link: https://patch.msgid.link/20260520-work-task_exec_state-v3-1-69f895bc1385@kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/mm_types.h | 2 +- include/linux/sched/coredump.h | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a308e2c23b82..51ea37b2a0aa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1908,7 +1908,7 @@ enum { /* * The first two bits represent core dump modes for set-user-ID, - * the modes are SUID_DUMP_* defined in linux/sched/coredump.h + * the modes are TASK_DUMPABLE_* defined in linux/sched/coredump.h */ #define MMF_DUMPABLE_BITS 2 #define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 624fda17a785..ed6547692b61 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -4,9 +4,16 @@ #include -#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ -#define SUID_DUMP_USER 1 /* Dump as user of process */ -#define SUID_DUMP_ROOT 2 /* Dump as root */ +/* + * Task dumpability mode. Gates core dump production and ptrace_attach() + * authorization. The numeric values are stable ABI (suid_dumpable + * sysctl, prctl(PR_SET_DUMPABLE)); do not renumber. + */ +enum task_dumpable { + TASK_DUMPABLE_OFF = 0, /* no dump; ptrace needs CAP_SYS_PTRACE */ + TASK_DUMPABLE_OWNER = 1, /* default; dump and ptrace by uid match */ + TASK_DUMPABLE_ROOT = 2, /* dump as root; ptrace needs CAP_SYS_PTRACE */ +}; static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) { @@ -26,7 +33,7 @@ extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things * that are using this for checking for privilege transitions, it must - * test against SUID_DUMP_USER rather than treating it as a boolean + * test against TASK_DUMPABLE_OWNER rather than treating it as a boolean * value. */ static inline int __get_dumpable(unsigned long mm_flags) -- cgit v1.2.3 From b092062cb6d799fa3504c5975cbb1b05c8b67d6d Mon Sep 17 00:00:00 2001 From: "Christian Brauner (Amutable)" Date: Wed, 20 May 2026 23:48:53 +0200 Subject: exec: introduce struct task_exec_state Introduce struct task_exec_state, a per-task RCU-protected structure that holds the dumpable mode and the user namespace and stays attached to the task for its full lifetime. task_exec_state_rcu() is the canonical reader: asserts RCU or task_lock is held, WARNs on a NULL state, returns the rcu_dereference()'d pointer. Reviewed-by: Jann Horn Link: https://patch.msgid.link/20260520-work-task_exec_state-v3-2-69f895bc1385@kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/sched.h | 2 ++ include/linux/sched/exec_state.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 include/linux/sched/exec_state.h (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ee06cba5c6f5..6674dbf960b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -962,6 +962,8 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; + struct task_exec_state __rcu *exec_state; + int exit_state; int exit_code; int exit_signal; diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h new file mode 100644 index 000000000000..dc5a795cbfe2 --- /dev/null +++ b/include/linux/sched/exec_state.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner */ +#ifndef _LINUX_SCHED_EXEC_STATE_H +#define _LINUX_SCHED_EXEC_STATE_H + +#include +#include +#include +#include +#include + +struct task_exec_state { + refcount_t count; + enum task_dumpable dumpable; + struct user_namespace *user_ns; + struct rcu_head rcu; +}; + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns); +void put_task_exec_state(struct task_exec_state *exec_state); +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk); +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state); +void task_exec_state_set_dumpable(enum task_dumpable value); +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task); +int task_exec_state_copy(struct task_struct *tsk); +void __init exec_state_init(void); + +DEFINE_FREE(put_task_exec_state, struct task_exec_state *, put_task_exec_state(_T)) + +#endif /* _LINUX_SCHED_EXEC_STATE_H */ -- cgit v1.2.3 From 92f829f6b2faad37a5582df73df01c93d4429821 Mon Sep 17 00:00:00 2001 From: "Christian Brauner (Amutable)" Date: Wed, 20 May 2026 23:48:54 +0200 Subject: ptrace: add ptracer_access_allowed() Add a helper that encapsulates all of the logic for checking ptrace access and remove open-coded versions in follow-up patches. Reviewed-by: Jann Horn Reviewed-by: David Hildenbrand (arm) Link: https://patch.msgid.link/20260520-work-task_exec_state-v3-3-69f895bc1385@kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/ptrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 90507d4afcd6..ef314f7a9ecc 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -17,6 +17,7 @@ struct syscall_info { struct seccomp_data data; }; +bool ptracer_access_allowed(struct task_struct *tsk); extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); -- cgit v1.2.3 From 6b1c66c9cca99bf00386481c7b2aa7394c26d8b8 Mon Sep 17 00:00:00 2001 From: "Christian Brauner (Amutable)" Date: Wed, 20 May 2026 23:48:55 +0200 Subject: exec_state: relocate dumpable information The dumpable flag captured at execve() is consulted by __ptrace_may_access() and several /proc owner / visibility checks. It lives on mm_struct today, which exit_mm() clears from the task long before the task itself is reaped. exec_state is anchored to the execve() that established the current privilege domain. CLONE_VM siblings refcount-share the parent's exec_state via copy_exec_state(); non-CLONE_VM clones allocate a fresh exec_state inheriting the parent's dumpable mode and user_ns reference via task_exec_state_copy(). execve() allocates a fresh instance (via alloc_task_exec_state() in begin_new_exec()) and installs it under task_lock + exec_update_lock with task_exec_state_replace(). init_task uses a static instance. The dumpable mode now lives on task->exec_state->dumpable. task->mm->flags no longer carries dumpability; MMF_DUMPABLE_MASK is removed, but MMF_DUMPABLE_BITS is reserved so MMF_DUMP_FILTER_* bit positions remain stable for the /proc//coredump_filter ABI. The task->user_dumpable cache bit and its assignment in exit_mm() are removed; readers go through get_dumpable(task) directly. coredump_params gains a snapshot field cprm.dumpable, populated from get_dumpable(current) at vfs_coredump() entry, replacing the previous __get_dumpable(cprm->mm_flags) consumers in fs/coredump.c and fs/pidfs.c. The user namespace recorded at execve() is consulted by __ptrace_may_access() and by /proc/PID/* owner derivation. Move the captured user_ns onto task_exec_state, which stays attached to the task past exit_mm() and across exit_files(). bprm grows a user_ns field staged in bprm_mm_init() with the caller's user_ns, narrowed by would_dump() to the closest privileged ancestor, and consumed by exec_mmap() via alloc_task_exec_state(bprm->user_ns). free_bprm() releases the staging reference. mm_struct loses ->user_ns entirely. Initializers in init-mm, efi_mm, and the implicit one in mm_init()/dup_mm()/mm_alloc() are removed; __mmdrop() drops the matching put_user_ns(). The kthread_use_mm() WARN_ON_ONCE(!mm->user_ns) is no longer meaningful and goes too. Reviewed-by: Jann Horn Link: https://patch.msgid.link/20260520-work-task_exec_state-v3-4-69f895bc1385@kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/binfmts.h | 2 ++ include/linux/coredump.h | 4 ++++ include/linux/mm_types.h | 9 ++++----- include/linux/sched.h | 4 +--- include/linux/sched/coredump.h | 36 ++---------------------------------- include/linux/sched/exec_state.h | 4 ++-- 6 files changed, 15 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 65abd5ab8836..a8379f4eee61 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -25,6 +25,8 @@ struct linux_binprm { struct page *page[MAX_ARG_PAGES]; #endif struct mm_struct *mm; + /* user_ns published to task->exec_state at execve, narrowed by would_dump(). */ + struct user_namespace *user_ns; unsigned long p; /* current top of mem */ unsigned int /* Should an execfd be passed to userspace? */ diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 68861da4cf7c..7b38ee2e7913 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #ifdef CONFIG_COREDUMP @@ -20,7 +21,10 @@ struct coredump_params { const kernel_siginfo_t *siginfo; struct file *file; unsigned long limit; + /* MMF_DUMP_FILTER_* bits, snapshot of mm->flags at dump start. */ unsigned long mm_flags; + /* Snapshot of dumpable at dump start. */ + enum task_dumpable dumpable; int cpu; loff_t written; loff_t pos; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 51ea37b2a0aa..9588ce3b16df 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1342,7 +1342,6 @@ struct mm_struct { */ struct task_struct __rcu *owner; #endif - struct user_namespace *user_ns; /* store ref to file /proc//exe symlink points to */ struct file __rcu *exe_file; @@ -1907,11 +1906,11 @@ enum { /* mm flags */ /* - * The first two bits represent core dump modes for set-user-ID, - * the modes are TASK_DUMPABLE_* defined in linux/sched/coredump.h + * Bits 0 and 1 were dumpability; that moved to task->exec_state. Reserve + * the bits so MMF_DUMP_FILTER_* positions stay stable for the + * /proc//coredump_filter ABI. */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1972,7 +1971,7 @@ enum { #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) -#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6674dbf960b5..258cb075478d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -85,6 +85,7 @@ struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; +struct task_exec_state; struct task_group; struct task_struct; struct timespec64; @@ -1004,9 +1005,6 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif - /* Save user-dumpable when mm goes away */ - unsigned user_dumpable:1; - /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ed6547692b61..20957ccde3b5 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -2,8 +2,6 @@ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H -#include - /* * Task dumpability mode. Gates core dump production and ptrace_attach() * authorization. The numeric values are stable ABI (suid_dumpable @@ -15,37 +13,7 @@ enum task_dumpable { TASK_DUMPABLE_ROOT = 2, /* dump as root; ptrace needs CAP_SYS_PTRACE */ }; -static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) -{ - /* - * By convention, dumpable bits are contained in first 32 bits of the - * bitmap, so we can simply access this first unsigned long directly. - */ - return __mm_flags_get_word(mm); -} - -static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) -{ - __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); -} - -extern void set_dumpable(struct mm_struct *mm, int value); -/* - * This returns the actual value of the suid_dumpable flag. For things - * that are using this for checking for privilege transitions, it must - * test against TASK_DUMPABLE_OWNER rather than treating it as a boolean - * value. - */ -static inline int __get_dumpable(unsigned long mm_flags) -{ - return mm_flags & MMF_DUMPABLE_MASK; -} - -static inline int get_dumpable(struct mm_struct *mm) -{ - unsigned long flags = __mm_flags_get_dumpable(mm); - - return __get_dumpable(flags); -} +void task_exec_state_set_dumpable(enum task_dumpable value); +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task); #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h index dc5a795cbfe2..9b61782510b8 100644 --- a/include/linux/sched/exec_state.h +++ b/include/linux/sched/exec_state.h @@ -16,13 +16,13 @@ struct task_exec_state { struct rcu_head rcu; }; +extern struct task_exec_state init_task_exec_state; + struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns); void put_task_exec_state(struct task_exec_state *exec_state); struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk); struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, struct task_exec_state *exec_state); -void task_exec_state_set_dumpable(enum task_dumpable value); -enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task); int task_exec_state_copy(struct task_struct *tsk); void __init exec_state_init(void); -- cgit v1.2.3 From 38205ecbe6b6dc47968ad4e9c978e2117720969e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 22 May 2026 16:03:30 +0200 Subject: exec: free the old mm outside the exec locks exec_mmap() installs the new mm and then tears the old one down while still holding exec_update_lock for writing -- and with cred_guard_mutex held all the way to setup_new_exec(): setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); Neither lock is needed for this. exec_update_lock only exists to make the mm swap atomic with the later commit_creds(), so that permission-checking readers (proc, ptrace, the futex robust list, perf, kcmp, mm_access()) never observe the new mm together with the old credentials. Those readers all operate on task->mm, i.e. the new mm after the swap; none looks at the detached old mm, its ->owner or signal->maxrss. cred_guard_mutex guards credential calculation and is equally irrelevant here. The cost is real: __mmput() runs exit_mmap() over the entire old address space and can block in exit_aio() waiting for in-flight AIO, all while holding exec_update_lock for writing and cred_guard_mutex. For execve() of a large process this blocks ptrace_attach() and every exec_update_lock reader for the duration of the teardown. Stash the old mm in bprm->old_mm and release it from setup_new_exec() after both locks are dropped. setup_new_exec() still runs before setup_arg_pages() and the segment mappings, so the old address space is freed before the new one is populated and peak memory is unchanged. The ordering constraints are kept: old_mm's mmap_lock is still dropped in exec_mmap() before mm_update_next_owner() (required since commit 31a78f23bac0 ("mm owner: fix race between swapoff and exit")), and mm_update_next_owner() still precedes mmput(); both run in the execing task's context, as mm_update_next_owner() requires. If exec swaps the mm but fails before setup_new_exec() runs the old mm would leak, so add a backstop in free_bprm(). The lazy-tlb case (old_mm == NULL, e.g. kernel_execve()) has no address space to free and is left in exec_mmap(). Link: https://patch.msgid.link/20260522-work-exit_mm-v1-1-bd32d5a560bb@kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/binfmts.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index a8379f4eee61..2c77e383e737 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -25,6 +25,7 @@ struct linux_binprm { struct page *page[MAX_ARG_PAGES]; #endif struct mm_struct *mm; + struct mm_struct *old_mm; /* replaced address space, freed by setup_new_exec() */ /* user_ns published to task->exec_state at execve, narrowed by would_dump(). */ struct user_namespace *user_ns; unsigned long p; /* current top of mem */ -- cgit v1.2.3