diff options
Diffstat (limited to 'kernel/fork.c')
| -rw-r--r-- | kernel/fork.c | 138 |
1 files changed, 105 insertions, 33 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index bc2bf58b93b6..8ac38beae360 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -46,6 +46,7 @@ #include <linux/mm_inline.h> #include <linux/memblock.h> #include <linux/nsproxy.h> +#include <linux/ns/ns_common_types.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cgroup.h> @@ -95,6 +96,7 @@ #include <linux/thread_info.h> #include <linux/kstack_erase.h> #include <linux/kasan.h> +#include <linux/randomize_kstack.h> #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> @@ -345,7 +347,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ - memset(stack, 0, THREAD_SIZE); + clear_pages(vm_area->addr, vm_area->nr_pages); tsk->stack_vm_area = vm_area; tsk->stack = stack; @@ -1014,13 +1016,14 @@ free_tsk: __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; +static unsigned long coredump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) { - default_dump_filter = - (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & - MMF_DUMP_FILTER_MASK; + if (kstrtoul(s, 0, &coredump_filter)) + return 0; + coredump_filter <<= MMF_DUMP_FILTER_SHIFT; + coredump_filter &= MMF_DUMP_FILTER_MASK; return 1; } @@ -1106,7 +1109,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - __mm_flags_overwrite_word(mm, default_dump_filter); + __mm_flags_overwrite_word(mm, coredump_filter); mm->def_flags = 0; } @@ -1948,9 +1951,11 @@ static void rv_task_fork(struct task_struct *p) static bool need_futex_hash_allocate_default(u64 clone_flags) { - if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) - return false; - return true; + /* + * Allocate a default futex hash for any sibling that will + * share the parent's mm, except vfork. + */ + return (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM; } /* @@ -2028,6 +2033,41 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_AUTOREAP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_PARENT) + return ERR_PTR(-EINVAL); + if (args->exit_signal) + return ERR_PTR(-EINVAL); + } + + if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) + return ERR_PTR(-EINVAL); + + if (clone_flags & CLONE_NNP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + } + + if (clone_flags & CLONE_PIDFD_AUTOKILL) { + if (!(clone_flags & CLONE_PIDFD)) + return ERR_PTR(-EINVAL); + if (!(clone_flags & CLONE_AUTOREAP)) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + /* + * Without CLONE_NNP the child could escalate privileges + * after being spawned, so require CAP_SYS_ADMIN. + * With CLONE_NNP the child can't gain new privileges, + * so allow unprivileged usage. + */ + if (!(clone_flags & CLONE_NNP) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2076,6 +2116,7 @@ __latent_entropy struct task_struct *copy_process( ftrace_graph_init_task(p); rt_mutex_init_task(p); + raw_spin_lock_init(&p->blocked_lock); lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING @@ -2250,13 +2291,18 @@ __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + unsigned flags = PIDFD_STALE; + + if (clone_flags & CLONE_THREAD) + flags |= PIDFD_THREAD; + if (clone_flags & CLONE_PIDFD_AUTOKILL) + flags |= PIDFD_AUTOKILL; /* * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); + retval = pidfd_prepare(pid, flags, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2336,10 +2382,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cancel_cgroup; - /* - * Allocate a default futex hash for the user process once the first - * thread spawns. - */ if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) @@ -2392,7 +2434,11 @@ __latent_entropy struct task_struct *copy_process( rseq_fork(p, clone_flags); - /* Don't start children in a dying pid namespace */ + /* + * If zap_pid_ns_processes() was called after alloc_pid(), the new + * child missed SIGKILL. If current is not in the same namespace, + * we can't rely on fatal_signal_pending() below. + */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_core_free; @@ -2412,6 +2458,9 @@ __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + if (clone_flags & CLONE_NNP) + task_set_no_new_privs(p); + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2423,7 +2472,10 @@ __latent_entropy struct task_struct *copy_process( init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { - ns_of_pid(pid)->child_reaper = p; + struct pid_namespace *ns = ns_of_pid(pid); + + ASSERT_EXCLUSIVE_WRITER(ns->child_reaper); + WRITE_ONCE(ns->child_reaper, p); p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->shared_pending.signal = delayed.signal; @@ -2435,6 +2487,8 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; + if (clone_flags & CLONE_AUTOREAP) + p->signal->autoreap = 1; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); @@ -2463,8 +2517,12 @@ __latent_entropy struct task_struct *copy_process( fd_install(pidfd, pidfile); proc_fork_connector(p); - sched_post_fork(p); + /* + * sched_ext needs @p to be associated with its cgroup in its post_fork + * hook. cgroup_post_fork() should come before sched_post_fork(). + */ cgroup_post_fork(p, args); + sched_post_fork(p); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2606,8 +2664,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. - * - * args->exit_signal is expected to be checked for sanity by the caller. */ pid_t kernel_clone(struct kernel_clone_args *args) { @@ -2619,6 +2675,16 @@ pid_t kernel_clone(struct kernel_clone_args *args) pid_t nr; /* + * Creating an empty mount namespace implies creating a new mount + * namespace. Set this before copy_process() so that the + * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly. + */ + if (clone_flags & CLONE_EMPTY_MNTNS) { + clone_flags |= CLONE_NEWNS; + args->flags = clone_flags; + } + + /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate @@ -2632,6 +2698,9 @@ pid_t kernel_clone(struct kernel_clone_args *args) (args->pidfd == args->parent_tid)) return -EINVAL; + if (!valid_signal(args->exit_signal)) + return -EINVAL; + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly @@ -2830,11 +2899,9 @@ static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs, return -EINVAL; /* - * Verify that higher 32bits of exit_signal are unset and that - * it is a valid signal + * Verify that higher 32bits of exit_signal are unset */ - if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || - !valid_signal(args.exit_signal))) + if (unlikely(args.exit_signal & ~((u64)CSIGNAL))) return -EINVAL; if ((args.flags & CLONE_INTO_CGROUP) && @@ -2896,7 +2963,9 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | + CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP | + CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS)) return false; /* @@ -3045,11 +3114,9 @@ void __init proc_caches_init(void) */ static int check_unshare_flags(unsigned long unshare_flags) { - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| - CLONE_NEWTIME)) + CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing @@ -3148,6 +3215,8 @@ int ksys_unshare(unsigned long unshare_flags) /* * If unsharing namespace, must also unshare filesystem information. */ + if (unshare_flags & UNSHARE_EMPTY_MNTNS) + unshare_flags |= CLONE_NEWNS; if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -3174,11 +3243,10 @@ int ksys_unshare(unsigned long unshare_flags) new_cred, new_fs); if (err) goto bad_unshare_cleanup_cred; - if (new_cred) { err = set_cred_ucounts(new_cred); if (err) - goto bad_unshare_cleanup_cred; + goto bad_unshare_cleanup_nsproxy; } if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { @@ -3194,8 +3262,10 @@ int ksys_unshare(unsigned long unshare_flags) shm_init_task(current); } - if (new_nsproxy) + if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); + new_nsproxy = NULL; + } task_lock(current); @@ -3224,13 +3294,15 @@ int ksys_unshare(unsigned long unshare_flags) perf_event_namespaces(current); +bad_unshare_cleanup_nsproxy: + if (new_nsproxy) + put_nsproxy(new_nsproxy); bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); |
