summaryrefslogtreecommitdiff
path: root/kernel/fork.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c138
1 files changed, 105 insertions, 33 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index bc2bf58b93b6..8ac38beae360 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -46,6 +46,7 @@
#include <linux/mm_inline.h>
#include <linux/memblock.h>
#include <linux/nsproxy.h>
+#include <linux/ns/ns_common_types.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
@@ -95,6 +96,7 @@
#include <linux/thread_info.h>
#include <linux/kstack_erase.h>
#include <linux/kasan.h>
+#include <linux/randomize_kstack.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
@@ -345,7 +347,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
stack = kasan_reset_tag(vm_area->addr);
/* Clear stale pointers from reused stack. */
- memset(stack, 0, THREAD_SIZE);
+ clear_pages(vm_area->addr, vm_area->nr_pages);
tsk->stack_vm_area = vm_area;
tsk->stack = stack;
@@ -1014,13 +1016,14 @@ free_tsk:
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
+static unsigned long coredump_filter = MMF_DUMP_FILTER_DEFAULT;
static int __init coredump_filter_setup(char *s)
{
- default_dump_filter =
- (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
- MMF_DUMP_FILTER_MASK;
+ if (kstrtoul(s, 0, &coredump_filter))
+ return 0;
+ coredump_filter <<= MMF_DUMP_FILTER_SHIFT;
+ coredump_filter &= MMF_DUMP_FILTER_MASK;
return 1;
}
@@ -1106,7 +1109,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
__mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- __mm_flags_overwrite_word(mm, default_dump_filter);
+ __mm_flags_overwrite_word(mm, coredump_filter);
mm->def_flags = 0;
}
@@ -1948,9 +1951,11 @@ static void rv_task_fork(struct task_struct *p)
static bool need_futex_hash_allocate_default(u64 clone_flags)
{
- if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
- return false;
- return true;
+ /*
+ * Allocate a default futex hash for any sibling that will
+ * share the parent's mm, except vfork.
+ */
+ return (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM;
}
/*
@@ -2028,6 +2033,41 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_AUTOREAP) {
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_PARENT)
+ return ERR_PTR(-EINVAL);
+ if (args->exit_signal)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if ((clone_flags & CLONE_PARENT) && current->signal->autoreap)
+ return ERR_PTR(-EINVAL);
+
+ if (clone_flags & CLONE_NNP) {
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ if (!(clone_flags & CLONE_PIDFD))
+ return ERR_PTR(-EINVAL);
+ if (!(clone_flags & CLONE_AUTOREAP))
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ /*
+ * Without CLONE_NNP the child could escalate privileges
+ * after being spawned, so require CAP_SYS_ADMIN.
+ * With CLONE_NNP the child can't gain new privileges,
+ * so allow unprivileged usage.
+ */
+ if (!(clone_flags & CLONE_NNP) &&
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2076,6 +2116,7 @@ __latent_entropy struct task_struct *copy_process(
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
+ raw_spin_lock_init(&p->blocked_lock);
lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
@@ -2250,13 +2291,18 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+ unsigned flags = PIDFD_STALE;
+
+ if (clone_flags & CLONE_THREAD)
+ flags |= PIDFD_THREAD;
+ if (clone_flags & CLONE_PIDFD_AUTOKILL)
+ flags |= PIDFD_AUTOKILL;
/*
* Note that no task has been attached to @pid yet indicate
* that via CLONE_PIDFD.
*/
- retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
+ retval = pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2336,10 +2382,6 @@ __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cancel_cgroup;
- /*
- * Allocate a default futex hash for the user process once the first
- * thread spawns.
- */
if (need_futex_hash_allocate_default(clone_flags)) {
retval = futex_hash_allocate_default();
if (retval)
@@ -2392,7 +2434,11 @@ __latent_entropy struct task_struct *copy_process(
rseq_fork(p, clone_flags);
- /* Don't start children in a dying pid namespace */
+ /*
+ * If zap_pid_ns_processes() was called after alloc_pid(), the new
+ * child missed SIGKILL. If current is not in the same namespace,
+ * we can't rely on fatal_signal_pending() below.
+ */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_core_free;
@@ -2412,6 +2458,9 @@ __latent_entropy struct task_struct *copy_process(
*/
copy_seccomp(p);
+ if (clone_flags & CLONE_NNP)
+ task_set_no_new_privs(p);
+
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2423,7 +2472,10 @@ __latent_entropy struct task_struct *copy_process(
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
- ns_of_pid(pid)->child_reaper = p;
+ struct pid_namespace *ns = ns_of_pid(pid);
+
+ ASSERT_EXCLUSIVE_WRITER(ns->child_reaper);
+ WRITE_ONCE(ns->child_reaper, p);
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->shared_pending.signal = delayed.signal;
@@ -2435,6 +2487,8 @@ __latent_entropy struct task_struct *copy_process(
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
+ if (clone_flags & CLONE_AUTOREAP)
+ p->signal->autoreap = 1;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
@@ -2463,8 +2517,12 @@ __latent_entropy struct task_struct *copy_process(
fd_install(pidfd, pidfile);
proc_fork_connector(p);
- sched_post_fork(p);
+ /*
+ * sched_ext needs @p to be associated with its cgroup in its post_fork
+ * hook. cgroup_post_fork() should come before sched_post_fork().
+ */
cgroup_post_fork(p, args);
+ sched_post_fork(p);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
@@ -2606,8 +2664,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
- *
- * args->exit_signal is expected to be checked for sanity by the caller.
*/
pid_t kernel_clone(struct kernel_clone_args *args)
{
@@ -2619,6 +2675,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
pid_t nr;
/*
+ * Creating an empty mount namespace implies creating a new mount
+ * namespace. Set this before copy_process() so that the
+ * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
+ */
+ if (clone_flags & CLONE_EMPTY_MNTNS) {
+ clone_flags |= CLONE_NEWNS;
+ args->flags = clone_flags;
+ }
+
+ /*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
@@ -2632,6 +2698,9 @@ pid_t kernel_clone(struct kernel_clone_args *args)
(args->pidfd == args->parent_tid))
return -EINVAL;
+ if (!valid_signal(args->exit_signal))
+ return -EINVAL;
+
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
@@ -2830,11 +2899,9 @@ static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
return -EINVAL;
/*
- * Verify that higher 32bits of exit_signal are unset and that
- * it is a valid signal
+ * Verify that higher 32bits of exit_signal are unset
*/
- if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
- !valid_signal(args.exit_signal)))
+ if (unlikely(args.exit_signal & ~((u64)CSIGNAL)))
return -EINVAL;
if ((args.flags & CLONE_INTO_CGROUP) &&
@@ -2896,7 +2963,9 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
- ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+ CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP |
+ CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS))
return false;
/*
@@ -3045,11 +3114,9 @@ void __init proc_caches_init(void)
*/
static int check_unshare_flags(unsigned long unshare_flags)
{
- if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
- CLONE_NEWTIME))
+ CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
@@ -3148,6 +3215,8 @@ int ksys_unshare(unsigned long unshare_flags)
/*
* If unsharing namespace, must also unshare filesystem information.
*/
+ if (unshare_flags & UNSHARE_EMPTY_MNTNS)
+ unshare_flags |= CLONE_NEWNS;
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
@@ -3174,11 +3243,10 @@ int ksys_unshare(unsigned long unshare_flags)
new_cred, new_fs);
if (err)
goto bad_unshare_cleanup_cred;
-
if (new_cred) {
err = set_cred_ucounts(new_cred);
if (err)
- goto bad_unshare_cleanup_cred;
+ goto bad_unshare_cleanup_nsproxy;
}
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
@@ -3194,8 +3262,10 @@ int ksys_unshare(unsigned long unshare_flags)
shm_init_task(current);
}
- if (new_nsproxy)
+ if (new_nsproxy) {
switch_task_namespaces(current, new_nsproxy);
+ new_nsproxy = NULL;
+ }
task_lock(current);
@@ -3224,13 +3294,15 @@ int ksys_unshare(unsigned long unshare_flags)
perf_event_namespaces(current);
+bad_unshare_cleanup_nsproxy:
+ if (new_nsproxy)
+ put_nsproxy(new_nsproxy);
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
-
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);