summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2019-05-10 11:43:46 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2019-05-10 11:43:46 -0700
commit2a267e7c41aa88215de2b542de797d03d16ecdfd (patch)
treeb949270835e304c8b60a40cde1b2c2e19c13b33a /kernel/cgroup
parent0981949da8f7498b96c6e0ae60680865ca283bf1 (diff)
parente93c9c99a629c61837d5a7fc2120cd2b6c70dbdd (diff)
Merge tag 'v5.1' into next
Sync up with mainline to bring in the latest APIs.
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-internal.h53
-rw-r--r--kernel/cgroup/cgroup-v1.c434
-rw-r--r--kernel/cgroup/cgroup.c377
-rw-r--r--kernel/cgroup/cpuset.c1017
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/pids.c4
-rw-r--r--kernel/cgroup/rdma.c5
-rw-r--r--kernel/cgroup/rstat.c10
8 files changed, 1426 insertions, 478 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 75568fcf2180..30e39f3932ad 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -7,10 +7,13 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
+#include <linux/fs_context.h>
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+extern bool cgroup_debug;
+extern void __init enable_debug_cgroup(void);
/*
* cgroup_path() takes a spin lock. It is good practice not to take
@@ -35,6 +38,31 @@ extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
} while (0)
/*
+ * The cgroup filesystem superblock creation/mount context.
+ */
+struct cgroup_fs_context {
+ struct kernfs_fs_context kfc;
+ struct cgroup_root *root;
+ struct cgroup_namespace *ns;
+ unsigned int flags; /* CGRP_ROOT_* flags */
+
+ /* cgroup1 bits */
+ bool cpuset_clone_children;
+ bool none; /* User explicitly requested empty subsystem */
+ bool all_ss; /* Seen 'all' option */
+ u16 subsys_mask; /* Selected subsystems */
+ char *name; /* Hierarchy name */
+ char *release_agent; /* Path for release notifications */
+};
+
+static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct cgroup_fs_context, kfc);
+}
+
+/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
* direction, a css_set is naturally associated with multiple cgroups.
@@ -115,16 +143,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-struct cgroup_sb_opts {
- u16 subsys_mask;
- unsigned int flags;
- char *release_agent;
- bool cpuset_clone_children;
- char *name;
- /* User explicitly requested empty subsystem */
- bool none;
-};
-
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
@@ -195,12 +213,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
void cgroup_free_root(struct cgroup_root *root);
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
+void init_cgroup_root(struct cgroup_fs_context *ctx);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
- struct cgroup_root *root, unsigned long magic,
- struct cgroup_namespace *ns);
+int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
@@ -244,14 +260,15 @@ extern const struct proc_ns_operations cgroupns_operations;
*/
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+extern const struct fs_parameter_description cgroup1_fs_parameters;
int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
void cgroup1_check_for_release(struct cgroup *cgrp);
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
- void *data, unsigned long magic,
- struct cgroup_namespace *ns);
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
+int cgroup1_get_tree(struct fs_context *fc);
+int cgroup1_reconfigure(struct fs_context *ctx);
#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 51063e7a93c2..c126b34fd4ff 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -13,9 +13,12 @@
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
#include <linux/cgroupstats.h>
+#include <linux/fs_parser.h>
#include <trace/events/cgroup.h>
+#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
+
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -27,6 +30,9 @@
/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask;
+/* disable named v1 mounts */
+static bool cgroup_no_v1_named;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -903,168 +909,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
return 0;
}
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
- char *token, *o = data;
- bool all_ss = false, one_ss = false;
- u16 mask = U16_MAX;
- struct cgroup_subsys *ss;
- int nr_opts = 0;
- int i;
-
-#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
- memset(opts, 0, sizeof(*opts));
+enum cgroup1_param {
+ Opt_all,
+ Opt_clone_children,
+ Opt_cpuset_v2_mode,
+ Opt_name,
+ Opt_none,
+ Opt_noprefix,
+ Opt_release_agent,
+ Opt_xattr,
+};
- while ((token = strsep(&o, ",")) != NULL) {
- nr_opts++;
+static const struct fs_parameter_spec cgroup1_param_specs[] = {
+ fsparam_flag ("all", Opt_all),
+ fsparam_flag ("clone_children", Opt_clone_children),
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ fsparam_string("name", Opt_name),
+ fsparam_flag ("none", Opt_none),
+ fsparam_flag ("noprefix", Opt_noprefix),
+ fsparam_string("release_agent", Opt_release_agent),
+ fsparam_flag ("xattr", Opt_xattr),
+ {}
+};
- if (!*token)
- return -EINVAL;
- if (!strcmp(token, "none")) {
- /* Explicitly have no subsystems */
- opts->none = true;
- continue;
- }
- if (!strcmp(token, "all")) {
- /* Mutually exclusive option 'all' + subsystem name */
- if (one_ss)
- return -EINVAL;
- all_ss = true;
- continue;
- }
- if (!strcmp(token, "noprefix")) {
- opts->flags |= CGRP_ROOT_NOPREFIX;
- continue;
- }
- if (!strcmp(token, "clone_children")) {
- opts->cpuset_clone_children = true;
- continue;
- }
- if (!strcmp(token, "cpuset_v2_mode")) {
- opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
- continue;
- }
- if (!strcmp(token, "xattr")) {
- opts->flags |= CGRP_ROOT_XATTR;
- continue;
- }
- if (!strncmp(token, "release_agent=", 14)) {
- /* Specifying two release agents is forbidden */
- if (opts->release_agent)
- return -EINVAL;
- opts->release_agent =
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
- if (!opts->release_agent)
- return -ENOMEM;
- continue;
- }
- if (!strncmp(token, "name=", 5)) {
- const char *name = token + 5;
- /* Can't specify an empty name */
- if (!strlen(name))
- return -EINVAL;
- /* Must match [\w.-]+ */
- for (i = 0; i < strlen(name); i++) {
- char c = name[i];
- if (isalnum(c))
- continue;
- if ((c == '.') || (c == '-') || (c == '_'))
- continue;
- return -EINVAL;
- }
- /* Specifying two names is forbidden */
- if (opts->name)
- return -EINVAL;
- opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN - 1,
- GFP_KERNEL);
- if (!opts->name)
- return -ENOMEM;
+const struct fs_parameter_description cgroup1_fs_parameters = {
+ .name = "cgroup1",
+ .specs = cgroup1_param_specs,
+};
- continue;
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct cgroup_subsys *ss;
+ struct fs_parse_result result;
+ int opt, i;
+
+ opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
+ if (opt == -ENOPARAM) {
+ if (strcmp(param->key, "source") == 0) {
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
}
-
for_each_subsys(ss, i) {
- if (strcmp(token, ss->legacy_name))
+ if (strcmp(param->key, ss->legacy_name))
continue;
- if (!cgroup_ssid_enabled(i))
+ ctx->subsys_mask |= (1 << i);
+ return 0;
+ }
+ return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
+ }
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_none:
+ /* Explicitly have no subsystems */
+ ctx->none = true;
+ break;
+ case Opt_all:
+ ctx->all_ss = true;
+ break;
+ case Opt_noprefix:
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ break;
+ case Opt_clone_children:
+ ctx->cpuset_clone_children = true;
+ break;
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ break;
+ case Opt_xattr:
+ ctx->flags |= CGRP_ROOT_XATTR;
+ break;
+ case Opt_release_agent:
+ /* Specifying two release agents is forbidden */
+ if (ctx->release_agent)
+ return cg_invalf(fc, "cgroup1: release_agent respecified");
+ ctx->release_agent = param->string;
+ param->string = NULL;
+ break;
+ case Opt_name:
+ /* blocked by boot param? */
+ if (cgroup_no_v1_named)
+ return -ENOENT;
+ /* Can't specify an empty name */
+ if (!param->size)
+ return cg_invalf(fc, "cgroup1: Empty name");
+ if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
+ return cg_invalf(fc, "cgroup1: Name too long");
+ /* Must match [\w.-]+ */
+ for (i = 0; i < param->size; i++) {
+ char c = param->string[i];
+ if (isalnum(c))
continue;
- if (cgroup1_ssid_disabled(i))
+ if ((c == '.') || (c == '-') || (c == '_'))
continue;
-
- /* Mutually exclusive option 'all' + subsystem name */
- if (all_ss)
- return -EINVAL;
- opts->subsys_mask |= (1 << i);
- one_ss = true;
-
- break;
+ return cg_invalf(fc, "cgroup1: Invalid name");
}
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+ /* Specifying two names is forbidden */
+ if (ctx->name)
+ return cg_invalf(fc, "cgroup1: name respecified");
+ ctx->name = param->string;
+ param->string = NULL;
+ break;
}
+ return 0;
+}
+
+static int check_cgroupfs_options(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ u16 mask = U16_MAX;
+ u16 enabled = 0;
+ struct cgroup_subsys *ss;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+ for_each_subsys(ss, i)
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ enabled |= 1 << i;
+
+ ctx->subsys_mask &= enabled;
/*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options were
- * not specified, let's default to 'all'
+ * In absense of 'none', 'name=' or subsystem name options,
+ * let's default to 'all'.
*/
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
- opts->subsys_mask |= (1 << i);
+ if (!ctx->subsys_mask && !ctx->none && !ctx->name)
+ ctx->all_ss = true;
+
+ if (ctx->all_ss) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (ctx->subsys_mask)
+ return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
+ /* 'all' => select all the subsystems */
+ ctx->subsys_mask = enabled;
+ }
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ if (!ctx->subsys_mask && !ctx->name)
+ return cg_invalf(fc, "cgroup1: Need name or subsystem set");
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
- return -EINVAL;
+ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
+ return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
/* Can't specify "none" and some subsystems */
- if (opts->subsys_mask && opts->none)
- return -EINVAL;
+ if (ctx->subsys_mask && ctx->none)
+ return cg_invalf(fc, "cgroup1: none used incorrectly");
return 0;
}
-static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+int cgroup1_reconfigure(struct fs_context *fc)
{
- int ret = 0;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_sb_opts opts;
+ int ret = 0;
u16 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
- ret = parse_cgroupfs_options(data, &opts);
+ ret = check_cgroupfs_options(fc);
if (ret)
goto out_unlock;
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
- added_mask = opts.subsys_mask & ~root->subsys_mask;
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
+ added_mask = ctx->subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~ctx->subsys_mask;
/* Don't allow flags or name to change at remount */
- if ((opts.flags ^ root->flags) ||
- (opts.name && strcmp(opts.name, root->name))) {
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags, opts.name ?: "", root->flags, root->name);
+ if ((ctx->flags ^ root->flags) ||
+ (ctx->name && strcmp(ctx->name, root->name))) {
+ cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
+ ctx->flags, ctx->name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1081,17 +1114,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
- if (opts.release_agent) {
+ if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, opts.release_agent);
+ strcpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
trace_cgroup_remount(root);
out_unlock:
- kfree(opts.release_agent);
- kfree(opts.name);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -1099,30 +1130,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
.rename = cgroup1_rename,
.show_options = cgroup1_show_options,
- .remount_fs = cgroup1_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
};
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
- void *data, unsigned long magic,
- struct cgroup_namespace *ns)
+/*
+ * The guts of cgroup1 mount - find or create cgroup_root to use.
+ * Called with cgroup_mutex held; returns 0 on success, -E... on
+ * error and positive - in case when the candidate is busy dying.
+ * On success it stashes a reference to cgroup_root into given
+ * cgroup_fs_context; that reference is *NOT* counting towards the
+ * cgroup_root refcount.
+ */
+static int cgroup1_root_to_use(struct fs_context *fc)
{
- struct super_block *pinned_sb = NULL;
- struct cgroup_sb_opts opts;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct dentry *dentry;
int i, ret;
- bool new_root = false;
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
- ret = parse_cgroupfs_options(data, &opts);
+ ret = check_cgroupfs_options(fc);
if (ret)
- goto out_unlock;
+ return ret;
/*
* Destruction of cgroup root is asynchronous, so subsystems may
@@ -1132,16 +1163,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
- if (!(opts.subsys_mask & (1 << i)) ||
+ if (!(ctx->subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
+ return 1; /* restart */
cgroup_put(&ss->root->cgrp);
}
@@ -1156,8 +1183,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
*/
- if (opts.name) {
- if (strcmp(opts.name, root->name))
+ if (ctx->name) {
+ if (strcmp(ctx->name, root->name))
continue;
name_match = true;
}
@@ -1166,42 +1193,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
*/
- if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->subsys_mask)) {
+ if ((ctx->subsys_mask || ctx->none) &&
+ (ctx->subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
- ret = -EBUSY;
- goto out_unlock;
+ return -EBUSY;
}
- if (root->flags ^ opts.flags)
+ if (root->flags ^ ctx->flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
- ret = 0;
- goto out_unlock;
+ ctx->root = root;
+ return 0;
}
/*
@@ -1209,62 +1212,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
*/
- if (!opts.subsys_mask && !opts.none) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ if (!ctx->subsys_mask && !ctx->none)
+ return cg_invalf(fc, "cgroup1: No subsys list or none specified");
/* Hierarchies may only be created in the initial cgroup namespace. */
- if (ns != &init_cgroup_ns) {
- ret = -EPERM;
- goto out_unlock;
- }
+ if (ctx->ns != &init_cgroup_ns)
+ return -EPERM;
root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- ret = -ENOMEM;
- goto out_unlock;
- }
- new_root = true;
+ if (!root)
+ return -ENOMEM;
- init_cgroup_root(root, &opts);
+ ctx->root = root;
+ init_cgroup_root(ctx);
- ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
+ ret = cgroup_setup_root(root, ctx->subsys_mask);
if (ret)
cgroup_free_root(root);
+ return ret;
+}
-out_unlock:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(opts.release_agent);
- kfree(opts.name);
+int cgroup1_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
- if (ret)
- return ERR_PTR(ret);
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
- dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
- CGROUP_SUPER_MAGIC, ns);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
- /*
- * There's a race window after we release cgroup_mutex and before
- * allocating a superblock. Make sure a concurrent process won't
- * be able to re-use the root during this window by delaying the
- * initialization of root refcnt.
- */
- if (new_root) {
- mutex_lock(&cgroup_mutex);
- percpu_ref_reinit(&root->cgrp.self.refcnt);
- mutex_unlock(&cgroup_mutex);
- }
+ ret = cgroup1_root_to_use(fc);
+ if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
+ ret = 1; /* restart */
- /*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
- */
- if (pinned_sb)
- deactivate_super(pinned_sb);
+ mutex_unlock(&cgroup_mutex);
+
+ if (!ret)
+ ret = cgroup_do_get_tree(fc);
- return dentry;
+ if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
+ deactivate_locked_super(sb);
+ ret = 1;
+ }
+
+ if (unlikely(ret > 0)) {
+ msleep(10);
+ return restart_syscall();
+ }
+ return ret;
}
static int __init cgroup1_wq_init(void)
@@ -1292,7 +1291,12 @@ static int __init cgroup_no_v1(char *str)
if (!strcmp(token, "all")) {
cgroup_no_v1_mask = U16_MAX;
- break;
+ continue;
+ }
+
+ if (!strcmp(token, "named")) {
+ cgroup_no_v1_named = true;
+ continue;
}
for_each_subsys(ss, i) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..3f2b4bde0f9c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
+#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/psi.h>
#include <net/sock.h>
@@ -86,6 +87,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -196,7 +198,7 @@ static u64 css_serial_nr_next = 1;
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
@@ -493,7 +495,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
}
/**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
@@ -502,8 +504,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
@@ -524,6 +526,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
}
/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ *
+ * The returned css is not guaranteed to be online, and therefore it is the
+ * callers responsiblity to tryget a reference for it.
+ */
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css)
+ return css;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ return init_css_set.subsys[ss->id];
+}
+
+/**
* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
@@ -605,10 +636,11 @@ EXPORT_SYMBOL_GPL(of_css);
*
* Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
- ; \
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
+ cgroup_subsys[(ssid)]))) \
+ ; \
else
/**
@@ -1007,7 +1039,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
- template[i] = cgroup_e_css(cgrp, ss);
+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else {
/*
* @ss is not in this hierarchy, so we don't want
@@ -1399,12 +1431,15 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
- else
+ } else {
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ }
return buf;
}
@@ -1738,26 +1773,37 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
-static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
-{
- char *token;
+enum cgroup2_param {
+ Opt_nsdelegate,
+ nr__cgroup2_params
+};
- *root_flags = 0;
+static const struct fs_parameter_spec cgroup2_param_specs[] = {
+ fsparam_flag ("nsdelegate", Opt_nsdelegate),
+ {}
+};
- if (!data)
- return 0;
+static const struct fs_parameter_description cgroup2_fs_parameters = {
+ .name = "cgroup2",
+ .specs = cgroup2_param_specs,
+};
- while ((token = strsep(&data, ",")) != NULL) {
- if (!strcmp(token, "nsdelegate")) {
- *root_flags |= CGRP_ROOT_NS_DELEGATE;
- continue;
- }
+static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
- pr_err("cgroup2: unknown option \"%s\"\n", token);
- return -EINVAL;
- }
+ opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- return 0;
+ switch (opt) {
+ case Opt_nsdelegate:
+ ctx->flags |= CGRP_ROOT_NS_DELEGATE;
+ return 0;
+ }
+ return -EINVAL;
}
static void apply_cgroup_root_flags(unsigned int root_flags)
@@ -1777,16 +1823,11 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
return 0;
}
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+static int cgroup_reconfigure(struct fs_context *fc)
{
- unsigned int root_flags;
- int ret;
-
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret)
- return ret;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- apply_cgroup_root_flags(root_flags);
+ apply_cgroup_root_flags(ctx->flags);
return 0;
}
@@ -1874,8 +1915,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_fs_context *ctx)
{
+ struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
@@ -1884,16 +1926,16 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
- root->flags = opts->flags;
- if (opts->release_agent)
- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
- if (opts->name)
- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
- if (opts->cpuset_clone_children)
+ root->flags = ctx->flags;
+ if (ctx->release_agent)
+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
+ if (ctx->name)
+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
+ if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1910,7 +1952,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
- ref_flags, GFP_KERNEL);
+ 0, GFP_KERNEL);
if (ret)
goto out;
@@ -1994,57 +2036,104 @@ out:
return ret;
}
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
- struct cgroup_root *root, unsigned long magic,
- struct cgroup_namespace *ns)
+int cgroup_do_get_tree(struct fs_context *fc)
{
- struct dentry *dentry;
- bool new_sb;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
+ ctx->kfc.root = ctx->root->kf_root;
+ if (fc->fs_type == &cgroup2_fs_type)
+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
+ else
+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
+ ret = kernfs_get_tree(fc);
/*
* In non-init cgroup namespace, instead of root cgroup's dentry,
* we return the dentry corresponding to the cgroupns->root_cgrp.
*/
- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+ if (!ret && ctx->ns != &init_cgroup_ns) {
struct dentry *nsdentry;
+ struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- cgrp = cset_cgroup_from_root(ns->root_cset, root);
+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
- dput(dentry);
- dentry = nsdentry;
+ nsdentry = kernfs_node_dentry(cgrp->kn, sb);
+ dput(fc->root);
+ fc->root = nsdentry;
+ if (IS_ERR(nsdentry)) {
+ ret = PTR_ERR(nsdentry);
+ deactivate_locked_super(sb);
+ }
}
- if (IS_ERR(dentry) || !new_sb)
- cgroup_put(&root->cgrp);
+ if (!ctx->kfc.new_sb_created)
+ cgroup_put(&ctx->root->cgrp);
- return dentry;
+ return ret;
}
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+/*
+ * Destroy a cgroup filesystem context.
+ */
+static void cgroup_fs_context_free(struct fs_context *fc)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct dentry *dentry;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+
+ kfree(ctx->name);
+ kfree(ctx->release_agent);
+ put_cgroup_ns(ctx->ns);
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static int cgroup_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- get_cgroup_ns(ns);
+ cgrp_dfl_visible = true;
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
+ ctx->root = &cgrp_dfl_root;
- /* Check if the caller has permission to mount. */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
- put_cgroup_ns(ns);
- return ERR_PTR(-EPERM);
- }
+ ret = cgroup_do_get_tree(fc);
+ if (!ret)
+ apply_cgroup_root_flags(ctx->flags);
+ return ret;
+}
+
+static const struct fs_context_operations cgroup_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup2_parse_param,
+ .get_tree = cgroup_get_tree,
+ .reconfigure = cgroup_reconfigure,
+};
+
+static const struct fs_context_operations cgroup1_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup1_parse_param,
+ .get_tree = cgroup1_get_tree,
+ .reconfigure = cgroup1_reconfigure,
+};
+
+/*
+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
+ * we select the namespace we're going to use.
+ */
+static int cgroup_init_fs_context(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
/*
* The first time anyone tries to mount a cgroup, enable the list
@@ -2053,29 +2142,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
- if (fs_type == &cgroup2_fs_type) {
- unsigned int root_flags;
-
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret) {
- put_cgroup_ns(ns);
- return ERR_PTR(ret);
- }
-
- cgrp_dfl_visible = true;
- cgroup_get_live(&cgrp_dfl_root.cgrp);
-
- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
- CGROUP2_SUPER_MAGIC, ns);
- if (!IS_ERR(dentry))
- apply_cgroup_root_flags(root_flags);
- } else {
- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
- CGROUP_SUPER_MAGIC, ns);
- }
-
- put_cgroup_ns(ns);
- return dentry;
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ fc->fs_private = &ctx->kfc;
+ if (fc->fs_type == &cgroup2_fs_type)
+ fc->ops = &cgroup_fs_context_ops;
+ else
+ fc->ops = &cgroup1_fs_context_ops;
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
+ fc->global = true;
+ return 0;
}
static void cgroup_kill_sb(struct super_block *sb)
@@ -2084,33 +2162,33 @@ static void cgroup_kill_sb(struct super_block *sb)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
- * If @root doesn't have any mounts or children, start killing it.
+ * If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
* cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
- if (!list_empty(&root->cgrp.self.children) ||
- root == &cgrp_dfl_root)
- cgroup_put(&root->cgrp);
- else
+ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
-
+ cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
struct file_system_type cgroup_fs_type = {
- .name = "cgroup",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = &cgroup1_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
- .name = "cgroup2",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup2",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = &cgroup2_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
@@ -3024,7 +3102,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
return ret;
/*
- * At this point, cgroup_e_css() results reflect the new csses
+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
@@ -3499,6 +3577,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
return ret ?: nbytes;
}
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
@@ -3537,6 +3625,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
@@ -3545,6 +3634,7 @@ static struct kernfs_ops cgroup_kf_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
@@ -3639,7 +3729,8 @@ restart:
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
-
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+ continue;
if (is_add) {
ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
@@ -4202,20 +4293,25 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
repeat:
- /*
- * Advance iterator to find next entry. cset->tasks is consumed
- * first and then ->mg_tasks. After ->mg_tasks, we move onto the
- * next cset.
- */
- next = it->task_pos->next;
+ if (it->task_pos) {
+ /*
+ * Advance iterator to find next entry. cset->tasks is
+ * consumed first and then ->mg_tasks. After ->mg_tasks,
+ * we move onto the next cset.
+ */
+ next = it->task_pos->next;
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (next == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
+ css_task_iter_advance_css_set(it);
+ else
+ it->task_pos = next;
+ } else {
+ /* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
+ }
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4255,7 +4351,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->cset_head = it->cset_pos;
- css_task_iter_advance_css_set(it);
+ css_task_iter_advance(it);
spin_unlock_irq(&css_set_lock);
}
@@ -5227,7 +5323,6 @@ int cgroup_rmdir(struct kernfs_node *kn)
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
@@ -5273,7 +5368,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
- have_free_callback |= (bool)ss->free << ss->id;
+ have_release_callback |= (bool)ss->release << ss->id;
have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
@@ -5294,11 +5389,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts;
+ static struct cgroup_fs_context __initdata ctx;
struct cgroup_subsys *ss;
int i;
- init_cgroup_root(&cgrp_dfl_root, &opts);
+ ctx.root = &cgrp_dfl_root;
+ init_cgroup_root(&ctx);
cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
@@ -5343,7 +5439,7 @@ int __init cgroup_init(void)
cgroup_rstat_boot();
/*
- * The latency of the synchronize_sched() is too high for cgroups,
+ * The latency of the synchronize_rcu() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
*/
rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
@@ -5359,7 +5455,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
@@ -5709,16 +5805,19 @@ void cgroup_exit(struct task_struct *tsk)
} while_each_subsys_mask();
}
-void cgroup_free(struct task_struct *task)
+void cgroup_release(struct task_struct *task)
{
- struct css_set *cset = task_css_set(task);
struct cgroup_subsys *ss;
int ssid;
- do_each_subsys_mask(ss, ssid, have_free_callback) {
- ss->free(task);
+ do_each_subsys_mask(ss, ssid, have_release_callback) {
+ ss->release(task);
} while_each_subsys_mask();
+}
+void cgroup_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
@@ -5743,6 +5842,16 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+void __init __weak enable_debug_cgroup(void) { }
+
+static int __init enable_cgroup_debug(char *str)
+{
+ cgroup_debug = true;
+ enable_debug_cgroup();
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5946,7 +6055,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_detach(cgrp, prog, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -5978,10 +6087,8 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
- if (unlikely(ret >= size)) {
- WARN_ON(1);
+ if (WARN_ON(ret >= size))
break;
- }
}
return ret;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 266f10cb7222..4834c4214e9c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -39,6 +39,7 @@
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
@@ -110,6 +111,16 @@ struct cpuset {
nodemask_t effective_mems;
/*
+ * CPUs allocated to child sub-partitions (default hierarchy only)
+ * - CPUs granted by the parent = effective_cpus U subparts_cpus
+ * - effective_cpus and subparts_cpus are mutually exclusive.
+ *
+ * effective_cpus contains only onlined CPUs, but subparts_cpus
+ * may have offlined ones.
+ */
+ cpumask_var_t subparts_cpus;
+
+ /*
* This is old Memory Nodes tasks took on.
*
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
@@ -134,6 +145,47 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
+
+ /* number of CPUs in subparts_cpus */
+ int nr_subparts_cpus;
+
+ /* partition root state */
+ int partition_root_state;
+
+ /*
+ * Default hierarchy only:
+ * use_parent_ecpus - set if using parent's effective_cpus
+ * child_ecpus_count - # of children with use_parent_ecpus set
+ */
+ int use_parent_ecpus;
+ int child_ecpus_count;
+};
+
+/*
+ * Partition root states:
+ *
+ * 0 - not a partition root
+ *
+ * 1 - partition root
+ *
+ * -1 - invalid partition root
+ * None of the cpus in cpus_allowed can be put into the parent's
+ * subparts_cpus. In this case, the cpuset is not a real partition
+ * root anymore. However, the CPU_EXCLUSIVE bit will still be set
+ * and the cpuset can be restored back to a partition root if the
+ * parent cpuset can give more CPUs back to this child cpuset.
+ */
+#define PRS_DISABLED 0
+#define PRS_ENABLED 1
+#define PRS_ERROR -1
+
+/*
+ * Temporary cpumasks for working with partitions that are passed among
+ * functions to avoid memory allocation in inner functions.
+ */
+struct tmpmasks {
+ cpumask_var_t addmask, delmask; /* For partition root */
+ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
};
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -152,19 +204,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return false;
-}
-#endif
-
-
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -218,9 +257,15 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+static inline int is_partition_root(const struct cpuset *cs)
+{
+ return cs->partition_root_state > 0;
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
+ .partition_root_state = PRS_ENABLED,
};
/**
@@ -315,25 +360,52 @@ static inline bool is_in_v2_mode(void)
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
-static struct dentry *cpuset_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name, void *data)
-{
- struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- struct dentry *ret = ERR_PTR(-ENODEV);
- if (cgroup_fs) {
- char mountopts[] =
- "cpuset,noprefix,"
- "release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->mount(cgroup_fs, flags,
- unused_dev_name, mountopts);
- put_filesystem(cgroup_fs);
+static int cpuset_get_tree(struct fs_context *fc)
+{
+ struct file_system_type *cgroup_fs;
+ struct fs_context *new_fc;
+ int ret;
+
+ cgroup_fs = get_fs_type("cgroup");
+ if (!cgroup_fs)
+ return -ENODEV;
+
+ new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags);
+ if (IS_ERR(new_fc)) {
+ ret = PTR_ERR(new_fc);
+ } else {
+ static const char agent_path[] = "/sbin/cpuset_release_agent";
+ ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0);
+ if (!ret)
+ ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0);
+ if (!ret)
+ ret = vfs_parse_fs_string(new_fc, "release_agent",
+ agent_path, sizeof(agent_path) - 1);
+ if (!ret)
+ ret = vfs_get_tree(new_fc);
+ if (!ret) { /* steal the result */
+ fc->root = new_fc->root;
+ new_fc->root = NULL;
+ }
+ put_fs_context(new_fc);
}
+ put_filesystem(cgroup_fs);
return ret;
}
+static const struct fs_context_operations cpuset_fs_context_ops = {
+ .get_tree = cpuset_get_tree,
+};
+
+static int cpuset_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &cpuset_fs_context_ops;
+ return 0;
+}
+
static struct file_system_type cpuset_fs_type = {
- .name = "cpuset",
- .mount = cpuset_mount,
+ .name = "cpuset",
+ .init_fs_context = cpuset_init_fs_context,
};
/*
@@ -419,6 +491,65 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
}
/**
+ * alloc_cpumasks - allocate three cpumasks for cpuset
+ * @cs: the cpuset that have cpumasks to be allocated.
+ * @tmp: the tmpmasks structure pointer
+ * Return: 0 if successful, -ENOMEM otherwise.
+ *
+ * Only one of the two input arguments should be non-NULL.
+ */
+static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ cpumask_var_t *pmask1, *pmask2, *pmask3;
+
+ if (cs) {
+ pmask1 = &cs->cpus_allowed;
+ pmask2 = &cs->effective_cpus;
+ pmask3 = &cs->subparts_cpus;
+ } else {
+ pmask1 = &tmp->new_cpus;
+ pmask2 = &tmp->addmask;
+ pmask3 = &tmp->delmask;
+ }
+
+ if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
+ goto free_one;
+
+ if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
+ goto free_two;
+
+ return 0;
+
+free_two:
+ free_cpumask_var(*pmask2);
+free_one:
+ free_cpumask_var(*pmask1);
+ return -ENOMEM;
+}
+
+/**
+ * free_cpumasks - free cpumasks in a tmpmasks structure
+ * @cs: the cpuset that have cpumasks to be free.
+ * @tmp: the tmpmasks structure pointer
+ */
+static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ if (cs) {
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->subparts_cpus);
+ }
+ if (tmp) {
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
+ }
+}
+
+/**
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
@@ -430,31 +561,24 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ if (alloc_cpumasks(trial, NULL)) {
+ kfree(trial);
+ return NULL;
+ }
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
-
-free_cpus:
- free_cpumask_var(trial->cpus_allowed);
-free_cs:
- kfree(trial);
- return NULL;
}
/**
- * free_trial_cpuset - free the trial cpuset
- * @trial: the trial cpuset to be freed
+ * free_cpuset - free the cpuset
+ * @cs: the cpuset to be freed
*/
-static void free_trial_cpuset(struct cpuset *trial)
+static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumask_var(trial->effective_cpus);
- free_cpumask_var(trial->cpus_allowed);
- kfree(trial);
+ free_cpumasks(cs, NULL);
+ kfree(cs);
}
/*
@@ -660,13 +784,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (is_sched_load_balance(&top_cpuset)) {
+ if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -689,6 +814,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
csn = 0;
rcu_read_lock();
+ if (root_load_balance)
+ csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
@@ -699,6 +826,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
+ *
+ * If root is load-balancing, we can skip @cp if it
+ * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -706,11 +836,16 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
+ if (root_load_balance &&
+ cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+ continue;
+
if (is_sched_load_balance(cp))
csa[csn++] = cp;
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
+ /* skip @cp's subtree if not a partition root */
+ if (!is_partition_root(cp))
+ pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -838,7 +973,12 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ if (!top_cpuset.nr_subparts_cpus &&
+ !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ goto out;
+
+ if (top_cpuset.nr_subparts_cpus &&
+ !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -881,10 +1021,248 @@ static void update_tasks_cpumask(struct cpuset *cs)
css_task_iter_end(&it);
}
+/**
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
+ * @new_cpus: the temp variable for the new effective_cpus mask
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
+ * @parent: the parent cpuset
+ *
+ * If the parent has subpartition CPUs, include them in the list of
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+ * to mask those out.
+ */
+static void compute_effective_cpumask(struct cpumask *new_cpus,
+ struct cpuset *cs, struct cpuset *parent)
+{
+ if (parent->nr_subparts_cpus) {
+ cpumask_or(new_cpus, parent->effective_cpus,
+ parent->subparts_cpus);
+ cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+ cpumask_and(new_cpus, new_cpus, cpu_active_mask);
+ } else {
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+ }
+}
+
+/*
+ * Commands for update_parent_subparts_cpumask
+ */
+enum subparts_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's subparts_cpus */
+};
+
+/**
+ * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * @cpuset: The cpuset that requests change in partition root state
+ * @cmd: Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp: Temporary addmask and delmask
+ * Return: 0, 1 or an error code
+ *
+ * For partcmd_enable, the cpuset is being transformed from a non-partition
+ * root to a partition root. The cpus_allowed mask of the given cpuset will
+ * be put into parent's subparts_cpus and taken away from parent's
+ * effective_cpus. The function will return 0 if all the CPUs listed in
+ * cpus_allowed can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * parent's subparts_cpus will be taken away from that cpumask and put back
+ * into parent's effective_cpus. 0 should always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu
+ * list is to be changed from cpus_allowed to newmask. Otherwise,
+ * cpus_allowed is assumed to remain the same. The cpuset should either
+ * be a partition root or an invalid partition root. The partition root
+ * state may change if newmask is NULL and none of the requested CPUs can
+ * be granted by the parent. The function will return 1 if changes to
+ * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
+ * Error code should only be returned when newmask is non-NULL.
+ *
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). The partcmd_update command is used by
+ * update_cpumasks_hier() with newmask NULL and update_cpumask() with
+ * newmask set.
+ *
+ * The checking is more strict when enabling partition root than the
+ * other two commands.
+ *
+ * Because of the implicit cpu exclusive nature of a partition root,
+ * cpumask changes that violates the cpu exclusivity rule will not be
+ * permitted when checked by validate_change(). The validate_change()
+ * function will also prevent any changes to the cpu list if it is not
+ * a superset of children's cpu lists.
+ */
+static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *parent = parent_cs(cpuset);
+ int adding; /* Moving cpus from effective_cpus to subparts_cpus */
+ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
+ bool part_error = false; /* Partition error? */
+
+ lockdep_assert_held(&cpuset_mutex);
+
+ /*
+ * The parent must be a partition root.
+ * The new cpumask, if present, or the current cpus_allowed must
+ * not be empty.
+ */
+ if (!is_partition_root(parent) ||
+ (newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cpuset->cpus_allowed)))
+ return -EINVAL;
+
+ /*
+ * Enabling/disabling partition root is not allowed if there are
+ * online children.
+ */
+ if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
+ return -EBUSY;
+
+ /*
+ * Enabling partition root is not allowed if not all the CPUs
+ * can be granted from parent's effective_cpus or at least one
+ * CPU will be left after that.
+ */
+ if ((cmd == partcmd_enable) &&
+ (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
+ cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
+ return -EINVAL;
+
+ /*
+ * A cpumask update cannot make parent's effective_cpus become empty.
+ */
+ adding = deleting = false;
+ if (cmd == partcmd_enable) {
+ cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ adding = true;
+ } else if (cmd == partcmd_disable) {
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ } else if (newmask) {
+ /*
+ * partcmd_update with newmask:
+ *
+ * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
+ * addmask = newmask & parent->effective_cpus
+ * & ~parent->subparts_cpus
+ */
+ cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->subparts_cpus);
+
+ cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+ /*
+ * Return error if the new effective_cpus could become empty.
+ */
+ if (adding &&
+ cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+ if (!deleting)
+ return -EINVAL;
+ /*
+ * As some of the CPUs in subparts_cpus might have
+ * been offlined, we need to compute the real delmask
+ * to confirm that.
+ */
+ if (!cpumask_and(tmp->addmask, tmp->delmask,
+ cpu_active_mask))
+ return -EINVAL;
+ cpumask_copy(tmp->addmask, parent->effective_cpus);
+ }
+ } else {
+ /*
+ * partcmd_update w/o newmask:
+ *
+ * addmask = cpus_allowed & parent->effectiveb_cpus
+ *
+ * Note that parent's subparts_cpus may have been
+ * pre-shrunk in case there is a change in the cpu list.
+ * So no deletion is needed.
+ */
+ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
+ parent->effective_cpus);
+ part_error = cpumask_equal(tmp->addmask,
+ parent->effective_cpus);
+ }
+
+ if (cmd == partcmd_update) {
+ int prev_prs = cpuset->partition_root_state;
+
+ /*
+ * Check for possible transition between PRS_ENABLED
+ * and PRS_ERROR.
+ */
+ switch (cpuset->partition_root_state) {
+ case PRS_ENABLED:
+ if (part_error)
+ cpuset->partition_root_state = PRS_ERROR;
+ break;
+ case PRS_ERROR:
+ if (!part_error)
+ cpuset->partition_root_state = PRS_ENABLED;
+ break;
+ }
+ /*
+ * Set part_error if previously in invalid state.
+ */
+ part_error = (prev_prs == PRS_ERROR);
+ }
+
+ if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+ return 0; /* Nothing need to be done */
+
+ if (cpuset->partition_root_state == PRS_ERROR) {
+ /*
+ * Remove all its cpus from parent's subparts_cpus.
+ */
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ }
+
+ if (!adding && !deleting)
+ return 0;
+
+ /*
+ * Change the parent's subparts_cpus.
+ * Newly added CPUs will be removed from effective_cpus and
+ * newly deleted ones will be added back to effective_cpus.
+ */
+ spin_lock_irq(&callback_lock);
+ if (adding) {
+ cpumask_or(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->addmask);
+ cpumask_andnot(parent->effective_cpus,
+ parent->effective_cpus, tmp->addmask);
+ }
+ if (deleting) {
+ cpumask_andnot(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->delmask);
+ /*
+ * Some of the CPUs in subparts_cpus might have been offlined.
+ */
+ cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
+ cpumask_or(parent->effective_cpus,
+ parent->effective_cpus, tmp->delmask);
+ }
+
+ parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+
+ return cmd == partcmd_update;
+}
+
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_cpus: temp variable for calculating new effective_cpus
+ * @cs: the cpuset to consider
+ * @tmp: temp variables for calculating effective_cpus & partition setup
*
* When congifured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
@@ -893,7 +1271,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
*
* Called with cpuset_mutex held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -903,27 +1281,115 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
- cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (is_in_v2_mode() && cpumask_empty(new_cpus))
- cpumask_copy(new_cpus, parent->effective_cpus);
+ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+ if (!cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
+ }
+ } else if (cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = false;
+ WARN_ON_ONCE(!parent->child_ecpus_count);
+ parent->child_ecpus_count--;
+ }
- /* Skip the whole subtree if the cpumask remains the same. */
- if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ /*
+ * Skip the whole subtree if the cpumask remains the same
+ * and has no partition root state.
+ */
+ if (!cp->partition_root_state &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+ /*
+ * update_parent_subparts_cpumask() should have been called
+ * for cs already in update_cpumask(). We should also call
+ * update_tasks_cpumask() again for tasks in the parent
+ * cpuset if the parent's subparts_cpus changes.
+ */
+ if ((cp != cs) && cp->partition_root_state) {
+ switch (parent->partition_root_state) {
+ case PRS_DISABLED:
+ /*
+ * If parent is not a partition root or an
+ * invalid partition root, clear the state
+ * state and the CS_CPU_EXCLUSIVE flag.
+ */
+ WARN_ON_ONCE(cp->partition_root_state
+ != PRS_ERROR);
+ cp->partition_root_state = 0;
+
+ /*
+ * clear_bit() is an atomic operation and
+ * readers aren't interested in the state
+ * of CS_CPU_EXCLUSIVE anyway. So we can
+ * just update the flag without holding
+ * the callback_lock.
+ */
+ clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+ break;
+
+ case PRS_ENABLED:
+ if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
+ update_tasks_cpumask(parent);
+ break;
+
+ case PRS_ERROR:
+ /*
+ * When parent is invalid, it has to be too.
+ */
+ cp->partition_root_state = PRS_ERROR;
+ if (cp->nr_subparts_cpus) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ }
+ break;
+ }
+ }
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, new_cpus);
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus &&
+ (cp->partition_root_state != PRS_ENABLED)) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ } else if (cp->nr_subparts_cpus) {
+ /*
+ * Make sure that effective_cpus & subparts_cpus
+ * are mutually exclusive.
+ *
+ * In the unlikely event that effective_cpus
+ * becomes empty. we clear cp->nr_subparts_cpus and
+ * let its child partition roots to compete for
+ * CPUs again.
+ */
+ cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
+ cp->subparts_cpus);
+ if (cpumask_empty(cp->effective_cpus)) {
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ cpumask_clear(cp->subparts_cpus);
+ cp->nr_subparts_cpus = 0;
+ } else if (!cpumask_subset(cp->subparts_cpus,
+ tmp->new_cpus)) {
+ cpumask_andnot(cp->subparts_cpus,
+ cp->subparts_cpus, tmp->new_cpus);
+ cp->nr_subparts_cpus
+ = cpumask_weight(cp->subparts_cpus);
+ }
+ }
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
@@ -932,11 +1398,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
update_tasks_cpumask(cp);
/*
- * If the effective cpumask of any non-empty cpuset is changed,
- * we need to rebuild sched domains.
+ * On legacy hierarchy, if the effective cpumask of any non-
+ * empty cpuset is changed, we need to rebuild sched domains.
+ * On default hierarchy, the cpuset needs to be a partition
+ * root as well.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- is_sched_load_balance(cp))
+ is_sched_load_balance(cp) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ is_partition_root(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -949,6 +1419,35 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
}
/**
+ * update_sibling_cpumasks - Update siblings cpumasks
+ * @parent: Parent cpuset
+ * @cs: Current cpuset
+ * @tmp: Temp variables
+ */
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *sibling;
+ struct cgroup_subsys_state *pos_css;
+
+ /*
+ * Check all its siblings and call update_cpumasks_hier()
+ * if their use_parent_ecpus flag is set in order for them
+ * to use the right effective_cpus value.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, pos_css, parent) {
+ if (sibling == cs)
+ continue;
+ if (!sibling->use_parent_ecpus)
+ continue;
+
+ update_cpumasks_hier(sibling, tmp);
+ }
+ rcu_read_unlock();
+}
+
+/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
* @trialcs: trial cpuset
@@ -958,6 +1457,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
+ struct tmpmasks tmp;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -989,12 +1489,50 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ /*
+ * Use the cpumasks in trialcs for tmpmasks when they are pointers
+ * to allocated cpumasks.
+ */
+ tmp.addmask = trialcs->subparts_cpus;
+ tmp.delmask = trialcs->effective_cpus;
+ tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
+ if (cs->partition_root_state) {
+ /* Cpumask of a partition root cannot be empty */
+ if (cpumask_empty(trialcs->cpus_allowed))
+ return -EINVAL;
+ if (update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp) < 0)
+ return -EINVAL;
+ }
+
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+
+ /*
+ * Make sure that subparts_cpus is a subset of cpus_allowed.
+ */
+ if (cs->nr_subparts_cpus) {
+ cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
spin_unlock_irq(&callback_lock);
- /* use trialcs->cpus_allowed as a temp variable */
- update_cpumasks_hier(cs, trialcs->cpus_allowed);
+ update_cpumasks_hier(cs, &tmp);
+
+ if (cs->partition_root_state) {
+ struct cpuset *parent = parent_cs(cs);
+
+ /*
+ * For partition root, update the cpumasks of sibling
+ * cpusets if they use parent's effective_cpus.
+ */
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+ }
return 0;
}
@@ -1348,7 +1886,95 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (spread_flag_changed)
update_tasks_flags(cs);
out:
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
+ return err;
+}
+
+/*
+ * update_prstate - update partititon_root_state
+ * cs: the cpuset to update
+ * val: 0 - disabled, 1 - enabled
+ *
+ * Call with cpuset_mutex held.
+ */
+static int update_prstate(struct cpuset *cs, int val)
+{
+ int err;
+ struct cpuset *parent = parent_cs(cs);
+ struct tmpmasks tmp;
+
+ if ((val != 0) && (val != 1))
+ return -EINVAL;
+ if (val == cs->partition_root_state)
+ return 0;
+
+ /*
+ * Cannot force a partial or invalid partition root to a full
+ * partition root.
+ */
+ if (val && cs->partition_root_state)
+ return -EINVAL;
+
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
+ err = -EINVAL;
+ if (!cs->partition_root_state) {
+ /*
+ * Turning on partition root requires setting the
+ * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
+ * cannot be NULL.
+ */
+ if (cpumask_empty(cs->cpus_allowed))
+ goto out;
+
+ err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
+ if (err)
+ goto out;
+
+ err = update_parent_subparts_cpumask(cs, partcmd_enable,
+ NULL, &tmp);
+ if (err) {
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ goto out;
+ }
+ cs->partition_root_state = PRS_ENABLED;
+ } else {
+ /*
+ * Turning off partition root will clear the
+ * CS_CPU_EXCLUSIVE bit.
+ */
+ if (cs->partition_root_state == PRS_ERROR) {
+ cs->partition_root_state = 0;
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ err = 0;
+ goto out;
+ }
+
+ err = update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, &tmp);
+ if (err)
+ goto out;
+
+ cs->partition_root_state = 0;
+
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+
+ /*
+ * Update cpumask of parent's tasks except when it is the top
+ * cpuset as some system daemons cannot be mapped to other CPUs.
+ */
+ if (parent != &top_cpuset)
+ update_tasks_cpumask(parent);
+
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+
+ rebuild_sched_domains_locked();
+out:
+ free_cpumasks(NULL, &tmp);
return err;
}
@@ -1498,10 +2124,8 @@ out_unlock:
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
cgroup_taskset_first(tset, &css);
- cs = css_cs(css);
mutex_lock(&cpuset_mutex);
css_cs(css)->attach_in_progress--;
@@ -1593,10 +2217,12 @@ typedef enum {
FILE_MEMLIST,
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
FILE_SCHED_RELAX_DOMAIN_LEVEL,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
@@ -1732,7 +2358,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
break;
}
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
@@ -1770,6 +2396,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_SUBPARTS_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+ break;
default:
ret = -EINVAL;
}
@@ -1824,12 +2453,60 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
return 0;
}
+static int sched_partition_show(struct seq_file *seq, void *v)
+{
+ struct cpuset *cs = css_cs(seq_css(seq));
+
+ switch (cs->partition_root_state) {
+ case PRS_ENABLED:
+ seq_puts(seq, "root\n");
+ break;
+ case PRS_DISABLED:
+ seq_puts(seq, "member\n");
+ break;
+ case PRS_ERROR:
+ seq_puts(seq, "root invalid\n");
+ break;
+ }
+ return 0;
+}
+
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cpuset *cs = css_cs(of_css(of));
+ int val;
+ int retval = -ENODEV;
+
+ buf = strstrip(buf);
+
+ /*
+ * Convert "root" to ENABLED, and convert "member" to DISABLED.
+ */
+ if (!strcmp(buf, "root"))
+ val = PRS_ENABLED;
+ else if (!strcmp(buf, "member"))
+ val = PRS_DISABLED;
+ else
+ return -EINVAL;
+
+ css_get(&cs->css);
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ retval = update_prstate(cs, val);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ css_put(&cs->css);
+ return retval ?: nbytes;
+}
/*
* for the common functions, 'private' gives the type of file
*/
-static struct cftype files[] = {
+static struct cftype legacy_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
@@ -1932,6 +2609,60 @@ static struct cftype files[] = {
};
/*
+ * This is currently a minimal set for the default hierarchy. It can be
+ * expanded later on by migrating more features and control files from v1.
+ */
+static struct cftype dfl_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "mems.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpus.partition",
+ .seq_show = sched_partition_show,
+ .write = sched_partition_write,
+ .private = FILE_PARTITION_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.subpartitions",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_SUBPARTS_CPULIST,
+ .flags = CFTYPE_DEBUG,
+ },
+
+ { } /* terminate */
+};
+
+
+/*
* cpuset_css_alloc - allocate a cpuset css
* cgrp: control group that the new cpuset will be part of
*/
@@ -1947,26 +2678,19 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+
+ if (alloc_cpumasks(cs, NULL)) {
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
+ }
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
-
-free_cpus:
- free_cpumask_var(cs->cpus_allowed);
-free_cs:
- kfree(cs);
- return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1993,6 +2717,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
+ cs->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
}
spin_unlock_irq(&callback_lock);
@@ -2035,7 +2761,12 @@ out_unlock:
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_locked(). That is not needed
+ * in the default hierarchy where only changes in partition
+ * will cause repartitioning.
+ *
+ * If the cpuset has the 'sched.partition' flag enabled, simulate
+ * turning 'sched.partition" off.
*/
static void cpuset_css_offline(struct cgroup_subsys_state *css)
@@ -2044,9 +2775,20 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&cpuset_mutex);
- if (is_sched_load_balance(cs))
+ if (is_partition_root(cs))
+ update_prstate(cs, 0);
+
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ if (cs->use_parent_ecpus) {
+ struct cpuset *parent = parent_cs(cs);
+
+ cs->use_parent_ecpus = false;
+ parent->child_ecpus_count--;
+ }
+
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
@@ -2057,9 +2799,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->cpus_allowed);
- kfree(cs);
+ free_cpuset(cs);
}
static void cpuset_bind(struct cgroup_subsys_state *root_css)
@@ -2105,8 +2845,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.fork = cpuset_fork,
- .legacy_cftypes = files,
+ .legacy_cftypes = legacy_files,
+ .dfl_cftypes = dfl_files,
.early_init = true,
+ .threaded = true,
};
/**
@@ -2121,6 +2863,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -2227,20 +2970,29 @@ hotplug_update_tasks(struct cpuset *cs,
update_tasks_nodemask(cs);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
*
* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
* all its tasks are moved to the nearest ancestor with both resources.
*/
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
+ struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2255,9 +3007,60 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
- nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+ parent = parent_cs(cs);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+ if (cs->nr_subparts_cpus)
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus.
+ */
+ cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+
+ if (!tmp || !cs->partition_root_state)
+ goto update_tasks;
+
+ /*
+ * In the unlikely event that a partition root has empty
+ * effective_cpus or its parent becomes erroneous, we have to
+ * transition it to the erroneous state.
+ */
+ if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+ (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * If the effective_cpus is empty because the child
+ * partitions take away all the CPUs, we can keep
+ * the current partition and let the child partitions
+ * fight for available CPUs.
+ */
+ if ((parent->partition_root_state == PRS_ERROR) ||
+ cpumask_empty(&new_cpus)) {
+ update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, tmp);
+ cs->partition_root_state = PRS_ERROR;
+ }
+ cpuset_force_rebuild();
+ }
+
+ /*
+ * On the other hand, an erroneous partition root may be transitioned
+ * back to a regular one or a partition root with no CPU allocated
+ * from the parent may change to erroneous.
+ */
+ if (is_partition_root(parent) &&
+ ((cs->partition_root_state == PRS_ERROR) ||
+ !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+ cpuset_force_rebuild();
+update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
@@ -2271,13 +3074,6 @@ retry:
mutex_unlock(&cpuset_mutex);
}
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
- force_rebuild = true;
-}
-
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2300,6 +3096,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
bool on_dfl = is_in_v2_mode();
+ struct tmpmasks tmp, *ptmp = NULL;
+
+ if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ ptmp = &tmp;
mutex_lock(&cpuset_mutex);
@@ -2307,6 +3107,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
+ /*
+ * If subparts_cpus is populated, it is likely that the check below
+ * will produce a false positive on cpus_updated when the cpu list
+ * isn't changed. It is extra work, but it is better to be safe.
+ */
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
@@ -2315,6 +3120,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus. If no CPU is left,
+ * we clear the subparts_cpus & let the child partitions
+ * fight for the CPUs again.
+ */
+ if (top_cpuset.nr_subparts_cpus) {
+ if (cpumask_subset(&new_cpus,
+ top_cpuset.subparts_cpus)) {
+ top_cpuset.nr_subparts_cpus = 0;
+ cpumask_clear(top_cpuset.subparts_cpus);
+ } else {
+ cpumask_andnot(&new_cpus, &new_cpus,
+ top_cpuset.subparts_cpus);
+ }
+ }
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2343,7 +3164,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
continue;
rcu_read_unlock();
- cpuset_hotplug_update_tasks(cs);
+ cpuset_hotplug_update_tasks(cs, ptmp);
rcu_read_lock();
css_put(&cs->css);
@@ -2356,6 +3177,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
force_rebuild = false;
rebuild_sched_domains();
}
+
+ free_cpumasks(NULL, ptmp);
}
void cpuset_update_active_cpus(void)
@@ -2666,9 +3489,9 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_lock();
cgrp = task_cs(current)->css.cgroup;
- pr_info("%s cpuset=", current->comm);
+ pr_cont(",cpuset=");
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%*pbl\n",
+ pr_cont(",mems_allowed=%*pbl",
nodemask_pr_args(&current->mems_allowed));
rcu_read_unlock();
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 9caeda610249..5f1b87330bee 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -373,11 +373,9 @@ struct cgroup_subsys debug_cgrp_subsys = {
* On v2, debug is an implicit controller enabled by "cgroup_debug" boot
* parameter.
*/
-static int __init enable_cgroup_debug(char *str)
+void __init enable_debug_cgroup(void)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
debug_cgrp_subsys.threaded = true;
- return 1;
}
-__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9829c67ebc0a..c9960baaa14f 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task)
pids_uncharge(pids, 1);
}
-static void pids_free(struct task_struct *task)
+static void pids_release(struct task_struct *task)
{
struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
@@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.cancel_attach = pids_cancel_attach,
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
- .free = pids_free,
+ .release = pids_release,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
.threaded = true,
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index d3bbb757ee49..1d75ae7f1cb7 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge);
* If IB stack wish a device to participate in rdma cgroup resource
* tracking, it must invoke this API to register with rdma cgroup before
* any user space application can start using the RDMA resources.
- * Returns 0 on success or EINVAL when table length given is beyond
- * supported size.
*/
-int rdmacg_register_device(struct rdmacg_device *device)
+void rdmacg_register_device(struct rdmacg_device *device)
{
INIT_LIST_HEAD(&device->dev_node);
INIT_LIST_HEAD(&device->rpools);
@@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device)
mutex_lock(&rdmacg_mutex);
list_add_tail(&device->dev_node, &rdmacg_devices);
mutex_unlock(&rdmacg_mutex);
- return 0;
}
EXPORT_SYMBOL(rdmacg_register_device);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d503d1a9007c..bb95a35e8c2d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
if (pos == root)
return NULL;
@@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
- if (parent && rstatc->updated_next) {
+ if (rstatc->updated_next) {
+ struct cgroup *parent = cgroup_parent(pos);
struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
struct cgroup_rstat_cpu *nrstatc;
struct cgroup **nextp;
@@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* updated stat.
*/
smp_mb();
+
+ return pos;
}
- return pos;
+ /* only happens for @root */
+ return NULL;
}
/* see cgroup_rstat_flush() */