From 009bcbd0b201d4dc125eb960a61cb6d4d9fdfc72 Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 14 May 2026 14:50:32 +0800 Subject: cgroup/rdma: add rdma.events to track resource limit exhaustion Add per-device hierarchical event counters to track when RDMA resource limits are exceeded. The rdma.events file reports max event counts propagated upward from the cgroup whose limit was hit to all ancestors. This mirrors the design of pids.events, where events are attributed to the cgroup that imposed the limit, not necessarily the cgroup where the allocation was attempted. Userspace can monitor this file via poll/epoll for real-time notification of resource exhaustion. Signed-off-by: Tao Cui Signed-off-by: Tejun Heo --- include/linux/cgroup_rdma.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index 80edae03c313..ac691fe7d3f5 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -24,6 +24,9 @@ struct rdma_cgroup { * that belongs to this cgroup. */ struct list_head rpools; + + /* Handle for rdma.events */ + struct cgroup_file events_file; }; struct rdmacg_device { -- cgit v1.2.3 From aefe4847f0891e2e71bedf5478d1cf350f86fc61 Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 14 May 2026 14:50:33 +0800 Subject: cgroup/rdma: add rdma.events.local for per-cgroup allocation failure attribution Add per-cgroup local event counters to track RDMA resource limit exhaustion from the perspective of individual cgroups. The rdma.events.local file reports two per-resource counters: - max: number of times this cgroup's limit was the one that blocked an allocation in the subtree - alloc_fail: number of allocation attempts originating from this cgroup that failed due to an ancestor's limit This mirrors the design of pids.events.local, where events are attributed to the cgroup that imposed the limit, not necessarily the cgroup where the allocation was attempted. Also extend rdma.events with a hierarchical alloc_fail counter that tracks allocation failures propagating upward from the requesting cgroup, complementing the existing max counter, so that rdma.events and rdma.events.local share the same output format. Signed-off-by: Tao Cui Signed-off-by: Tejun Heo --- include/linux/cgroup_rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index ac691fe7d3f5..404e746552ca 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -25,8 +25,9 @@ struct rdma_cgroup { */ struct list_head rpools; - /* Handle for rdma.events */ + /* Handles for rdma.events[.local] */ struct cgroup_file events_file; + struct cgroup_file events_local_file; }; struct rdmacg_device { -- cgit v1.2.3 From 3360a5c16d87933fb74b530f5e016eb3dfffee5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:17 -1000 Subject: cgroup: Inline cgroup_has_tasks() in cgroup.h cpuset reads cs->css.cgroup->nr_populated_csets directly in two places to test whether a cgroup has tasks. cgroup.c already has a matching helper, cgroup_has_tasks(). Move it to cgroup.h as static inline and use that instead. This is to prepare for relocation of cgroup->nr_populated_csets. No semantic change. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..ceb87507667e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -639,6 +639,11 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } +static inline bool cgroup_has_tasks(struct cgroup *cgrp) +{ + return cgrp->nr_populated_csets; +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { -- cgit v1.2.3 From 44fabf05634ce9e90b3fb179ea962995b7bbaa09 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:18 -1000 Subject: cgroup: Annotate unlocked nr_populated_* accesses with READ_ONCE/WRITE_ONCE cgroup_update_populated() updates nr_populated_csets, nr_populated_domain_children, and nr_populated_threaded_children under css_set_lock, but cgroup_has_tasks(), cgroup_is_populated(), and cgroup_can_be_thread_root() read them without holding it. Use READ_ONCE/WRITE_ONCE. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ceb87507667e..9f8bef8f3a60 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -639,16 +639,29 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } +/* + * Populated counters: writes happen under css_set_lock. The accessors below + * may read unlocked. What an unpopulated result means depends on context: + * + * - No lock held. Just a snapshot. May race with concurrent updates and is + * useful only as a hint. + * + * - cgroup_mutex held. Migration into the cgroup is blocked, so an observed + * !populated stays !populated until cgroup_mutex is dropped. + * + * - CSS_DYING set. The css can no longer be repopulated, so !populated is + * sticky once observed. + */ static inline bool cgroup_has_tasks(struct cgroup *cgrp) { - return cgrp->nr_populated_csets; + return READ_ONCE(cgrp->nr_populated_csets); } -/* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + - cgrp->nr_populated_threaded_children; + return READ_ONCE(cgrp->nr_populated_csets) + + READ_ONCE(cgrp->nr_populated_domain_children) + + READ_ONCE(cgrp->nr_populated_threaded_children); } /* returns ino associated with a cgroup */ -- cgit v1.2.3 From c4799253a3ee74ebb27be72fb991c597a5902c01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:19 -1000 Subject: cgroup: Move populated counters to cgroup_subsys_state Later patches replace the cgroup-level finish_destroy_work deferral added by 93618edf7538 ("cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated") with a per-subsys-css deferral. That needs each subsystem css to track its own populated count. Move the populated counters from cgroup onto cgroup_subsys_state. cgroup->self is itself a cgroup_subsys_state and self.parent walks the same chain as cgroup_parent(), so cgroup_update_populated() generalizes to a single css_update_populated() taking a css. The cgroup-side bookkeeping runs only when the walk started from a self css. Keep nr_populated_{domain,threaded}_children on cgroup. Both sum to self.nr_populated_children, but staying as dedicated fields to allow readers like cgroup_can_be_thread_root() unlocked access. css_set_update_populated() also walks the per-subsys-css chain so each subsystem css's hierarchical populated count is maintained. No reader consumes those counts yet. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 24 ++++++++++++++---------- include/linux/cgroup.h | 11 +++++++---- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 50a784da7a81..c4929f7bbe5a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -253,6 +253,15 @@ struct cgroup_subsys_state { */ int nr_descendants; + /* + * Hierarchical populated state. For cgroup->self, nr_populated_csets + * counts populated csets linked via cgrp_cset_link. + * nr_populated_children counts immediate-child csses whose own + * populated state is nonzero. Protected by css_set_lock. + */ + int nr_populated_csets; + int nr_populated_children; + /* * A singly-linked list of css structures to be rstat flushed. * This is a scratch field to be used exclusively by @@ -504,17 +513,12 @@ struct cgroup { int max_descendants; /* - * Each non-empty css_set associated with this cgroup contributes - * one to nr_populated_csets. The counter is zero iff this cgroup - * doesn't have any tasks. - * - * All children which have non-zero nr_populated_csets and/or - * nr_populated_children of their own contribute one to either - * nr_populated_domain_children or nr_populated_threaded_children - * depending on their type. Each counter is zero iff all cgroups - * of the type in the subtree proper don't have any tasks. + * Domain/threaded split of self.nr_populated_children: each counts + * immediate-child cgroups whose subtree is populated and sums to + * self.nr_populated_children. Kept as separate fields to allow readers + * like cgroup_can_be_thread_root() unlocked access. Protected by + * css_set_lock; updated by css_update_populated(). */ - int nr_populated_csets; int nr_populated_domain_children; int nr_populated_threaded_children; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9f8bef8f3a60..c2a8c38d8206 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -654,14 +654,17 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, */ static inline bool cgroup_has_tasks(struct cgroup *cgrp) { - return READ_ONCE(cgrp->nr_populated_csets); + return READ_ONCE(cgrp->self.nr_populated_csets); +} + +static inline bool css_is_populated(struct cgroup_subsys_state *css) +{ + return READ_ONCE(css->nr_populated_csets) || READ_ONCE(css->nr_populated_children); } static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return READ_ONCE(cgrp->nr_populated_csets) + - READ_ONCE(cgrp->nr_populated_domain_children) + - READ_ONCE(cgrp->nr_populated_threaded_children); + return css_is_populated(&cgrp->self); } /* returns ino associated with a cgroup */ -- cgit v1.2.3 From cfc1da7e1127b4c8787f4dc25d59987c10c9107f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:20 -1000 Subject: cgroup: Add per-subsys-css kill_css_finish deferral 93618edf7538 ("cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated") deferred kill_css_finish() at the cgroup level: rmdir waits for the entire cgroup's populated count to drop to zero, then fires kill_css_finish() on every subsystem css at once. Replace that with per-subsys-css deferral. Each subsystem css now tracks its own hierarchical populated count and independently defers its kill_css_finish() until its own subtree drains. The rmdir-race fix carries through unchanged in shape. The dying css's ->css_offline() still waits until no PF_EXITING task references it, and v2's cgroup-level machinery goes away. cgroup_apply_control_disable() has the same race shape (PF_EXITING tasks pinning a css whose ->css_offline() is about to run) and stays synchronous here. This patch lays the groundwork for fixing it - per-cgroup waiting can't gate one subsys css being killed while the rest of the cgroup stays live, but per-css can. Subtree-wide invariant preserved: a dying ancestor css stays populated through nr_populated_children until every dying descendant's task drains, so the walker fires the ancestor's kill_finish_work only after all descendants have drained. Add paired smp_mb()s in kill_css_sync() and css_update_populated() to fence the StoreLoad on (CSS_DYING, populated counter), guaranteeing that either the walker queues kill_finish_work or the caller fires synchronously. cgroup_destroy_locked() was implicitly fenced by an unrelated css_set_lock pair; cgroup_apply_control_disable() in the next patch is not. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c4929f7bbe5a..de2cd6238c2a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -262,6 +262,9 @@ struct cgroup_subsys_state { int nr_populated_csets; int nr_populated_children; + /* deferred kill_css_finish() queued by css_update_populated() */ + struct work_struct kill_finish_work; + /* * A singly-linked list of css structures to be rstat flushed. * This is a scratch field to be used exclusively by @@ -615,9 +618,6 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* defers killing csses after removal until cgroup is depopulated */ - struct work_struct finish_destroy_work; - /* used to schedule release agent */ struct work_struct release_agent_work; -- cgit v1.2.3