summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShakeel Butt <shakeel.butt@linux.dev>2026-06-01 09:15:01 -0700
committerAndrew Morton <akpm@linux-foundation.org>2026-06-03 16:25:51 -0700
commit98733f3f0becb1ae0701d021c1748e974e5fa55c (patch)
tree7aa0ce9e3f3913f50ec3f07b57085b06880458ec
parent43e7f189769c512c843184a8a5892ac779a6bd90 (diff)
mm/list_lru: drain before clearing xarray entry on reparent
memcg_reparent_list_lrus() clears the dying memcg's xarray entry with xas_store(&xas, NULL) before reparenting its per-node lists into the parent. This opens a window where a concurrent list_lru_del() arriving for the dying memcg sees xa_load() == NULL, walks to the parent in lock_list_lru_of_memcg(), takes the parent's per-node lock, and calls list_del_init() on an item still physically linked on the dying memcg's list. If another in-flight thread holds the dying memcg's per-node lock at the same moment (another list_lru_del, or a list_lru_walk_one running an isolate callback), both threads modify ->next/->prev pointers on the same physical list under different locks. Adjacent items can corrupt each other's links. Fix it by reversing the order: reparent each per-node list and mark the child's list lru dead and then clear the xarray entry. Any concurrent list_lru op that finds the still-set xarray entry either takes the dying memcg's per-node lock (synchronizing with the drain) or sees LONG_MIN and walks to the parent, where the items now live. Link: https://lore.kernel.org/20260601161501.1444829-1-shakeel.butt@linux.dev Fixes: fb56fdf8b9a2 ("mm/list_lru: split the lock to per-cgroup scope") Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> Reported-by: Chris Mason <clm@fb.com> Reviewed-by: Kairui Song <kasong@tencent.com> Acked-by: Muchun Song <muchun.song@linux.dev> Cc: Dave Chinner <david@fromorbit.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--mm/list_lru.c21
1 files changed, 12 insertions, 9 deletions
diff --git a/mm/list_lru.c b/mm/list_lru.c
index dd29bcf8eb5f..9bf7f524796b 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -473,26 +473,29 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &memcg_list_lrus, list) {
struct list_lru_memcg *mlru;
- XA_STATE(xas, &lru->xa, memcg->kmemcg_id);
/*
- * Lock the Xarray to ensure no on going list_lru_memcg
- * allocation and further allocation will see css_is_dying().
+ * css_is_dying() check in memcg_list_lru_alloc() avoids
+ * allocating a new mlru since CSS_DYING is already set for this
+ * memcg a rcu grace period ago.
*/
- xas_lock_irq(&xas);
- mlru = xas_store(&xas, NULL);
- xas_unlock_irq(&xas);
+ mlru = xa_load(&lru->xa, memcg->kmemcg_id);
if (!mlru)
continue;
/*
- * With Xarray value set to NULL, holding the lru lock below
- * prevents list_lru_{add,del,isolate} from touching the lru,
- * safe to reparent.
+ * Reparent each per-node list and mark the child dead
+ * (LONG_MIN) before clearing xarray entry otherwise a
+ * concurrent list_lru_del() may corrupt the list if it arrives
+ * after xarray clear but before reparenting as
+ * lock_list_lru_of_memcg will acquire parent's lock while the
+ * item is still on child's list.
*/
for_each_node(i)
memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent);
+ xa_erase_irq(&lru->xa, memcg->kmemcg_id);
+
/*
* Here all list_lrus corresponding to the cgroup are guaranteed
* to remain empty, we can safely free this lru, any further