From 0e0e490f5d5ec2f91209b77a95f9c7185d97cfc6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 30 Apr 2026 15:42:43 -0400 Subject: VFS: use wait_var_event for waiting in d_alloc_parallel() Parallel lookup starts with a call of d_alloc_parallel(). That primitive either returns a matching hashed dentry or allocates a new one in the in-lookup state and returns it to the caller. Once the caller is done with lookup, it indicates so either by call of d_{splice_alias,add}() or by call of d_done_lookup(); at that point dentry leaves the in-lookup state. If d_alloc_parallel() finds a matching in-lookup dentry, it must wait for that dentry to leave the in-lookup state, one way or another. Currently by supplying wait_queue_head to d_alloc_parallel(). If d_alloc_parallel() creates a new in-lookup dentry, the address of that wait_queue_head is stored in ->d_wait of new dentry and stays there while it's in the in-lookup; subsequent d_alloc_parallel() will wait on the queue found in the matching in-lookup dentry. Transition out of in-lookup state wakes waiters on that queue (if any). That works, but the calling conventions are inconvenient - the caller must supply wait_queue_head and make sure that it survives at least until the new in-lookup dentry leaves the in-lookup state. That amounts to boilerplate in the d_alloc_parallel() callers that are followed by a call of d_lookup_done() in the same function; in cases like nfs asynchronous unlink it gets worse than that. This patch changes d_alloc_parallel() to use wake_up_var_locked() to wake up waiters, and wait_var_event_spinlock() to wait. dentry->d_lock is used for synchronisation as it is already held and the relevant times. That eliminates the need of caller-supplied wait_queue_head, simplifying the calling conventions. Better yet, we only need one bit of information stored in dentry itself: whether there are any waiters to be woken up, and that can be easily stored in ->d_flags; ->d_wait goes away. The reason we need that bit (DCACHE_LOOKUP_WAITERS) is that with wait_var machinery the queues are shared with all kinds of stuff and there's no way tell if any of the waiters have anything to do with our dentry; most of the time none of them will be relevant, so we need to avoid the pointless wakeups. Another benefit of the new scheme comes from the fact that wakeups have to be done outside of write-side critical areas of ->i_dir_seq; with the old scheme we need to carry the value picked from ->d_wait from __d_lookup_unhash() to the place where we actually wake the waiters up. Now we can just leave DCACHE_LOOKUP_WAITERS in ->d_flags until we get to doing wakeups - that's done within the same ->d_lock scope, so we are fine; new bit is accessed only under ->d_lock and it's seen only on dentries with DCACHE_PAR_LOOKUP in ->d_flags. __d_lookup_unhash() no longer needs to re-init ->d_lru. That was previously shared (in a union) with ->d_wait but ->d_wait is now gone so it no longer corrupts ->d_lru. Co-developed-by: Al Viro # saner handling of flags Signed-off-by: NeilBrown Signed-off-by: Al Viro --- include/linux/dcache.h | 11 +++++------ include/linux/nfs_xdr.h | 1 - 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 2577c05f84ec..97a887be150a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -116,10 +116,7 @@ struct dentry { * possible! */ - union { - struct list_head d_lru; /* LRU list */ - wait_queue_head_t *d_wait; /* in-lookup ones only */ - }; + struct list_head d_lru; /* LRU list */ struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* @@ -210,6 +207,9 @@ enum dentry_flags { DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), + DCACHE_LOOKUP_WAITERS = BIT(9), /* A thread is waiting for + * PAR_LOOKUP to clear + */ DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* @@ -256,8 +256,7 @@ extern void d_delete(struct dentry *); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); -extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, - wait_queue_head_t *); +extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); /* weird procfs mess; *NOT* exported */ extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index fcbd21b5685f..6aced49d5f00 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1743,7 +1743,6 @@ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; - wait_queue_head_t wq; const struct cred *cred; struct nfs_fattr dir_attr; long timeout; -- cgit v1.2.3 From 1a967a7ec70ff951716e84739f79b6e167ac1e0b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 May 2026 23:00:09 -0400 Subject: fix a race between d_find_any_alias() and final dput() of NORCU dentries Refcount of a NORCU dentry must not be incremented after having dropped to zero. Otherwise we might end up with the following race: CPU1: in fast_dput(d), rcu_read_lock(); CPU1: decrements refcount of d to 0 CPU1: notice that it's unhashed CPU2: grab a reference to d CPU2: dput(d), freeing d CPU1: ... looks like we need to evict d, let's grab ->d_lock, recheck the refcount, etc. and that spin_lock(&d->d_lock) ends up a UAF, despite still being in an RCU read-side critical area started back when the refcount had been positive. If not for DCACHE_NORCU in d->d_flags freeing would've been RCU-delayed, so we'd have grabbed ->d_lock, noticed the negative value stored into refcount by __dentry_kill(), dropped the locks and that would be it. For NORCU dentries freeing is _not_ delayed, though. Most of the non-counting references are excluded for NORCU dentries - they are not allowed to be hashed, they never get placed on LRU, they never get placed into anyone's list of children and while dput_to_list() might put them into a shrink list, nobody bumps refcount of something that had been reached that way. However, inode's list of aliases can be a problem - it does not contribute to dentry refcount (for obvious reasons) and we *do* have places that grab references to something found on that list - that's precisely what d_find_alias() is. In case of d_find_alias() we are safe - it skips unhashed aliases, so all NORCU ones are ignored there. d_find_any_alias() is *not* limited to hashed ones, though, and while it's usually called for directories (which never get NORCU dentries), there are callers that use it to get something for non-directories with no hashed aliases. Having d_find_any_alias() hit a NORCU dentry is not impossible - it can be easily arranged if you have CAP_DAC_READ_SEARCH (memfd_create() + mmap() + name_to_handle_at() for /proc/self/map_files/<...> + munmap() + open_by_handle_at() will do that, and adding a second memfd_create() for mount_fd makes it possible to do that without having memfd pinned). The race window is narrow, and it's probably not feasible on bare hardware, but... It's not hard to fix, fortunately: * separate __d_find_dir_alias() (== current __d_find_any_alias()) to be used for directory inodes. * provide dget_alias_ilocked() that would return false for NORCU dentries with zero refcount and return true incrementing refcount otherwise * make __d_find_any_alias() go over the list of aliases, using dget_alias_ilocked() and returning the alias it succeeds on (normally the first one). Any NORCU alias with zero refcount is going to be evicted by the thread that had dropped the final reference; this makes __d_find_any_alias() pretend it had lost the race with eviction. Signed-off-by: Al Viro --- include/linux/dcache.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 97a887be150a..a3409de3f490 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -365,6 +365,24 @@ static inline struct dentry *dget(struct dentry *dentry) return dentry; } +/* dentry->d_inode->i_lock must be held by caller */ +static inline bool dget_alias_ilocked(struct dentry *dentry) +{ + if (likely(!(READ_ONCE(dentry->d_flags) & DCACHE_NORCU))) { + lockref_get(&dentry->d_lockref); + return true; + } + // NORCU dentries with zero refcount MUST NOT be grabbed + spin_lock(&dentry->d_lock); + if (dentry->d_lockref.count > 0) { + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + return true; + } + spin_unlock(&dentry->d_lock); + return false; +} + extern struct dentry *dget_parent(struct dentry *dentry); /** -- cgit v1.2.3 From 67132b5e5de8a01493ddbb217e936415ca4e09af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Apr 2026 23:39:16 -0400 Subject: kill d_dispose_if_unused() Rename to_shrink_list() into __move_to_shrink_list(), document and export it. Switch d_dispose_if_unused() users to that and kill d_dispose_if_unused() itself. Signed-off-by: Al Viro --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a3409de3f490..4b1ff99608e0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -280,7 +280,7 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); -extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern bool __move_to_shrink_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); -- cgit v1.2.3 From e9895609cb7f9d9712c5b20044edff5f196eec1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2026 18:39:03 -0400 Subject: wind ->s_roots via ->d_sib instead of ->d_hash shrink_dcache_for_umount() is supposed to handle the possibility of some of the dentries to be evicted being in other threads shrink lists; it either kills them, leaving an empty husk to be freed by the owner of shrink list whenever it gets around to that, or it waits for the eviction in progress to get completed. That relies upon dentry remaining attached to the tree until the eviction reaches dentry_unlist() and its ->d_sib gets removed from the list. Unfortunately, the secondary roots are linked via ->d_hash, rather than ->d_sib and they become removed from that list before their inode references are dropped. If shrink_dentry_list() from another thread ends up evicting one of the secondary roots and gets to that point in dentry_kill() when shrink_dcache_for_umount() is looking for secondary roots, the latter will *not* notice anything, possibly leading to warnings about busy inodes at umount time and all kinds of breakage after that. Moreover, shrink_dcache_for_umount() walks the list of secondary roots with no protection whatsoever, so it might end up calling dget() on a dentry that already passed through lockref_mark_dead(&dentry->d_lockref); ending up with corrupted refcount and possible UAF. AFAICS, the most straightforward way to deal with that would be to have secondary roots linked via ->d_sib rather than ->d_hash; then they would remain on the list until killed, and we could use d_add_waiter() machinery to wait for eviction in progress. Changes: * secondary roots look the same as ->s_root from d_unhashed() and d_unlinked() POV now. * secondary roots are represented as "no parent, but on ->d_sib" instead of "no parent, but on ->d_hash". * since ->d_sib is a plain hlist, we protect it with per-superblock spinlock (sb->s_roots_lock) instead of the LSB of the head pointer (for non-root dentries it would be protected by ->d_lock of parent). * __d_obtain_alias() uses ->d_sib for linkage when allocating a secondary root. * d_splice_alias_ops() detects splicing of a secondary root and removes it from the list before calling __d_move(). * dentry_unlist() detects eviction of a secondary root and removes it from the list; no need to play the games for d_walk() sake, since the latter is not going to look for the next sibling of those anyway. * ___d_drop() doesn't care about ->s_roots anymore. * shrink_dcache_for_umount() uses proper locking for access to the list of secondary roots and if it runs into one that is in the middle of eviction waits for that to finish. Signed-off-by: Al Viro --- include/linux/fs/super_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..23d1c2612d0c 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -162,7 +162,8 @@ struct super_block { struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif - struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ + struct hlist_head s_roots; /* alternate root dentries for NFS */ + spinlock_t s_roots_lock; struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; -- cgit v1.2.3