From 6a07814ff643b5c8e1353d8c6229f52fde205cde Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 5 Jun 2026 15:53:16 +0200 Subject: kernfs: fix xattr race condition with multiple superblocks Multiple superblocks with different namespaces can share the same kernfs_node when kernfs_test_super() finds a matching root but different namespace. This means multiple inodes from different superblocks can reference the same kernfs_node->iattr->xattrs structure. The VFS layer only holds per-inode locks during xattr operations, which is insufficient to serialize concurrent xattr modifications on the shared kernfs_node. This can lead to race conditions in simple_xattr_set() where the lookup->replace/remove sequence is not atomic with respect to operations from other superblocks. Fix this by protecting xattr operations with the existing hashed kernfs_locks->open_file_mutex[] array, which is already used to protect per-node open file data. The hashed mutex array provides scalable per-node serialization (scaled by CPU count, up to 1024 locks on 32+ CPU systems) with zero memory overhead. Changes: - Rename open_file_mutex[] to node_mutex[] to reflect dual purpose - Add kernfs_node_lock_ptr() and kernfs_node_lock() helpers - Protect simple_xattr_set() calls in kernfs_xattr_set() and kernfs_vfs_user_xattr_set() with the hashed mutex - Update file.c to use new helpers via compatibility wrappers - Update documentation to explain the extended lock usage Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") Reported-by: Sashiko Closes: https://sashiko.dev/#/patchset/20260601162454.2116375-1-mszeredi%40redhat.com Assisted-by: Claude:claude-sonnet-4-5 Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260605135322.2632068-2-mszeredi@redhat.com Signed-off-by: Christian Brauner (Amutable) --- include/linux/kernfs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index e21b2f7f4159..351a5101c862 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -76,20 +76,25 @@ struct kernfs_iattrs; * kernfs_open_file. * * kernfs_open_files are chained at kernfs_open_node->files, which is - * protected by kernfs_global_locks.open_file_mutex[i]. + * protected by kernfs_global_locks.node_mutex[i]. * * To reduce possible contention in sysfs access, arising due to single - * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node + * locks, use an array of locks (e.g. node_mutex) and use kernfs_node * object address as hash keys to get the index of these locks. * * Hashed mutexes are safe to use here because operations using these don't * rely on global exclusion. * + * The hashed mutex array protects per-node data: the kernfs_open_node for + * open file management, and kernfs_node xattr operations (necessary because + * multiple superblocks with different namespaces can share the same + * kernfs_node, making per-inode locking insufficient). + * * In future we intend to replace other global locks with hashed ones as well. * kernfs_global_locks acts as a holder for all such hash tables. */ struct kernfs_global_locks { - struct mutex open_file_mutex[NR_KERNFS_LOCKS]; + struct mutex node_mutex[NR_KERNFS_LOCKS]; }; enum kernfs_node_type { -- cgit v1.2.3 From 076e5cef28e27febfc09b5f72544d2b857c75201 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 5 Jun 2026 15:53:18 +0200 Subject: simple_xattr: change interface to pass struct simple_xattrs ** Change the simple_xattr API to accept pointer-to-pointer (struct simple_xattrs **) instead of pointer. This allows the functions to handle lazy allocation internally without requiring callers to use simple_xattrs_lazy_alloc(). The simple_xattr_set(), simple_xattr_set_limited() and simple_xattr_add() functions now handle allocation when xattrs is NULL. simple_xattrs_free() now also frees the xattrs structure itself and sets the pointer to NULL. This simplifies callers and removes the need for most callers to explicitly manage xattrs allocation and lifetime. In shmem_initxattrs(), the total required space for all initial xattrs (ispace) is pre-calculated and deducted from sbinfo->free_ispace. Since this patch modifies the function to add new xattrs directly to the inode's &info->xattrs list rather than using a local temporary variable, a failure means that the partially populated info->xattrs list remains attached to the inode. When the VFS caller handles the -ENOMEM error, it drops the newly created inode via iput(), shmem_free_inode() adds freed to sbinfo->free_ispace a second time, permanently inflating the tmpfs free space quota. Fix by substracting already added xattrs from ispace. Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260605135322.2632068-4-mszeredi@redhat.com Signed-off-by: Christian Brauner (Amutable) --- include/linux/xattr.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 8b6601367eae..ded446c1ef81 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -133,26 +133,23 @@ static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) } int simple_xattrs_init(struct simple_xattrs *xattrs); -struct simple_xattrs *simple_xattrs_alloc(void); -struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, - const void *value, int flags); -void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); +void simple_xattrs_free(struct simple_xattrs **xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); void simple_xattr_free_rcu(struct simple_xattr *xattr); -int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, +int simple_xattr_get(struct simple_xattrs **xattrs, const char *name, void *buffer, size_t size); -struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, +struct simple_xattr *simple_xattr_set(struct simple_xattrs **xattrs, const char *name, const void *value, size_t size, int flags); -int simple_xattr_set_limited(struct simple_xattrs *xattrs, +int simple_xattr_set_limited(struct simple_xattrs **xattrs, struct simple_xattr_limits *limits, const char *name, const void *value, size_t size, int flags); -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, +ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrs, char *buffer, size_t size); -int simple_xattr_add(struct simple_xattrs *xattrs, +int simple_xattr_add(struct simple_xattrs **xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); @@ -162,10 +159,4 @@ DEFINE_CLASS(simple_xattr, simple_xattr_alloc(value, size), const void *value, size_t size) -DEFINE_CLASS(simple_xattrs, - struct simple_xattrs *, - if (!IS_ERR_OR_NULL(_T)) { simple_xattrs_free(_T, NULL); kfree(_T); }, - simple_xattrs_alloc(), - void) - #endif /* _LINUX_XATTR_H */ -- cgit v1.2.3 From 1e7cd8a53b72a58a44c4d282aed95f6ce0e76db0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 5 Jun 2026 15:53:19 +0200 Subject: simpe_xattr: use per-sb cache Move the hash table to the super block to remove excessive overhead in case of small number of xattrs per inode. Add linked list to the inode, used for listxattr and eviction. Listxattr uses rcu protection to iterate the list of xattrs. Before being made per-sb, lazy allocation was protected by inode lock. Now inode lock no longer provides sufficient exclusion, so use cmpxchg() to ensure atomicity. Though I haven't found a description of this pattern, after some research it seems that cmpxchg_release() and READ_ONCE() should provide the necessary memory barriers. Use simple_xattr_free_rcu() in simple_xattrs_free(). This is needed because the hash table is now shared between inodes and lookup on a different inode might be running the compare function on the just freed element within the RCU grace period. Following stats are based on slabinfo diff, after creating 100k empty files, then adding a "user.test=foo" xattr to each: v7.0 (no rhashtable): File creation: 993.40 bytes/file Xattr addition: 79.99 bytes/file v7.1-rc2 (per-inode rhashtable): File creation: 939.73 bytes/file Xattr addition: 1296.08 bytes/file v7.1-rc2 + this patch (per-sb rhashtable) File creation: 946.84 bytes/file Xattr addition: 111.86 bytes/file The overhead of a single xattr is reduced to nearly v7.0 levels. The per xattr overhead is slightly larger due to the addition of three pointers to struct simple_xattr. Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260605135322.2632068-5-mszeredi@redhat.com Signed-off-by: Christian Brauner (Amutable) --- include/linux/shmem_fs.h | 3 ++- include/linux/xattr.h | 26 ++++++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 93a0ba872ebe..69b0177da156 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -48,7 +48,7 @@ struct shmem_inode_info { }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ - struct simple_xattrs *xattrs; /* list of xattrs */ + struct list_head xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ @@ -89,6 +89,7 @@ struct shmem_sb_info { struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ struct shmem_quota_limits qlimits; /* Default quota limits */ + struct simple_xattr_cache xa_cache; }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) diff --git a/include/linux/xattr.h b/include/linux/xattr.h index ded446c1ef81..7aaaf4f8aff5 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -106,12 +106,14 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) return handler->prefix ?: handler->name; } -struct simple_xattrs { - struct rhashtable ht; +struct simple_xattr_cache { + struct rhashtable *ht; }; struct simple_xattr { struct rhash_head hash_node; + struct list_head *parent; + struct list_head node; struct rcu_head rcu; char *name; size_t size; @@ -132,27 +134,31 @@ static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) atomic_set(&limits->xattr_size, 0); } -int simple_xattrs_init(struct simple_xattrs *xattrs); -void simple_xattrs_free(struct simple_xattrs **xattrs, size_t *freed_space); +void simple_xattrs_free(struct simple_xattr_cache *cache, struct list_head *xattrs, + size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); void simple_xattr_free_rcu(struct simple_xattr *xattr); -int simple_xattr_get(struct simple_xattrs **xattrs, const char *name, - void *buffer, size_t size); -struct simple_xattr *simple_xattr_set(struct simple_xattrs **xattrs, +int simple_xattr_get(struct simple_xattr_cache *cache, struct list_head *xattrs, + const char *name, void *buffer, size_t size); +struct simple_xattr *simple_xattr_set(struct simple_xattr_cache *cache, + struct list_head *xattrs, const char *name, const void *value, size_t size, int flags); -int simple_xattr_set_limited(struct simple_xattrs **xattrs, +int simple_xattr_set_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, struct simple_xattr_limits *limits, const char *name, const void *value, size_t size, int flags); -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrs, +ssize_t simple_xattr_list(struct inode *inode, struct list_head *xattrs, char *buffer, size_t size); -int simple_xattr_add(struct simple_xattrs **xattrs, +int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); +void simple_xattr_cache_cleanup(struct simple_xattr_cache *cache); + DEFINE_CLASS(simple_xattr, struct simple_xattr *, if (!IS_ERR_OR_NULL(_T)) simple_xattr_free(_T), -- cgit v1.2.3 From 9722955b54307e9070994f2382ec06af3d7405e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2026 09:40:12 +0200 Subject: bpf: Add simple xattr support to bpffs Add support for extended attributes on bpffs inodes so that user space and BPF LSM programs can attach metadata, for example, a content hash or a security label - to a pinned object or directory. BPF LSM or user space tooling can then uniformly look at this (e.g. security.bpf.*) in similar way to other fs'es. The store is in-memory and non-persistent: it lives only for the lifetime of the mount, like everything else in bpffs. The modelling is similar to tmpfs. bpffs serves the trusted.* and security.* namespaces; user.* is left unsupported. As bpffs is FS_USERNS_MOUNT, security.* is reachable by the unprivileged mounter in a user namespace, and thus we are using the simple_xattr_set_limited infra there (trusted.* needs global CAP_SYS_ADMIN). bpf_fill_super() is open-coded instead of using simple_fill_super(), because the root inode must now be allocated through bpf_fs_alloc_inode() i.e. carry the bpf_fs_inode wrapper and come from the right cache - which requires s_op (and s_xattr) to be installed before the first inode is created. While at it, also harden s_iflags with SB_I_NOEXEC and SB_I_NODEV. bpf_fs_listxattr() is only reachable through the filesystem via i_op->listxattr, so the BPF token inode is left untouched. Name-based fsetxattr()/fgetxattr() on a token fd still work since the get/set handlers are installed at the superblock. For security.* namespace, we use simple_xattr_set_limited() but there was no simple_xattr_add_limited() API yet which was needed in bpf_fs_initxattrs() to avoid underflows in the accounting. The symlink target is freed in bpf_free_inode() rather than in bpf_destroy_inode() so that it is released only after an RCU grace period, as an RCU path walk following the symlink may still dereference inode->i_link in security_inode_follow_link(). Lastly, the bpf_symlink() allocated the symlink target is switched to GFP_KERNEL_ACCOUNT, so the string is charged to the caller's memcg. Signed-off-by: Daniel Borkmann Link: https://patch.msgid.link/20260602074012.416289-1-daniel@iogearbox.net Cc: Christian Brauner Signed-off-by: Christian Brauner (Amutable) --- include/linux/bpf.h | 3 +++ include/linux/xattr.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca9..434ba91401c6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ #include #include #include +#include #include struct bpf_verifier_env; @@ -1918,6 +1919,8 @@ struct bpf_mount_opts { u64 delegate_maps; u64 delegate_progs; u64 delegate_attachs; + + struct simple_xattr_cache xa_cache; }; struct bpf_token { diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 7aaaf4f8aff5..54ac3cbc133f 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -155,6 +155,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct list_head *xattrs, char *buffer, size_t size); int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr *new_xattr); +int simple_xattr_add_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, + struct simple_xattr_limits *limits, + struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); void simple_xattr_cache_cleanup(struct simple_xattr_cache *cache); -- cgit v1.2.3