From f3cf725cd284b7912d5522babb44721bf38c8887 Mon Sep 17 00:00:00 2001 From: Nan Li Date: Mon, 22 Jun 2026 10:08:35 +0100 Subject: afs: handle CB.InitCallBackState3 requests without a server record The cache manager callback path now attaches the server record to an incoming call through the rxrpc peer's app data. That association is not guaranteed to exist for every callback request, and most callback handlers already tolerate that case. Make CB.InitCallBackState3 follow the same pattern by checking whether a server record was attached before using it. If the peer is not mapped to a server record, trace the request and ignore it, matching the existing behaviour for other unmatched callback requests. This keeps the callback handler consistent with the rest of the cache manager service and avoids depending on peer state that may not be available for a given request. Fixes: 40e8b52fe8c8 ("afs: Use the per-peer app data provided by rxrpc") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Nan Li Signed-off-by: Ren Wei Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-2-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/cmservice.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 5540ae1cad59..263c60c811a5 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -364,6 +364,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); + if (!call->server) { + trace_afs_cm_no_server_u(call, call->request); + return 0; + } + if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) { pr_notice("Callback UUID does not match fileserver UUID\n"); trace_afs_cm_no_server_u(call, call->request); -- cgit v1.2.3 From 539dce1144651f7976fa418e618b0b574bf15eeb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 14:52:18 +0200 Subject: fs: refuse O_TMPFILE creation with an unmapped fsuid or fsgid vfs_tmpfile() never checked that the caller's fsuid and fsgid map into the filesystem. On an idmapped mount whose idmapping does not cover the caller's fs{u,g}id, the ->tmpfile() instance initializes the new inode through inode_init_owner(), where mapped_fsuid()/mapped_fsgid() return INVALID_UID/INVALID_GID, and the tmpfile ends up owned by (uid_t)-1. Every other creation path already refuses this: may_o_create() (O_CREAT) and may_create_dentry() (mkdir, mknod, symlink, link) bail out with -EOVERFLOW via fsuidgid_has_mapping() precisely so that an object cannot be created with an owner the filesystem cannot represent. An O_TMPFILE is no exception: it is created I_LINKABLE and linkat(2) can splice it into the namespace afterwards, so the same guarantee must hold. Add the missing fsuidgid_has_mapping() check to vfs_tmpfile(). On a non-idmapped mount the caller's fs{u,g}id always map in the superblock's user namespace, so this is a no-op there and only takes effect on an idmapped mount that does not map the caller. It applies to every filesystem that sets FS_ALLOW_IDMAP and implements ->tmpfile() (tmpfs, ext4, btrfs, xfs, f2fs, ...), and to overlayfs, whose upper-layer tmpfile creation funnels through vfs_tmpfile() via backing_tmpfile_open(). Fixes: 8e5389132ab4 ("fs: introduce fsuidgid_has_mapping() helper") Link: https://patch.msgid.link/20260615-work-idmapped-tmpfile-v1-1-754a94d81f83@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner (Amutable) --- fs/namei.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 5cc9f0f466b8..19ce43c9a6e6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4736,6 +4736,10 @@ int vfs_tmpfile(struct mnt_idmap *idmap, int error; int open_flag = file->f_flags; + /* A tmpfile is I_LINKABLE, so guard its owner like may_o_create(). */ + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) + return -EOVERFLOW; + /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) -- cgit v1.2.3 From 4897cb71d4ab1f7e1a214adb1e4b80176702368d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 22 Jun 2026 10:08:36 +0100 Subject: afs: Fix error code in afs_extract_vl_addrs() The error codes on these paths are only set on the first iteration through the loop. Set the correct error code on every iteration. Fixes: 0a5143f2f89c ("afs: Implement VL server rotation") Signed-off-by: Dan Carpenter Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-3-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/vl_list.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 3e4966915ea4..003889cf0f18 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -92,7 +92,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, { struct afs_addr_list *alist; const u8 *b = *_b; - int ret = -EINVAL; + int ret; alist = afs_alloc_addrlist(nr_addrs); if (!alist) @@ -110,6 +110,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, case DNS_ADDRESS_IS_IPV4: if (end - b < 4) { _leave(" = -EINVAL [short inet]"); + ret = -EINVAL; goto error; } memcpy(x, b, 4); @@ -122,6 +123,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, case DNS_ADDRESS_IS_IPV6: if (end - b < 16) { _leave(" = -EINVAL [short inet6]"); + ret = -EINVAL; goto error; } memcpy(x, b, 16); -- cgit v1.2.3 From d943e68edc5cb98192d38e31373bb6b6a73230c6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Jun 2026 14:52:19 +0200 Subject: selftests/filesystems: test O_TMPFILE creation on idmapped mounts Add a regression test for the fsuidgid_has_mapping() check in vfs_tmpfile(). It idmaps a detached tmpfs mount so that the caller-visible id range [0, 10000) maps onto the on-disk range [10000, 20000) and checks that: - a caller whose fsuid/fsgid fall outside that range cannot create an O_TMPFILE through the mount and gets -EOVERFLOW instead of an inode owned by (uid_t)-1; - a mapped caller can create an O_TMPFILE, link it into the namespace, and the ownership round-trips through the mount idmap: it is reported as 0 through the mount and stored as 10000 on the underlying tmpfs. The test runs entirely as root and uses setfsuid()/setfsgid() to become the unmapped caller, so it needs no helper user. The layer directory is world-writable so that an unmapped caller still clears the directory permission check and reaches the fsuidgid_has_mapping() test. Link: https://patch.msgid.link/20260615-work-idmapped-tmpfile-v1-2-754a94d81f83@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner (Amutable) --- tools/testing/selftests/filesystems/.gitignore | 1 + tools/testing/selftests/filesystems/Makefile | 4 + .../selftests/filesystems/idmapped_tmpfile.c | 168 +++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 tools/testing/selftests/filesystems/idmapped_tmpfile.c diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index 64ac0dfa46b7..a78f894157de 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -5,3 +5,4 @@ fclog file_stressor anon_inode_test kernfs_test +idmapped_tmpfile diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index 85427d7f19b9..a7ec2ba2dd83 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -2,6 +2,10 @@ CFLAGS += $(KHDR_INCLUDES) TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog +TEST_GEN_PROGS += idmapped_tmpfile TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk + +$(OUTPUT)/idmapped_tmpfile: LDLIBS += -lcap +$(OUTPUT)/idmapped_tmpfile: utils.c diff --git a/tools/testing/selftests/filesystems/idmapped_tmpfile.c b/tools/testing/selftests/filesystems/idmapped_tmpfile.c new file mode 100644 index 000000000000..bc411ab8281e --- /dev/null +++ b/tools/testing/selftests/filesystems/idmapped_tmpfile.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "kselftest_harness.h" +#include "wrappers.h" +#include "utils.h" + +/* + * The test mount maps caller-visible ids [0, MAP_RANGE) onto the on-disk range + * [MAP_HOST, MAP_HOST + MAP_RANGE). An id outside [0, MAP_RANGE) therefore has + * no mapping in the mount and is not representable in the filesystem. + */ +#define MAP_HOST 10000 +#define MAP_RANGE 10000 +#define UNMAPPED 50000 + +#ifndef MOUNT_ATTR_IDMAP +#define MOUNT_ATTR_IDMAP 0x00100000 +#endif + +#ifndef __NR_mount_setattr +#define __NR_mount_setattr 442 +#endif + +static inline int sys_mount_setattr(int dfd, const char *path, + unsigned int flags, + struct mount_attr *attr, size_t size) +{ + return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); +} + +/* + * Clone @path into a detached mount idmapped so that caller-visible ids + * [0, MAP_RANGE) map onto the on-disk ids [MAP_HOST, MAP_HOST + MAP_RANGE). + * Returns the mount fd, or -1 if idmapped mounts are not available. + */ +static int idmapped_clone(const char *path) +{ + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + int fd_tree, userns_fd, ret; + + fd_tree = sys_open_tree(AT_FDCWD, path, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (fd_tree < 0) + return -1; + + userns_fd = get_userns_fd(MAP_HOST, 0, MAP_RANGE); + if (userns_fd < 0) { + close(fd_tree); + return -1; + } + + attr.userns_fd = userns_fd; + ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr)); + close(userns_fd); + if (ret) { + close(fd_tree); + return -1; + } + + return fd_tree; +} + +FIXTURE(idmapped_tmpfile) { + char dir[64]; /* non-idmapped path to the layer directory */ +}; + +FIXTURE_SETUP(idmapped_tmpfile) +{ + /* Private mount namespace so test mounts need no cleanup. */ + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + ASSERT_EQ(sys_mount("tmpfs", "/tmp", "tmpfs", 0, NULL), 0); + + snprintf(self->dir, sizeof(self->dir), "/tmp/d"); + ASSERT_EQ(mkdir(self->dir, 0777), 0); + /* World-writable so an unmapped caller still passes permission(). */ + ASSERT_EQ(chmod(self->dir, 0777), 0); +} + +FIXTURE_TEARDOWN(idmapped_tmpfile) +{ +} + +/* + * A caller whose fsuid/fsgid have no mapping in the idmapped mount must not be + * able to create an O_TMPFILE. Without the check in vfs_tmpfile() the inode + * would be created owned by (uid_t)-1 and could then be linked into the + * namespace. + */ +TEST_F(idmapped_tmpfile, unmapped_caller_is_refused) +{ + int mfd, fd; + + mfd = idmapped_clone(self->dir); + if (mfd < 0) + SKIP(return, "idmapped mounts not supported"); + + /* Become a caller outside the mount's [0, MAP_RANGE) range. */ + setfsgid(UNMAPPED); + setfsuid(UNMAPPED); + ASSERT_EQ(setfsuid(-1), UNMAPPED); + + fd = openat(mfd, ".", O_TMPFILE | O_WRONLY, 0644); + ASSERT_LT(fd, 0); + EXPECT_EQ(errno, EOVERFLOW); + if (fd >= 0) + close(fd); + + EXPECT_EQ(close(mfd), 0); +} + +/* + * A mapped caller can create an O_TMPFILE and link it into the namespace; the + * ownership round-trips through the mount idmap. This is what makes refusing + * the unmapped case above necessary in the first place. + */ +TEST_F(idmapped_tmpfile, mapped_caller_creates_and_links) +{ + char path[PATH_MAX]; + struct stat st; + int mfd, fd; + + mfd = idmapped_clone(self->dir); + if (mfd < 0) + SKIP(return, "idmapped mounts not supported"); + + /* Caller is uid/gid 0, which maps to MAP_HOST through the mount. */ + fd = openat(mfd, ".", O_TMPFILE | O_RDWR, 0600); + ASSERT_GE(fd, 0); + + ASSERT_EQ(fstat(fd, &st), 0); + EXPECT_EQ(st.st_uid, 0); + EXPECT_EQ(st.st_gid, 0); + + /* The tmpfile is linkable: splice it into the directory. */ + ASSERT_EQ(linkat(fd, "", mfd, "linked", AT_EMPTY_PATH), 0); + EXPECT_EQ(close(fd), 0); + + ASSERT_EQ(fstatat(mfd, "linked", &st, 0), 0); + EXPECT_EQ(st.st_uid, 0); + EXPECT_EQ(st.st_gid, 0); + + /* On the underlying, non-idmapped tmpfs it is stored as MAP_HOST. */ + snprintf(path, sizeof(path), "%s/linked", self->dir); + ASSERT_EQ(stat(path, &st), 0); + EXPECT_EQ(st.st_uid, MAP_HOST); + EXPECT_EQ(st.st_gid, MAP_HOST); + + EXPECT_EQ(close(mfd), 0); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From 0b70716081c6462be9b2928ad736d0d527b09678 Mon Sep 17 00:00:00 2001 From: Matvey Kovalev Date: Mon, 22 Jun 2026 10:08:37 +0100 Subject: afs: fix NULL pointer dereference in afs_get_tree() afs_alloc_sbi() uses kzalloc for memory allocation. And, if ctx->dyn_root is not null, as->cell and as->volume are null. In trace_afs_get_tree() they are dereferenced. KASAN error message: KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 2 PID: 18478 Comm: syz-executor.7 Not tainted 5.10.246-syzkaller #0 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:perf_trace_afs_get_tree+0x1d9/0x550 include/trace/events/afs.h:1365 Call Trace: trace_afs_get_tree include/trace/events/afs.h:1365 [inline] afs_get_tree+0x922/0x1350 fs/afs/super.c:599 vfs_get_tree+0x8e/0x300 fs/super.c:1572 do_new_mount fs/namespace.c:3011 [inline] path_mount+0x14a5/0x2220 fs/namespace.c:3341 do_mount fs/namespace.c:3354 [inline] __do_sys_mount fs/namespace.c:3562 [inline] __se_sys_mount fs/namespace.c:3539 [inline] __x64_sys_mount+0x283/0x300 fs/namespace.c:3539 do_syscall_64+0x33/0x50 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x67/0xd1 Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 80548b03991f5 ("afs: Add more tracepoints") Cc: stable@vger.kernel.org Signed-off-by: Matvey Kovalev Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-4-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/afs/super.c b/fs/afs/super.c index 942f3e9800d7..dec091e569c4 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -587,7 +587,8 @@ static int afs_get_tree(struct fs_context *fc) } fc->root = dget(sb->s_root); - trace_afs_get_tree(as->cell, as->volume); + if (!ctx->dyn_root) + trace_afs_get_tree(as->cell, as->volume); _leave(" = 0 [%p]", sb); return 0; -- cgit v1.2.3 From 733a984a4ee7345325e47efb505eebfe67b299bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:38 +0100 Subject: afs: Fix double netfs initialisation in afs_root_iget() Fix afs_root_iget() to leave initialisation of the netfs_inode part of the afs_vnode to afs_inode_init_from_status(). Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Closes: https://sashiko.dev/#/patchset/20260609081738.770127-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-5-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 3f48458694ba..a88995629d72 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -566,7 +566,6 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) vnode = AFS_FS_I(inode); vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); - afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); if (IS_ERR(op)) { -- cgit v1.2.3 From 81e985b4c3a6cbcc443fcdcd3ebda7fcc845d459 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:39 +0100 Subject: afs: Remove setting of AS_RELEASE_ALWAYS for symlinks and mountpoints Regular AFS files correctly use afs_file_aops which have release_folio set as netfs_release_folio, so AS_RELEASE_ALWAYS is valid for them when fscache is enabled (set via afs_vnode_set_cache()). Symlinks and mountpoints in AFS use afs_dir_aops, which does not provide a release_folio callback. However, afs_apply_status() unconditionally calls mapping_set_release_always() for these. In such case when memory management code attempts to release folios, filemap_release_folio() checks folio_needs_release() which returns true due to AS_RELEASE_ALWAYS being set. Since there is no release_folio callback, it falls through to try_to_free_buffers(), which at present expects buffer_heads to be not null. For symlinks and mountpoints without buffer_heads, this causes pointer dereference. [dh: Added more bits that were missed] Fixes: eae9e78951bb ("afs: Use netfslib for symlinks, allowing them to be cached") Signed-off-by: Deepakkumar Karn Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-6-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/inode.c | 7 +++---- fs/afs/internal.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index a88995629d72..54ac6ec21daf 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -52,9 +52,9 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren /* * Set parameters for the netfs library */ -static void afs_set_netfs_context(struct afs_vnode *vnode) +static void afs_set_netfs_context(struct afs_vnode *vnode, bool is_file) { - netfs_inode_init(&vnode->netfs, &afs_req_ops, true); + netfs_inode_init(&vnode->netfs, &afs_req_ops, is_file); } /* @@ -126,7 +126,6 @@ static int afs_inode_init_from_status(struct afs_operation *op, } inode->i_mapping->a_ops = &afs_symlink_aops; inode_nohighmem(inode); - mapping_set_release_always(inode->i_mapping); break; default: dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL); @@ -136,7 +135,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, i_size_write(inode, status->size); inode_set_bytes(inode, status->size); - afs_set_netfs_context(vnode); + afs_set_netfs_context(vnode, status->type == AFS_FTYPE_FILE); vnode->invalid_before = status->data_version; trace_afs_set_dv(vnode, status->data_version); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 0b72a8566299..785c646856d7 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -750,8 +750,6 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; - if (cookie) - mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } -- cgit v1.2.3 From 35b177ef541ae8eefbfbf679c3476bc3fb1eb83c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:40 +0100 Subject: afs: Fix directory inode initialisation order Fix afs_inode_init_from_status() to call afs_set_netfs_context() before the switch to do file type-specific initialisation because local directory changes don't get uploaded to the server, only stored in the cache. This requires that the file size be set before, so move that up too. Without this, NETFS_ICTX_SINGLE_NO_UPLOAD as set on directories gets clobbered. Closes: https://sashiko.dev/#/patchset/20260618074903.2374756-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-7-dhowells@redhat.com Fixes: 6dd80936618c ("afs: Use netfslib for directories") cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 54ac6ec21daf..51c28f148845 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -93,6 +93,10 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_gid = make_kgid(&init_user_ns, status->group); set_nlink(&vnode->netfs.inode, status->nlink); + i_size_write(inode, status->size); + inode_set_bytes(inode, status->size); + afs_set_netfs_context(vnode, status->type == AFS_FTYPE_FILE); + switch (status->type) { case AFS_FTYPE_FILE: inode->i_mode = S_IFREG | (status->mode & S_IALLUGO); @@ -133,10 +137,6 @@ static int afs_inode_init_from_status(struct afs_operation *op, return afs_protocol_error(NULL, afs_eproto_file_type); } - i_size_write(inode, status->size); - inode_set_bytes(inode, status->size); - afs_set_netfs_context(vnode, status->type == AFS_FTYPE_FILE); - vnode->invalid_before = status->data_version; trace_afs_set_dv(vnode, status->data_version); inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); -- cgit v1.2.3 From cb39654926f8e7a08ecc1dcb3941628855275940 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Mon, 22 Jun 2026 10:08:41 +0100 Subject: afs: use kvfree() to free memory allocated by kvcalloc() op->more_files is allocated with kvcalloc() but released via afs_put_operation(), which uses kfree() internally. This mismach prevents the resource from being released properly and may lead to undefined behavior. Fix this by using kvfree() to free op->more_files to match its allocation method. Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Signed-off-by: Zilin Guan Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-8-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/fs_operation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index c0dbbc6d3716..20801b29521d 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -348,7 +348,7 @@ int afs_put_operation(struct afs_operation *op) for (i = 0; i < op->nr_files - 2; i++) if (op->more_files[i].put_vnode) iput(&op->more_files[i].vnode->netfs.inode); - kfree(op->more_files); + kvfree(op->more_files); } if (op->estate) { -- cgit v1.2.3 From a58edda50a3ec08e6adac1d04dc3e488494e412d Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 22 Jun 2026 10:08:42 +0100 Subject: afs: Remove erroneous seq |= 1 in volume lookup loop The `seq |= 1` operation in the volume lookup loop is incorrect because: seq is already incremented at start, making it odd in next iteration which triggers lock, but The `|= 1` operation causes seq to be even and unintended lockless operation Remove this erroneous operation to maintain proper lock sequencing. Fixes: 32222f09782f ("afs: Apply server breaks to mmap'd files in the call processor") Signed-off-by: Li RongQing Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-9-dhowells@redhat.com Reviewed-by: Oleg Nesterov cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/callback.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 894d2bad6b6c..833ac3178ddc 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -140,7 +140,6 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, break; if (!need_seqretry(&cell->volume_lock, seq)) break; - seq |= 1; /* Want a lock next time */ } done_seqretry(&cell->volume_lock, seq); -- cgit v1.2.3 From 680ba02073415962446e79b10e15ad3b8c87fec5 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Mon, 22 Jun 2026 10:08:43 +0100 Subject: afs: check for duplicate servers in VL server list The DNS response may contain the same server more than once. Check for duplicates by name and port before inserting into the list to avoid duplicate entries. Addresses the TODO comment in afs_extract_vlserver_list(). Signed-off-by: Yuto Ohnuki Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-10-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/vl_list.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 003889cf0f18..8e1cf6cdcf71 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -289,8 +289,20 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, afs_put_addrlist(old, afs_alist_trace_put_vlserver_old); } + /* Check for duplicates in the server list */ + for (j = 0; j < vllist->nr_servers; j++) { + struct afs_vlserver *s = vllist->servers[j].server; - /* TODO: Might want to check for duplicates */ + if (s->name_len == server->name_len && + s->port == server->port && + strncasecmp(s->name, server->name, server->name_len) == 0) { + afs_put_vlserver(cell->net, server); + server = NULL; + break; + } + } + if (!server) + continue; /* Insertion-sort by priority and weight */ for (j = 0; j < vllist->nr_servers; j++) { -- cgit v1.2.3 From 2f79d1b93c62470fe02dbdc24770f1ae5a9e1be6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:44 +0100 Subject: afs: Fix bulk lookup malfunction due to change in dir_emit() API afs_do_lookup() and afs_do_lookup_one() use the same directory parsing code as afs_readdir() and were supplying alternative dir_context actors to retrieve dirents, but because lookup needs the vnode's uniquifier as part of the reference, but not the DT flags, the uniquifier was being passed in the dt flags argument to the lookup actors. Unfortunately, commit c644bce62b9c, added to fix overlayfs with fuse, broke this by masking off part of the uniquifier. This doesn't matter enough to be directly noticeable, instead causing bulk advance inode lookups to fail (which are retried later) and may cause dir revalidation to malfunction if the uniquifier is changed by masking. Fix this by making the afs directory parsing code take special ->actor values of AFS_LOOKUP or AFS_LOOKUP_ONE instead that tell it to call afs_lookup_filldir() or afs_lookup_one_filldir() directly rather than going through dir_emit(). dir_emit() is still used for readdir. Fixes: c644bce62b9c ("readdir: require opt-in for d_type flags") Reported-by: Marc Dionne Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-11-dhowells@redhat.com cc: Amir Goldstein cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/dir.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 498b99ccdf0e..6df56fe9163f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,9 +28,11 @@ static int afs_d_revalidate(struct inode *dir, const struct qstr *name, static int afs_d_delete(const struct dentry *dentry); static void afs_d_iput(struct dentry *dentry, struct inode *inode); static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, - loff_t fpos, u64 ino, unsigned dtype); + u64 ino, u32 uniquifier); +#define AFS_LOOKUP_ONE ((filldir_t)0x123UL) static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, - loff_t fpos, u64 ino, unsigned dtype); + u64 ino, u32 uniquifier); +#define AFS_LOOKUP ((filldir_t)0x137UL) static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, @@ -421,11 +423,18 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, } /* found the next entry */ - if (!dir_emit(ctx, dire->u.name, nlen, - ntohl(dire->u.vnode), - (ctx->actor == afs_lookup_filldir || - ctx->actor == afs_lookup_one_filldir)? - ntohl(dire->u.unique) : DT_UNKNOWN)) { + if (ctx->actor == AFS_LOOKUP) { + if (!afs_lookup_filldir(ctx, dire->u.name, nlen, + ntohl(dire->u.vnode), + ntohl(dire->u.unique))) + return 0; + } else if (ctx->actor == AFS_LOOKUP_ONE) { + if (!afs_lookup_one_filldir(ctx, dire->u.name, nlen, + ntohl(dire->u.vnode), + ntohl(dire->u.unique))) + return 0; + } else if (!dir_emit(ctx, dire->u.name, nlen, + ntohl(dire->u.vnode), DT_UNKNOWN)) { _leave(" = 0 [full]"); return 0; } @@ -545,6 +554,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) { afs_dataversion_t dir_version; + ctx->dt_flags_mask = UINT_MAX; return afs_dir_iterate(file_inode(file), ctx, file, &dir_version); } @@ -554,14 +564,14 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) * uniquifier through dtype */ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, - int nlen, loff_t fpos, u64 ino, unsigned dtype) + int nlen, u64 ino, u32 uniquifier) { struct afs_lookup_one_cookie *cookie = container_of(ctx, struct afs_lookup_one_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, - (unsigned long long) ino, dtype); + (unsigned long long) ino, uniquifier); /* insanity checks first */ BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); @@ -574,7 +584,7 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, } cookie->fid.vnode = ino; - cookie->fid.unique = dtype; + cookie->fid.unique = uniquifier; cookie->found = 1; _leave(" = false [found]"); @@ -591,7 +601,7 @@ static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, { struct afs_super_info *as = dir->i_sb->s_fs_info; struct afs_lookup_one_cookie cookie = { - .ctx.actor = afs_lookup_one_filldir, + .ctx.actor = AFS_LOOKUP_ONE, .name = *name, .fid.vid = as->volume->vid }; @@ -622,14 +632,14 @@ static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, * uniquifier through dtype */ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, - int nlen, loff_t fpos, u64 ino, unsigned dtype) + int nlen, u64 ino, u32 uniquifier) { struct afs_lookup_cookie *cookie = container_of(ctx, struct afs_lookup_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, - (unsigned long long) ino, dtype); + (unsigned long long) ino, uniquifier); /* insanity checks first */ BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); @@ -637,7 +647,7 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, if (cookie->nr_fids < 50) { cookie->fids[cookie->nr_fids].vnode = ino; - cookie->fids[cookie->nr_fids].unique = dtype; + cookie->fids[cookie->nr_fids].unique = uniquifier; cookie->nr_fids++; } @@ -778,7 +788,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) for (i = 0; i < ARRAY_SIZE(cookie->fids); i++) cookie->fids[i].vid = dvnode->fid.vid; - cookie->ctx.actor = afs_lookup_filldir; + cookie->ctx.actor = AFS_LOOKUP; cookie->name = dentry->d_name; cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want * and slot 0 for the directory */ -- cgit v1.2.3 From c9c3b615a462a4023bd148f02c564e175ed10502 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:45 +0100 Subject: afs: Fix misplaced inc of net->cells_outstanding Fix net->cells_outstanding being incremented before the check for failure of idr_alloc_cyclic(), leaving the count incremented on error. Fixes: 88c853c3f5c0 ("afs: Fix cell refcounting by splitting the usage counter") Reported-by: Hillf Danton Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-12-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/cell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 9738684dbdd2..e0fab1609f27 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -205,11 +205,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_source = vllist->source; cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ - atomic_inc(&net->cells_outstanding); ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, 2, INT_MAX / 2, GFP_KERNEL); if (ret < 0) goto error; + atomic_inc(&net->cells_outstanding); cell->dynroot_ino = ret; cell->debug_id = atomic_inc_return(&cell_debug_id); -- cgit v1.2.3 From 5597fbd1e7c161914f20315a726e54025b0fdadb Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:46 +0100 Subject: afs: Fix reinitialisation of the inode, in particular ->lock_work It seems that initalising afs_vnode::lock_work a single time in the slab's init function isn't sufficient for work_structs. This results in the DEBUG_OBJECTS debugging stuff producing a warning occasionally when running the generic/131 xfstest: ODEBUG: activate not available (active state 0) object: 0000000016d8760f object type: work_struct hint: afs_lock_work+0x0/0x220 WARNING: lib/debugobjects.c:629 at debug_print_object+0x4b/0x90, CPU#3: locktest/7695 ... CPU: 3 UID: 0 PID: 7695 Comm: locktest Tainted: G S 7.1.0-build3+ #2771 PREEMPT ... RIP: 0010:debug_print_object+0x65/0x90 ... Call Trace: ? __pfx_afs_lock_work+0x10/0x10 debug_object_activate+0x122/0x170 insert_work+0x25/0x60 __queue_work+0x2e0/0x340 queue_delayed_work_on+0x48/0x70 afs_fl_release_private+0x57/0x70 locks_release_private+0x5c/0xa0 locks_free_lock+0xe/0x20 posix_lock_inode+0x55f/0x5b0 locks_lock_inode_wait+0x81/0x140 ? file_write_and_wait_range+0x50/0x70 afs_lock+0xcd/0x110 fcntl_setlk+0x10d/0x260 do_fcntl+0x24e/0x5b0 __do_sys_fcntl+0x6a/0x90 do_syscall_64+0x11e/0x310 entry_SYSCALL_64_after_hwframe+0x71/0x79 Fix this by reinitialising ->lock_work after allocating an inode. Also, flush ->lock_work when the inode is being evicted to make sure it's not still running. Fixes: e8d6c554126b ("AFS: implement file locking") Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-13-dhowells@redhat.com cc: Marc Dionne cc: Thomas Gleixner cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/inode.c | 1 + fs/afs/super.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 51c28f148845..14f39a9bea6c 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -680,6 +680,7 @@ void afs_evict_inode(struct inode *inode) inode->i_mapping->a_ops->writepages(inode->i_mapping, &wbc); } + flush_delayed_work(&vnode->lock_work); netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); netfs_free_folioq_buffer(vnode->directory); diff --git a/fs/afs/super.c b/fs/afs/super.c index dec091e569c4..82bb713825a0 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -660,7 +660,6 @@ static void afs_i_init_once(void *_vnode) INIT_LIST_HEAD(&vnode->wb_keys); INIT_LIST_HEAD(&vnode->pending_locks); INIT_LIST_HEAD(&vnode->granted_locks); - INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); INIT_LIST_HEAD(&vnode->cb_mmap_link); seqlock_init(&vnode->cb_lock); } @@ -694,6 +693,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb) init_rwsem(&vnode->rmdir_lock); INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); + INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); _leave(" = %p", &vnode->netfs.inode); return &vnode->netfs.inode; -- cgit v1.2.3 From 0f36469d7ce98b362934113c550d08bb0c784231 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:47 +0100 Subject: afs: Fix callback service message parsers to pass through -EAGAIN The AFS filesystem client uses an rxrpc server to listen for callback notifications. Each callback call type handler has a delivery function that parses the incoming request stream, and this should return -EAGAIN the last packet hasn't yet been seen, but all currently queued received data is consumed. afs_extract_data() does this, but the -EAGAIN return is switched to 0 inadvertantly Fix callback service message parsers to pass through -EAGAIN Fixes: d001648ec7cf ("rxrpc: Don't expose skbs to in-kernel users [ver #2]") Closes: https://sashiko.dev/#/patchset/20260609081738.770127-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-14-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/cmservice.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 263c60c811a5..db394f101fc6 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -334,7 +334,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) ret = afs_extract_data(call, false); switch (ret) { case 0: break; - case -EAGAIN: return 0; default: return ret; } @@ -456,7 +455,6 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) ret = afs_extract_data(call, false); switch (ret) { case 0: break; - case -EAGAIN: return 0; default: return ret; } -- cgit v1.2.3 From 3b1601471a88f86082fc1f1c2475645cdf59f7d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:48 +0100 Subject: afs: Use scoped_seqlock_read() rather than manually doing seqlock stuff This is an addendum to the patch to remove the erroneous seq |= 1 in volume lookup loop. Switch to using scoped_seqlock_read() as suggested by Oleg Nesterov[1]. Signed-off-by: David Howells Link: https://lore.kernel.org/r/aifaeKvz3KemfzaS@redhat.com/ [1] Link: https://patch.msgid.link/20260622090856.2746629-15-dhowells@redhat.com Reviewed-by: Oleg Nesterov cc: Marc Dionne cc: Li RongQing cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/callback.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 833ac3178ddc..dd7a407ea368 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -113,16 +113,12 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, { struct afs_volume *volume = NULL; struct rb_node *p; - int seq = 1; - for (;;) { + scoped_seqlock_read(&cell->volume_lock, ss_lock) { /* Unfortunately, rbtree walking doesn't give reliable results * under just the RCU read lock, so we have to check for * changes. */ - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - read_seqbegin_or_lock(&cell->volume_lock, &seq); - p = rcu_dereference_raw(cell->volumes.rb_node); while (p) { volume = rb_entry(p, struct afs_volume, cell_node); @@ -138,11 +134,8 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback)) break; - if (!need_seqretry(&cell->volume_lock, seq)) - break; } - done_seqretry(&cell->volume_lock, seq); return volume; } -- cgit v1.2.3 From 794a01110390c1b76f59ece773fb0fbfd89c6f5c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:49 +0100 Subject: afs: Fix missing NULL pointer check in afs_break_some_callbacks() Fix afs_break_some_callbacks() to check to see if afs_lookup_volume_rcu() returned NULL (e.g. the specified volume is unknown). Fixes: 8230fd8217b7 ("afs: Make callback processing more efficient.") Closes: https://sashiko.dev/#/patchset/20260609081738.770127-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-16-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/callback.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index dd7a407ea368..74853e0d0435 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -213,7 +213,11 @@ static void afs_break_some_callbacks(struct afs_server *server, rcu_read_lock(); volume = afs_lookup_volume_rcu(server->cell, vid); - if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) { + if (!volume) { + /* Ignore breaks on unknown volumes. */ + rcu_read_unlock(); + *_count = 0; + } else if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) { afs_break_volume_callback(server, volume); *_count -= 1; if (*_count) -- cgit v1.2.3 From d672c276f685a540ed2b2a8bafaed4650a89022c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:50 +0100 Subject: afs: Fix leak of ungot volume Fix afs_lookup_volume_rcu() so that it doesn't leak a dying volume if afs_try_get_volume() fails. Fixes: 32222f09782f ("afs: Apply server breaks to mmap'd files in the call processor") Closes: https://sashiko.dev/#/patchset/20260609081738.770127-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-17-dhowells@redhat.com cc: Marc Dionne cc: Deepakkumar Karn cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/callback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 74853e0d0435..61354003c006 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -134,6 +134,7 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback)) break; + volume = NULL; } return volume; -- cgit v1.2.3 From fc10c0ecf06f2981af5d04357612b00051e03e9e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:51 +0100 Subject: afs: Fix vllist leak Fix a leak of the new vllist in afs_update_cell() in the event that it is an empty list (nr_servers == 0), in which case the old list isn't displaced unless the old list is also empty. Fixes: d5c32c89b208 ("afs: Fix cell DNS lookup") Closes: https://sashiko.dev/#/patchset/20260609081738.770127-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-18-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/cell.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index e0fab1609f27..fbb8a43aa7cd 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -547,6 +547,8 @@ static int afs_update_cell(struct afs_cell *cell) rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_source = vllist->source; old = p; + } else { + old = vllist; } write_unlock(&cell->vl_servers_lock); afs_put_vlserverlist(cell->net, old); -- cgit v1.2.3 From 55e841836c6f4646490f7b0347192b7a92d431ba Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:52 +0100 Subject: afs: Fix lack of locking around modifications of net->cells_dyn_ino Fix the lack of locking around modifications of net->cells_dyn_ino by taking net->cells_lock exclusively. This also requires to cell to be removed from net->cells_dyn_ino in afs_destroy_cell_work() rather than in afs_cell_destroy() as the latter runs in RCU cleanup context and sleeping locks cannot be taken there. Fixes: 1d0b929fc070 ("afs: Change dynroot to create contents on demand") Closes: https://sashiko.dev/#/patchset/20260618074903.2374756-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-19-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/cell.c | 8 +++++++- fs/afs/dynroot.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index fbb8a43aa7cd..9d8937ae24e2 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -205,8 +205,10 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_source = vllist->source; cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ + down_write(&net->cells_lock); ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, 2, INT_MAX / 2, GFP_KERNEL); + up_write(&net->cells_lock); if (ret < 0) goto error; atomic_inc(&net->cells_outstanding); @@ -579,7 +581,6 @@ static void afs_cell_destroy(struct rcu_head *rcu) afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); - idr_remove(&net->cells_dyn_ino, cell->dynroot_ino); kfree(cell->name - 1); kfree(cell); @@ -594,6 +595,11 @@ static void afs_destroy_cell_work(struct work_struct *work) afs_see_cell(cell, afs_cell_trace_destroy); timer_delete_sync(&cell->management_timer); cancel_work_sync(&cell->manager); + + down_write(&cell->net->cells_lock); + idr_remove(&cell->net->cells_dyn_ino, cell->dynroot_ino); + up_write(&cell->net->cells_lock); + call_rcu(&cell->rcu, afs_cell_destroy); } diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 1d5e33bc7502..6e3c8c691ba9 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -278,7 +278,7 @@ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry } /* - * Transcribe the cell database into readdir content under the RCU read lock. + * Transcribe the cell database into readdir content under net->cells_lock. * Each cell produces two entries, one prefixed with a dot and one not. */ static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx) -- cgit v1.2.3 From 26f17ce6fa3f05cb5965790499c1839094260de4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:53 +0100 Subject: afs: Fix premature cell exposure through /afs AFS cell records are prematurely exposured through the /afs dynamic root by virtue of adding them immediately to the net->cells_dyn_ino IDR when the cell is allocated rather than when it is added to the lookup tree. This allows a candidate record to be accessed, even if it's actually a duplicate or not published yet. Fix this by not adding the cell to cells_dyn_ino until it's confirmed non-duplicate and is being published. A flag is then used to record whether it is added to the IDR to make removal from the IDR conditional. Closes: https://sashiko.dev/#/patchset/20260618155141.2513212-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-20-dhowells@redhat.com Fixes: 1d0b929fc070 ("afs: Change dynroot to create contents on demand") cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/cell.c | 27 +++++++++++++++++---------- fs/afs/internal.h | 1 + 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 9d8937ae24e2..47a2645768d7 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -205,14 +205,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_source = vllist->source; cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ - down_write(&net->cells_lock); - ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, - 2, INT_MAX / 2, GFP_KERNEL); - up_write(&net->cells_lock); - if (ret < 0) - goto error; atomic_inc(&net->cells_outstanding); - cell->dynroot_ino = ret; cell->debug_id = atomic_inc_return(&cell_debug_id); trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); @@ -306,6 +299,13 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, goto cell_already_exists; } + ret = idr_alloc_cyclic(&net->cells_dyn_ino, candidate, + 2, INT_MAX / 2, GFP_KERNEL); + if (ret < 0) + goto cant_alloc_ino; + candidate->dynroot_ino = ret; + set_bit(AFS_CELL_FL_HAVE_INO, &candidate->flags); + cell = candidate; candidate = NULL; afs_use_cell(cell, trace); @@ -380,6 +380,11 @@ no_wait: _leave(" = %p [cell]", cell); return cell; +cant_alloc_ino: + up_write(&net->cells_lock); + afs_put_cell(candidate, afs_cell_trace_put_candidate); + goto error_noput; + cell_already_exists: _debug("cell exists"); cell = cursor; @@ -596,9 +601,11 @@ static void afs_destroy_cell_work(struct work_struct *work) timer_delete_sync(&cell->management_timer); cancel_work_sync(&cell->manager); - down_write(&cell->net->cells_lock); - idr_remove(&cell->net->cells_dyn_ino, cell->dynroot_ino); - up_write(&cell->net->cells_lock); + if (test_bit(AFS_CELL_FL_HAVE_INO, &cell->flags)) { + down_write(&cell->net->cells_lock); + idr_remove(&cell->net->cells_dyn_ino, cell->dynroot_ino); + up_write(&cell->net->cells_lock); + } call_rcu(&cell->rcu, afs_cell_destroy); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 785c646856d7..601f01e5c15f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -388,6 +388,7 @@ struct afs_cell { #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ #define AFS_CELL_FL_CHECK_ALIAS 2 /* Need to check for aliases */ +#define AFS_CELL_FL_HAVE_INO 3 /* Have dynroot_ino */ enum afs_cell_state state; short error; enum dns_record_source dns_source:8; /* Latest source of data from lookup */ -- cgit v1.2.3 From 56b4e4b26f84411d880f968a539207b0a8889c8c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:54 +0100 Subject: afs: Fix the volume AFS_VOLUME_RM_TREE is set on Fix afs_insert_volume_into_cell() to set AFS_VOLUME_RM_TREE on the volume replaced, not the new volume, as it's now removed from the cell's volume tree. This will cause the old volume to be removed from the tree twice and the new volume never to be removed. Fixes: 9a6b294ab496 ("afs: Fix use-after-free due to get/remove race in volume tree") Closes: https://sashiko.dev/#/patchset/20260618074903.2374756-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-21-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/volume.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 9ae5c8ad2e04..4f79d25ec37f 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -40,7 +40,7 @@ static struct afs_volume *afs_insert_volume_into_cell(struct afs_cell *cell, goto found; } - set_bit(AFS_VOLUME_RM_TREE, &volume->flags); + set_bit(AFS_VOLUME_RM_TREE, &p->flags); rb_replace_node_rcu(&p->cell_node, &volume->cell_node, &cell->volumes); } } -- cgit v1.2.3 From 903d37c97228258da71e092f8b4ab260ce81497d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Jun 2026 10:08:55 +0100 Subject: afs: Fix unchecked-length string display in debug statement Fix afs_extract_vlserver_list() to limit the length of the displayed string in a debug statement(). Fixes: 0a5143f2f89c ("afs: Implement VL server rotation") Closes: https://sashiko.dev/#/patchset/20260618074903.2374756-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260622090856.2746629-22-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/vl_list.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 8e1cf6cdcf71..c1dac5dbed0d 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -200,6 +200,8 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, b += sizeof(*hdr); while (end - b >= sizeof(bs)) { + int nlen; + bs.name_len = afs_extract_le16(&b); bs.priority = afs_extract_le16(&b); bs.weight = afs_extract_le16(&b); @@ -209,10 +211,12 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, bs.protocol = *b++; bs.nr_addrs = *b++; + nlen = min3(bs.name_len, end - b, 255); + _debug("extract %u %u %u %u %u %u %*.*s", bs.name_len, bs.priority, bs.weight, bs.port, bs.protocol, bs.nr_addrs, - bs.name_len, bs.name_len, b); + bs.name_len, nlen, b); if (end - b < bs.name_len) break; -- cgit v1.2.3 From ebebef925281a336ed1d4bbbefaa5d3b00877f28 Mon Sep 17 00:00:00 2001 From: Jori Koolstra Date: Sun, 14 Jun 2026 21:10:40 +0200 Subject: MAINTAINERS: take over vboxsf from Hans de Goede I talked to Hans de Goede about two weeks ago in person. He expressed he would rather have someone else maintain vboxsf and was thinking about orphaning it. Since I am already doing filesystem stuff anyway, I am fine with doing this. (vboxsf is a thin layer between the vfs and the Virtual Box guest device driver). I have no major plans for vboxsf, but I do want to support passing physical addresses to the host; the communication protocol seems to allow for it and it would mean we can get rid of some kmap calls. Signed-off-by: Jori Koolstra Link: https://patch.msgid.link/20260614191040.3007723-1-jkoolstra@xs4all.nl Acked-by: Hans de Goede Signed-off-by: Christian Brauner (Amutable) --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 15011f5752a9..a6f463d20328 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -28725,7 +28725,7 @@ F: include/linux/vbox_utils.h F: include/uapi/linux/vbox*.h VIRTUAL BOX SHARED FOLDER VFS DRIVER -M: Hans de Goede +M: Jori Koolstra L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/vboxsf/* -- cgit v1.2.3 From 681e452683b69a8e1a571cba0f238f8ceacf55d2 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 12 Jun 2026 12:40:41 +0800 Subject: iomap: release pages on atomic dio size mismatch If bio_iov_iter_get_pages() or the bounce helper succeeds but builds a short bio, the REQ_ATOMIC size check rejects it before submission. The old error path only dropped the bio reference, leaving any pages already attached to the bio unreleased. Release or unbounce the pages before falling through to out_put_bio on this error path. This bug was reported by sashiko: https://sashiko.dev/#/patchset/20260608073134.95964-1-changfengnan%40bytedance.com Fixes: 9e0933c21c12 ("fs: iomap: Atomic write support") Signed-off-by: Fengnan Chang Link: https://patch.msgid.link/20260612044041.10677-1-changfengnan@bytedance.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Amutable) --- fs/iomap/direct-io.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b485e3b191da..e2cd5f92babe 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -369,7 +369,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, */ if ((op & REQ_ATOMIC) && WARN_ON_ONCE(ret != iomap_length(iter))) { ret = -EINVAL; - goto out_put_bio; + goto out_bio_release_pages; } if (iter->iomap.flags & IOMAP_F_INTEGRITY) { @@ -393,6 +393,11 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, iomap_dio_submit_bio(iter, dio, bio, pos); return ret; +out_bio_release_pages: + if (dio->flags & IOMAP_DIO_BOUNCE) + bio_iov_iter_unbounce(bio, true, false); + else + bio_release_pages(bio, false); out_put_bio: bio_put(bio); return ret; -- cgit v1.2.3 From 16b02eb4b9b272c221255c20d34ccd5db53a3ed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Sat, 13 Jun 2026 21:10:05 +0000 Subject: proc: only bump parent nlink when registering directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit proc_register() increments the parent directory's link count for every entry it registers, while remove_proc_entry() and remove_proc_subtree() decrement it only when the removed entry is a directory. Regular files thus inflate the parent's count while they exist, and leak one link permanently on every create and remove cycle. For example, /proc/bus/pci/00 with twenty-two device files and no subdirectories reports nlink 24 instead of 2, and SR-IOV VF enable and disable cycles, each creating and removing the VF config space entries under /proc/bus/pci/, inflate the link count of that directory without bound. Before commit e06689bf5701 ("proc: change ->nlink under proc_subdir_lock"), the increment lived in proc_mkdir_data() and proc_create_mount_point(), and was therefore applied only to directories. Moving it into proc_register() to bring it under proc_subdir_lock dropped the S_ISDIR check. Thus, move the nlink accounting into pde_subdir_insert() and pde_erase(), only updating it for directories in both, so the link count is always changed together with the directory entry itself. Fixes: e06689bf5701 ("proc: change ->nlink under proc_subdir_lock") Cc: stable@vger.kernel.org # v5.5+ Signed-off-by: Krzysztof Wilczyński Link: https://patch.msgid.link/20260613211005.921692-1-kwilczynski@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/proc/generic.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index adc9b9a092b0..26086a283672 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -112,6 +112,8 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); rb_insert_color(&de->subdir_node, root); + if (S_ISDIR(de->mode)) + dir->nlink++; return true; } @@ -404,7 +406,6 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, write_unlock(&proc_subdir_lock); goto out_free_inum; } - dir->nlink++; write_unlock(&proc_subdir_lock); return dp; @@ -706,6 +707,8 @@ static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) { rb_erase(&pde->subdir_node, &parent->subdir); RB_CLEAR_NODE(&pde->subdir_node); + if (S_ISDIR(pde->mode)) + parent->nlink--; } /* @@ -731,8 +734,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = NULL; } else { pde_erase(de, parent); - if (S_ISDIR(de->mode)) - parent->nlink--; } } write_unlock(&proc_subdir_lock); @@ -791,8 +792,6 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) continue; } next = de->parent; - if (S_ISDIR(de->mode)) - next->nlink--; write_unlock(&proc_subdir_lock); proc_entry_rundown(de); -- cgit v1.2.3 From e348eecd4d8fa8d18a5157ff59f7be1dc59c5928 Mon Sep 17 00:00:00 2001 From: Souvik Banerjee Date: Fri, 1 May 2026 23:27:35 +0000 Subject: ovl: use linked upper dentry in copy-up tmpfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ovl_copy_up_tmpfile() stores the disconnected O_TMPFILE dentry as the overlay's upper dentry reference via ovl_inode_update(). vfs_tmpfile() allocated this dentry via d_alloc(parentpath->dentry, &slash_name), so d_name is "/" and d_parent is c->workdir. Local upper filesystems (ext4, btrfs, xfs, ...) immediately rename it to "#" via d_mark_tmpfile() inside their ->tmpfile() op; FUSE and virtiofs do not, so both fields stay that way. Neither identifies the destination directory and filename where ovl_do_link() actually linked the file. When the upper filesystem implements ->d_revalidate() (e.g. FUSE or virtiofs), ovl_revalidate_real() calls it with the dentry's parent inode and a snapshot of d_name. The server tries to look up "/" inside c->workdir, fails, and overlayfs reports -ESTALE. This causes persistent ESTALE errors for any file that was copied up via the tmpfile path, breaking dpkg, apt, and other tools that do rename-over-existing on overlayfs with a FUSE/virtiofs upper. Before commit 6b52243f633e ("ovl: fold copy-up helpers into callers"), the tmpfile copy-up path used a dedicated helper ovl_link_tmpfile() that captured the linked destination dentry returned by ovl_do_link(): err = ovl_do_link(temp, udir, upper); ... if (!err) *newdentry = dget(upper); and published it via ovl_inode_update(d_inode(c->dentry), newdentry). The fold inlined ovl_do_link() into ovl_copy_up_tmpfile() but dropped the dget(upper) capture, and rewrote the publish line as ovl_inode_update(d_inode(c->dentry), dget(temp)) — where temp is the disconnected O_TMPFILE dentry. Fix by keeping a reference to the linked destination dentry after ovl_do_link() succeeds, and publishing that dentry at the existing ovl_inode_update() call site. The non-tmpfile/workdir path continues to publish the renamed temporary dentry. Reproducer: - Mount overlayfs with virtiofs (or a FUSE fs whose server advertises FUSE_TMPFILE) as upper - Run: dpkg -i - Observe: "error installing new file '...': Stale file handle" Fixes: 6b52243f633e ("ovl: fold copy-up helpers into callers") Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Souvik Banerjee Link: https://patch.msgid.link/20260501232735.2610824-1-souvik@amlalabs.com Reviewed-by: Amir Goldstein Reviewed-by: Miklos Szeredi Signed-off-by: Christian Brauner (Amutable) --- fs/overlayfs/copy_up.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 13cb60b52bd6..e963701b4c87 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -853,7 +853,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(c->destdir); - struct dentry *temp, *upper; + struct dentry *temp, *upper, *newdentry = NULL; struct file *tmpfile; int err; @@ -889,6 +889,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, temp, udir, upper); + if (!err) { + /* + * Record the linked dentry -- not the disconnected + * O_TMPFILE dentry -- so that ->d_revalidate() on + * the upper fs sees the real parent/name. + */ + newdentry = dget(upper); + } end_creating(upper); } @@ -903,7 +911,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); - ovl_inode_update(d_inode(c->dentry), dget(temp)); + ovl_inode_update(d_inode(c->dentry), newdentry); out: ovl_end_write(c->dentry); -- cgit v1.2.3 From fb3e566cafc38fe3ba35e6843a2d529a3748870c Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 18 Jun 2026 10:39:22 -0400 Subject: minix: avoid overflow in bitmap block count calculation minix_check_superblock() uses minix_blocks_needed() to verify that the on-disk imap and zmap block counts are large enough for the advertised inode and zone counts. The helper currently performs DIV_ROUND_UP() in unsigned int arithmetic. A Minix v3 image can set s_ninodes or s_zones near UINT_MAX so the addition inside DIV_ROUND_UP() wraps to zero. That makes a zero imap/zmap block count look valid, after which minix_fill_super() can dereference s_imap[0] or s_zmap[0] even though no bitmap buffers were allocated. Impact: mounting a crafted Minix v3 image whose s_ninodes or s_zones is near UINT_MAX makes minix_check_superblock() accept a zero bitmap-block count and minix_fill_super() dereference s_imap[0]/s_zmap[0], panicking the kernel. The divisor is the bitmap capacity in bits, blocksize * 8, which is always a power of two: minix_fill_super() obtains the block size through sb_set_blocksize(), and blk_validate_block_size() rejects any size that is not a power of two. Use DIV_ROUND_UP_POW2(), which divides before adding the round-up term and so cannot overflow for a power-of-two divisor. Fixes: 8c97a6ddc956 ("minix: Add required sanity checking to minix_check_superblock()") Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Michael Bommarito Link: https://patch.msgid.link/20260618143922.3066874-1-michael.bommarito@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner (Amutable) --- fs/minix/minix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/minix/minix.h b/fs/minix/minix.h index f2025c9b5825..9e52d4302f0d 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -97,7 +97,7 @@ static inline struct minix_inode_info *minix_i(struct inode *inode) static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) { - return DIV_ROUND_UP(bits, blocksize * 8); + return DIV_ROUND_UP_POW2(bits, blocksize * 8); } #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ -- cgit v1.2.3 From 8c256fba2b46020004201c500b2a1fbc707a33ef Mon Sep 17 00:00:00 2001 From: Hongling Zeng Date: Wed, 17 Jun 2026 16:50:49 +0800 Subject: cachefiles: Fix double unlock in nomem_d_alloc error path When start_creating() fails and returns -ENOMEM, it has already released the parent directory lock in __start_dirop(): static struct dentry *__start_dirop(...) { ... inode_lock_nested(dir, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(name, parent, lookup_flags); if (IS_ERR(dentry)) inode_unlock(dir); <-- Lock released on error return dentry; } However, the nomem_d_alloc error path in cachefiles_get_directory() unconditionally calls inode_unlock(d_inode(dir)) again, causing a double unlock that corrupts the rwsem state. This is a leftover from commit 7ab96df840e60 which replaced manual locking with start_creating() but failed to update the nomem_d_alloc path (while correctly updating mkdir_error and lookup_error paths). Fixes: 7ab96df840e6 ("VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating()") Signed-off-by: Hongling Zeng Link: https://patch.msgid.link/20260617085049.730789-1-zenghongling@kylinos.cn Signed-off-by: Christian Brauner (Amutable) --- fs/cachefiles/namei.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 2937db690b40..2c46f0decb02 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -209,7 +209,6 @@ lookup_error: return ERR_PTR(ret); nomem_d_alloc: - inode_unlock(d_inode(dir)); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3 From fd5637a2fe6dd4448392738691d63e5559fafb12 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 9 Jun 2026 20:46:56 +0200 Subject: ovl: fix comment about locking order Forgot to update the comment when we changed the locking order. Fixes: 162d06444070c ("ovl: reorder ovl_want_write() after ovl_inode_lock()") Signed-off-by: Amir Goldstein Link: https://patch.msgid.link/20260609184656.1916631-1-amir73il@gmail.com Signed-off-by: Christian Brauner (Amutable) --- fs/overlayfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 00c69707bda9..bc71231cad53 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -783,8 +783,8 @@ static const struct address_space_operations ovl_aops = { * * This chain is valid: * - inode->i_rwsem (inode_lock[2]) - * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) * * And this chain is valid: @@ -797,8 +797,8 @@ static const struct address_space_operations ovl_aops = { * held, because it is in reverse order of the non-nested case using the same * upper fs: * - inode->i_rwsem (inode_lock[1]) - * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) * - OVL_I(inode)->lock (ovl_inode_lock[1]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) */ #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH -- cgit v1.2.3 From 6a2875517c778ac1111b6920e94cbab91cda8724 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 16 Jun 2026 18:33:46 +0200 Subject: fat: stop reading directory entries past the end-of-directory marker The FAT specification[1] (FAT Directory Structure -> "DIR_Name[0]") states: If DIR_Name[0] == 0x00, then the directory entry is free (same as for 0xE5), and there are no allocated directory entries after this one (all of the DIR_Name[0] bytes in all of the entries after this one are also set to 0). The special 0 value, rather than the 0xE5 value, indicates to FAT file system driver code that the rest of the entries in this directory do not need to be examined because they are all free. Linux did not honour this. fat_get_entry() kept advancing past the 0x00 terminator; if the trailing on-disk slots were not zero-filled (buggy formatters, read-only media written by other operating systems, on-disk corruption) the driver surfaced arbitrary bytes as real directory entries. On a typical affected image, `ls /mnt` returns ~150 bogus entries with random binary names, multi-gigabyte sizes, dates ranging from 1980 to 2106, and a flood of -EIO from stat(). Earlier attempts (v1..v3, see [2][3][4]) added `de->name[0] == 0` guards at each call site. As Hirofumi pointed out on v3, those guards reject the entry but fat_get_entry() has already advanced *pos past it; the next readdir() resumes after the marker and walks straight back into the garbage. His suggestion was to centralise the check. This patch: * Adds fat_get_entry_eod(), a small wrapper around fat_get_entry() that returns -1 when name[0] == 0 and seeks *pos to dir->i_size. Per spec every slot after the 0x00 marker is also zero, so jumping to the end of the directory is correct: subsequent reads return -1 from fat_bmap() without re-fetching trailing zero slots, and callers persisting *pos across invocations (notably readdir's ctx->pos) keep reporting end-of-directory on re-entry. * Converts the read/search paths to use the new wrapper: fat_parse_long(), fat_search_long(), __fat_readdir(), and fat_get_short_entry() -- the last covers fat_get_dotdot_entry(), fat_dir_empty(), fat_subdirs(), fat_scan(), and fat_scan_logstart() transitively. * Leaves fat_add_entries() and __fat_remove_entries() on raw fat_get_entry(): the write paths legitimately need to operate on free/zero slots. fat_add_entries() additionally detects an allocated entry past a 0x00 marker (the spec violation that produces the garbage) and treats it as filesystem corruption: fat_fs_error_ratelimit() is called -- which honours the configured errors= mount option (panic / remount-ro / continue) -- and the operation returns -EIO so we don't write fresh entries into an already-corrupt directory. [1] https://download.microsoft.com/download/1/6/1/161ba512-40e2-4cc9-843a-923143f3456c/fatgen103.doc [2] https://lore.kernel.org/lkml/20181207013410.7050-1-mcroce@redhat.com/ [3] https://lore.kernel.org/lkml/20181216231510.26854-1-mcroce@redhat.com/ [4] https://lore.kernel.org/lkml/20190201001408.7453-1-mcroce@redhat.com/ Reported-by: Timothy Redaelli Suggested-by: OGAWA Hirofumi Signed-off-by: Matteo Croce Link: https://patch.msgid.link/20260616163346.32603-1-technoboy85@gmail.com Acked-by: OGAWA Hirofumi Signed-off-by: Christian Brauner (Amutable) --- fs/fat/dir.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 4f6f42f33613..c6cca5d00ffd 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -130,6 +130,31 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, return fat__get_entry(dir, pos, bh, de); } +/* + * Like fat_get_entry(), but honour the FAT end-of-directory marker: + * a dirent whose first name byte is NUL terminates iteration per the + * spec, which also guarantees that every following slot is zeroed. + * Skip straight to the end of the directory so the next call returns + * -1 from fat_bmap() without re-reading the trailing zero slots, and + * so callers that persist *pos across invocations (e.g. readdir's + * ctx->pos) keep reporting EOD. Release *bh and set it to NULL to + * match fat_get_entry()'s contract that *bh is NULL on the -1 return. + */ +static int fat_get_entry_eod(struct inode *dir, loff_t *pos, + struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + int err = fat_get_entry(dir, pos, bh, de); + + if (err == 0 && (*de)->name[0] == 0) { + brelse(*bh); + *bh = NULL; + *pos = dir->i_size; + return -1; + } + return err; +} + /* * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII. * If uni_xlate is enabled and we can't get a 1:1 conversion, use a @@ -327,7 +352,7 @@ parse_long: if (ds->id & 0x40) (*unicode)[offset + 13] = 0; - if (fat_get_entry(dir, pos, bh, de) < 0) + if (fat_get_entry_eod(dir, pos, bh, de) < 0) return PARSE_EOF; if (slot == 0) break; @@ -489,7 +514,7 @@ int fat_search_long(struct inode *inode, const unsigned char *name, err = -ENOENT; while (1) { - if (fat_get_entry(inode, &cpos, &bh, &de) == -1) + if (fat_get_entry_eod(inode, &cpos, &bh, &de) == -1) goto end_of_dir; parse_record: nr_slots = 0; @@ -601,7 +626,7 @@ static int __fat_readdir(struct inode *inode, struct file *file, bh = NULL; get_new: - if (fat_get_entry(inode, &cpos, &bh, &de) == -1) + if (fat_get_entry_eod(inode, &cpos, &bh, &de) == -1) goto end_of_dir; parse_record: nr_slots = 0; @@ -885,7 +910,7 @@ static int fat_get_short_entry(struct inode *dir, loff_t *pos, struct buffer_head **bh, struct msdos_dir_entry **de) { - while (fat_get_entry(dir, pos, bh, de) >= 0) { + while (fat_get_entry_eod(dir, pos, bh, de) >= 0) { /* free entry or long name entry or volume label */ if (!IS_FREE((*de)->name) && !((*de)->attr & ATTR_VOLUME)) return 0; @@ -1302,6 +1327,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct msdos_dir_entry *de; int err, free_slots, i, nr_bhs; loff_t pos; + bool saw_eod; sinfo->nr_slots = nr_slots; @@ -1310,12 +1336,15 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, bh = prev = NULL; pos = 0; err = -ENOSPC; + saw_eod = false; while (fat_get_entry(dir, &pos, &bh, &de) > -1) { /* check the maximum size of directory */ if (pos >= FAT_MAX_DIR_SIZE) goto error; if (IS_FREE(de->name)) { + if (de->name[0] == 0) + saw_eod = true; if (prev != bh) { get_bh(bh); bhs[nr_bhs] = prev = bh; @@ -1325,6 +1354,13 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, if (free_slots == nr_slots) goto found; } else { + if (saw_eod) { + fat_fs_error_ratelimit(sb, + "allocated dir entry found after end-of-directory marker (i_pos %lld)", + MSDOS_I(dir)->i_pos); + err = -EIO; + goto error; + } for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); prev = NULL; -- cgit v1.2.3 From 704d48d81dc41470e108811c32c577ada66192d4 Mon Sep 17 00:00:00 2001 From: Farhad Alemi Date: Mon, 1 Jun 2026 20:10:08 -0700 Subject: freevxfs: don't BUG() on unknown typed-extent type vxfs_bmap_typed() handles four typed-extent types and calls BUG() in its default case, so an on-disk typed extent with any other type value crashes the kernel. It is reachable from ioctl(FIBMAP) on a regular file: kernel BUG at fs/freevxfs/vxfs_bmap.c:230! RIP: vxfs_bmap_typed fs/freevxfs/vxfs_bmap.c:230 [inline] vxfs_bmap1+0x128a/0x12d0 fs/freevxfs/vxfs_bmap.c:257 Replace the BUG() with WARN_ON_ONCE() and return 0 -- the value vxfs_bmap_typed() already returns on failure (and from the DEV4 case above); vxfs_getblk() maps 0 to -EIO, so the ioctl fails cleanly. Reported-by: Farhad Alemi Signed-off-by: Farhad Alemi Link: https://patch.msgid.link/CA+0ovChveuAwv=t15dr2m09E32bM48hHJxvfeEYZOhdNiEc9Tw@mail.gmail.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Amutable) --- fs/freevxfs/vxfs_bmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index e85222892038..1b8216eb1d90 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -227,7 +227,8 @@ vxfs_bmap_typed(struct inode *ip, long iblock) return 0; } default: - BUG(); + WARN_ON_ONCE(1); + return 0; } } -- cgit v1.2.3 From 18227a6bc98bd0ba96ed3ce9d5b28776a5a28dfc Mon Sep 17 00:00:00 2001 From: Bryam Vargas Date: Fri, 19 Jun 2026 04:38:20 -0500 Subject: orangefs: keep the readdir entry size 64-bit in fill_from_part() fill_from_part() computes the size of a directory entry in size_t but stores it in a __u32. An entry length near U32_MAX wraps it to a small value, bypasses the bounds check, and is then used to index the entry, reading far past the directory part -- an out-of-bounds read that oopses the kernel. Compute the size as a u64 so it cannot truncate; the bounds check then rejects the entry. The trailer is supplied by the userspace client. Fixes: 480e3e532e31 ("orangefs: support very large directories") Cc: stable@vger.kernel.org Signed-off-by: Bryam Vargas Link: https://patch.msgid.link/20260619-b4-disp-50d2bd59-v1-1-ce332969b4a2@proton.me Signed-off-by: Christian Brauner (Amutable) --- fs/orangefs/dir.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 6e2ebc8b9867..115b2c2f5269 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -191,7 +191,8 @@ static int fill_from_part(struct orangefs_dir_part *part, { const int offset = sizeof(struct orangefs_readdir_response_s); struct orangefs_khandle *khandle; - __u32 *len, padlen; + __u32 *len; + u64 padlen; loff_t i; char *s; i = ctx->pos & ~PART_MASK; @@ -215,8 +216,8 @@ static int fill_from_part(struct orangefs_dir_part *part, * len is the size of the string itself. padlen is the * total size of the encoded string. */ - padlen = (sizeof *len + *len + 1) + - (8 - (sizeof *len + *len + 1)%8)%8; + padlen = (u64)sizeof *len + *len + 1; + padlen += (8 - padlen % 8) % 8; if (part->len < i + padlen + sizeof *khandle) goto next; s = (void *)part + offset + i + sizeof *len; -- cgit v1.2.3 From 3f8c65b06fafc3f779abda5f7b81707411d05d4c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jun 2026 11:32:27 +0200 Subject: bpf: have bpf_real_data_inode() take a struct file bpf_real_data_inode() must be usable from the bprm_check_security, mmap_file and file_mprotect hooks for systemd's RestrictFilesystemAccess BPF LSM program, so have it take a struct file instead of a dentry. Amir Goldstein suggests: While doing so, rename it from bpf_real_inode() to bpf_real_data_inode(). For a regular file on a union/overlay filesystem it resolves to the underlying inode that hosts the data, but for a non-regular file it returns the overlay inode. The new name makes the "inode hosting the data" intent explicit and avoids the ambiguity of "the real inode backing a file". Document the non-regular-file behavior in the kfunc too. Both the signature change and the rename are safe because the kfunc landed this cycle and has no released users. Link: https://patch.msgid.link/20260623-work-bpf-real_inode-v2-1-8e8b57dd25f7@kernel.org Fixes: 9af8c8a54f6e ("bpf: add bpf_real_inode() kfunc") Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner (Amutable) --- fs/bpf_fs_kfuncs.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 768aca2dc0f0..f1863a891db6 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -360,18 +360,23 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s #endif /* CONFIG_CGROUPS */ /** - * bpf_real_inode - get the real inode backing a dentry - * @dentry: dentry to resolve + * bpf_real_data_inode - get the real inode hosting a file's data + * @file: file to resolve * - * If the dentry is on a union/overlay filesystem, return the underlying, real - * inode that hosts the data. Otherwise return the inode attached to the - * dentry itself. + * Resolve @file to the inode that hosts its data. For a regular file on a + * union/overlay filesystem this is the underlying (upper or lower) inode that + * stores the data, not the overlay inode. * - * Return: The real inode backing the dentry, or NULL for a negative dentry. + * Data resolution only applies to regular files. For a non-regular file (e.g. + * a device node, fifo or socket) on a union/overlay filesystem the overlay + * inode itself is returned; for any file on a non-union filesystem the inode + * attached to @file is returned. + * + * Return: The inode hosting @file's data, or NULL. */ -__bpf_kfunc struct inode *bpf_real_inode(struct dentry *dentry) +__bpf_kfunc struct inode *bpf_real_data_inode(struct file *file) { - return d_real_inode(dentry); + return d_real_inode(file_dentry(file)); } __bpf_kfunc_end_defs(); @@ -384,7 +389,7 @@ BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_real_inode, KF_SLEEPABLE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_real_data_inode, KF_SLEEPABLE | KF_RET_NULL) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) -- cgit v1.2.3 From 597a7bc7630035580e941a548cb646618c1c5933 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 16 Jun 2026 16:08:17 +0200 Subject: xfs: fix the error unwind in xfs_open_devices() Since the rt and log block devices are closed in xfs_free_buftarg() the buftarg owns the device file. The error unwind does not respect that: when the log buftarg allocation fails, out_free_rtdev_targ frees the rt buftarg - releasing rtdev_file - and then falls through to out_close_rtdev and releases it a second time. The unwind also leaves mp->m_rtdev_targp and mp->m_ddev_targp pointing to the freed buftargs. The failed mount continues into deactivate_locked_super() -> xfs_kill_sb() -> xfs_mount_free(), which frees them again. Clear the buftarg pointers once the unwind freed them and clear rtdev_file once the rt buftarg owns it, so nothing is released twice. Reachable when a buftarg allocation fails after the data buftarg was set up: an I/O error in sync_blockdev() or an allocation failure in xfs_init_buftarg() while mounting with external rt and log devices. Link: https://patch.msgid.link/20260616-work-super-bdev_holder_global-v2-1-7df6b864028e@kernel.org Fixes: 41233576e9a4 ("xfs: close the RT and log block devices in xfs_free_buftarg") Signed-off-by: Christian Brauner (Amutable) --- fs/xfs/xfs_super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index eac7f9503805..8531d526fc44 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -534,8 +534,11 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) xfs_free_buftarg(mp->m_rtdev_targp); + mp->m_rtdev_targp = NULL; + rtdev_file = NULL; /* released by xfs_free_buftarg() */ out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); + mp->m_ddev_targp = NULL; out_close_rtdev: if (rtdev_file) bdev_fput(rtdev_file); -- cgit v1.2.3 From 55ec50d046c03b3724741957f7b007856e36dbe7 Mon Sep 17 00:00:00 2001 From: Morduan Zang Date: Wed, 24 Jun 2026 14:26:22 +0800 Subject: iomap: guard io_size EOF trim against concurrent truncate underflow iomap: fix zero padding data issue in concurrent append writes changed ioend accounting so that io_size tracks only valid data within EOF. This trims io_size when a writeback range extends past end_pos: ioend->io_size += map_len; if (ioend->io_offset + ioend->io_size > end_pos) ioend->io_size = end_pos - ioend->io_offset; However, if end_pos ends up below ioend->io_offset, the subtraction becomes negative and is stored in size_t io_size, causing an unsigned wrap to a huge value. This can happen when writeback continues past byte-level EOF up to a block-aligned range, or when a concurrent truncate shrinks the file after end_pos was sampled in iomap_writeback_handle_eof(). A wrapped io_size can mislead append detection and corrupt completion-time size handling, since filesystem end_io paths consume io_size for decisions such as on-disk EOF updates and unwritten/COW completion ranges. Fix this by clamping io_size to zero when EOF has moved to or before the ioend start offset. This preserves the original intent of trimming io_size to valid in-EOF data while avoiding the underflow. Fixes: 51d20d1dacbe ("iomap: fix zero padding data issue in concurrent append writes") Suggested-by: Christoph Hellwig Signed-off-by: Morduan Zang Link: https://patch.msgid.link/9E38E2659B47DC2A+20260624062622.337469-1-zhangdandan@uniontech.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Amutable) --- fs/iomap/ioend.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index f7c3e0c70fd7..0565328764c1 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -298,8 +298,12 @@ new_ioend: * appending writes. */ ioend->io_size += map_len; - if (ioend->io_offset + ioend->io_size > end_pos) - ioend->io_size = end_pos - ioend->io_offset; + if (ioend->io_offset + ioend->io_size > end_pos) { + if (ioend->io_offset >= end_pos) + ioend->io_size = 0; + else + ioend->io_size = end_pos - ioend->io_offset; + } wbc_account_cgroup_owner(wpc->wbc, folio, map_len); return map_len; -- cgit v1.2.3 From f718c9fa87bec45eca57189aa05647741ae9eb14 Mon Sep 17 00:00:00 2001 From: Alan Urmancheev Date: Tue, 23 Jun 2026 01:23:22 -0400 Subject: exec: fix off-by-one in binfmt max rewrite depth comment The loop in exec_binprm() permits depth values 0 through 5, up to 5 successive binfmt rewrites (setting bprm->interpreter) until the 6th one would fail on depth > 5 and return -ELOOP. The comment claimed 4 levels, which was wrong. Adjusting the code to allow only 4 rewrites would be breaking userland, so fix the comment and not the code. Reproducer (a chain of shebanged scripts followed by an ELF binary): #!/bin/sh tmp=$(mktemp -d) echo $tmp cd $tmp mk () { echo $2 > $1; chmod +x $1; } for i in $(seq 4); do mk $i "#!$((i + 1))" done mk 5 '#!/bin/true' ./1 && echo '5 binfmt rewrites OK (1 -> 2 -> 3 -> 4 -> 5 -> /bin/true)' mk 5 '#!6' mk 6 '#!/bin/true' ./1 || echo '6 binfmt rewrites KO (1 -> 2 -> 3 -> 4 -> 5 -> 6 -> /bin/true)' Signed-off-by: Alan Urmancheev Link: https://patch.msgid.link/20260623052322.74711-1-alan.urman@gmail.com Signed-off-by: Christian Brauner (Amutable) --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index b92fe7db176c..d5993cedc829 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1717,7 +1717,7 @@ static int exec_binprm(struct linux_binprm *bprm) old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); - /* This allows 4 levels of binfmt rewrites before failing hard. */ + /* This allows 5 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) -- cgit v1.2.3 From b61cbeadaa83a712afb2f759aa7e65d43cdef322 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:19 +0100 Subject: netfs: Fix decision whether to disallow write-streaming due to fscache use netfs_perform_write() buffers data by writing it into the pagecache for later writeback. If the folio it wants to write to isn't present, it uses "write streaming" in which is will store partial data in a non-uptodate, but dirty folio. However, when fscache is in use, this is a potential problem as writes to the cache have to be aligned to the cache backend's DIO granularity, and so netfs_perform_write() attempts to suppress write-streaming in such a case, requiring the folio content to be fetched first unless the entire folio is going to be overwritten. This allows the content to be written to the cache too. Unfortunately, the test netfs_perform_write() uses isn't correct because it doesn't take into account the fact that the object lookup is asynchronous and farmed off to a work queue, so there's a short window in which the cache is doing a lookup but the test fails because the answer is undefined. This can be triggered by the generic/464 xfstest, and causes a warning to be emitted in cachefiles (in code not yet upstream) because it sees a write that doesn't have its bounds rounded out to DIO alignment. Fix this by changing the condition to whether FSCACHE_COOKIE_IS_CACHING is set on a cookie rather than whether the cookie is marked enabled. Note that this is really just a hint as to whether we allow write streaming or not and no other aspects of the cookie or cache object are accessed. Also apply the same fix to netfs_write_begin(). Reported-by: Marc Dionne Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-2-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/netfs/buffered_read.c | 2 +- fs/netfs/buffered_write.c | 2 +- fs/netfs/internal.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 76d0f6a29aba..24a8a5418e31 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -659,7 +659,7 @@ retry: * within the cache granule containing the EOF, in which case we need * to preload the granule. */ - if (!netfs_is_cache_enabled(ctx) && + if (!netfs_is_cache_maybe_enabled(ctx) && netfs_skip_folio_read(folio, pos, len, false)) { netfs_stat(&netfs_n_rh_write_zskip); goto have_folio_no_wait; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 6bde3320bcec..2cdb68e6b16f 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -277,7 +277,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * caching service temporarily because the backing store got * culled. */ - if (netfs_is_cache_enabled(ctx)) { + if (netfs_is_cache_maybe_enabled(ctx)) { if (finfo) { netfs_stat(&netfs_n_wh_wstream_conflict); goto flush_content; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 645996ecfc80..d889caa401dc 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -239,6 +239,18 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) #endif } +static inline bool netfs_is_cache_maybe_enabled(struct netfs_inode *ctx) +{ +#if IS_ENABLED(CONFIG_FSCACHE) + struct fscache_cookie *cookie = ctx->cache; + + return fscache_cookie_valid(cookie) && + test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags); +#else + return false; +#endif +} + /* * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). */ -- cgit v1.2.3 From dbd6f56d975b23241b7bbb11bb8f562af548a0aa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:20 +0100 Subject: netfs: Fix netfs_create_write_req() to handle async cache object creation netfs_create_write_req() will skip caching if the fscache cookie is disabled, but this is a problem because async cache object creation might not have got far enough yet that has been enabled - thereby causing the call to fscache_begin_write_operation() to be skipped. Fix this by removing the checks on the cookie and delegating this to fscache_begin_write_operation(). Fixes: 7b589a9b45ae ("netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags") Closes: https://sashiko.dev/#/patchset/20260624115737.2964520-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-3-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/netfs/write_issue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index c03c7cc45e47..4f55228f0fd4 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -106,7 +106,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); - if (is_cacheable && netfs_is_cache_enabled(ictx)) + if (is_cacheable) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0) goto nomem; -- cgit v1.2.3 From af6830cc12dfe86c832dccc9c9878a93aaa22f83 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:21 +0100 Subject: cachefiles: Fix double fput Fix a double fput() in error handling in cachefiles_create_tmpfile(). Link: https://sashiko.dev/#/patchset/20260608145432.681865-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-4-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/cachefiles/namei.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 2c46f0decb02..67793898148b 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -466,7 +466,6 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ret = -EINVAL; if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { - fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); goto err_unuse; } -- cgit v1.2.3 From 511a018ed2afd8d415edd307ce7ad2048506f6a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:22 +0100 Subject: cachefiles: Fix file burial to take lock when unsetting S_KERNEL_FILE Fix cachefiles_bury_object() to lock the inode of the file being buried whilst it unsets the S_KERNEL_FILE flag. Fixes: 07a90e97400c ("cachefiles: Implement culling daemon commands") Closes: https://sashiko.dev/#/patchset/20260616100821.2062304-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-5-dhowells@redhat.com cc: Paulo Alcantara cc: NeilBrown cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/cachefiles/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 67793898148b..8a9f6be15828 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -374,7 +374,7 @@ try_again: "Rename failed with error %d", ret); } - __cachefiles_unmark_inode_in_use(object, d_inode(rep)); + cachefiles_do_unmark_inode_in_use(object, d_inode(rep)); end_renaming(&rd); _leave(" = 0"); return 0; -- cgit v1.2.3 From 55f4bb9373ca4a521f3b0119366db92715a39b81 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:23 +0100 Subject: iov_iter: Fix potential underflow in iov_iter_extract_xarray_pages() In iov_iter_extract_xarray_pages(), if no pages are extracted because there's a hole (or something otherwise unextractable) in the xarray, then the calculation of maxsize at the end can go wrong if the starting offset is not zero. Fix this by returning 0 in such a case and freeing the page array if allocated here rather than being passed in. Note that in the near future, ITER_XARRAY should be removed. Fixes: 7d58fe731028 ("iov_iter: Add a function to extract a page list from an iterator") Link: https://sashiko.dev/#/patchset/20260608145432.681865-1-dhowells%40redhat.com Link: https://sashiko.dev/#/patchset/20260616100821.2062304-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-6-dhowells@redhat.com Reviewed-by: Christoph Hellwig cc: Paulo Alcantara cc: Matthew Wilcox cc: Christoph Hellwig cc: Jens Axboe cc: Mike Marshall cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- lib/iov_iter.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 273919b16161..0f320b4e82a8 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1568,6 +1568,7 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct folio *folio; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; + bool will_alloc = !*pages; XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT); offset = pos & ~PAGE_MASK; @@ -1595,6 +1596,14 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, } rcu_read_unlock(); + if (!nr) { + if (will_alloc) { + kvfree(*pages); + *pages = NULL; + } + return 0; + } + maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; -- cgit v1.2.3 From 70531f4f3a143f81baf549da7f59a24a9f87a65c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:24 +0100 Subject: iov_iter: Fix missing alloc fail check in iov_iter_extract_bvec_pages() Fix iov_iter_extract_bvec_pages() to check if want_pages_array() fails and, if so, return -ENOMEM appropriately. Fixes: e4e535bff2bc ("iov_iter: don't require contiguous pages in iov_iter_extract_bvec_pages") Link: https://sashiko.dev/#/patchset/20260608145432.681865-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-7-dhowells@redhat.com Reviewed-by: Christoph Hellwig cc: Ming Lei cc: Paulo Alcantara cc: Matthew Wilcox cc: Christoph Hellwig cc: Jens Axboe cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- lib/iov_iter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 0f320b4e82a8..3dfad70328eb 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1637,6 +1637,8 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, bi.bi_bvec_done = skip; maxpages = want_pages_array(pages, maxsize, skip, maxpages); + if (!maxpages) + return -ENOMEM; while (bi.bi_size && bi.bi_idx < i->nr_segs) { struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); -- cgit v1.2.3 From 72698020e15db16fc141e191b460bc335263b0ad Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:25 +0100 Subject: iov_iter: Fix a memory leak in iov_iter_extract_user_pages() There's a potential memory leak in callers of iov_iter_extract_user_pages() whereby if a pages array is allocated in function, it isn't freed before returning of an error or 0. Now, it's not a leak per se in iov_iter_extract_user_pages() as, if an array is allocated, it's returned through *pages, so it's incumbent on the caller to free it. However, not all callers do. Fix this by freeing the table and clearing *pages before returning an error or 0. Note that iov_iter_extract_pages() and its subfunctions are allowed to return 0 without returning an array (for instance if the iterator count is 0). Fixes: 7d58fe731028 ("iov_iter: Add a function to extract a page list from an iterator") Closes: https://sashiko.dev/#/patchset/20260616100821.2062304-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-8-dhowells@redhat.com Reviewed-by: Christoph Hellwig cc: Paulo Alcantara cc: Matthew Wilcox cc: Christoph Hellwig cc: Jens Axboe cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- lib/iov_iter.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 3dfad70328eb..c2484551a4e8 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1756,6 +1756,7 @@ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, unsigned long addr; unsigned int gup_flags = 0; size_t offset; + bool will_alloc = !*pages; int res; if (i->data_source == ITER_DEST) @@ -1772,8 +1773,14 @@ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); - if (unlikely(res <= 0)) + if (unlikely(res <= 0)) { + if (will_alloc) { + kvfree(*pages); + *pages = NULL; + } return res; + } + maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; -- cgit v1.2.3 From 0442e23a5f72c74ba18882e4a2eed305c687009d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:26 +0100 Subject: iov_iter: Remove unused variable in kunit_iov_iter.c Remove the no longer used variable 'b' from iov_kunit_copy_to_bvec(). The variable is initialised and incremented, but nothing now makes use of the value. Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-9-dhowells@redhat.com Reviewed-by: Christoph Hellwig cc: Ming Lei cc: Paulo Alcantara cc: Matthew Wilcox cc: Christoph Hellwig cc: Jens Axboe cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- lib/tests/kunit_iov_iter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/tests/kunit_iov_iter.c b/lib/tests/kunit_iov_iter.c index 1e6fce9cb255..d9690ba1db88 100644 --- a/lib/tests/kunit_iov_iter.c +++ b/lib/tests/kunit_iov_iter.c @@ -283,7 +283,7 @@ static void __init iov_kunit_copy_to_bvec(struct kunit *test) struct page **spages, **bpages; u8 *scratch, *buffer; size_t bufsize, npages, size, copied; - int i, b, patt; + int i, patt; bufsize = 0x100000; npages = bufsize / PAGE_SIZE; @@ -306,10 +306,9 @@ static void __init iov_kunit_copy_to_bvec(struct kunit *test) KUNIT_EXPECT_EQ(test, iter.nr_segs, 0); /* Build the expected image in the scratch buffer. */ - b = 0; patt = 0; memset(scratch, 0, bufsize); - for (pr = bvec_test_ranges; pr->from >= 0; pr++, b++) { + for (pr = bvec_test_ranges; pr->from >= 0; pr++) { u8 *p = scratch + pr->page * PAGE_SIZE; for (i = pr->from; i < pr->to; i++) -- cgit v1.2.3 From 2bcd3ab3728752425ff5ab1e4be1698eba13d0d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:27 +0100 Subject: scatterlist: Fix offset in folio calc in extract_xarray_to_sg() Fix the calculation of the offset in the folio being extracted in extract_xarray_to_sg(). Note that in the near future, ITER_XARRAY should be removed. Fixes: f5f82cd18732 ("Move netfs_extract_iter_to_sg() to lib/scatterlist.c") Link: https://sashiko.dev/#/patchset/20260608145432.681865-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-10-dhowells@redhat.com Reviewed-by: Christoph Hellwig cc: Paulo Alcantara cc: Matthew Wilcox cc: Christoph Hellwig cc: Jens Axboe cc: Mike Marshall cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- lib/scatterlist.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b7fe91ef35b8..6ea40d2e6247 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -1366,6 +1366,7 @@ static ssize_t extract_xarray_to_sg(struct iov_iter *iter, sg_max--; maxsize -= len; + start += len; ret += len; if (maxsize <= 0 || sg_max == 0) break; -- cgit v1.2.3 From fa746e23d1094f9a68afe5973746b0e32078fd8b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:28 +0100 Subject: netfs: Fix kdoc warning Fix a kdoc warning due to a misnamed parameter in the description. Reported-by: Matthew Wilcox Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-11-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 243c0f737938..bdc270e84b30 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -753,7 +753,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, /** * netfs_resize_file - Note that a file got resized - * @ctx: The netfs inode being resized + * @ictx: The netfs inode being resized * @new_i_size: The new file size * @changed_on_server: The change was applied to the server * -- cgit v1.2.3 From 41376400c4717fed43490030902f9e4c9062b285 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:29 +0100 Subject: netfs: Replace wb_lock with a bit lock for asynchronicity The netfs_inode::wb_lock mutex is used to prevent multiple simultaneous writebacks from fighting each other (a writeback thread will write multiple discontiguous regions within the same request). The mutex, however, only serialises the issuing of subrequests; it doesn't serialise the collection of results, and, in particular, the updating of file size information and fscache populatedness data. Unfortunately, the mutex cannot be held around the entire process as it has to be unlocked in the same thread in which it is locked - and we don't want to hold up the allocator whilst we complete the writeback. Fix this by replacing the mutex with a bit flag and a list of lock waiters so that the lock can be dropped in the collector thread after collection is complete. Link: https://sashiko.dev/#/patchset/20260608145432.681865-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-12-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/afs/symlink.c | 4 +- fs/netfs/locking.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/netfs/write_collect.c | 10 +++++ fs/netfs/write_issue.c | 37 +++++-------------- include/linux/netfs.h | 11 +++++- 5 files changed, 126 insertions(+), 31 deletions(-) diff --git a/fs/afs/symlink.c b/fs/afs/symlink.c index ed5868369f37..16b4823cb7b7 100644 --- a/fs/afs/symlink.c +++ b/fs/afs/symlink.c @@ -255,11 +255,11 @@ int afs_symlink_writepages(struct address_space *mapping, } if (ret == 0) { - mutex_lock(&vnode->netfs.wb_lock); + netfs_wb_begin(&vnode->netfs, false); netfs_free_folioq_buffer(vnode->directory); vnode->directory = NULL; vnode->directory_size = 0; - mutex_unlock(&vnode->netfs.wb_lock); + netfs_wb_end(&vnode->netfs); } else if (ret == 1) { ret = 0; /* Skipped write due to lock conflict. */ } diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c index 2249ecd09d0a..4e3be2b81504 100644 --- a/fs/netfs/locking.c +++ b/fs/netfs/locking.c @@ -9,6 +9,11 @@ #include #include "internal.h" +struct netfs_wb_waiter { + struct list_head link; /* Link in ictx->wb_queue */ + struct task_struct *waiter; /* Waiter task; cleared when lock granted */ +}; + /* * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish * @inode: inode to wait for @@ -203,3 +208,93 @@ void netfs_end_io_direct(struct inode *inode) up_read(&inode->i_rwsem); } EXPORT_SYMBOL(netfs_end_io_direct); + +/* + * Wait to have exclusive access to writeback. + */ +static bool netfs_wb_begin_wait(struct netfs_inode *ictx) +{ + struct netfs_wb_waiter waiter = {}; + struct task_struct *tsk = current; + bool got = false; + + spin_lock(&ictx->lock); + + if (test_and_set_bit_lock(NETFS_ICTX_WB_LOCK, &ictx->flags)) { + get_task_struct(tsk); + waiter.waiter = tsk; + list_add_tail(&waiter.link, &ictx->wb_queue); + } else { + got = true; + } + spin_unlock(&ictx->lock); + + if (!got) { + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + /* Read waiter before accessing inode state. */ + if (smp_load_acquire(&waiter.waiter) == NULL) + break; + schedule(); + } + } + __set_current_state(TASK_RUNNING); + return true; +} + +/** + * netfs_wb_begin - Begin writeback, waiting if need be + * @ictx: The inode to get writeback access on + * @nowait: Return failure immediately rather than waiting if true + * + * Begin writeback to an inode, waiting for exclusive access if @nowait is + * false. This prevents collection from being done out of order with respect + * to the issuance of write subrequests. + * + * Note that writeback may be ended in a different process (e.g. the collection + * function on a workqueue) than started it. + * + * Return: True if can proceed, false if denied. + */ +bool netfs_wb_begin(struct netfs_inode *ictx, bool nowait) +{ + if (!test_and_set_bit_lock(NETFS_ICTX_WB_LOCK, &ictx->flags)) + return true; + if (nowait) { + netfs_stat(&netfs_n_wb_lock_skip); + return false; + } + netfs_stat(&netfs_n_wb_lock_wait); + return netfs_wb_begin_wait(ictx); +} +EXPORT_SYMBOL(netfs_wb_begin); + +/* netfs_wb_end - End writeback + * @ictx: The inode we have writeback access to + * + * End writeback access on an inode, waking up the next writeback request. + */ +void netfs_wb_end(struct netfs_inode *ictx) +{ + struct netfs_wb_waiter *waiter; + struct task_struct *tsk; + + WARN_ON_ONCE(!test_bit(NETFS_ICTX_WB_LOCK, &ictx->flags)); + + spin_lock(&ictx->lock); + + waiter = list_first_entry_or_null(&ictx->wb_queue, struct netfs_wb_waiter, link); + if (waiter) { + list_del(&waiter->link); + tsk = waiter->waiter; + /* Write inode state before clearing waiter. */ + smp_store_release(&waiter->waiter, NULL); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + clear_bit_unlock(NETFS_ICTX_WB_LOCK, &ictx->flags); + } + + spin_unlock(&ictx->lock); +} +EXPORT_SYMBOL(netfs_wb_end); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 24fc2bb2f8a4..210eb8f3958d 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -408,6 +408,16 @@ bool netfs_write_collection(struct netfs_io_request *wreq) netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ + switch (wreq->origin) { + case NETFS_WRITEBACK: + case NETFS_WRITEBACK_SINGLE: + case NETFS_WRITETHROUGH: + netfs_wb_end(ictx); + break; + default: + break; + } + if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); wreq->iocb->ki_pos += written; diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 4f55228f0fd4..2473bce37649 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -551,14 +551,8 @@ int netfs_writepages(struct address_space *mapping, struct folio *folio; int error = 0; - if (!mutex_trylock(&ictx->wb_lock)) { - if (wbc->sync_mode == WB_SYNC_NONE) { - netfs_stat(&netfs_n_wb_lock_skip); - return 0; - } - netfs_stat(&netfs_n_wb_lock_wait); - mutex_lock(&ictx->wb_lock); - } + if (!netfs_wb_begin(ictx, wbc->sync_mode == WB_SYNC_NONE)) + return 0; /* Need the first folio to be able to set up the op. */ folio = writeback_iter(mapping, wbc, NULL, &error); @@ -593,8 +587,6 @@ int netfs_writepages(struct address_space *mapping, } while ((folio = writeback_iter(mapping, wbc, folio, &error))); netfs_end_issue_write(wreq); - - mutex_unlock(&ictx->wb_lock); netfs_wake_collector(wreq); netfs_put_request(wreq, netfs_rreq_trace_put_return); @@ -604,7 +596,7 @@ int netfs_writepages(struct address_space *mapping, couldnt_start: netfs_kill_dirty_pages(mapping, wbc, folio); out: - mutex_unlock(&ictx->wb_lock); + netfs_wb_end(ictx); _leave(" = %d", error); return error; } @@ -618,12 +610,12 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len struct netfs_io_request *wreq = NULL; struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp)); - mutex_lock(&ictx->wb_lock); + netfs_wb_begin(ictx, false); wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, iocb->ki_pos, NETFS_WRITETHROUGH); if (IS_ERR(wreq)) { - mutex_unlock(&ictx->wb_lock); + netfs_wb_end(ictx); return wreq; } @@ -685,7 +677,6 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *writethrough_cache) { - struct netfs_inode *ictx = netfs_inode(wreq->inode); ssize_t ret; _enter("R=%x", wreq->debug_id); @@ -699,8 +690,6 @@ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_c netfs_end_issue_write(wreq); - mutex_unlock(&ictx->wb_lock); - if (wreq->iocb) ret = -EIOCBQUEUED; else @@ -847,15 +836,10 @@ int netfs_writeback_single(struct address_space *mapping, if (WARN_ON_ONCE(!iov_iter_is_folioq(iter))) return -EIO; - if (!mutex_trylock(&ictx->wb_lock)) { - if (wbc->sync_mode == WB_SYNC_NONE) { - /* The VFS will have undirtied the inode. */ - netfs_single_mark_inode_dirty(&ictx->inode); - netfs_stat(&netfs_n_wb_lock_skip); - return 1; - } - netfs_stat(&netfs_n_wb_lock_wait); - mutex_lock(&ictx->wb_lock); + if (!netfs_wb_begin(ictx, wbc->sync_mode == WB_SYNC_NONE)) { + /* The VFS will have undirtied the inode. */ + netfs_single_mark_inode_dirty(&ictx->inode); + return 1; } wreq = netfs_create_write_req(mapping, NULL, 0, NETFS_WRITEBACK_SINGLE); @@ -893,7 +877,6 @@ stop: smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); - mutex_unlock(&ictx->wb_lock); netfs_wake_collector(wreq); netfs_put_request(wreq, netfs_rreq_trace_put_return); @@ -901,7 +884,7 @@ stop: return ret; couldnt_start: - mutex_unlock(&ictx->wb_lock); + netfs_wb_end(ictx); _leave(" = %d", ret); return ret; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index bdc270e84b30..1bc120d61c5b 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -61,14 +61,16 @@ struct netfs_inode { #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cache; #endif - struct mutex wb_lock; /* Writeback serialisation */ + struct list_head wb_queue; /* Queue of processes wanting to do writeback */ loff_t _remote_i_size; /* Size of the remote file */ loff_t _zero_point; /* Size after which we assume there's no data * on the server */ + spinlock_t lock; /* Lock covering wb_queue */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ +#define NETFS_ICTX_WB_LOCK 2 /* Writeback serialisation lock */ #define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ #define NETFS_ICTX_SINGLE_NO_UPLOAD 4 /* Monolithic payload, cache but no upload */ }; @@ -462,6 +464,10 @@ int netfs_alloc_folioq_buffer(struct address_space *mapping, size_t *_cur_size, ssize_t size, gfp_t gfp); void netfs_free_folioq_buffer(struct folio_queue *fq); +/* Writeback exclusion API. */ +bool netfs_wb_begin(struct netfs_inode *ictx, bool nowait); +void netfs_wb_end(struct netfs_inode *ictx); + /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query @@ -743,7 +749,8 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif - mutex_init(&ctx->wb_lock); + INIT_LIST_HEAD(&ctx->wb_queue); + spin_lock_init(&ctx->lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { ctx->_zero_point = ctx->_remote_i_size; -- cgit v1.2.3 From ba6a9f6533c77c628eef0c0c5c19cd316e2be1b4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:30 +0100 Subject: netfs: Fix writethrough to use collection offload Fix writethrough write to set NETFS_RREQ_OFFLOAD_COLLECTION on the request so that collection is processed asynchronously rather than only right at the end - and also so that asynchronous O_SYNC writes get collected at all. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Closes: https://sashiko.dev/#/patchset/20260616100821.2062304-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-13-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/netfs/write_issue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 2473bce37649..3b363ce12f3f 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -620,6 +620,7 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len } wreq->io_streams[0].avail = true; + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); trace_netfs_write(wreq, netfs_write_trace_writethrough); return wreq; } -- cgit v1.2.3 From ac5f95ac5d6d0f4c567b8b642825705a2bf0d79e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:31 +0100 Subject: netfs: Fix writeback error handling Fix the error handling in writeback_iter() loop. If an error occurs, writeback_iter() needs to be called again with *error set to the error so that it can clean up iteration state. Further, the current folio needs unlocking and redirtying. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Link: https://sashiko.dev/#/patchset/20260619140646.2633762-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-14-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/netfs/write_issue.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 3b363ce12f3f..3682896c3fdf 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -582,8 +582,6 @@ int netfs_writepages(struct address_space *mapping, } error = netfs_write_folio(wreq, wbc, folio); - if (error < 0) - break; } while ((folio = writeback_iter(mapping, wbc, folio, &error))); netfs_end_issue_write(wreq); @@ -594,7 +592,14 @@ int netfs_writepages(struct address_space *mapping, return error; couldnt_start: - netfs_kill_dirty_pages(mapping, wbc, folio); + if (error == -ENOMEM) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + folio = writeback_iter(mapping, wbc, folio, &error); + WARN_ON_ONCE(folio != NULL); + } else { + netfs_kill_dirty_pages(mapping, wbc, folio); + } out: netfs_wb_end(ictx); _leave(" = %d", error); -- cgit v1.2.3 From b6a713fd34b9498ee2164d5d3e8460732a392efc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:32 +0100 Subject: netfs: Fix folio state after ENOMEM whilst under writeback iteration Fix the state of the current folio when ENOMEM occurs during writeback iteration. The folio needs to be redirtied and unlocked before the terminal writeback_iter() is invoked. Fixes: 06fa229ceb36 ("netfs: Abstract out a rolling folio buffer implementation") Link: https://sashiko.dev/#/patchset/20260619140646.2633762-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-15-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/netfs/write_issue.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 3682896c3fdf..f2761c99795a 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -582,6 +582,10 @@ int netfs_writepages(struct address_space *mapping, } error = netfs_write_folio(wreq, wbc, folio); + if (error == -ENOMEM) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + } } while ((folio = writeback_iter(mapping, wbc, folio, &error))); netfs_end_issue_write(wreq); -- cgit v1.2.3 From 64f04f9789237728be4e1836151848af350d1374 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Jun 2026 15:06:33 +0100 Subject: netfs: Fix DIO write retry for filesystems without a ->prepare_write() Fix netfs_unbuffered_write() so that it doesn't re-issue a write twice when the filesystem doesn't have a ->prepare_write(). The resetting of the iterator and the call to netfs_reissue_write() should just be removed as almost everything it does is done again when the loop it's in goes back to the top. It does, however, still need the IN_PROGRESS flag setting, so that (and the stat inc) are moved out of the if-statement. Further, the MADE_PROGRESS flags should be cleared and wreq->transferred should be updated, so fix those too. Reported-by: syzbot+3c74b1f0c372e98efc32@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3c74b1f0c372e98efc32 Signed-off-by: David Howells Link: https://patch.msgid.link/20260625140640.3116900-16-dhowells@redhat.com cc: Paulo Alcantara cc: hongao cc: ChenXiaoSong cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/netfs/direct_write.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 25f8ceb15fad..c16fbad286a1 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -166,13 +166,16 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq) */ subreq->error = -EAGAIN; trace_netfs_sreq(subreq, netfs_sreq_trace_retry); - if (subreq->transferred > 0) + if (subreq->transferred > 0) { iov_iter_advance(&wreq->buffer.iter, subreq->transferred); + wreq->transferred += subreq->transferred; + } if (stream->source == NETFS_UPLOAD_TO_SERVER && wreq->netfs_ops->retry_request) wreq->netfs_ops->retry_request(wreq, stream); + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); @@ -186,17 +189,10 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq) netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - if (stream->prepare_write) { + if (stream->prepare_write) stream->prepare_write(subreq); - __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_stat(&netfs_n_wh_retry_write_subreq); - } else { - struct iov_iter source; - - netfs_reset_iter(subreq); - source = subreq->io_iter; - netfs_reissue_write(stream, subreq, &source); - } + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); } netfs_unbuffered_write_done(wreq); -- cgit v1.2.3 From 6c732471740bc2ac9b0946134f9f551dc75f4369 Mon Sep 17 00:00:00 2001 From: David Lee Date: Wed, 1 Jul 2026 11:44:28 +0000 Subject: fhandle: reject detached mounts in capable_wrt_mount() The recent fhandle RCU fix moved the mount namespace capability check into capable_wrt_mount(), so a non-NULL mnt_namespace survives the ns_capable() dereference. The helper still assumes the later READ_ONCE(mount->mnt_ns) must be non-NULL because may_decode_fh() checked is_mounted() first. That assumption is not stable. A detached mount from open_tree(..., OPEN_TREE_CLONE) can be dissolved on fput while open_by_handle_at() is between those checks, and umount_tree() can clear mount->mnt_ns. If the helper observes NULL, it dereferences mnt_ns->user_ns and panics. Return false when the RCU read observes a detached mount. This keeps the relaxed permission path conservative: a mount no longer attached to a namespace cannot authorize open_by_handle_at() access. Fixes: 620c266f3949 ("fhandle: relax open_by_handle_at() permission checks") Cc: stable@vger.kernel.org Signed-off-by: David Lee Assisted-by: LLM Link: https://patch.msgid.link/20260701114438.24431-1-david.lee@trailofbits.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner (Amutable) --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 1ca7eb3a6cb5..f8829231e3d7 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -295,7 +295,7 @@ static bool capable_wrt_mount(struct mount *mount) */ guard(rcu)(); mnt_ns = READ_ONCE(mount->mnt_ns); - return ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN); + return mnt_ns && ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN); } static inline int may_decode_fh(struct handle_to_path_ctx *ctx, -- cgit v1.2.3 From 044472d5ee7d71f918fa3f61bd65e4933a0c006e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2026 14:17:38 +0200 Subject: iomap: consolidate bio submission Add a iomap_bio_submit_read_endio helper factored out of iomap_bio_submit_read to that all ->submit_read implementations for iomap_read_ops that use iomap_bio_read_folio_range can shared the logic. Right now that logic is mostly trivial, but already has a bug for XFS because the XFS version is too trivial: file system integrity validation needs a workqueue context and thus can't happen from the default iomap bi_end_io I/O handler. Unfortunately the iomap refactoring just before fs integrity landed moved code around here and the call go misplaced, meaning it never got called. The PI information still is verified by the block layer, but the offloading is less efficient (and the future userspace interface can't get at it). Fixes: 0b10a370529c ("iomap: support T10 protection information") Cc: stable@vger.kernel.org # v7.1 Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260629121750.3392300-2-hch@lst.de Acked-by: Namjae Jeon Reviewed-by: "Darrick J. Wong" Reviewed-by: Joanne Koong Signed-off-by: Christian Brauner (Amutable) --- fs/exfat/iomap.c | 5 +---- fs/iomap/bio.c | 13 ++++++++++--- fs/ntfs/aops.c | 6 ++---- fs/ntfs3/inode.c | 5 +---- fs/xfs/xfs_aops.c | 3 +-- include/linux/iomap.h | 2 ++ 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/exfat/iomap.c b/fs/exfat/iomap.c index 1aac38e63fe6..190fc6471f84 100644 --- a/fs/exfat/iomap.c +++ b/fs/exfat/iomap.c @@ -253,10 +253,7 @@ static void exfat_iomap_read_end_io(struct bio *bio) static void exfat_iomap_bio_submit_read(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx) { - struct bio *bio = ctx->read_ctx; - - bio->bi_end_io = exfat_iomap_read_end_io; - submit_bio(bio); + iomap_bio_submit_read_endio(iter, ctx, exfat_iomap_read_end_io); } const struct iomap_read_ops exfat_iomap_bio_read_ops = { diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 4504f4633f17..0f31e35567b4 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -78,15 +78,23 @@ u32 iomap_finish_ioend_buffered_read(struct iomap_ioend *ioend) return __iomap_read_end_io(&ioend->io_bio, ioend->io_error); } -static void iomap_bio_submit_read(const struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx) +void iomap_bio_submit_read_endio(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, bio_end_io_t end_io) { struct bio *bio = ctx->read_ctx; + bio->bi_end_io = end_io; if (iter->iomap.flags & IOMAP_F_INTEGRITY) fs_bio_integrity_alloc(bio); submit_bio(bio); } +EXPORT_SYMBOL_GPL(iomap_bio_submit_read_endio); + +static void iomap_bio_submit_read(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx) +{ + return iomap_bio_submit_read_endio(iter, ctx, iomap_read_end_io); +} static struct bio_set *iomap_read_bio_set(struct iomap_read_folio_ctx *ctx) { @@ -127,7 +135,6 @@ static void iomap_read_alloc_bio(const struct iomap_iter *iter, if (ctx->rac) bio->bi_opf |= REQ_RAHEAD; bio->bi_iter.bi_sector = iomap_sector(iomap, iter->pos); - bio->bi_end_io = iomap_read_end_io; bio_add_folio_nofail(bio, folio, plen, offset_in_folio(folio, iter->pos)); ctx->read_ctx = bio; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 1fbf832ad165..f2bb56506046 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -38,11 +38,9 @@ static void ntfs_iomap_read_end_io(struct bio *bio) } static void ntfs_iomap_bio_submit_read(const struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx) + struct iomap_read_folio_ctx *ctx) { - struct bio *bio = ctx->read_ctx; - bio->bi_end_io = ntfs_iomap_read_end_io; - submit_bio(bio); + iomap_bio_submit_read_endio(iter, ctx, ntfs_iomap_read_end_io); } static const struct iomap_read_ops ntfs_iomap_bio_read_ops = { diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index c43101cc064d..0c9bd669117d 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -608,10 +608,7 @@ static void ntfs_iomap_read_end_io(struct bio *bio) static void ntfs_iomap_bio_submit_read(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx) { - struct bio *bio = ctx->read_ctx; - - bio->bi_end_io = ntfs_iomap_read_end_io; - submit_bio(bio); + iomap_bio_submit_read_endio(iter, ctx, ntfs_iomap_read_end_io); } static const struct iomap_read_ops ntfs_iomap_bio_read_ops = { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2a0c54256e93..51293b6f331f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -764,8 +764,7 @@ xfs_bio_submit_read( /* defer read completions to the ioend workqueue */ iomap_init_ioend(iter->inode, bio, ctx->read_ctx_file_offset, 0); - bio->bi_end_io = xfs_end_bio; - submit_bio(bio); + iomap_bio_submit_read_endio(iter, ctx, xfs_end_bio); } static const struct iomap_read_ops xfs_iomap_read_ops = { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 3582ed1fe236..56b43d594e6e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -622,6 +622,8 @@ extern struct bio_set iomap_ioend_bioset; #ifdef CONFIG_BLOCK int iomap_bio_read_folio_range(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t plen); +void iomap_bio_submit_read_endio(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, bio_end_io_t end_io); extern const struct iomap_read_ops iomap_bio_read_ops; -- cgit v1.2.3 From 3372eb0384b791faf133806da287819f5bfaad76 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Jun 2026 14:17:39 +0200 Subject: fuse: call fuse_send_readpages explicitly from fuse_readahead Move the call to fuse_send_readpages from the iomap ->submit_read method to the fuse readahead implementation. fuse_read_folio() does not need to call fuse_send_readpages() because it always does reads synchronously (the iomap->submit_read method for this was a no-op since data->ia is always NULL for fuse_read_folio()). This prepares for an iomap fix that will call ->submit_read after each iomap. Signed-off-by: Joanne Koong Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260629121750.3392300-3-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner (Amutable) --- fs/fuse/file.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e052a0d44dee..ceada75310b8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -981,19 +981,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, return ret; } -static void fuse_iomap_submit_read(const struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx) -{ - struct fuse_fill_read_data *data = ctx->read_ctx; - - if (data->ia) - fuse_send_readpages(data->ia, data->file, data->nr_bytes, - data->fc->async_read); -} - static const struct iomap_read_ops fuse_iomap_read_ops = { .read_folio_range = fuse_iomap_read_folio_range_async, - .submit_read = fuse_iomap_submit_read, }; static int fuse_read_folio(struct file *file, struct folio *folio) @@ -1116,6 +1105,9 @@ static void fuse_readahead(struct readahead_control *rac) return; iomap_readahead(&fuse_iomap_ops, &ctx, NULL); + if (data.ia) + fuse_send_readpages(data.ia, data.file, data.nr_bytes, + fc->async_read); } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) -- cgit v1.2.3 From c1fb97d31782f5a8c66d127624626accbb0dd8bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2026 14:17:40 +0200 Subject: iomap: submit read bio after each extent Currently the iomap buffered read path tries to build up read context (i.e. bios for the typical block based case) over multiple iomaps as long as the sector matches. This does not take into account files that can map to multiple different devices. While this could be fixed by a bdev check in iomap_bio_read_folio_range, the building up of I/O over iomaps actually was a problem for the not yet merged ext2 iomap port, as that does want to send out I/O at the end of an indirect block mapped range. So instead of adding more checks move over to a model where a bio only spans a single iomap. Change ->submit_read to be called after each iteration so that the bio based users submit the bio after each iomap. Fuse is unchanged because the previous commit stopped using ->submit_read for it. Fixes: dfeab2e95a75 ("erofs: add multiple device support") Reported-by: Kelu Ye Reported-by: Yifan Zhao Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260629121750.3392300-4-hch@lst.de Tested-by: Yifan Zhao Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner (Amutable) --- fs/iomap/bio.c | 2 ++ fs/iomap/buffered-io.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 0f31e35567b4..dc8ac7e370a5 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -87,6 +87,8 @@ void iomap_bio_submit_read_endio(const struct iomap_iter *iter, if (iter->iomap.flags & IOMAP_F_INTEGRITY) fs_bio_integrity_alloc(bio); submit_bio(bio); + + ctx->read_ctx = NULL; } EXPORT_SYMBOL_GPL(iomap_bio_submit_read_endio); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8d4806dc46d4..276720bc18dc 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -642,12 +642,12 @@ void iomap_read_folio(const struct iomap_ops *ops, fsverity_readahead(ctx->vi, folio->index, folio_nr_pages(folio)); - while ((ret = iomap_iter(&iter, ops)) > 0) + while ((ret = iomap_iter(&iter, ops)) > 0) { iter.status = iomap_read_folio_iter(&iter, ctx, &bytes_submitted); - - if (ctx->read_ctx && ctx->ops->submit_read) - ctx->ops->submit_read(&iter, ctx); + if (ctx->read_ctx && ctx->ops->submit_read) + ctx->ops->submit_read(&iter, ctx); + } if (ctx->cur_folio) iomap_read_end(ctx->cur_folio, bytes_submitted); @@ -718,12 +718,12 @@ void iomap_readahead(const struct iomap_ops *ops, fsverity_readahead(ctx->vi, readahead_index(rac), readahead_count(rac)); - while (iomap_iter(&iter, ops) > 0) + while (iomap_iter(&iter, ops) > 0) { iter.status = iomap_readahead_iter(&iter, ctx, &cur_bytes_submitted); - - if (ctx->read_ctx && ctx->ops->submit_read) - ctx->ops->submit_read(&iter, ctx); + if (ctx->read_ctx && ctx->ops->submit_read) + ctx->ops->submit_read(&iter, ctx); + } if (ctx->cur_folio) iomap_read_end(ctx->cur_folio, cur_bytes_submitted); -- cgit v1.2.3 From 5c6ce05e406520290c1d89da97fb3cd70c09137d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Jul 2026 09:23:02 +0100 Subject: netfs: Fix barriering when walking subrequest list Fix the barriering used when walking the subrequest list in retry as there's a possibility of seeing a subreq that's just been added by the application thread. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Link: https://sashiko.dev/#/patchset/20260608145432.681865-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/138807.1782980582@warthog.procyon.org.uk Reviewed-by: Paulo Alcantara (Red Hat) cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/netfs/read_retry.c | 7 ++++++- fs/netfs/write_retry.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index f59a70f3a086..2b42758e01ec 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -98,7 +98,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) goto abandon; } - list_for_each_continue(next, &stream->subrequests) { + for (;;) { + /* Read pointer to subreq before reading subreq state. */ + next = smp_load_acquire(&next->next); + if (next == &stream->subrequests) + break; + subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); if (subreq->start + subreq->transferred != start + len || test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) || diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index 32735abfa03f..058bc7a166a5 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -72,7 +72,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) return; - list_for_each_continue(next, &stream->subrequests) { + for (;;) { + /* Read pointer to subreq before reading subreq state. */ + next = smp_load_acquire(&next->next); + if (next == &stream->subrequests) + break; + subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); if (subreq->start + subreq->transferred != start + len || test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) || -- cgit v1.2.3