diff options
Diffstat (limited to 'fs')
148 files changed, 2436 insertions, 1271 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index d3aefbec4de6..34c115d7c250 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -75,17 +75,4 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) int v9fs_open_to_dotl_flags(int flags); -static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) -{ - /* - * 32-bit need the lock, concurrent updates could break the - * sequences and make i_size_read() loop forever. - * 64-bit updates are atomic and can skip the locking. - */ - if (sizeof(i_size) > sizeof(long)) - spin_lock(&inode->i_lock); - i_size_write(inode, i_size); - if (sizeof(i_size) > sizeof(long)) - spin_unlock(&inode->i_lock); -} #endif diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d1508b1fe109..f468acb8ee7d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1141,11 +1141,13 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - v9inode->netfs.remote_i_size = stat->length; + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, stat->length); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) - v9fs_i_size_write(inode, stat->length); + i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ inode->i_blocks = (stat->length + 512 - 1) >> 9; + spin_unlock(&inode->i_lock); v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 71796a89bcf4..141fb54db65d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -634,10 +634,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - v9inode->netfs.remote_i_size = stat->st_size; + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, stat->st_size); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) - v9fs_i_size_write(inode, stat->st_size); + i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; + spin_unlock(&inode->i_lock); } else { if (stat->st_result_mask & P9_STATS_ATIME) { inode_set_atime(inode, stat->st_atime_sec, @@ -662,13 +664,15 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } + spin_lock(&inode->i_lock); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) { - v9inode->netfs.remote_i_size = stat->st_size; - v9fs_i_size_write(inode, stat->st_size); + netfs_write_remote_i_size(inode, stat->st_size); + i_size_write(inode, stat->st_size); } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; + spin_unlock(&inode->i_lock); } if (stat->st_result_mask & P9_STATS_GEN) inode->i_generation = stat->st_gen; diff --git a/fs/afs/Makefile b/fs/afs/Makefile index b49b8fe682f3..0d8f1982d596 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -30,6 +30,7 @@ kafs-y := \ server.o \ server_list.o \ super.o \ + symlink.o \ validation.o \ vlclient.o \ vl_alias.o \ diff --git a/fs/afs/dir.c b/fs/afs/dir.c index aaaa55878ffd..498b99ccdf0e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -44,6 +44,8 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); +static int afs_dir_writepages(struct address_space *mapping, + struct writeback_control *wbc); const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -68,7 +70,7 @@ const struct inode_operations afs_dir_inode_operations = { }; const struct address_space_operations afs_dir_aops = { - .writepages = afs_single_writepages, + .writepages = afs_dir_writepages, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -233,22 +235,13 @@ static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file) struct iov_iter iter; ssize_t ret; loff_t i_size; - bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && - !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); i_size = i_size_read(&dvnode->netfs.inode); - if (is_dir) { - if (i_size < AFS_DIR_BLOCK_SIZE) - return afs_bad(dvnode, afs_file_error_dir_small); - if (i_size > AFS_DIR_BLOCK_SIZE * 1024) { - trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); - return -EFBIG; - } - } else { - if (i_size > AFSPATHMAX) { - trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); - return -EFBIG; - } + if (i_size < AFS_DIR_BLOCK_SIZE) + return afs_bad(dvnode, afs_file_error_dir_small); + if (i_size > AFS_DIR_BLOCK_SIZE * 1024) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; } /* Expand the storage. TODO: Shrink the storage too. */ @@ -277,24 +270,18 @@ static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file) * buffer. */ ret = -ESTALE; - } else if (is_dir) { + } else { int ret2 = afs_dir_check(dvnode); if (ret2 < 0) ret = ret2; - } else if (i_size < folioq_folio_size(dvnode->directory, 0)) { - /* NUL-terminate a symlink. */ - char *symlink = kmap_local_folio(folioq_folio(dvnode->directory, 0), 0); - - symlink[i_size] = 0; - kunmap_local(symlink); } } return ret; } -ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file) +static ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file) { ssize_t ret; @@ -1763,13 +1750,20 @@ error: return ret; } +static void afs_symlink_put(struct afs_operation *op) +{ + kfree(op->create.symlink); + op->create.symlink = NULL; + afs_create_put(op); +} + static const struct afs_operation_ops afs_symlink_operation = { .issue_afs_rpc = afs_fs_symlink, .issue_yfs_rpc = yfs_fs_symlink, .success = afs_create_success, .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, - .put = afs_create_put, + .put = afs_symlink_put, }; /* @@ -1779,7 +1773,9 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content) { struct afs_operation *op; + struct afs_symlink *symlink; struct afs_vnode *dvnode = AFS_FS_I(dir); + size_t clen = strlen(content); int ret; _enter("{%llx:%llu},{%pd},%s", @@ -1791,12 +1787,20 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto error; ret = -EINVAL; - if (strlen(content) >= AFSPATHMAX) + if (clen >= AFSPATHMAX) + goto error; + + ret = -ENOMEM; + symlink = kmalloc_flex(struct afs_symlink, content, clen + 1, GFP_KERNEL); + if (!symlink) goto error; + refcount_set(&symlink->ref, 1); + memcpy(symlink->content, content, clen + 1); op = afs_alloc_operation(NULL, dvnode->volume); if (IS_ERR(op)) { ret = PTR_ERR(op); + kfree(symlink); goto error; } @@ -1808,7 +1812,7 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, op->dentry = dentry; op->ops = &afs_symlink_operation; op->create.reason = afs_edit_dir_for_symlink; - op->create.symlink = content; + op->create.symlink = symlink; op->mtime = current_time(dir); ret = afs_do_sync_operation(op); afs_dir_unuse_cookie(dvnode, ret); @@ -2192,28 +2196,33 @@ error: } /* - * Write the file contents to the cache as a single blob. + * Write the directory contents to the cache as a single blob. */ -int afs_single_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int afs_dir_writepages(struct address_space *mapping, + struct writeback_control *wbc) { struct afs_vnode *dvnode = AFS_FS_I(mapping->host); struct iov_iter iter; - bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && - !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); int ret = 0; /* Need to lock to prevent the folio queue and folios from being thrown * away. */ - down_read(&dvnode->validate_lock); + if (!down_read_trylock(&dvnode->validate_lock)) { + if (wbc->sync_mode == WB_SYNC_NONE) { + /* The VFS will have undirtied the inode. */ + netfs_single_mark_inode_dirty(&dvnode->netfs.inode); + return 0; + } + down_read(&dvnode->validate_lock); + } - if (is_dir ? - test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) : - atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) { + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size_read(&dvnode->netfs.inode)); ret = netfs_writeback_single(mapping, wbc, &iter); + if (ret == 1) + ret = 0; /* Skipped write due to lock conflict. */ } up_read(&dvnode->validate_lock); diff --git a/fs/afs/file.c b/fs/afs/file.c index 85696ac984cc..0467742bfeee 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -427,21 +427,35 @@ static void afs_free_request(struct netfs_io_request *rreq) afs_put_wb_key(rreq->netfs_priv2); } -static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +/* + * Set the file size and block count, taking ->cb_lock and ->i_lock to maintain + * coherency and prevent 64-bit tearing on 32-bit arches. + * + * Also, estimate the number of 512 bytes blocks used, rounded up to nearest 1K + * for consistency with other AFS clients. + */ +void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size) { - struct afs_vnode *vnode = AFS_FS_I(inode); + struct inode *inode = &vnode->netfs.inode; loff_t i_size; write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); if (new_i_size > i_size) { - i_size_write(&vnode->netfs.inode, new_i_size); - inode_set_bytes(&vnode->netfs.inode, new_i_size); + i_size_write(inode, new_i_size); + inode_set_bytes(inode, round_up(new_i_size, 1024)); } + spin_unlock(&inode->i_lock); write_sequnlock(&vnode->cb_lock); fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +{ + afs_set_i_size(AFS_FS_I(inode), new_i_size); +} + static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { struct afs_vnode *vnode = AFS_FS_I(wreq->inode); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 95494d5f2b8a..a2ffd60889f8 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -886,7 +886,7 @@ void afs_fs_symlink(struct afs_operation *op) namesz = name->len; padsz = (4 - (namesz & 3)) & 3; - c_namesz = strlen(op->create.symlink); + c_namesz = strlen(op->create.symlink->content); c_padsz = (4 - (c_namesz & 3)) & 3; reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4); @@ -910,7 +910,7 @@ void afs_fs_symlink(struct afs_operation *op) bp = (void *) bp + padsz; } *bp++ = htonl(c_namesz); - memcpy(bp, op->create.symlink, c_namesz); + memcpy(bp, op->create.symlink->content, c_namesz); bp = (void *) bp + c_namesz; if (c_padsz > 0) { memset(bp, 0, c_padsz); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index a5173434f786..3f48458694ba 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -25,96 +25,6 @@ #include "internal.h" #include "afs_fs.h" -void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op) -{ - size_t size = strlen(op->create.symlink) + 1; - size_t dsize = 0; - char *p; - - if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size, - mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0) - return; - - vnode->directory_size = dsize; - p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0); - memcpy(p, op->create.symlink, size); - kunmap_local(p); - set_bit(AFS_VNODE_DIR_READ, &vnode->flags); - netfs_single_mark_inode_dirty(&vnode->netfs.inode); -} - -static void afs_put_link(void *arg) -{ - struct folio *folio = virt_to_folio(arg); - - kunmap_local(arg); - folio_put(folio); -} - -const char *afs_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback) -{ - struct afs_vnode *vnode = AFS_FS_I(inode); - struct folio *folio; - char *content; - ssize_t ret; - - if (!dentry) { - /* RCU pathwalk. */ - if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode)) - return ERR_PTR(-ECHILD); - goto good; - } - - if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) - goto fetch; - - ret = afs_validate(vnode, NULL); - if (ret < 0) - return ERR_PTR(ret); - - if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && - test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) - goto good; - -fetch: - ret = afs_read_single(vnode, NULL); - if (ret < 0) - return ERR_PTR(ret); - set_bit(AFS_VNODE_DIR_READ, &vnode->flags); - -good: - folio = folioq_folio(vnode->directory, 0); - folio_get(folio); - content = kmap_local_folio(folio, 0); - set_delayed_call(callback, afs_put_link, content); - return content; -} - -int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen) -{ - DEFINE_DELAYED_CALL(done); - const char *content; - int len; - - content = afs_get_link(dentry, d_inode(dentry), &done); - if (IS_ERR(content)) { - do_delayed_call(&done); - return PTR_ERR(content); - } - - len = umin(strlen(content), buflen); - if (copy_to_user(buffer, content, len)) - len = -EFAULT; - do_delayed_call(&done); - return len; -} - -static const struct inode_operations afs_symlink_inode_operations = { - .get_link = afs_get_link, - .readlink = afs_readlink, -}; - static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode) { static unsigned long once_only; @@ -214,7 +124,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_mode = S_IFLNK | status->mode; inode->i_op = &afs_symlink_inode_operations; } - inode->i_mapping->a_ops = &afs_dir_aops; + inode->i_mapping->a_ops = &afs_symlink_aops; inode_nohighmem(inode); mapping_set_release_always(inode->i_mapping); break; @@ -224,7 +134,8 @@ static int afs_inode_init_from_status(struct afs_operation *op, return afs_protocol_error(NULL, afs_eproto_file_type); } - afs_set_i_size(vnode, status->size); + i_size_write(inode, status->size); + inode_set_bytes(inode, status->size); afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; @@ -253,7 +164,8 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; - struct inode *inode = &vnode->netfs.inode; + struct netfs_inode *ictx = &vnode->netfs; + struct inode *inode = &ictx->inode; struct timespec64 t; umode_t mode; bool unexpected_jump = false; @@ -336,6 +248,8 @@ static void afs_apply_status(struct afs_operation *op, } if (data_changed) { + unsigned long long zero_point, size = status->size; + inode_set_iversion_raw(inode, status->data_version); /* Only update the size if the data version jumped. If the @@ -343,16 +257,25 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ - vnode->netfs.remote_i_size = status->size; - if (change_size || status->size > i_size_read(inode)) { - afs_set_i_size(vnode, status->size); + spin_lock(&inode->i_lock); + + if (change_size || size > i_size_read(inode)) { + /* We can read the sizes directly as we hold i_lock. */ + zero_point = ictx->_zero_point; + if (unexpected_jump) - vnode->netfs.zero_point = status->size; + zero_point = size; + netfs_write_sizes(inode, size, size, zero_point); + inode_set_bytes(inode, size); inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); + } else { + netfs_write_remote_i_size(inode, size); } + spin_unlock(&inode->i_lock); + if (op->ops == &afs_fetch_data_operation) - op->fetch.subreq->rreq->i_size = status->size; + op->fetch.subreq->rreq->i_size = size; } } @@ -709,7 +632,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, * it, but we need to give userspace the server's size. */ if (S_ISDIR(inode->i_mode)) - stat->size = vnode->netfs.remote_i_size; + stat->size = netfs_read_remote_i_size(inode); } while (read_seqretry(&vnode->cb_lock, seq)); return 0; @@ -756,12 +679,14 @@ void afs_evict_inode(struct inode *inode) .range_end = LLONG_MAX, }; - afs_single_writepages(inode->i_mapping, &wbc); + inode->i_mapping->a_ops->writepages(inode->i_mapping, &wbc); } netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); netfs_free_folioq_buffer(vnode->directory); + if (vnode->symlink) + afs_evict_symlink(vnode); afs_set_cache_aux(vnode, &aux); netfs_clear_inode_writeback(inode, &aux); @@ -889,7 +814,7 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->netfs.remote_i_size) { + attr->ia_size > netfs_read_remote_i_size(inode)) { truncate_setsize(inode, attr->ia_size); netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 599353c33337..0b72a8566299 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -710,6 +710,7 @@ struct afs_vnode { #define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */ struct folio_queue *directory; /* Directory contents */ + struct afs_symlink __rcu *symlink; /* Symlink content */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ @@ -777,6 +778,15 @@ struct afs_permits { }; /* + * Copy of symlink content for normal use. + */ +struct afs_symlink { + struct rcu_head rcu; + refcount_t ref; + char content[]; +}; + +/* * Error prioritisation and accumulation. */ struct afs_error { @@ -887,7 +897,7 @@ struct afs_operation { struct { int reason; /* enum afs_edit_dir_reason */ mode_t mode; - const char *symlink; + struct afs_symlink *symlink; } create; struct { bool need_rehash; @@ -1098,13 +1108,10 @@ extern const struct inode_operations afs_dir_inode_operations; extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; -ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file); ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) __acquires(&dvnode->validate_lock); extern void afs_d_release(struct dentry *); extern void afs_check_for_remote_deletion(struct afs_operation *); -int afs_single_writepages(struct address_space *mapping, - struct writeback_control *wbc); /* * dir_edit.c @@ -1157,6 +1164,7 @@ extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); void afs_fetch_data_async_rx(struct work_struct *work); void afs_fetch_data_immediate_cancel(struct afs_call *call); +void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size); /* * flock.c @@ -1246,10 +1254,6 @@ extern void afs_fs_probe_cleanup(struct afs_net *); */ extern const struct afs_operation_ops afs_fetch_status_operation; -void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); -const char *afs_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback); -int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); @@ -1600,6 +1604,21 @@ extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* + * symlink.c + */ +extern const struct inode_operations afs_symlink_inode_operations; +extern const struct address_space_operations afs_symlink_aops; + +void afs_invalidate_symlink(struct afs_vnode *vnode); +void afs_evict_symlink(struct afs_vnode *vnode); +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback); +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); +int afs_symlink_writepages(struct address_space *mapping, + struct writeback_control *wbc); + +/* * validation.c */ bool afs_check_validity(const struct afs_vnode *vnode); @@ -1759,16 +1778,6 @@ static inline void afs_update_dentry_version(struct afs_operation *op, } /* - * Set the file size and block count. Estimate the number of 512 bytes blocks - * used, rounded up to nearest 1K for consistency with other AFS clients. - */ -static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) -{ - i_size_write(&vnode->netfs.inode, size); - vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; -} - -/* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). diff --git a/fs/afs/symlink.c b/fs/afs/symlink.c new file mode 100644 index 000000000000..ed5868369f37 --- /dev/null +++ b/fs/afs/symlink.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS filesystem symbolic link handling + * + * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/pagemap.h> +#include <linux/iov_iter.h> +#include "internal.h" + +static void afs_put_symlink(struct afs_symlink *symlink) +{ + if (refcount_dec_and_test(&symlink->ref)) + kfree_rcu(symlink, rcu); +} + +static void afs_replace_symlink(struct afs_vnode *vnode, struct afs_symlink *symlink) +{ + struct afs_symlink *old; + + old = rcu_replace_pointer(vnode->symlink, symlink, + lockdep_is_held(&vnode->validate_lock)); + if (old) + afs_put_symlink(old); +} + +/* + * In the event that a third-party update of a symlink occurs, dispose of the + * copy of the old contents. Called under ->validate_lock. + */ +void afs_invalidate_symlink(struct afs_vnode *vnode) +{ + afs_replace_symlink(vnode, NULL); +} + +/* + * Dispose of a symlink copy during inode deletion. + */ +void afs_evict_symlink(struct afs_vnode *vnode) +{ + struct afs_symlink *old; + + old = rcu_replace_pointer(vnode->symlink, NULL, true); + if (old) + afs_put_symlink(old); + +} + +/* + * Set up a locally created symlink inode for immediate write to the cache. + */ +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op) +{ + struct afs_symlink *symlink = op->create.symlink; + size_t dsize = 0; + size_t size = strlen(symlink->content) + 1; + char *p; + + rcu_assign_pointer(vnode->symlink, symlink); + op->create.symlink = NULL; + + if (!fscache_cookie_enabled(netfs_i_cookie(&vnode->netfs))) + return; + + if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size, + mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0) + return; + + vnode->directory_size = dsize; + p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0); + memcpy(p, symlink->content, size); + kunmap_local(p); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); +} + +/* + * Read a symlink in a single download. + */ +static ssize_t afs_do_read_symlink(struct afs_vnode *vnode) +{ + struct afs_symlink *symlink; + struct iov_iter iter; + ssize_t ret; + loff_t i_size; + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size > PAGE_SIZE - 1) { + trace_afs_file_error(vnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } + + if (!vnode->directory) { + size_t cur_size = 0; + + ret = netfs_alloc_folioq_buffer(NULL, + &vnode->directory, &cur_size, PAGE_SIZE, + mapping_gfp_mask(vnode->netfs.inode.i_mapping)); + vnode->directory_size = PAGE_SIZE - 1; + if (ret < 0) + return ret; + } + + iov_iter_folio_queue(&iter, ITER_DEST, vnode->directory, 0, 0, PAGE_SIZE); + + /* AFS requires us to perform the read of a symlink as a single unit to + * avoid issues with the content being changed between reads. + */ + ret = netfs_read_single(&vnode->netfs.inode, NULL, &iter); + if (ret >= 0) { + i_size = ret; + if (i_size > PAGE_SIZE - 1) { + trace_afs_file_error(vnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } + vnode->directory_size = i_size; + + /* Copy the symlink. */ + symlink = kmalloc_flex(struct afs_symlink, content, i_size + 1, + GFP_KERNEL); + if (!symlink) + return -ENOMEM; + + refcount_set(&symlink->ref, 1); + symlink->content[i_size] = 0; + + const char *s = kmap_local_folio(folioq_folio(vnode->directory, 0), 0); + + memcpy(symlink->content, s, i_size); + kunmap_local(s); + + afs_replace_symlink(vnode, symlink); + } + + if (!fscache_cookie_enabled(netfs_i_cookie(&vnode->netfs))) { + netfs_free_folioq_buffer(vnode->directory); + vnode->directory = NULL; + vnode->directory_size = 0; + } + + return ret; +} + +static ssize_t afs_read_symlink(struct afs_vnode *vnode) +{ + ssize_t ret; + + fscache_use_cookie(afs_vnode_cache(vnode), false); + ret = afs_do_read_symlink(vnode); + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); + return ret; +} + +static void afs_put_link(void *arg) +{ + afs_put_symlink(arg); +} + +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct afs_symlink *symlink; + struct afs_vnode *vnode = AFS_FS_I(inode); + ssize_t ret; + + if (!dentry) { + /* RCU pathwalk. */ + symlink = rcu_dereference(vnode->symlink); + if (!symlink || !afs_check_validity(vnode)) + return ERR_PTR(-ECHILD); + set_delayed_call(callback, NULL, NULL); + return symlink->content; + } + + if (vnode->symlink) { + ret = afs_validate(vnode, NULL); + if (ret < 0) + return ERR_PTR(ret); + + down_read(&vnode->validate_lock); + if (vnode->symlink) + goto good; + up_read(&vnode->validate_lock); + } + + if (down_write_killable(&vnode->validate_lock) < 0) + return ERR_PTR(-ERESTARTSYS); + if (!vnode->symlink) { + ret = afs_read_symlink(vnode); + if (ret < 0) { + up_write(&vnode->validate_lock); + return ERR_PTR(ret); + } + } + + downgrade_write(&vnode->validate_lock); + +good: + symlink = rcu_dereference_protected(vnode->symlink, + lockdep_is_held(&vnode->validate_lock)); + refcount_inc(&symlink->ref); + up_read(&vnode->validate_lock); + + set_delayed_call(callback, afs_put_link, symlink); + return symlink->content; +} + +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + DEFINE_DELAYED_CALL(done); + const char *content; + int len; + + content = afs_get_link(dentry, d_inode(dentry), &done); + if (IS_ERR(content)) { + do_delayed_call(&done); + return PTR_ERR(content); + } + + len = umin(strlen(content), buflen); + if (copy_to_user(buffer, content, len)) + len = -EFAULT; + do_delayed_call(&done); + return len; +} + +/* + * Write the symlink contents to the cache as a single blob. We then throw + * away the page we used to receive it. + */ +int afs_symlink_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + int ret = 0; + + if (!down_read_trylock(&vnode->validate_lock)) { + if (wbc->sync_mode == WB_SYNC_NONE) { + /* The VFS will have undirtied the inode. */ + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + return 0; + } + down_read(&vnode->validate_lock); + } + + if (vnode->directory && + atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) { + iov_iter_folio_queue(&iter, ITER_SOURCE, vnode->directory, 0, 0, + i_size_read(&vnode->netfs.inode)); + ret = netfs_writeback_single(mapping, wbc, &iter); + } + + if (ret == 0) { + mutex_lock(&vnode->netfs.wb_lock); + netfs_free_folioq_buffer(vnode->directory); + vnode->directory = NULL; + vnode->directory_size = 0; + mutex_unlock(&vnode->netfs.wb_lock); + } else if (ret == 1) { + ret = 0; /* Skipped write due to lock conflict. */ + } + + up_read(&vnode->validate_lock); + return ret; +} + +const struct inode_operations afs_symlink_inode_operations = { + .get_link = afs_get_link, + .readlink = afs_readlink, +}; + +const struct address_space_operations afs_symlink_aops = { + .writepages = afs_symlink_writepages, +}; diff --git a/fs/afs/validation.c b/fs/afs/validation.c index 0ba8336c9025..e997563af658 100644 --- a/fs/afs/validation.c +++ b/fs/afs/validation.c @@ -465,11 +465,17 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) vnode->cb_ro_snapshot = cb_ro_snapshot; vnode->cb_scrub = cb_scrub; - /* if the vnode's data version number changed then its contents are - * different */ + /* If the vnode's data version number changed then its contents are + * different. Note that afs_apply_status() doesn't set ZAP_DATA on + * directories. + */ zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); - if (zap) - afs_zap_data(vnode); + if (zap) { + if (S_ISREG(vnode->netfs.inode.i_mode)) + afs_zap_data(vnode); + else if (S_ISLNK(vnode->netfs.inode.i_mode)) + afs_invalidate_symlink(vnode); + } up_write(&vnode->validate_lock); _leave(" = 0"); return 0; diff --git a/fs/afs/write.c b/fs/afs/write.c index fcfed9d24e0a..7f34b939706a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -142,7 +142,7 @@ static void afs_issue_write_worker(struct work_struct *work) afs_begin_vnode_operation(op); op->store.write_iter = &subreq->io_iter; - op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size); + op->store.i_size = umax(pos + len, netfs_read_remote_i_size(&vnode->netfs.inode)); op->mtime = inode_get_mtime(&vnode->netfs.inode); afs_wait_for_operation(op); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 24fb562ebd33..d941179730a9 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -960,7 +960,7 @@ void yfs_fs_symlink(struct afs_operation *op) _enter(""); - contents_sz = strlen(op->create.symlink); + contents_sz = strlen(op->create.symlink->content); call = afs_alloc_flat_call(op->net, &yfs_RXYFSSymlink, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + @@ -981,7 +981,7 @@ void yfs_fs_symlink(struct afs_operation *op) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &dvp->fid); bp = xdr_encode_name(bp, name); - bp = xdr_encode_string(bp, op->create.symlink, contents_sz); + bp = xdr_encode_string(bp, op->create.symlink->content, contents_sz); bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime); yfs_check_req(call, bp); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e6f5a17a13e3..b611c64119db 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2412,29 +2412,25 @@ static struct btrfs_block_group *btrfs_create_block_group( */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - u64 start = 0; + struct rb_node *node; int ret = 0; - while (1) { + /* + * This is called during mount from btrfs_read_block_groups(), before + * any background threads are started, so no concurrent writers can + * modify the mapping_tree. No lock is needed here. + */ + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { struct btrfs_chunk_map *map; struct btrfs_block_group *bg; - /* - * btrfs_find_chunk_map() will return the first chunk map - * intersecting the range, so setting @length to 1 is enough to - * get the first chunk. - */ - map = btrfs_find_chunk_map(fs_info, start, 1); - if (!map) - break; - + map = rb_entry(node, struct btrfs_chunk_map, rb_node); bg = btrfs_lookup_block_group(fs_info, map->start); if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); ret = -EUCLEAN; - btrfs_free_chunk_map(map); break; } if (unlikely(bg->start != map->start || bg->length != map->chunk_len || @@ -2447,12 +2443,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; - btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b2393a48a8fe..a02b62e0a8f3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -407,22 +407,18 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - /* - * Avoid direct reclaim when the caller does not allow it. Since - * add_ra_bio_pages() is always speculative, suppress allocation warnings - * in either case. - */ + /* Avoid direct reclaim when the caller does not allow it. */ + constraint_gfp = ~__GFP_FS; + cache_gfp = GFP_NOFS | __GFP_NOWARN; if (!direct_reclaim) { - constraint_gfp = ~(__GFP_FS | __GFP_DIRECT_RECLAIM) | __GFP_NOWARN; - cache_gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; - } else { - constraint_gfp = (~__GFP_FS) | __GFP_NOWARN; - cache_gfp = GFP_NOFS | __GFP_NOWARN; + constraint_gfp &= ~__GFP_DIRECT_RECLAIM; + cache_gfp &= ~__GFP_DIRECT_RECLAIM; } while (cur < compressed_end) { pgoff_t page_end; pgoff_t pg_index = cur >> PAGE_SHIFT; + gfp_t masked_constraint_gfp; u32 add_size; if (pg_index > end_index) @@ -449,8 +445,14 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), - 0, NULL); + /* + * Since add_ra_bio_pages() is always speculative, suppress + * allocation warnings. + */ + masked_constraint_gfp = mapping_gfp_constraint(mapping, constraint_gfp); + masked_constraint_gfp |= __GFP_NOWARN; + + folio = filemap_alloc_folio(masked_constraint_gfp, 0, NULL); if (!folio) break; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8a11be02eeb9..c0a30bb213d7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4686,6 +4686,7 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, free_extent_buffer_stale(eb); } } + btrfs_extent_io_tree_release(dirty_pages); } static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9284c0a81bef..6b79bff241f2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1246,7 +1246,9 @@ static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, write_unlock(&tree->lock); next: from = btrfs_ino(inode) + 1; - cond_resched_lock(&root->inodes.xa_lock); + xa_unlock(&root->inodes); + cond_resched(); + xa_lock(&root->inodes); } xa_unlock(&root->inodes); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index cf1cb5c4db75..8c171ed07008 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -633,7 +633,7 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - return 0; + goto mark_dirty; } } @@ -661,7 +661,7 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - return 0; + goto mark_dirty; } } @@ -788,7 +788,12 @@ again: } } - return 0; +mark_dirty: + ret = btrfs_inode_set_file_extent_range(inode, start, end - start); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; } /* diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a4758d94b32e..a8aa086a4df8 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -155,6 +155,7 @@ enum { BTRFS_FS_LOG_RECOVERING, BTRFS_FS_OPEN, BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_SQUOTA_ENABLING, BTRFS_FS_UPDATE_UUID_TREE_GEN, BTRFS_FS_CREATING_FREE_SPACE_TREE, BTRFS_FS_BTREE_ERR, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 906d5c21ebc4..1ca1cbdf25bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9299,10 +9299,38 @@ next: if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { + u64 range_start; + u64 range_end; + if (cur_offset > actual_len) i_size = actual_len; else i_size = cur_offset; + + /* + * Make sure the file_extent_tree covers the entire + * range [old_i_size, new_i_size) before we update + * disk_i_size. Without this, a previous KEEP_SIZE + * prealloc that extended past i_size (and was lost + * across umount/mount because file_extent_tree is + * only populated up to round_up(i_size) on inode + * load) can leave a gap inside this range. That gap + * would cause btrfs_inode_safe_disk_i_size_write() + * (via find_contiguous_extent_bit() starting at 0) + * to truncate disk_i_size to the start of the gap, + * making the persisted size smaller than i_size. + */ + range_start = round_down(inode->i_size, fs_info->sectorsize); + range_end = round_up(i_size, fs_info->sectorsize); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + range_start, range_end - range_start); + if (ret) { + btrfs_abort_transaction(trans, ret); + if (own_trans) + btrfs_end_transaction(trans); + break; + } + i_size_write(inode, i_size); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } @@ -10671,7 +10699,9 @@ struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) break; from = btrfs_ino(inode) + 1; - cond_resched_lock(&root->inodes.xa_lock); + xa_unlock(&root->inodes); + cond_resched(); + xa_lock(&root->inodes); } xa_unlock(&root->inodes); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index cdf736d3a4e5..6838faceb6d5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1107,7 +1107,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); - btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); + /* + * Set the enable generation to the next transaction, as we cannot + * ensure that extents written during this transaction will see any + * state we have set here. So we should treat all extents of the + * transaction as coming in before squotas was enabled. + */ + btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid + 1); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } @@ -1210,7 +1216,15 @@ out_add_root: goto out_free_path; } - fs_info->qgroup_enable_gen = trans->transid; + /* + * Set fs_info->qgroup_enable_gen and BTRFS_FS_SQUOTA_ENABLING + * under the transaction handle. We want to ensure that all extents in + * the next transaction definitely see them. + */ + if (simple) { + fs_info->qgroup_enable_gen = trans->transid + 1; + set_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags); + } mutex_unlock(&fs_info->qgroup_ioctl_lock); /* @@ -1224,9 +1238,15 @@ out_add_root: */ ret = btrfs_commit_transaction(trans); trans = NULL; + mutex_lock(&fs_info->qgroup_ioctl_lock); - if (ret) + if (ret) { + if (simple) { + clear_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags); + fs_info->qgroup_enable_gen = 0; + } goto out_free_path; + } /* * Set quota enabled flag after committing the transaction, to avoid @@ -1236,6 +1256,8 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (simple) + clear_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ @@ -1715,32 +1737,24 @@ out: return ret; } -static bool can_delete_parent_qgroup(struct btrfs_qgroup *qgroup) - +static bool can_delete_parent_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { ASSERT(btrfs_qgroup_level(qgroup->qgroupid)); + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + squota_check_parent_usage(fs_info, qgroup); return list_empty(&qgroup->members); } /* - * Return true if we can delete the squota qgroup and false otherwise. - * - * Rules for whether we can delete: - * - * A subvolume qgroup can be removed iff the subvolume is fully deleted, which - * is iff there is 0 usage in the qgroup. - * - * A higher level qgroup can be removed iff it has no members. - * Note: We audit its usage to warn on inconsitencies without blocking deletion. + * Because a shared extent can outlive its owning subvolume, we cannot delete a + * subvol squota qgroup until all of the extents it owns are gone, even if the + * subvolume itself has been deleted. */ -static bool can_delete_squota_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) +static bool can_delete_squota_subvol_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) { ASSERT(btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); - - if (btrfs_qgroup_level(qgroup->qgroupid) > 0) { - squota_check_parent_usage(fs_info, qgroup); - return can_delete_parent_qgroup(qgroup); - } + ASSERT(btrfs_qgroup_level(qgroup->qgroupid) == 0); return !(qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr); } @@ -1754,14 +1768,11 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup { struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); - - /* Since squotas cannot be inconsistent, they have special rules for deletion. */ - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) - return can_delete_squota_qgroup(fs_info, qgroup); + int ret; /* For higher level qgroup, we can only delete it if it has no child. */ if (btrfs_qgroup_level(qgroup->qgroupid)) - return can_delete_parent_qgroup(qgroup); + return can_delete_parent_qgroup(fs_info, qgroup); /* * For level-0 qgroups, we can only delete it if it has no subvolume @@ -1777,10 +1788,21 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup return -ENOMEM; /* - * The @ret from btrfs_find_root() exactly matches our definition for - * the return value, thus can be returned directly. + * Any subvol qgroup, regardless of mode, cannot be deleted if the + * subvol still exists. */ - return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); + ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); + /* + * btrfs_find_root returns <0 on error, 0 if found, and >0 if not, + * so the "found" and "error" cases match our desired return values. + */ + if (ret <= 0) + return ret; + + /* Squotas require additional checks, even if the subvol is deleted. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return can_delete_squota_subvol_qgroup(fs_info, qgroup); + return 1; } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) @@ -4922,7 +4944,8 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, u64 num_bytes = delta->num_bytes; const int sign = (delta->is_inc ? 1 : -1); - if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE && + !test_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags)) return 0; if (!btrfs_is_fstree(root)) @@ -4934,8 +4957,9 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, root); - if (!qgroup) { - ret = -ENOENT; + if (WARN_ON_ONCE(!qgroup)) { + btrfs_warn(fs_info, "squota failed to find qgroup for root %llu", root); + ret = 0; goto out; } @@ -4944,8 +4968,19 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, list_for_each_entry(qg, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg->excl += num_bytes * sign; - qg->rfer += num_bytes * sign; + ASSERT(qg->excl == qg->rfer); + if (WARN_ON_ONCE(sign < 0 && qg->excl < num_bytes)) { + btrfs_warn(fs_info, + "squota underflow qg %hu/%llu excl %llu num_bytes %llu", + btrfs_qgroup_level(qg->qgroupid), + btrfs_qgroup_subvolid(qg->qgroupid), + qg->excl, num_bytes); + qg->excl = 0; + qg->rfer = 0; + } else { + qg->excl += num_bytes * sign; + qg->rfer += num_bytes * sign; + } qgroup_dirty(fs_info, qg); list_for_each_entry(glist, &qg->groups, next_group) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 248adb785051..194f581b36f3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1293,14 +1293,13 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); - if (ret) return ret; - else if (ret2) + if (ret2) return ret2; - else - return 0; + + btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); + return 0; } /* diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1b83ed0e0a63..2937db690b40 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -130,6 +130,8 @@ retry: ret = cachefiles_inject_write_error(); if (ret == 0) { subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL); + if (IS_ERR(subdir)) + ret = PTR_ERR(subdir); } else { end_creating(subdir); subdir = ERR_PTR(ret); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1454760332ff..0a86f672cc09 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1336,6 +1336,7 @@ void ceph_process_folio_batch(struct address_space *mapping, ceph_wbc, folio); if (rc == -ENODATA) { folio_unlock(folio); + folio_put(folio); ceph_wbc->fbatch.folios[i] = NULL; continue; } else if (rc == -E2BIG) { @@ -1346,6 +1347,7 @@ void ceph_process_folio_batch(struct address_space *mapping, if (!folio_clear_dirty_for_io(folio)) { doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); folio_unlock(folio); + folio_put(folio); ceph_wbc->fbatch.folios[i] = NULL; continue; } diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 4dc9426643e8..053d5bf0c9f0 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -228,12 +228,19 @@ static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, restart: realm = ceph_inode(inode)->i_snap_realm; - if (realm) + if (realm) { ceph_get_snap_realm(mdsc, realm); - else - pr_err_ratelimited_client(cl, - "%p %llx.%llx null i_snap_realm\n", - inode, ceph_vinop(inode)); + } else { + /* + * i_snap_realm is NULL when all caps have been released, e.g. + * after an MDS session rejection. This is a transient state; + * the realm will be restored once caps are re-granted. + * Treat it as "no quota realm found". + */ + doutc(cl, "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + } + while (realm) { bool has_inode; @@ -340,12 +347,19 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, down_read(&mdsc->snap_rwsem); restart: realm = ceph_inode(inode)->i_snap_realm; - if (realm) + if (realm) { ceph_get_snap_realm(mdsc, realm); - else - pr_err_ratelimited_client(cl, - "%p %llx.%llx null i_snap_realm\n", - inode, ceph_vinop(inode)); + } else { + /* + * i_snap_realm is NULL when all caps have been released, e.g. + * after an MDS session rejection. This is a transient state; + * the realm will be restored once caps are re-granted. + * Treat it as "quota not exceeded". + */ + doutc(cl, "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + } + while (realm) { bool has_inode; @@ -496,6 +510,9 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) u64 total = 0, used, free; bool is_updated = false; + if (!ceph_has_realms_with_quotas(d_inode(fsc->sb->s_root))) + return false; + down_read(&mdsc->snap_rwsem); get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, &realm, true); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5f87f62091a1..e773be07f767 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1254,6 +1254,22 @@ retry: ceph_vinop(inode), name, ceph_cap_string(issued)); __build_xattrs(inode); + /* + * __build_xattrs() may have released and reacquired i_ceph_lock, + * during which handle_cap_grant() could have replaced i_xattrs.blob + * with a newer MDS-provided blob and bumped i_xattrs.version. If that + * caused __build_xattrs() to rebuild the rb-tree from the new blob, + * count/names_size/vals_size may now be larger than when + * required_blob_size was computed above. Recompute it here so the + * prealloc_blob size check below reflects the current tree state. + */ + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if (required_blob_size > mdsc->mdsmap->m_max_xattr_size) { + doutc(cl, "sync (size too large): %d > %llu\n", + required_blob_size, mdsc->mdsmap->m_max_xattr_size); + goto do_sync; + } + if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob; @@ -1294,6 +1310,7 @@ retry: do_sync: spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); do_sync_unlocked: if (lock_snap_rwsem) up_read(&mdsc->snap_rwsem); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1c5224cf183e..733c19571f1c 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -191,13 +191,10 @@ static const struct dentry_operations efivarfs_d_ops = { static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) { + struct qstr q = QSTR(name); struct dentry *d; - struct qstr q; int err; - q.name = name; - q.len = strlen(name); - err = efivarfs_d_hash(parent, &q); if (err) return ERR_PTR(err); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 41e311019a25..df7ea019526d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -89,13 +89,11 @@ static int erofs_init_inode_xattrs(struct inode *inode) vi->xattr_isize - sizeof(struct erofs_xattr_ibody_header)) { erofs_err(sb, "invalid h_shared_count %u @ nid %llu", vi->xattr_shared_count, vi->nid); - erofs_put_metabuf(&buf); ret = -EFSCORRUPTED; goto out_unlock; } vi->xattr_shared_xattrs = kmalloc_objs(uint, vi->xattr_shared_count); if (!vi->xattr_shared_xattrs) { - erofs_put_metabuf(&buf); ret = -ENOMEM; goto out_unlock; } @@ -112,12 +110,12 @@ static int erofs_init_inode_xattrs(struct inode *inode) } vi->xattr_shared_xattrs[i] = le32_to_cpu(*xattr_id); } - erofs_put_metabuf(&buf); /* paired with smp_mb() at the beginning of the function. */ smp_mb(); set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); out_unlock: + erofs_put_metabuf(&buf); clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags); return ret; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 43bb5a6a9924..c6240dccbb0f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1455,6 +1455,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; if (z_erofs_in_atomic()) { + /* See `sync_decompress` in sysfs-fs-erofs for more details */ + if (sbi->sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD struct kthread_worker *worker; @@ -1471,9 +1474,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, #else queue_work(z_erofs_workqueue, &io->u.work); #endif - /* See `sync_decompress` in sysfs-fs-erofs for more details */ - if (sbi->sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) - sbi->sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; return; } gfp_flag = memalloc_noio_save(); @@ -1509,8 +1509,15 @@ repeat: DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page)); folio = page_folio(zbv.page); - /* For preallocated managed folios, add them to page cache here */ + /* + * Preallocated folios are added to the managed cache here rather than + * in z_erofs_bind_cache() in order to keep these folios locked in + * increasing (physical) address order. + * Clear folio->private before these folios become visible to others in + * the managed cache to avoid duplicate additions for unaligned extents. + */ if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) { + folio->private = NULL; tocache = true; goto out_tocache; } @@ -1546,14 +1553,8 @@ repeat: } return; } - /* - * Already linked with another pcluster, which only appears in - * crafted images by fuzzers for now. But handle this anyway. - */ - tocache = false; /* use temporary short-lived pages */ } else { DBG_BUGON(1); /* referenced managed folios can't be truncated */ - tocache = true; } folio_unlock(folio); folio_put(folio); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index a72db36096ca..e1a02a2c8406 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -716,7 +716,7 @@ static int z_erofs_map_sanity_check(struct inode *inode, } if (map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX) { - if (sbi->available_compr_algs ^ BIT(map->m_algorithmformat)) { + if (!(sbi->available_compr_algs & BIT(map->m_algorithmformat))) { erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", map->m_algorithmformat, EROFS_I(inode)->nid); return -EFSCORRUPTED; diff --git a/fs/fhandle.c b/fs/fhandle.c index 642e3d569497..1ca7eb3a6cb5 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -285,6 +285,19 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path, return 0; } +static bool capable_wrt_mount(struct mount *mount) +{ + struct mnt_namespace *mnt_ns; + + /* + * For ->mnt_ns access. + * The following READ_ONCE() is semantically rcu_dereference(). + */ + guard(rcu)(); + mnt_ns = READ_ONCE(mount->mnt_ns); + return ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN); +} + static inline int may_decode_fh(struct handle_to_path_ctx *ctx, unsigned int o_flags) { @@ -320,8 +333,7 @@ static inline int may_decode_fh(struct handle_to_path_ctx *ctx, if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) ctx->flags = HANDLE_CHECK_PERMS; else if (is_mounted(root->mnt) && - ns_capable(real_mount(root->mnt)->mnt_ns->user_ns, - CAP_SYS_ADMIN) && + capable_wrt_mount(real_mount(root->mnt)) && !has_locked_children(real_mount(root->mnt), root->dentry)) ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; else diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5dda7080f4a9..c105aaf9ff5d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1793,6 +1793,10 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) goto out_up_killsb; + if (!S_ISREG(inode->i_mode)) { + err = -EINVAL; + goto out_iput; + } mapping = inode->i_mapping; file_size = i_size_read(inode); @@ -1912,6 +1916,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) break; + if (!folio_test_uptodate(folio)) { + folio_put(folio); + break; + } folio_offset = offset_in_folio(folio, pos); nr_bytes = min(folio_size(folio) - folio_offset, num); @@ -1966,7 +1974,10 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, inode = fuse_ilookup(fc, nodeid, &fm); if (inode) { - err = fuse_retrieve(fm, inode, &outarg); + if (!S_ISREG(inode->i_mode)) + err = -EINVAL; + else + err = fuse_retrieve(fm, inode, &outarg); iput(inode); } up_read(&fc->killsb); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c59452d60b8d..f94f3dc082c6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2176,7 +2176,10 @@ static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, WARN_ON(!ap->num_folios); - /* Reached max pages */ + /* Reached max pages or max folio slots */ + if (ap->num_folios >= fc->max_pages) + return true; + if (DIV_ROUND_UP(bytes, PAGE_SIZE) > fc->max_pages) return true; diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index 66617b1557c6..f5150372618e 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -372,8 +372,8 @@ int hpfs_check_free_dnodes(struct super_block *s, int n) return 0; } } + hpfs_brelse4(&qbh); } - hpfs_brelse4(&qbh); i = 0; if (hpfs_sb(s)->sb_c_bitmap != -1) { bmp = hpfs_map_bitmap(s, b, &qbh, "chkdn1"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8b05bec08e04..78d61bf2bd9b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,15 +96,8 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) +static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - /* Unfortunate we have to reassign vma->vm_private_data. */ - return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); -} - -static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) -{ - struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -119,8 +112,8 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma_desc_set_flags(desc, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); - desc->vm_ops = &hugetlb_vm_ops; + vma_set_flags(vma, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); + vma->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -129,16 +122,16 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (desc->pgoff & PGOFF_LOFFT_MAX) + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)vma_desc_size(desc); - len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); + vma_len = (loff_t)(vma->vm_end - vma->vm_start); + len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -148,7 +141,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) ret = -ENOMEM; - vma_flags = desc->vma_flags; + vma_flags = vma->flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -158,30 +151,17 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) vma_flags_set(&vma_flags, VMA_NORESERVE_BIT); if (hugetlb_reserve_pages(inode, - desc->pgoff >> huge_page_order(h), - len >> huge_page_shift(h), desc, - vma_flags) < 0) + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma, + vma_flags) < 0) goto out; ret = 0; - if (vma_desc_test(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_test(vma, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); - if (!ret) { - /* Allocate the VMA lock after we set it up. */ - desc->action.success_hook = hugetlb_file_mmap_prepare_success; - /* - * We cannot permit the rmap finding this VMA in the time - * between the VMA being inserted into the VMA tree and the - * completion/success hook being invoked. - * - * This is because we establish a per-VMA hugetlb lock which can - * be raced by rmap. - */ - desc->action.hide_from_rmap_until_complete = true; - } return ret; } @@ -1227,7 +1207,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap_prepare = hugetlbfs_file_mmap_prepare, + .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, diff --git a/fs/inode.c b/fs/inode.c index 6a3cbc7dcd28..62c579a0cf7d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2124,7 +2124,13 @@ static int inode_update_cmtime(struct inode *inode, unsigned int flags) inode_iversion_need_inc(inode)) return -EAGAIN; } else { - if (inode_maybe_inc_iversion(inode, !!dirty)) + /* + * Don't force iversion increment for pure lazytime + * updates (I_DIRTY_TIME only), let I_VERSION_QUERIED + * dictate whether the increment is needed. + */ + if (inode_maybe_inc_iversion(inode, + dirty != I_DIRTY_TIME)) dirty |= I_DIRTY_SYNC; } } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d7b648421a70..d55b936e6986 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -400,6 +400,11 @@ void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, bool uptodate = !error; bool finished = true; + if (error) + fserror_report_io(folio->mapping->host, FSERR_BUFFERED_READ, + folio_pos(folio) + off, len, error, + GFP_ATOMIC); + if (ifs) { unsigned long flags; @@ -411,11 +416,6 @@ void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (error) - fserror_report_io(folio->mapping->host, FSERR_BUFFERED_READ, - folio_pos(folio) + off, len, error, - GFP_ATOMIC); - if (finished) folio_end_read(folio, uptodate); } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b0a6549b3848..b36ee619cdcd 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -355,7 +355,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, if (dio->flags & IOMAP_DIO_BOUNCE) ret = bio_iov_iter_bounce(bio, dio->submit.iter, - iomap_max_bio_size(&iter->iomap)); + iomap_max_bio_size(&iter->iomap), alignment); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 60c4a0e0fca5..442d62679262 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -309,7 +309,7 @@ static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, out1: jfs_info("jfs_mkdir: rc:%d", rc); - return ERR_PTR(rc); + return rc ? ERR_PTR(rc) : NULL; } /* diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index a7c85ab6d4b5..1db6cb352542 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -332,7 +332,7 @@ int nlmsvc_dispatch(struct svc_rqst *rqstp); * File handling for the server personality */ __be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, - struct nlm_lock *); + struct nlm_lock *, int); void nlm_release_file(struct nlm_file *); void nlmsvc_put_lockowner(struct nlm_lockowner *); void nlmsvc_release_lockowner(struct nlm_lock *); diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 5de41e249534..41cab858de57 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -146,8 +146,11 @@ nlm4svc_lookup_file(struct svc_rqst *rqstp, struct nlm_host *host, struct nlm_lock *lock, struct nlm_file **filp, struct nlm4_lock *xdr_lock, unsigned char type) { + bool is_test = (rqstp->rq_proc == NLMPROC4_TEST || + rqstp->rq_proc == NLMPROC4_TEST_MSG); struct file_lock *fl = &lock->fl; struct nlm_file *file = NULL; + int mode; __be32 error; if (xdr_lock->fh.len > NFS_MAXFHSIZE) @@ -170,7 +173,8 @@ nlm4svc_lookup_file(struct svc_rqst *rqstp, struct nlm_host *host, fl->c.flc_type = type; lockd_set_file_lock_range4(fl, lock->lock_start, lock->lock_len); - error = nlm_lookup_file(rqstp, &file, lock); + mode = is_test ? O_RDWR : lock_to_openmode(fl); + error = nlm_lookup_file(rqstp, &file, lock, mode); switch (error) { case nlm_granted: break; @@ -184,7 +188,8 @@ nlm4svc_lookup_file(struct svc_rqst *rqstp, struct nlm_host *host, *filp = file; fl->c.flc_flags = FL_POSIX; - fl->c.flc_file = file->f_file[lock_to_openmode(fl)]; + fl->c.flc_file = is_test ? nlmsvc_file_file(file) + : file->f_file[mode]; fl->c.flc_pid = current->tgid; fl->fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index b98b1d0ada35..f4520149d6d7 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -613,7 +613,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lock *conflock) { int error; - int mode; __be32 ret; dprintk("lockd: nlmsvc_testlock(%s/%llu, ty=%d, %Ld-%Ld)\n", @@ -631,14 +630,13 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - mode = lock_to_openmode(&lock->fl); locks_init_lock(&conflock->fl); /* vfs_test_lock only uses start, end, and owner, but tests flc_file */ conflock->fl.c.flc_file = lock->fl.c.flc_file; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; conflock->fl.c.flc_owner = lock->fl.c.flc_owner; - error = vfs_test_lock(file->f_file[mode], &conflock->fl); + error = vfs_test_lock(lock->fl.c.flc_file, &conflock->fl); if (error) { ret = nlm_lck_denied_nolocks; goto out; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 749abf8886ba..c0a3487719e2 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -68,6 +68,8 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, struct nlm_host *host = NULL; struct nlm_file *file = NULL; struct nlm_lock *lock = &argp->lock; + bool is_test = (rqstp->rq_proc == NLMPROC_TEST || + rqstp->rq_proc == NLMPROC_TEST_MSG); int mode; __be32 error = 0; @@ -83,15 +85,22 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - error = cast_status(nlm_lookup_file(rqstp, &file, lock)); + mode = lock_to_openmode(&lock->fl); + + if (is_test) + mode = O_RDWR; + + error = cast_status(nlm_lookup_file(rqstp, &file, lock, mode)); if (error != 0) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - mode = lock_to_openmode(&lock->fl); lock->fl.c.flc_flags = FL_POSIX; - lock->fl.c.flc_file = file->f_file[mode]; + if (is_test) + lock->fl.c.flc_file = nlmsvc_file_file(file); + else + lock->fl.c.flc_file = file->f_file[mode]; lock->fl.c.flc_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 344e6c187cde..9da9d6e0b42e 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -83,23 +83,36 @@ int lock_to_openmode(struct file_lock *lock) * * We have to make sure we have the right credential to open * the file. + * + * @mode is O_RDONLY, O_WRONLY, or O_RDWR. O_RDWR means success + * is achieved with EITHER O_RDONLY or O_WRONLY; it does not + * require both. */ static __be32 nlm_do_fopen(struct svc_rqst *rqstp, struct nlm_file *file, int mode) { - struct file **fp = &file->f_file[mode]; - __be32 nlmerr = nlm_granted; + __be32 nlmerr = nlm__int__failed; + __be32 deferred = 0; int error; + int m; + + for (m = O_RDONLY; m <= O_WRONLY; m++) { + struct file **fp = &file->f_file[m]; + + if (mode != O_RDWR && mode != m) + continue; + if (*fp) + return nlm_granted; - if (*fp) - return nlmerr; + error = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, m); + if (!error) + return nlm_granted; - error = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); - if (error) { dprintk("lockd: open failed (errno %d)\n", error); switch (error) { case -EWOULDBLOCK: nlmerr = nlm__int__drop_reply; + deferred = nlmerr; break; case -ESTALE: nlmerr = nlm__int__stale_fh; @@ -110,7 +123,7 @@ static __be32 nlm_do_fopen(struct svc_rqst *rqstp, } } - return nlmerr; + return deferred ? deferred : nlmerr; } /* @@ -119,17 +132,15 @@ static __be32 nlm_do_fopen(struct svc_rqst *rqstp, */ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, - struct nlm_lock *lock) + struct nlm_lock *lock, int mode) { struct nlm_file *file; unsigned int hash; __be32 nfserr; - int mode; nlm_debug_print_fh("nlm_lookup_file", &lock->fh); hash = file_hash(&lock->fh); - mode = lock_to_openmode(&lock->fl); /* Lock file table */ mutex_lock(&nlm_file_mutex); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 6472c4ea3d1e..cb61fbdb52e9 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -375,6 +375,8 @@ int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_ continue; seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); + if (seq_has_overflowed(seq)) + return -EAGAIN; seq->count++; /* mappings are separated by \0 */ if (seq_has_overflowed(seq)) diff --git a/fs/mount.h b/fs/mount.h index e0816c11a198..5c120f8361bd 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -71,7 +71,15 @@ struct mount { struct hlist_head mnt_slave_list;/* list of slave mounts */ struct hlist_node mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ - struct mnt_namespace *mnt_ns; /* containing namespace */ + /* + * Containing namespace (active or deactivating, non-refcounted). + * Normally protected by namespace_sem. + * Can also be accessed locklessly under RCU. RCU readers can't rely on + * the namespace still being active, but implicitly hold a passive + * reference (because an RCU delay happens between a namespace being + * deactivated and the corresponding passive refcount drop). + */ + struct mnt_namespace *mnt_ns; struct mountpoint *mnt_mp; /* where is it mounted */ union { struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ diff --git a/fs/namei.c b/fs/namei.c index c7fac83c9a85..4787244ca4a7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5024,6 +5024,7 @@ struct file *dentry_create(struct path *path, int flags, umode_t mode, { struct file *file __free(fput) = NULL; struct dentry *dentry = path->dentry; + struct dentry *orig_dentry = dentry; struct dentry *dir = dentry->d_parent; struct inode *dir_inode = d_inode(dir); struct mnt_idmap *idmap; @@ -5043,9 +5044,18 @@ struct file *dentry_create(struct path *path, int flags, umode_t mode, if (create_error) flags &= ~O_CREAT; + /* atomic_open will dput(dentry) on error */ + dget(orig_dentry); dentry = atomic_open(path, dentry, file, flags, mode); error = PTR_ERR_OR_ZERO(dentry); + if (IS_ERR(dentry)) + /* keep the original */ + dentry = orig_dentry; + else + /* Drop the extra reference */ + dput(orig_dentry); + if (unlikely(create_error) && error == -ENOENT) error = create_error; diff --git a/fs/namespace.c b/fs/namespace.c index fe919abd2f01..f5905f4ec560 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1079,7 +1079,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) bool mnt_first_node = true, mnt_last_node = true; WARN_ON(mnt_ns_attached(mnt)); - mnt->mnt_ns = ns; + WRITE_ONCE(mnt->mnt_ns, ns); while (*link) { parent = *link; if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { @@ -1434,7 +1434,7 @@ EXPORT_SYMBOL(mntget); void mnt_make_shortterm(struct vfsmount *mnt) { if (mnt) - real_mount(mnt)->mnt_ns = NULL; + WRITE_ONCE(real_mount(mnt)->mnt_ns, NULL); } /** @@ -1806,7 +1806,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) ns->nr_mounts--; __touch_mnt_namespace(ns); } - p->mnt_ns = NULL; + WRITE_ONCE(p->mnt_ns, NULL); if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a8c0d86118c5..76d0f6a29aba 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -156,9 +156,8 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, netfs_cache_read_terminated, subreq); } -static void netfs_queue_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq, - bool last_subreq) +void netfs_queue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -169,7 +168,8 @@ static void netfs_queue_read(struct netfs_io_request *rreq, * remove entries off of the front. */ spin_lock(&rreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); + /* Write IN_PROGRESS before pointer to new subreq */ + list_add_tail_release(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (!stream->active) { stream->collected_to = subreq->start; @@ -178,11 +178,6 @@ static void netfs_queue_read(struct netfs_io_request *rreq, } } - if (last_subreq) { - smp_wmb(); /* Write lists before ALL_QUEUED. */ - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - } - spin_unlock(&rreq->lock); } @@ -214,7 +209,6 @@ static void netfs_issue_read(struct netfs_io_request *rreq, static void netfs_read_to_pagecache(struct netfs_io_request *rreq, struct readahead_control *ractl) { - struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned long long start = rreq->start; ssize_t size = rreq->len; int ret = 0; @@ -233,10 +227,13 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, subreq->start = start; subreq->len = size; + netfs_queue_read(rreq, subreq); + source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); subreq->source = source; if (source == NETFS_DOWNLOAD_FROM_SERVER) { - unsigned long long zp = umin(ictx->zero_point, rreq->i_size); + unsigned long long zero_point = netfs_read_zero_point(rreq->inode); + unsigned long long zp = umin(zero_point, rreq->i_size); size_t len = subreq->len; if (unlikely(rreq->origin == NETFS_READ_SINGLE)) @@ -252,7 +249,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx", rreq->debug_id, subreq->debug_index, subreq->len, size, - subreq->start, ictx->zero_point, rreq->i_size); + subreq->start, zero_point, rreq->i_size); + netfs_cancel_read(subreq, ret); break; } subreq->len = len; @@ -261,12 +259,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { - subreq->error = ret; - /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, - netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, - netfs_sreq_trace_put_cancel); + netfs_cancel_read(subreq, ret); break; } trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); @@ -289,24 +282,29 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, pr_err("Unexpected read source %u\n", source); WARN_ON_ONCE(1); + netfs_cancel_read(subreq, ret); break; issue: slice = netfs_prepare_read_iterator(subreq, ractl); if (slice < 0) { ret = slice; - subreq->error = ret; - trace_netfs_sreq(subreq, netfs_sreq_trace_cancel); - /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); + netfs_cancel_read(subreq, ret); break; } - size -= slice; start += slice; + size -= slice; + if (size <= 0) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + } - netfs_queue_read(rreq, subreq, size <= 0); netfs_issue_read(rreq, subreq); + + if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + netfs_wait_for_paused_read(rreq); + if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) + break; cond_resched(); } while (size > 0); @@ -397,6 +395,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) { struct netfs_io_request *rreq; struct address_space *mapping = folio->mapping; + struct netfs_group *group = netfs_folio_group(folio); struct netfs_folio *finfo = netfs_folio_info(folio); struct netfs_inode *ctx = netfs_inode(mapping->host); struct folio *sink = NULL; @@ -458,14 +457,20 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) netfs_read_to_pagecache(rreq, NULL); - if (sink) - folio_put(sink); - ret = netfs_wait_for_read(rreq); if (ret >= 0) { + if (group) + folio_change_private(folio, group); + else + folio_detach_private(folio); + kfree(finfo); + trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); flush_dcache_folio(folio); folio_mark_uptodate(folio); } + + if (sink) + folio_put(sink); folio_unlock(folio); netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; @@ -498,10 +503,10 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct netfs_inode *ctx = netfs_inode(mapping->host); int ret; - if (folio_test_dirty(folio)) { - trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + folio_wait_writeback(folio); + + if (folio_test_dirty(folio)) return netfs_read_gaps(file, folio); - } _enter("%lx", folio->index); @@ -667,7 +672,7 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio->index; + rreq->no_unlock_folio = folio; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); @@ -684,9 +689,9 @@ retry: netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); + netfs_put_request(rreq, netfs_rreq_trace_put_return); if (ret < 0) goto error; - netfs_put_request(rreq, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_private_2_killable(folio); @@ -733,7 +738,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, goto error; } - rreq->no_unlock_folio = folio->index; + rreq->no_unlock_folio = folio; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 05ea5b0cc0e8..6bde3320bcec 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -12,24 +12,6 @@ #include <linux/slab.h> #include "internal.h" -static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - if (netfs_group) - folio_attach_private(folio, netfs_get_group(netfs_group)); -} - -static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - void *priv = folio_get_private(folio); - - if (unlikely(priv != netfs_group)) { - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) - folio_attach_private(folio, netfs_get_group(netfs_group)); - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) - folio_detach_private(folio); - } -} - /* * Grab a folio for writing and lock it. Attempt to allocate as large a folio * as possible to hold as much of the remaining length as possible in one go. @@ -149,6 +131,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } do { + enum netfs_folio_trace trace; struct netfs_folio *finfo; struct netfs_group *group; unsigned long long fpos; @@ -156,6 +139,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, size_t offset; /* Offset into pagecache folio */ size_t part; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ + void *priv; offset = pos & (max_chunk - 1); part = min(max_chunk - offset, iov_iter_count(iter)); @@ -201,73 +185,99 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto error_folio_unlock; } - /* Decide how we should modify a folio. We might be attempting - * to do write-streaming, in which case we don't want to a - * local RMW cycle if we can avoid it. If we're doing local - * caching or content crypto, we award that priority over - * avoiding RMW. If the file is open readably, then we also - * assume that we may want to read what we wrote. - */ finfo = netfs_folio_info(folio); group = netfs_folio_group(folio); + /* If the requested group differs from the group set on the + * page, then we need to flush out the folio if it has a group + * set (ie. is non-NULL). Note that COPY_TO_CACHE is a special + * case, being a netfs annotation rather than an actual group. + * + * The filesystem isn't permitted to mix writes with groups and + * writes without groups as the NULL group is used to indicate + * that no group is set. + */ if (unlikely(group != netfs_group) && - group != NETFS_FOLIO_COPY_TO_CACHE) + group != NETFS_FOLIO_COPY_TO_CACHE && + group) { + WARN_ON_ONCE(!netfs_group); goto flush_content; + } + /* Decide how we should modify a folio. We might be attempting + * to do write-streaming, as we don't want to a local RMW cycle + * if we can avoid it. If we're doing local caching or content + * crypto, we award that priority over avoiding RMW. If the + * file is open readably, then we let ->read_folio() fill in + * the gaps. + */ if (folio_test_uptodate(folio)) { if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - netfs_set_group(folio, netfs_group); - trace_netfs_folio(folio, netfs_folio_is_uptodate); - goto copied; + trace = netfs_folio_is_uptodate; + goto copied_uptodate; } /* If the page is above the zero-point then we assume that the * server would just return a block of zeros or a short read if * we try to read it. */ - if (fpos >= ctx->zero_point) { + if (fpos >= netfs_read_zero_point(inode)) { folio_zero_segment(folio, 0, offset); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; folio_zero_segment(folio, offset + copied, flen); - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace_netfs_folio(folio, netfs_modify_and_clear); - goto copied; + if (finfo) + trace = netfs_modify_and_clear_rm_finfo; + else + trace = netfs_modify_and_clear; + goto mark_uptodate; } /* See if we can write a whole folio in one go. */ if (!maybe_trouble && offset == 0 && part >= flen) { copied = copy_folio_from_iter_atomic(folio, offset, part, iter); - if (unlikely(copied == 0)) + if (likely(copied == part)) { + if (finfo) + trace = netfs_whole_folio_modify_filled; + else + trace = netfs_whole_folio_modify; + goto mark_uptodate; + } + if (copied == 0) goto copy_failed; - if (unlikely(copied < part)) { + if (!finfo || copied <= finfo->dirty_offset) { maybe_trouble = true; iov_iter_revert(iter, copied); copied = 0; folio_unlock(folio); goto retry; } - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace_netfs_folio(folio, netfs_whole_folio_modify); + + /* We overwrote some existing dirty data, so we have to + * accept the partial write. + */ + finfo->dirty_len += finfo->dirty_offset; + if (finfo->dirty_len == flen) { + trace = netfs_whole_folio_modify_filled_efault; + goto mark_uptodate; + } + if (copied > finfo->dirty_len) + finfo->dirty_len = copied; + finfo->dirty_offset = 0; + trace = netfs_whole_folio_modify_efault; goto copied; } /* We don't want to do a streaming write on a file that loses * caching service temporarily because the backing store got - * culled and we don't really want to get a streaming write on - * a file that's open for reading as ->read_folio() then has to - * be able to flush it. + * culled. */ - if ((file->f_mode & FMODE_READ) || - netfs_is_cache_enabled(ctx)) { + if (netfs_is_cache_enabled(ctx)) { if (finfo) { netfs_stat(&netfs_n_wh_wstream_conflict); goto flush_content; @@ -282,11 +292,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - netfs_set_group(folio, netfs_group); - trace_netfs_folio(folio, netfs_just_prefetch); - goto copied; + trace = netfs_just_prefetch; + goto copied_uptodate; } + /* Do a streaming write on a folio that has nothing in it yet. */ if (!finfo) { ret = -EIO; if (WARN_ON(folio_get_private(folio))) @@ -295,10 +305,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(copied == 0)) goto copy_failed; if (offset == 0 && copied == flen) { - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace_netfs_folio(folio, netfs_streaming_filled_page); - goto copied; + trace = netfs_streaming_filled_page; + goto mark_uptodate; } finfo = kzalloc_obj(*finfo); @@ -312,7 +320,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, finfo->dirty_len = copied; folio_attach_private(folio, (void *)((unsigned long)finfo | NETFS_FOLIO_INFO)); - trace_netfs_folio(folio, netfs_streaming_write); + trace = netfs_streaming_write; goto copied; } @@ -326,16 +334,10 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto copy_failed; finfo->dirty_len += copied; if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { - if (finfo->netfs_group) - folio_change_private(folio, finfo->netfs_group); - else - folio_detach_private(folio); - folio_mark_uptodate(folio); - kfree(finfo); - trace_netfs_folio(folio, netfs_streaming_cont_filled_page); - } else { - trace_netfs_folio(folio, netfs_streaming_write_cont); + trace = netfs_streaming_cont_filled_page; + goto mark_uptodate; } + trace = netfs_streaming_write_cont; goto copied; } @@ -349,7 +351,38 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto out; continue; + /* Mark a folio as being up to data when we've filled it + * completely. If the folio has a group attached, then it must + * be the same group, otherwise we should have flushed it out + * above. We have to get rid of the netfs_folio struct if + * there was one. + */ + mark_uptodate: + folio_mark_uptodate(folio); + + copied_uptodate: + priv = folio_get_private(folio); + if (likely(priv == netfs_group)) { + /* Already set correctly; no change required. */ + } else if (priv == NETFS_FOLIO_COPY_TO_CACHE) { + if (!netfs_group) + folio_detach_private(folio); + else + folio_change_private(folio, netfs_get_group(netfs_group)); + } else if (!priv) { + folio_attach_private(folio, netfs_get_group(netfs_group)); + } else { + WARN_ON_ONCE(!finfo); + if (netfs_group) + /* finfo->netfs_group has a ref */ + folio_change_private(folio, netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } + copied: + trace_netfs_folio(folio, trace); flush_dcache_folio(folio); /* Update the inode size if we moved the EOF marker */ @@ -510,6 +543,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); vm_fault_t ret = VM_FAULT_NOPAGE; + void *priv; int err; _enter("%lx", folio->index); @@ -530,7 +564,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr } group = netfs_folio_group(folio); - if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { + if (group && + group != netfs_group && + group != NETFS_FOLIO_COPY_TO_CACHE) { folio_unlock(folio); err = filemap_fdatawrite_range(mapping, folio_pos(folio), @@ -552,7 +588,19 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); else trace_netfs_folio(folio, netfs_folio_trace_mkwrite); - netfs_set_group(folio, netfs_group); + + priv = folio_get_private(folio); + if (priv != netfs_group) { + if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_detach_private(folio); + else if (netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_change_private(folio, netfs_get_group(netfs_group)); + else if (netfs_group && !priv) + folio_attach_private(folio, netfs_get_group(netfs_group)); + else + WARN_ON_ONCE(1); + } + file_update_time(file); set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags); if (ictx->ops->post_modify) diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index f72e6da88cca..6a8fb0d55e04 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -45,12 +45,11 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) * Perform a read to a buffer from the server, slicing up the region to be read * according to the network rsize. */ -static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) +static void netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) { - struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned long long start = rreq->start; ssize_t size = rreq->len; - int ret = 0; + int ret; do { struct netfs_io_subrequest *subreq; @@ -58,7 +57,10 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) subreq = netfs_alloc_subrequest(rreq); if (!subreq) { - ret = -ENOMEM; + /* Stash the error in the request if there's not + * already an error set. + */ + cmpxchg(&rreq->error, 0, -ENOMEM); break; } @@ -66,25 +68,13 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) subreq->start = start; subreq->len = size; - __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - - spin_lock(&rreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { - if (!stream->active) { - stream->collected_to = subreq->start; - /* Store list pointers before active flag */ - smp_store_release(&stream->active, true); - } - } - trace_netfs_sreq(subreq, netfs_sreq_trace_added); - spin_unlock(&rreq->lock); + netfs_queue_read(rreq, subreq); netfs_stat(&netfs_n_rh_download); if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { - netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); + netfs_cancel_read(subreq, ret); break; } } @@ -113,8 +103,6 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); netfs_wake_collector(rreq); } - - return ret; } /* @@ -137,21 +125,17 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) // TODO: Use bounce buffer if requested inode_dio_begin(rreq->inode); + netfs_dispatch_unbuffered_reads(rreq); - ret = netfs_dispatch_unbuffered_reads(rreq); - - if (!rreq->submitted) { - netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); - inode_dio_end(rreq->inode); - ret = 0; - goto out; - } + /* The collector will get run, even if we don't manage to submit any + * subreqs, so we shouldn't call inode_dio_end() here. + */ if (sync) ret = netfs_wait_for_read(rreq); else ret = -EIOCBQUEUED; -out: + _leave(" = %zd", ret); return ret; } diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index f9ab69de3e29..25f8ceb15fad 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -376,8 +376,10 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret < 0) goto out; end = iocb->ki_pos + iov_iter_count(from); - if (end > ictx->zero_point) - ictx->zero_point = end; + spin_lock(&inode->i_lock); + if (end > ictx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), FSCACHE_INVAL_DIO_WRITE); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index d436e20d3418..645996ecfc80 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -23,6 +23,8 @@ /* * buffered_read.c */ +void netfs_queue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq); void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); @@ -108,6 +110,7 @@ static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq, */ bool netfs_read_collection(struct netfs_io_request *rreq); void netfs_read_collection_worker(struct work_struct *work); +void netfs_cancel_read(struct netfs_io_subrequest *subreq, int error); void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); /* diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 154a14bb2d7f..b375567e0520 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -22,7 +22,7 @@ * * Extract the page fragments from the given amount of the source iterator and * build up a second iterator that refers to all of those bits. This allows - * the original iterator to disposed of. + * the original iterator to be disposed of. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA be * allowed on the pages extracted. @@ -43,7 +43,7 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, unsigned int max_pages; unsigned int npages = 0; unsigned int i; - ssize_t ret; + ssize_t ret = 0; size_t count = orig_len, offset, len; size_t bv_size, pg_size; @@ -67,26 +67,29 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, ret = iov_iter_extract_pages(orig, &pages, count, max_pages - npages, extraction_flags, &offset); - if (ret < 0) { - pr_err("Couldn't get user pages (rc=%zd)\n", ret); + if (unlikely(ret <= 0)) { + ret = ret ?: -EIO; break; } - if (ret > count) { - pr_err("get_pages rc=%zd more than %zu\n", ret, count); + if (WARN(ret > count, + "%s: extract_pages overrun %zd > %zu bytes\n", + __func__, ret, count)) { + ret = -EIO; break; } - count -= ret; - ret += offset; - cur_npages = DIV_ROUND_UP(ret, PAGE_SIZE); - - if (npages + cur_npages > max_pages) { - pr_err("Out of bvec array capacity (%u vs %u)\n", - npages + cur_npages, max_pages); + cur_npages = DIV_ROUND_UP(offset + ret, PAGE_SIZE); + if (WARN(cur_npages > max_pages - npages, + "%s: extract_pages overrun %u > %u pages\n", + __func__, npages + cur_npages, max_pages)) { + ret = -EIO; break; } + count -= ret; + ret += offset; + for (i = 0; i < cur_npages; i++) { len = ret > PAGE_SIZE ? PAGE_SIZE : ret; bvec_set_page(bv + npages + i, *pages++, len - offset, offset); @@ -97,6 +100,18 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, npages += cur_npages; } + /* Note: Don't try to clean up after EIO. Either we got no pages, so + * nothing to clean up, or we got a buffer overrun, memory corruption + * and can't trust the stuff in the buffer (a WARN was emitted). + */ + + if (ret < 0 && (ret == -ENOMEM || npages == 0)) { + for (i = 0; i < npages; i++) + unpin_user_page(bv[i].bv_page); + kvfree(bv); + return ret; + } + iov_iter_bvec(new, orig->data_source, bv, npages, orig_len - count); return npages; } diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 6df89c92b10b..5d554512ed23 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -211,18 +211,25 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo; - struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + struct inode *inode = folio_inode(folio); + struct netfs_inode *ctx = netfs_inode(inode); size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio->index, offset, length); if (offset == 0 && length == flen) { - unsigned long long i_size = i_size_read(&ctx->inode); + unsigned long long i_size, remote_i_size, zero_point; unsigned long long fpos = folio_pos(folio), end; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); end = umin(fpos + flen, i_size); - if (fpos < i_size && end > ctx->zero_point) - ctx->zero_point = end; + if (fpos < i_size && end > zero_point) { + spin_lock(&inode->i_lock); + end = umin(fpos + flen, inode->i_size); + if (fpos < i_size && end > ctx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); + } } folio_wait_private_2(folio); /* [DEPRECATED] */ @@ -255,7 +262,8 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) goto erase_completely; /* Move the start of the data. */ finfo->dirty_len = fend - iend; - finfo->dirty_offset = offset; + finfo->dirty_offset = iend; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_front); return; } @@ -264,12 +272,14 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) */ if (iend >= fend) { finfo->dirty_len = offset - fstart; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_tail); return; } /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ + trace_netfs_folio(folio, netfs_folio_trace_invalidate_middle); } return; @@ -277,8 +287,9 @@ erase_completely: netfs_put_group(netfs_folio_group(folio)); folio_detach_private(folio); folio_clear_uptodate(folio); + folio_cancel_dirty(folio); kfree(finfo); - return; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_all); } EXPORT_SYMBOL(netfs_invalidate_folio); @@ -292,15 +303,22 @@ EXPORT_SYMBOL(netfs_invalidate_folio); */ bool netfs_release_folio(struct folio *folio, gfp_t gfp) { - struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); - unsigned long long end; + struct inode *inode = folio_inode(folio); + struct netfs_inode *ctx = netfs_inode(inode); + unsigned long long i_size, remote_i_size, zero_point, end; if (folio_test_dirty(folio)) return false; - end = umin(folio_next_pos(folio), i_size_read(&ctx->inode)); - if (end > ctx->zero_point) - ctx->zero_point = end; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + end = folio_next_pos(folio); + if (end > zero_point) { + spin_lock(&inode->i_lock); + end = umin(end, ctx->_remote_i_size); + if (end > ctx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); + } if (folio_test_private(folio)) return false; @@ -356,6 +374,7 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, DEFINE_WAIT(myself); list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + smp_rmb(); /* Read ->next before IN_PROGRESS. */ if (!netfs_check_subreq_in_progress(subreq)) continue; diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index e5f6665b3341..23660a590124 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -83,7 +83,7 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq, } just_unlock: - if (folio->index == rreq->no_unlock_folio && + if (folio == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { _debug("no unlock"); } else { @@ -205,8 +205,10 @@ reassess: * in progress. The issuer thread may be adding stuff to the tail * whilst we're doing this. */ - front = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); + front = list_first_entry_or_null_acquire(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + /* Read first subreq pointer before IN_PROGRESS flag. */ + while (front) { size_t transferred; @@ -576,6 +578,17 @@ skip_error_checks: EXPORT_SYMBOL(netfs_read_subreq_terminated); /* + * Cancel a read subrequest due to preparation failure. + */ +void netfs_cancel_read(struct netfs_io_subrequest *subreq, int error) +{ + trace_netfs_sreq(subreq, netfs_sreq_trace_cancel); + subreq->error = error; + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + netfs_read_subreq_terminated(subreq); +} + +/* * Handle termination of a read from the cache. */ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error) diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index cca9ac43c077..f59a70f3a086 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -175,7 +175,9 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) list_for_each_entry_safe_from(subreq, tmp, &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); + spin_lock(&rreq->lock); list_del(&subreq->rreq_link); + spin_unlock(&rreq->lock); netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; @@ -203,8 +205,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) refcount_read(&subreq->ref), netfs_sreq_trace_new); + spin_lock(&rreq->lock); list_add(&subreq->rreq_link, &to->rreq_link); - to = list_next_entry(to, rreq_link); + spin_unlock(&rreq->lock); + to = subreq; trace_netfs_sreq(subreq, netfs_sreq_trace_retry); stream->sreq_max_len = umin(len, rreq->rsize); @@ -288,8 +292,15 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq) struct folio *folio = folioq_folio(p, slot); if (folio && !folioq_is_marked2(p, slot)) { - trace_netfs_folio(folio, netfs_folio_trace_abandon); - folio_unlock(folio); + if (folio == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, + &rreq->flags)) { + _debug("no unlock"); + } else { + trace_netfs_folio(folio, + netfs_folio_trace_abandon); + folio_unlock(folio); + } } } } diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index d0e23bc42445..8833550d2eb6 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -89,7 +89,6 @@ static void netfs_single_read_cache(struct netfs_io_request *rreq, */ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) { - struct netfs_io_stream *stream = &rreq->io_streams[0]; struct netfs_io_subrequest *subreq; int ret = 0; @@ -102,14 +101,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) subreq->len = rreq->len; subreq->io_iter = rreq->buffer.iter; - __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - - spin_lock(&rreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); - trace_netfs_sreq(subreq, netfs_sreq_trace_added); - /* Store list pointers before active flag */ - smp_store_release(&stream->active, true); - spin_unlock(&rreq->lock); + netfs_queue_read(rreq, subreq); netfs_single_cache_prepare_read(rreq, subreq); switch (subreq->source) { @@ -121,10 +113,14 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) goto cancel; } + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); rreq->netfs_ops->issue_read(subreq); rreq->submitted += subreq->len; break; case NETFS_READ_FROM_CACHE: + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); netfs_single_read_cache(rreq, subreq); rreq->submitted += subreq->len; @@ -134,14 +130,15 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) pr_warn("Unexpected single-read source %u\n", subreq->source); WARN_ON_ONCE(true); ret = -EIO; - break; + goto cancel; } - smp_wmb(); /* Write lists before ALL_QUEUED. */ - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); return ret; cancel: - netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); + netfs_cancel_read(subreq, ret); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + netfs_wake_collector(rreq); return ret; } diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index b194447f4b11..24fc2bb2f8a4 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -57,7 +57,8 @@ static void netfs_dump_request(const struct netfs_io_request *rreq) int netfs_folio_written_back(struct folio *folio) { enum netfs_folio_trace why = netfs_folio_trace_clear; - struct netfs_inode *ictx = netfs_inode(folio->mapping->host); + struct inode *inode = folio_inode(folio); + struct netfs_inode *ictx = netfs_inode(inode); struct netfs_folio *finfo; struct netfs_group *group = NULL; int gcount = 0; @@ -69,8 +70,10 @@ int netfs_folio_written_back(struct folio *folio) unsigned long long fend; fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len; - if (fend > ictx->zero_point) - ictx->zero_point = fend; + spin_lock(&ictx->inode.i_lock); + if (fend > ictx->_zero_point) + netfs_write_zero_point(inode, fend); + spin_unlock(&ictx->inode.i_lock); folio_detach_private(folio); group = finfo->netfs_group; @@ -228,8 +231,10 @@ reassess_streams: if (!smp_load_acquire(&stream->active)) continue; - front = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); + front = list_first_entry_or_null_acquire(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + /* Read first subreq pointer before IN_PROGRESS flag. */ + while (front) { trace_netfs_collect_sreq(wreq, front); //_debug("sreq [%x] %llx %zx/%zx", diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 2db688f94125..c03c7cc45e47 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -204,7 +204,8 @@ void netfs_prepare_write(struct netfs_io_request *wreq, * remove entries off of the front. */ spin_lock(&wreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); + /* Write IN_PROGRESS before pointer to new subreq */ + list_add_tail_release(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (!stream->active) { stream->collected_to = subreq->start; @@ -413,12 +414,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (streamw) netfs_issue_write(wreq, cache); - /* Flip the page to the writeback state and unlock. If we're called - * from write-through, then the page has already been put into the wb - * state. - */ - if (wreq->origin == NETFS_WRITEBACK) - folio_start_writeback(folio); + folio_start_writeback(folio); folio_unlock(folio); if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { @@ -646,29 +642,41 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache) { + int ret; + _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end); - if (!*writethrough_cache) { - if (folio_test_dirty(folio)) - /* Sigh. mmap. */ - folio_clear_dirty_for_io(folio); + /* The folio is locked. */ + if (*writethrough_cache != folio) { + if (*writethrough_cache) { + /* Did the folio get moved? */ + folio_put(*writethrough_cache); + *writethrough_cache = NULL; + } /* We can make multiple writes to the folio... */ - folio_start_writeback(folio); if (wreq->len == 0) trace_netfs_folio(folio, netfs_folio_trace_wthru); else trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); *writethrough_cache = folio; + folio_get(folio); } wreq->len += copied; - if (!to_page_end) + + if (!to_page_end) { + folio_mark_dirty(folio); + folio_unlock(folio); return 0; + } + ret = netfs_write_folio(wreq, wbc, folio); + folio_put(*writethrough_cache); *writethrough_cache = NULL; - return netfs_write_folio(wreq, wbc, folio); + wreq->submitted = wreq->len; + return ret; } /* @@ -682,8 +690,12 @@ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_c _enter("R=%x", wreq->debug_id); - if (writethrough_cache) + if (writethrough_cache) { + folio_lock(writethrough_cache); netfs_write_folio(wreq, wbc, writethrough_cache); + folio_put(writethrough_cache); + wreq->submitted = wreq->len; + } netfs_end_issue_write(wreq); @@ -818,6 +830,9 @@ static int netfs_write_folio_single(struct netfs_io_request *wreq, * * Write a monolithic, non-pagecache object back to the server and/or * the cache. + * + * Return: 0 if successful; 1 if skipped due to lock conflict and WB_SYNC_NONE; + * or a negative error code. */ int netfs_writeback_single(struct address_space *mapping, struct writeback_control *wbc, @@ -834,8 +849,10 @@ int netfs_writeback_single(struct address_space *mapping, if (!mutex_trylock(&ictx->wb_lock)) { if (wbc->sync_mode == WB_SYNC_NONE) { + /* The VFS will have undirtied the inode. */ + netfs_single_mark_inode_dirty(&ictx->inode); netfs_stat(&netfs_n_wb_lock_skip); - return 0; + return 1; } netfs_stat(&netfs_n_wb_lock_wait); mutex_lock(&ictx->wb_lock); diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index 29489a23a220..32735abfa03f 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -130,7 +130,9 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, list_for_each_entry_safe_from(subreq, tmp, &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + spin_lock(&wreq->lock); list_del(&subreq->rreq_link); + spin_unlock(&wreq->lock); netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; @@ -153,8 +155,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, netfs_sreq_trace_new); trace_netfs_sreq(subreq, netfs_sreq_trace_split); + spin_lock(&wreq->lock); list_add(&subreq->rreq_link, &to->rreq_link); - to = list_next_entry(to, rreq_link); + spin_unlock(&wreq->lock); + to = subreq; trace_netfs_sreq(subreq, netfs_sreq_trace_retry); stream->sreq_max_len = len; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3134bb17f3e3..d7c399763ad9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -927,7 +927,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) } if (nfs_write_need_commit(hdr)) { struct nfs_open_context *ctx = - hdr->req->wb_lock_context->open_context; + req->wb_lock_context->open_context; /* Reset wb_nio, since the write was successful. */ req->wb_nio = 0; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 85e94c30285a..ab39ec885440 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1413,6 +1413,9 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dst, clone->cl_dst_pos, clone->cl_count, EX_ISSYNC(cstate->current_fh.fh_export)); + if (!status && (READ_ONCE(dst->nf_file->f_mode) & FMODE_NOCMTIME) != 0) + nfsd_update_cmtime_attr(dst->nf_file, 0); + nfsd_file_put(dst); nfsd_file_put(src); out: @@ -2118,8 +2121,10 @@ do_callback: set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); trace_nfsd_copy_async_done(copy); - nfsd4_send_cb_offload(copy); atomic_dec(©->cp_nn->pending_async_copies); + if (copy->cp_res.wr_bytes_written > 0 && copy->attr_update) + nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0); + nfsd4_send_cb_offload(copy); return 0; } @@ -2179,6 +2184,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); + if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) & + FMODE_NOCMTIME) != 0) + async_copy->attr_update = true; memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data, cstate->session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); @@ -2197,6 +2205,10 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { status = nfsd4_do_copy(copy, copy->nf_src->nf_file, copy->nf_dst->nf_file, true); + if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) & + FMODE_NOCMTIME) != 0 && + copy->cp_res.wr_bytes_written > 0) + nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0); } out: trace_nfsd_copy_done(copy, status); @@ -2535,10 +2547,6 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp, dd = nfsd_get_dir_deleg(cstate, gdd, nf); nfsd_file_put(nf); if (IS_ERR(dd)) { - int err = PTR_ERR(dd); - - if (err != -EAGAIN) - return nfserrno(err); gdd->gddrnf_status = GDD4_UNAVAIL; return nfs_ok; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c2d13b26a687..6837b63d9864 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1221,10 +1221,6 @@ static void put_deleg_file(struct nfs4_file *fp) static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct file *f) { - struct iattr ia = { .ia_valid = ATTR_ATIME | ATTR_CTIME | ATTR_MTIME | ATTR_DELEG }; - struct inode *inode = file_inode(f); - int ret; - /* don't do anything if FMODE_NOCMTIME isn't set */ if ((READ_ONCE(f->f_mode) & FMODE_NOCMTIME) == 0) return; @@ -1242,17 +1238,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f return; /* Stamp everything to "now" */ - inode_lock(inode); - ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &ia, NULL); - inode_unlock(inode); - if (ret) { - struct inode *inode = file_inode(f); - - pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%llu: %d\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino, ret); - } + nfsd_update_cmtime_attr(f, ATTR_ATIME); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) @@ -1865,6 +1851,13 @@ void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb) break; case SC_TYPE_LAYOUT: ls = layoutstateid(stid); + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + } + spin_unlock(&clp->cl_lock); nfsd4_close_layout(ls); break; } @@ -6378,7 +6371,6 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, } open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE; - dp->dl_cb_fattr.ncf_cur_fsize = stat.size; dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); dp->dl_atime = stat.atime; dp->dl_ctime = stat.ctime; @@ -9429,11 +9421,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) goto out_status; + status = nfs_ok; + goto out_status; + } + if (!ncf->ncf_file_modified) { + if (ncf->ncf_initial_cinfo != ncf->ncf_cb_change) + ncf->ncf_file_modified = true; + else if (i_size_read(inode) != ncf->ncf_cb_fsize) + ncf->ncf_file_modified = true; } - if (!ncf->ncf_file_modified && - (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || - ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) - ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { int err; @@ -9560,3 +9556,31 @@ out_delegees: put_nfs4_file(fp); return ERR_PTR(status); } + +/** + * nfsd_update_cmtime_attr - update file's delegated ctime/mtime, + * and optionally other attributes (ie ATTR_ATIME). + * @f: pointer to an opened file + * @flags: any additional flags that should be updated + * + * Given upon opening a file delegated attributes were issues, update + * @f attributes to current times. + */ +void nfsd_update_cmtime_attr(struct file *f, unsigned int flags) +{ + int ret; + struct inode *inode = file_inode(f); + struct iattr attr = { + .ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_DELEG | flags, + }; + + inode_lock(inode); + ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &attr, NULL); + inode_unlock(inode); + if (ret) + pr_notice_ratelimited("nfsd: Unable to update timestamps on " + "inode %02x:%02x:%llu: %d\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino, ret); +} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 39e7012a60d8..04e3954d54bd 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1594,16 +1594,27 @@ out_unlock: static int nfsd_nl_fh_key_set(const struct nlattr *attr, struct nfsd_net *nn) { siphash_key_t *fh_key = nn->fh_key; + u64 k0, k1; + bool changed; + + k0 = get_unaligned_le64(nla_data(attr)); + k1 = get_unaligned_le64(nla_data(attr) + 8); if (!fh_key) { fh_key = kmalloc(sizeof(siphash_key_t), GFP_KERNEL); - if (!fh_key) + if (!fh_key) { + trace_nfsd_ctl_fh_key_set(false, -ENOMEM); return -ENOMEM; + } nn->fh_key = fh_key; + changed = true; + } else { + changed = fh_key->key[0] != k0 || fh_key->key[1] != k1; } - fh_key->key[0] = get_unaligned_le64(nla_data(attr)); - fh_key->key[1] = get_unaligned_le64(nla_data(attr) + 8); + fh_key->key[0] = k0; + fh_key->key[1] = k1; + trace_nfsd_ctl_fh_key_set(changed, 0); return 0; } @@ -1682,7 +1693,6 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) attr = info->attrs[NFSD_A_SERVER_FH_KEY]; if (attr) { ret = nfsd_nl_fh_key_set(attr, nn); - trace_nfsd_ctl_fh_key_set((const char *)nn->fh_key, ret); if (ret) goto out_unlock; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 953675eba5c3..c5ccea64c281 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -843,6 +843,7 @@ extern void nfsd4_shutdown_copy(struct nfs4_client *clp); void nfsd4_put_client(struct nfs4_client *clp); void nfsd4_async_copy_reaper(struct nfsd_net *nn); bool nfsd4_has_active_async_copies(struct nfs4_client *clp); +void nfsd_update_cmtime_attr(struct file *f, unsigned int flags); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 5ad38f50836d..b631a472222b 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -2243,23 +2243,21 @@ TRACE_EVENT(nfsd_end_grace, TRACE_EVENT(nfsd_ctl_fh_key_set, TP_PROTO( - const char *key, + bool changed, int result ), - TP_ARGS(key, result), + TP_ARGS(changed, result), TP_STRUCT__entry( - __field(u32, key_hash) + __field(bool, changed) __field(int, result) ), TP_fast_assign( - if (key) - __entry->key_hash = ~crc32_le(0xFFFFFFFF, key, 16); - else - __entry->key_hash = 0; + __entry->changed = changed; __entry->result = result; ), - TP_printk("key=0x%08x result=%d", - __entry->key_hash, __entry->result + TP_printk("key %s, result=%d", + __entry->changed ? "updated" : "unmodified", + __entry->result ) ); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 417e9ad9fbb3..9a4124c77e04 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -752,6 +752,7 @@ struct nfsd4_copy { struct nfsd_file *nf_src; struct nfsd_file *nf_dst; + bool attr_update; copy_stateid_t cp_stateid; diff --git a/fs/nsfs.c b/fs/nsfs.c index 51e8c9430477..160018c4fb36 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -266,7 +266,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, else tsk = find_task_by_pid_ns(arg, pid_ns); if (!tsk) - break; + return ret; switch (ioctl) { case NS_GET_PID_FROM_PIDNS: diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 97b660eaa00c..421c6cdcbb53 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -583,24 +583,13 @@ static u32 ntfs_resident_attr_min_value_length(const __le32 type) case AT_STANDARD_INFORMATION: return offsetof(struct standard_information, ver) + sizeof(((struct standard_information *)0)->ver.v1.reserved12); - case AT_ATTRIBUTE_LIST: - return offsetof(struct attr_list_entry, name); case AT_FILE_NAME: - return offsetof(struct file_name_attr, file_name); - case AT_OBJECT_ID: - return sizeof(struct guid); - case AT_SECURITY_DESCRIPTOR: - return sizeof(struct security_descriptor_relative); + return offsetof(struct file_name_attr, file_name) + + sizeof(__le16) * 1; case AT_VOLUME_INFORMATION: return sizeof(struct volume_information); - case AT_INDEX_ROOT: - return sizeof(struct index_root); - case AT_REPARSE_POINT: - return offsetof(struct reparse_point, reparse_data); case AT_EA_INFORMATION: return sizeof(struct ea_information); - case AT_EA: - return offsetof(struct ea_attr, ea_name) + 1; default: return 0; } @@ -672,6 +661,9 @@ static int ntfs_attr_find(const __le32 type, const __le16 *name, __le16 *upcase = vol->upcase; u32 upcase_len = vol->upcase_len; unsigned int space; + u16 name_offset; + u32 attr_len; + u32 name_size; /* * Iterate over attributes in mft record starting at @ctx->attr, or the @@ -699,6 +691,20 @@ static int ntfs_attr_find(const __le32 type, const __le16 *name, return -ENOENT; if (unlikely(!a->length)) break; + if (a->name_length) { + name_offset = le16_to_cpu(a->name_offset); + attr_len = le32_to_cpu(a->length); + name_size = a->name_length * sizeof(__le16); + + if (name_offset > attr_len || + attr_len - name_offset < name_size) { + ntfs_error(vol->sb, + "Corrupt attribute name in MFT record %llu\n", + ctx->ntfs_ino->mft_no); + break; + } + } + if (type == AT_UNUSED) return 0; if (a->type != type) @@ -712,14 +718,6 @@ static int ntfs_attr_find(const __le32 type, const __le16 *name, if (a->name_length) return -ENOENT; } else { - if (a->name_length && ((le16_to_cpu(a->name_offset) + - a->name_length * sizeof(__le16)) > - le32_to_cpu(a->length))) { - ntfs_error(vol->sb, "Corrupt attribute name in MFT record %llu\n", - ctx->ntfs_ino->mft_no); - break; - } - if (!ntfs_are_names_equal(name, name_len, (__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), a->name_length, ic, upcase, upcase_len)) { @@ -2924,12 +2922,12 @@ int ntfs_attr_open(struct ntfs_inode *ni, const __le32 type, struct ntfs_inode *base_ni; int err; - ntfs_debug("Entering for inode %lld, attr 0x%x.\n", - (unsigned long long)ni->mft_no, type); - if (!ni || !ni->vol) return -EINVAL; + ntfs_debug("Entering for inode %lld, attr 0x%x.\n", + ni->mft_no, type); + if (NInoAttr(ni)) base_ni = ni->ext.base_ntfs_ino; else diff --git a/fs/ntfs/attrlist.c b/fs/ntfs/attrlist.c index bd501e8a628c..c2594d4c83b0 100644 --- a/fs/ntfs/attrlist.c +++ b/fs/ntfs/attrlist.c @@ -119,15 +119,14 @@ int ntfs_attrlist_entry_add(struct ntfs_inode *ni, struct attr_record *attr) struct mft_record *ni_mrec; u8 *old_al; - ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n", - (long long) ni->mft_no, - (unsigned int) le32_to_cpu(attr->type)); - if (!ni || !attr) { ntfs_debug("Invalid arguments.\n"); return -EINVAL; } + ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n", + ni->mft_no, (unsigned int) le32_to_cpu(attr->type)); + ni_mrec = map_mft_record(ni); if (IS_ERR(ni_mrec)) { ntfs_debug("Invalid arguments.\n"); diff --git a/fs/ntfs/bdev-io.c b/fs/ntfs/bdev-io.c index 67e65c88d681..27d7c2767a33 100644 --- a/fs/ntfs/bdev-io.c +++ b/fs/ntfs/bdev-io.c @@ -97,6 +97,8 @@ int ntfs_bdev_write(struct super_block *sb, void *buf, loff_t start, size_t size idx_end++; for (; idx < idx_end; idx++, from = 0) { + u32 len; + folio = read_mapping_folio(sb->s_bdev->bd_mapping, idx, NULL); if (IS_ERR(folio)) { ntfs_error(sb, "Unable to read %ld page", idx); @@ -105,9 +107,10 @@ int ntfs_bdev_write(struct super_block *sb, void *buf, loff_t start, size_t size offset = (loff_t)idx << PAGE_SHIFT; to = min_t(u32, end - offset, PAGE_SIZE); + len = to - from; - memcpy_to_folio(folio, from, buf + buf_off, to); - buf_off += to; + memcpy_to_folio(folio, from, buf + buf_off, len); + buf_off += len; folio_mark_uptodate(folio); folio_mark_dirty(folio); folio_put(folio); diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c index a547bdcfa456..146e011c1a41 100644 --- a/fs/ntfs/index.c +++ b/fs/ntfs/index.c @@ -677,11 +677,11 @@ static int ntfs_ib_read(struct ntfs_index_context *icx, s64 vcn, struct index_bl static int ntfs_icx_parent_inc(struct ntfs_index_context *icx) { - icx->pindex++; - if (icx->pindex >= MAX_PARENT_VCN) { + if (icx->pindex >= MAX_PARENT_VCN - 1) { ntfs_error(icx->idx_ni->vol->sb, "Index is over %d level deep", MAX_PARENT_VCN); return -EOPNOTSUPP; } + icx->pindex++; return 0; } @@ -1970,6 +1970,7 @@ struct index_entry *ntfs_index_walk_down(struct index_entry *ie, struct ntfs_ind { struct index_entry *entry; struct index_block *ib; + int err; s64 vcn; entry = ie; @@ -1979,14 +1980,20 @@ struct index_entry *ntfs_index_walk_down(struct index_entry *ie, struct ntfs_ind ib = kvzalloc(ictx->block_size, GFP_NOFS); if (!ib) return ERR_PTR(-ENOMEM); - /* down from level zero */ + /* + * Descending from root index (level 0) to the first + * child level. is_in_root == true implies pindex == 0, + * so advance to level 1. + */ + ictx->pindex = 1; ictx->ir = NULL; ictx->ib = ib; - ictx->pindex = 1; ictx->is_in_root = false; } else { /* down from non-zero level */ - ictx->pindex++; + err = ntfs_icx_parent_inc(ictx); + if (err) + return ERR_PTR(err); } ictx->parent_pos[ictx->pindex] = 0; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 16890d411194..360bebd1ee3f 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2582,8 +2582,8 @@ int ntfs_inode_sync_filename(struct ntfs_inode *ni) mutex_lock_nested(&index_ni->mrec_lock, NTFS_INODE_MUTEX_PARENT); if (NInoBeingDeleted(ni)) { - iput(index_vi); mutex_unlock(&index_ni->mrec_lock); + iput(index_vi); continue; } @@ -2591,8 +2591,8 @@ int ntfs_inode_sync_filename(struct ntfs_inode *ni) if (!ictx) { ntfs_error(sb, "Failed to get index ctx, inode %llu", index_ni->mft_no); - iput(index_vi); mutex_unlock(&index_ni->mrec_lock); + iput(index_vi); continue; } @@ -2601,8 +2601,8 @@ int ntfs_inode_sync_filename(struct ntfs_inode *ni) ntfs_debug("Index lookup failed, inode %llu", index_ni->mft_no); ntfs_index_ctx_put(ictx); - iput(index_vi); mutex_unlock(&index_ni->mrec_lock); + iput(index_vi); continue; } /* Update flags and file size. */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index 3f8d1640f1d5..d3f25d8e29f9 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -710,6 +710,9 @@ map_vcn: if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { vcn = rl->vcn; kvfree(empty_buf); + empty_buf = NULL; + kfree(ra); + ra = NULL; goto map_vcn; } /* If this run is not valid abort with an error. */ @@ -753,7 +756,7 @@ map_vcn: } while (start < end); } while ((++rl)->vcn < end_vcn); up_write(&log_ni->runlist.lock); - kfree(empty_buf); + kvfree(empty_buf); kfree(ra); truncate_inode_pages(log_vi->i_mapping, 0); /* Set the flag so we do not have to do it again on remount. */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 7d989267a82b..a7d10ee41b34 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -30,6 +30,8 @@ int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m, { struct attr_record *a; struct super_block *sb = vol->sb; + u16 attrs_offset; + u32 bytes_in_use; if (!ntfs_is_file_record(m->magic)) { ntfs_error(sb, "Record %llu has no FILE magic (0x%x)\n", @@ -65,7 +67,16 @@ int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m, goto err_out; } - a = (struct attr_record *)((char *)m + le16_to_cpu(m->attrs_offset)); + attrs_offset = le16_to_cpu(m->attrs_offset); + bytes_in_use = le32_to_cpu(m->bytes_in_use); + + if (attrs_offset > bytes_in_use || + bytes_in_use - attrs_offset < sizeof_field(struct attr_record, type)) { + ntfs_error(sb, "Record %llu has corrupt attribute offset\n", mft_no); + goto err_out; + } + + a = (struct attr_record *)((char *)m + attrs_offset); if ((char *)a < (char *)m || (char *)a > (char *)m + vol->mft_record_size) { ntfs_error(sb, "Record %llu is corrupt\n", mft_no); goto err_out; @@ -449,7 +460,7 @@ static void ntfs_bio_end_io(struct bio *bio) int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no, struct mft_record *m) { - u8 *kmirr = NULL; + u8 *kmirr; struct folio *folio; unsigned int folio_ofs, lcn_folio_off = 0; int err = 0; @@ -479,6 +490,7 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no, kmirr = kmap_local_folio(folio, 0) + folio_ofs; /* Copy the mst protected mft record to the mirror. */ memcpy(kmirr, m, vol->mft_record_size); + kunmap_local(kmirr); if (vol->cluster_size_bits > PAGE_SHIFT) { lcn_folio_off = folio->index << PAGE_SHIFT; @@ -490,20 +502,22 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no, NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, vol->mftmirr_lcn) + lcn_folio_off + folio_ofs); - if (!bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) { + if (bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) + err = submit_bio_wait(bio); + else err = -EIO; - bio_put(bio); - goto unlock_folio; - } + bio_put(bio); - bio->bi_end_io = ntfs_bio_end_io; - submit_bio(bio); - /* Current state: all buffers are clean, unlocked, and uptodate. */ + /* + * The in-memory mirror is now valid because we just memcpy()'d the + * mst-protected mft record into it. Mark the folio uptodate even on + * write error so a subsequent read_mapping_folio() does not refetch + * the stale on-disk mirror and overwrite this copy. The error is + * propagated to the caller via @err. + */ folio_mark_uptodate(folio); -unlock_folio: folio_unlock(folio); - kunmap_local(kmirr); folio_put(folio); if (likely(!err)) { ntfs_debug("Done."); @@ -588,20 +602,36 @@ int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int syn } /* Synchronize the mft mirror now if not @sync. */ - if (!sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); + if (!sync && ni->mft_no < vol->mftmirr_size) { + int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no, + fixup_m); + if (unlikely(sub_err) && !err) + err = sub_err; + } - folio_get(folio); - bio->bi_private = folio; - bio->bi_end_io = ntfs_bio_end_io; - submit_bio(bio); + if (sync) { + int sub_err = submit_bio_wait(bio); + + bio_put(bio); + if (unlikely(sub_err) && !err) + err = sub_err; + } else { + folio_get(folio); + bio->bi_private = folio; + bio->bi_end_io = ntfs_bio_end_io; + submit_bio(bio); + } offset += vol->cluster_size; i++; } /* If @sync, now synchronize the mft mirror. */ - if (sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); + if (sync && ni->mft_no < vol->mftmirr_size) { + int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); + + if (unlikely(sub_err) && !err) + err = sub_err; + } kunmap_local(kaddr); if (unlikely(err)) { /* I/O error during writing. This is really bad! */ @@ -617,10 +647,10 @@ put_bio_out: bio_put(bio); err_out: /* - * Current state: all buffers are clean, unlocked, and uptodate. - * The caller should mark the base inode as bad so that no more i/o - * happens. ->drop_inode() will still be invoked so all extent inodes - * and other allocated memory will be freed. + * The caller should mark the base inode as bad so no more I/O + * happens. ->drop_inode() will still be invoked so all extent inodes + * and other allocated memory will be freed. ENOMEM is retried by + * redirtying the mft record below. */ if (err == -ENOMEM) { ntfs_error(vol->sb, @@ -833,7 +863,7 @@ static bool ntfs_may_write_mft_record(struct ntfs_volume *vol, const u64 mft_no, vi = igrab(mft_vi); WARN_ON(vi != mft_vi); } else { - vi = find_inode_nowait(sb, mft_no, ntfs_test_inode_wb, &na); + vi = find_inode_nowait(sb, na.mft_no, ntfs_test_inode_wb, &na); if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated) return false; } @@ -1034,7 +1064,7 @@ static s64 ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(struct ntfs_volume *vo b = ffz((unsigned long)*byte); if (b < 8 && b >= (bit & 7)) { ll = data_pos + (bit & ~7ull) + b; - if (unlikely(ll > (1ll << 32))) { + if (unlikely(ll >= (1ll << 32))) { folio_unlock(folio); kunmap_local(buf); folio_put(folio); @@ -2721,8 +2751,11 @@ static int ntfs_write_mft_block(struct folio *folio, struct writeback_control *w ntfs_debug("Entering for inode 0x%llx, attribute type 0x%x, folio index 0x%lx.", ni->mft_no, ni->type, folio->index); - if (!locked_nis || !ref_inos) + if (!locked_nis || !ref_inos) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); return -ENOMEM; + } /* We have to zero every time due to mmap-at-end-of-file. */ if (folio->index >= (i_size >> folio_shift(folio))) @@ -2840,9 +2873,13 @@ flush_bio: } prev_mft_ofs = mft_ofs; - if (mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, mft_no, + if (mft_no < vol->mftmirr_size) { + int sub_err = ntfs_sync_mft_mirror(vol, mft_no, (struct mft_record *)(kaddr + mft_ofs)); + + if (unlikely(sub_err) && !err) + err = sub_err; + } } else if (ref_inos[nr_ref_inos]) nr_ref_inos++; } diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 96c450e62efc..c4f82846c58c 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -344,9 +344,9 @@ static int ntfs_sd_add_everyone(struct ntfs_inode *ni) sd_len = sizeof(struct security_descriptor_relative) + 2 * (sizeof(struct ntfs_sid) + 8) + sizeof(struct ntfs_acl) + sizeof(struct ntfs_ace) + 4; - sd = kmalloc(sd_len, GFP_NOFS); + sd = kzalloc(sd_len, GFP_NOFS); if (!sd) - return -1; + return -ENOMEM; sd->revision = 1; sd->control = SE_DACL_PRESENT | SE_SELF_RELATIVE; diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index da21dbeaaf66..e7de3d01257e 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -2056,10 +2056,11 @@ struct runlist_element *ntfs_rl_collapse_range(struct runlist_element *dst_rl, i * consists of holes. */ merge_cnt = 0; - i = new_1st_cnt == 0 ? 1 : new_1st_cnt; - if (ntfs_rle_lcn_contiguous(&new_rl[i - 1], &new_rl[i])) { - /* Merge right and left */ - s_rl = &new_rl[new_1st_cnt - 1]; + if (new_1st_cnt > 0 && + ntfs_rle_lcn_contiguous(&new_rl[new_1st_cnt - 1], + &new_rl[new_1st_cnt])) { + /* Merge right and left. */ + s_rl = &new_rl[new_1st_cnt - 1]; s_rl->length += s_rl[1].length; merge_cnt = 1; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 22dc7865eca7..9e321cc2febe 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -413,6 +413,7 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label) { struct ntfs_inode *vol_ni = NTFS_I(vol->vol_ino); struct ntfs_attr_search_ctx *ctx; + char *new_label; __le16 *uname; int uname_len, ret; @@ -425,7 +426,7 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label) return uname_len; } - if (uname_len > NTFS_MAX_LABEL_LEN) { + if (uname_len > NTFS_MAX_LABEL_LEN) { ntfs_error(vol->sb, "Volume label is too long (max %d characters).", NTFS_MAX_LABEL_LEN); @@ -433,11 +434,22 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label) return -EINVAL; } + /* + * Allocate the in-memory label copy up front. If kstrdup() fails we + * bail out before touching on-disk metadata, so the in-memory label + * and the on-disk label stay in sync. + */ + new_label = kstrdup(label, GFP_KERNEL); + if (!new_label) { + kvfree(uname); + return -ENOMEM; + } + mutex_lock(&vol_ni->mrec_lock); ctx = ntfs_attr_get_search_ctx(vol_ni, NULL); if (!ctx) { ret = -ENOMEM; - goto out; + goto out; } if (!ntfs_attr_lookup(AT_VOLUME_NAME, NULL, 0, 0, 0, NULL, 0, @@ -450,12 +462,14 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label) out: mutex_unlock(&vol_ni->mrec_lock); kvfree(uname); - mark_inode_dirty_sync(vol->vol_ino); if (ret >= 0) { kfree(vol->volume_label); - vol->volume_label = kstrdup(label, GFP_KERNEL); + vol->volume_label = new_label; + mark_inode_dirty_sync(vol->vol_ino); ret = 0; + } else { + kfree(new_label); } return ret; } @@ -979,6 +993,13 @@ mft_unmap_out: ntfs_is_baad_recordp((__le32 *)kmirr)) bytes = vol->mft_record_size; } + /* Compare the two records. */ + if (memcmp(kmft, kmirr, bytes)) { + ntfs_error(sb, + "$MFT and $MFTMirr record %i do not match. Run chkdsk.", + i); + goto mm_unmap_out; + } kmft += vol->mft_record_size; kmirr += vol->mft_record_size; } while (++i < vol->mftmirr_size); @@ -1671,7 +1692,7 @@ iput_attrdef_err_out: iput_upcase_err_out: vol->upcase_len = 0; mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { + if (vol->upcase && vol->upcase == default_upcase) { ntfs_nr_upcase_users--; vol->upcase = NULL; } @@ -1701,7 +1722,7 @@ static void ntfs_volume_free(struct ntfs_volume *vol) * the number of upcase users if we are a user. */ mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { + if (vol->upcase && vol->upcase == default_upcase) { ntfs_nr_upcase_users--; vol->upcase = NULL; } @@ -2494,7 +2515,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) } vol->upcase_len = 0; mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { + if (vol->upcase && vol->upcase == default_upcase) { ntfs_nr_upcase_users--; vol->upcase = NULL; } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index bec5475de094..75e65e72c2d6 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -362,7 +362,7 @@ static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, __orangefs_setattr(dir, &iattr); out: op_release(new_op); - return ERR_PTR(ret); + return ret ? ERR_PTR(ret) : NULL; } static int orangefs_rename(struct mnt_idmap *idmap, diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 1dcc75b3a90f..e7fe29cb6028 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -838,15 +838,14 @@ static int ovl_iterate_merged(struct file *file, struct dir_context *ctx) struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_cache_entry *p; - int err = 0; + int err; if (!od->cache) { struct ovl_dir_cache *cache; cache = ovl_cache_get(dentry); - err = PTR_ERR(cache); if (IS_ERR(cache)) - return err; + return PTR_ERR(cache); od->cache = cache; ovl_seek_cursor(od, ctx->pos); @@ -869,7 +868,7 @@ static int ovl_iterate_merged(struct file *file, struct dir_context *ctx) od->cursor = p->l_node.next; ctx->pos++; } - return err; + return 0; } static bool ovl_need_adjust_d_ino(struct file *file) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7b86a6bac644..b41f4788e4f0 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1354,7 +1354,7 @@ int ovl_ensure_verity_loaded(const struct path *datapath) struct inode *inode = d_inode(datapath->dentry); struct file *filp; - if (!fsverity_active(inode) && IS_VERITY(inode)) { + if (IS_VERITY(inode) && fsverity_get_info(inode) == NULL) { /* * If this inode was not yet opened, the verity info hasn't been * loaded yet, so we need to do that here to force it into memory. diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 135fb42f6936..56bbaffe4b44 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -132,16 +132,16 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) struct qnx6_dir_entry *de; struct folio *folio; char *kaddr = qnx6_get_folio(inode, n, &folio); - char *limit; + struct qnx6_dir_entry *limit; if (IS_ERR(kaddr)) { pr_err("%s(): read failed\n", __func__); ctx->pos = (n + 1) << PAGE_SHIFT; return PTR_ERR(kaddr); } - de = (struct qnx6_dir_entry *)(kaddr + offset); - limit = kaddr + last_entry(inode, n); - for (; (char *)de < limit; de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { + de = (struct qnx6_dir_entry *)kaddr + offset; + limit = (struct qnx6_dir_entry *)kaddr + last_entry(inode, n); + for (; de < limit; de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { int size = de->de_size; u32 no_inode = fs32_to_cpu(sbi, de->de_inode); diff --git a/fs/select.c b/fs/select.c index 75978b18f48f..bf71c9838dfe 100644 --- a/fs/select.c +++ b/fs/select.c @@ -708,6 +708,17 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; + /* + * Reject negative components before normalisation. The seconds + * sum below is performed in signed long and a crafted negative + * timeval can wrap to a positive value that passes + * timespec64_valid() and turns into an effectively-infinite + * deadline via timespec64_add_safe()'s saturation, instead of + * the -EINVAL POSIX requires for negative timeouts. + */ + if (tv.tv_sec < 0 || tv.tv_usec < 0) + return -EINVAL; + to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 02791ec3c5a1..88d5e9a32f28 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -286,6 +286,14 @@ replay_again: &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto oshr_free; + + if (oplock != SMB2_OPLOCK_LEVEL_II) { + rc = -EINVAL; + cifs_dbg(FYI, "%s: Oplock level %d not suitable for cached directory\n", + __func__, oplock); + goto oshr_free; + } + smb2_set_next_command(tcon, &rqst[0]); memset(&qi_iov, 0, sizeof(qi_iov)); diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 3a41bbada04c..44c407275680 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -8,6 +8,7 @@ */ #include <linux/list.h> +#include <linux/cred.h> #include <linux/slab.h> #include <linux/string.h> #include <keys/user-type.h> @@ -40,12 +41,27 @@ cifs_spnego_key_destroy(struct key *key) kfree(key->payload.data[0]); } +static int +cifs_spnego_key_vet_description(const char *description) +{ + /* + * cifs.spnego descriptions are authority-bearing inputs to cifs.upcall. + * They are only valid when produced by CIFS while using the private + * spnego_cred installed below. Do not let userspace create this type + * of key through request_key(2)/add_key(2), since the helper treats + * pid/uid/creduid/upcall_target as kernel-originating fields. + */ + if (current_cred() != spnego_cred) + return -EPERM; + return 0; +} /* * keytype for CIFS spnego keys */ struct key_type cifs_spnego_key_type = { .name = "cifs.spnego", + .vet_description = cifs_spnego_key_vet_description, .instantiate = cifs_spnego_key_instantiate, .destroy = cifs_spnego_key_destroy, .describe = user_describe, diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index ec5d47779304..786dbbc43c5b 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -1264,6 +1264,17 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl) return 0; } +static bool dacl_offset_valid(unsigned int acl_len, __u32 dacloffset) +{ + if (acl_len < sizeof(struct smb_acl)) + return false; + + if (dacloffset < sizeof(struct smb_ntsd)) + return false; + + return dacloffset <= acl_len - sizeof(struct smb_acl); +} + /* Convert CIFS ACL to POSIX form */ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, @@ -1284,7 +1295,6 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, group_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), le32_to_cpu(pntsd->gsidoffset), @@ -1315,11 +1325,18 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, return rc; } - if (dacloffset) + if (dacloffset) { + if (!dacl_offset_valid(acl_len, dacloffset)) { + cifs_dbg(VFS, "Server returned illegal DACL offset\n"); + return -EINVAL; + } + + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr, get_mode_from_special_sid); - else + } else { cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */ + } return rc; } @@ -1342,6 +1359,11 @@ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd, dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { + if (!dacl_offset_valid(secdesclen, dacloffset)) { + cifs_dbg(VFS, "Server returned illegal DACL offset\n"); + return -EINVAL; + } + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); rc = validate_dacl(dacl_ptr, end_of_acl); if (rc) @@ -1710,6 +1732,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, nsecdesclen = sizeof(struct smb_ntsd) + (sizeof(struct smb_sid) * 2); dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { + if (!dacl_offset_valid(secdesclen, dacloffset)) { + cifs_dbg(VFS, "Server returned illegal DACL offset\n"); + rc = -EINVAL; + goto id_mode_to_cifs_acl_exit; + } + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); rc = validate_dacl(dacl_ptr, (char *)pntsd + secdesclen); if (rc) { @@ -1732,7 +1760,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, * descriptor parameters, and security descriptor itself */ nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); - pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); + pnntsd = kzalloc(nsecdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); cifs_put_tlink(tlink); @@ -1752,6 +1780,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } +id_mode_to_cifs_acl_exit: cifs_put_tlink(tlink); kfree(pnntsd); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9f76b0347fa9..ce23924f01b3 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -306,6 +306,8 @@ static void cifs_kill_sb(struct super_block *sb) /* Wait for all pending oplock breaks to complete */ flush_workqueue(cifsoplockd_wq); + /* Wait for all opened files to release */ + flush_workqueue(deferredclose_wq); /* finally release root dentry */ dput(cifs_sb->root); @@ -434,7 +436,8 @@ cifs_alloc_inode(struct super_block *sb) spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ - cifs_inode->netfs.remote_i_size = 0; + cifs_inode->netfs._remote_i_size = 0; + cifs_inode->netfs._zero_point = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; cifs_inode->epoch = 0; @@ -1303,7 +1306,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, struct cifsFileInfo *smb_file_src = src_file->private_data; struct cifsFileInfo *smb_file_target = dst_file->private_data; struct cifs_tcon *target_tcon, *src_tcon; - unsigned long long destend, fstart, fend, old_size, new_size; + unsigned long long i_size, new_size; + unsigned long long destend, fstart, fend; unsigned int xid; int rc; @@ -1347,7 +1351,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->netfs.remote_i_size < off + len) { + if (netfs_read_remote_i_size(src_inode) < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1368,22 +1372,24 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); if (rc) goto unlock; - if (fend > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = fend + 1; - old_size = target_cifsi->netfs.remote_i_size; + + spin_lock(&target_inode->i_lock); + if (fend > target_cifsi->netfs._zero_point) + netfs_write_zero_point(target_inode, fend + 1); + i_size = target_inode->i_size; + spin_unlock(&target_inode->i_lock); /* Discard all the folios that overlap the destination region. */ cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend); truncate_inode_pages_range(&target_inode->i_data, fstart, fend); - fscache_invalidate(cifs_inode_cookie(target_inode), NULL, - i_size_read(target_inode), 0); + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, i_size, 0); rc = -EOPNOTSUPP; if (target_tcon->ses->server->ops->duplicate_extents) { rc = target_tcon->ses->server->ops->duplicate_extents(xid, smb_file_src, smb_file_target, off, len, destoff); - if (rc == 0 && new_size > old_size) { + if (rc == 0 && new_size > i_size) { truncate_setsize(target_inode, new_size); fscache_resize_cookie(cifs_inode_cookie(target_inode), new_size); @@ -1402,8 +1408,12 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, rc = -EINVAL; } } - if (rc == 0 && new_size > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = new_size; + if (rc == 0) { + spin_lock(&target_inode->i_lock); + if (new_size > target_cifsi->netfs._zero_point) + netfs_write_zero_point(target_inode, new_size); + spin_unlock(&target_inode->i_lock); + } } /* force revalidate of size and timestamps of target file now @@ -1474,7 +1484,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->netfs.remote_i_size < off + len) { + if (netfs_read_remote_i_size(src_inode) < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1502,8 +1512,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, fscache_resize_cookie(cifs_inode_cookie(target_inode), i_size_read(target_inode)); } - if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = destoff + rc; + if (rc > 0) { + spin_lock(&target_inode->i_lock); + if (destoff + rc > target_cifsi->netfs._zero_point) + netfs_write_zero_point(target_inode, destoff + rc); + spin_unlock(&target_inode->i_lock); + } } file_accessed(src_file); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 3990a9012264..9e27bfa7376b 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1465,6 +1465,7 @@ cifs_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct cifs_io_subrequest *rdata = mid->callback_data; struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); + struct inode *inode = &ictx->inode; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 1, .rq_iter = rdata->subreq.io_iter }; @@ -1538,7 +1539,7 @@ do_retry: } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && - rdata->subreq.start + trans >= ictx->remote_i_size) { + rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) { rdata->result = 0; __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else if (rdata->got_bytes > 0) { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 664a2c223089..b60344125f27 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2517,18 +2517,23 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result) { struct netfs_io_request *wreq = wdata->rreq; - struct netfs_inode *ictx = netfs_inode(wreq->inode); + struct inode *inode = wreq->inode; + struct netfs_inode *ictx = netfs_inode(inode); loff_t wrend; if (result > 0) { + spin_lock(&inode->i_lock); + wrend = wdata->subreq.start + wdata->subreq.transferred + result; - if (wrend > ictx->zero_point && + if (wrend > ictx->_zero_point && (wdata->rreq->origin == NETFS_UNBUFFERED_WRITE || wdata->rreq->origin == NETFS_DIO_WRITE)) - ictx->zero_point = wrend; - if (wrend > ictx->remote_i_size) + netfs_write_zero_point(inode, wrend); + if (wrend > ictx->_remote_i_size) netfs_resize_file(ictx, wrend, true); + + spin_unlock(&inode->i_lock); } netfs_write_subrequest_terminated(&wdata->subreq, result); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b63ec7ab6e51..2f86158f85d7 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -420,7 +420,7 @@ static int parse_symlink_flavor(struct fs_context *fc, char *value, #define DUP_CTX_STR(field) \ do { \ if (ctx->field) { \ - new_ctx->field = kstrdup(ctx->field, GFP_ATOMIC); \ + new_ctx->field = kstrdup(ctx->field, GFP_KERNEL); \ if (new_ctx->field == NULL) { \ smb3_cleanup_fs_context_contents(new_ctx); \ return -ENOMEM; \ @@ -736,7 +736,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, static int smb3_fs_context_parse_monolithic(struct fs_context *fc, void *data); static int smb3_get_tree(struct fs_context *fc); -static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels); +static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels); static int smb3_reconfigure(struct fs_context *fc); static const struct fs_context_operations smb3_fs_context_ops = { @@ -1010,25 +1010,34 @@ do { \ int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { + char *password = NULL, *password2 = NULL; + if (ses->password && cifs_sb->ctx->password && strcmp(ses->password, cifs_sb->ctx->password)) { - kfree_sensitive(cifs_sb->ctx->password); - cifs_sb->ctx->password = kstrdup(ses->password, GFP_KERNEL); - if (!cifs_sb->ctx->password) + password = kstrdup(ses->password, GFP_KERNEL); + if (!password) return -ENOMEM; } if (ses->password2 && cifs_sb->ctx->password2 && strcmp(ses->password2, cifs_sb->ctx->password2)) { - kfree_sensitive(cifs_sb->ctx->password2); - cifs_sb->ctx->password2 = kstrdup(ses->password2, GFP_KERNEL); - if (!cifs_sb->ctx->password2) { - kfree_sensitive(cifs_sb->ctx->password); - cifs_sb->ctx->password = NULL; + password2 = kstrdup(ses->password2, GFP_KERNEL); + if (!password2) { + kfree_sensitive(password); return -ENOMEM; } } + + if (password) { + kfree_sensitive(cifs_sb->ctx->password); + cifs_sb->ctx->password = password; + } + if (password2) { + kfree_sensitive(cifs_sb->ctx->password2); + cifs_sb->ctx->password2 = password2; + } + return 0; } @@ -1041,7 +1050,7 @@ int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_se * with the session's channel lock. This should be called whenever the maximum * allowed channels for a session changes (e.g., after a remount or reconfigure). */ -static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels) +static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels) { spin_lock(&ses->chan_lock); ses->chan_max = max_channels; @@ -1051,12 +1060,15 @@ static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channe static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); + struct smb3_fs_context *new_ctx = NULL; + struct smb3_fs_context *old_ctx = NULL; struct dentry *root = fc->root; struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; unsigned int rsize = ctx->rsize, wsize = ctx->wsize; char *new_password = NULL, *new_password2 = NULL; bool need_recon = false; + bool need_mchan_update; int rc; if (ses->expired_pwd) @@ -1066,6 +1078,16 @@ static int smb3_reconfigure(struct fs_context *fc) if (rc) return rc; + old_ctx = kzalloc_obj(*old_ctx); + if (!old_ctx) + return -ENOMEM; + + rc = smb3_fs_context_dup(old_ctx, cifs_sb->ctx); + if (rc) { + kfree(old_ctx); + return rc; + } + /* * We can not change UNC/username/password/domainname/ * workstation_name/nodename/iocharset @@ -1075,16 +1097,22 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); + STEAL_STRING(cifs_sb, ctx, domainname); + STEAL_STRING(cifs_sb, ctx, nodename); + STEAL_STRING(cifs_sb, ctx, iocharset); - if (need_recon == false) + if (!need_recon) { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); - else { + } else { if (ctx->password) { new_password = kstrdup(ctx->password, GFP_KERNEL); - if (!new_password) - return -ENOMEM; - } else + if (!new_password) { + rc = -ENOMEM; + goto restore_ctx; + } + } else { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); + } } /* @@ -1094,11 +1122,29 @@ static int smb3_reconfigure(struct fs_context *fc) if (ctx->password2) { new_password2 = kstrdup(ctx->password2, GFP_KERNEL); if (!new_password2) { - kfree_sensitive(new_password); - return -ENOMEM; + rc = -ENOMEM; + goto restore_ctx; } - } else + } else { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password2); + } + + /* if rsize or wsize not passed in on remount, use previous values */ + ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; + ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize; + + new_ctx = kzalloc_obj(*new_ctx); + if (!new_ctx) { + rc = -ENOMEM; + goto restore_ctx; + } + + rc = smb3_fs_context_dup(new_ctx, ctx); + if (rc) + goto restore_ctx; + + need_mchan_update = ctx->multichannel != cifs_sb->ctx->multichannel || + ctx->max_channels != cifs_sb->ctx->max_channels; /* * we may update the passwords in the ses struct below. Make sure we do @@ -1109,54 +1155,55 @@ static int smb3_reconfigure(struct fs_context *fc) /* * smb2_reconnect may swap password and password2 in case session setup * failed. First get ctx passwords in sync with ses passwords. It should - * be okay to do this even if this function were to return an error at a - * later stage + * be done before committing new passwords. */ rc = smb3_sync_session_ctx_passwords(cifs_sb, ses); if (rc) { mutex_unlock(&ses->session_mutex); - kfree_sensitive(new_password); - kfree_sensitive(new_password2); - return rc; + goto cleanup_new_ctx; + } + + /* + * If multichannel or max_channels has changed, update the session's channels accordingly. + * This may add or remove channels to match the new configuration. + */ + if (need_mchan_update) { + /* Prevent concurrent scaling operations */ + spin_lock(&ses->ses_lock); + if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { + spin_unlock(&ses->ses_lock); + mutex_unlock(&ses->session_mutex); + rc = -EINVAL; + goto cleanup_new_ctx; + } + ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); } /* - * now that allocations for passwords are done, commit them + * Commit session passwords before any channel work so newly added + * channels authenticate with the new credentials. */ if (new_password) { kfree_sensitive(ses->password); ses->password = new_password; + new_password = NULL; } if (new_password2) { kfree_sensitive(ses->password2); ses->password2 = new_password2; + new_password2 = NULL; } - /* - * If multichannel or max_channels has changed, update the session's channels accordingly. - * This may add or remove channels to match the new configuration. - */ - if ((ctx->multichannel != cifs_sb->ctx->multichannel) || - (ctx->max_channels != cifs_sb->ctx->max_channels)) { - + if (need_mchan_update) { /* Synchronize ses->chan_max with the new mount context */ smb3_sync_ses_chan_max(ses, ctx->max_channels); - /* Now update the session's channels to match the new configuration */ - /* Prevent concurrent scaling operations */ - spin_lock(&ses->ses_lock); - if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { - spin_unlock(&ses->ses_lock); - mutex_unlock(&ses->session_mutex); - return -EINVAL; - } - ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; - spin_unlock(&ses->ses_lock); mutex_unlock(&ses->session_mutex); - rc = smb3_update_ses_channels(ses, ses->server, - false /* from_reconnect */, - false /* disable_mchan */); + smb3_update_ses_channels(ses, ses->server, + false /* from_reconnect */, + false /* disable_mchan */); /* Clear scaling flag after operation */ spin_lock(&ses->ses_lock); @@ -1166,16 +1213,12 @@ static int smb3_reconfigure(struct fs_context *fc) mutex_unlock(&ses->session_mutex); } - STEAL_STRING(cifs_sb, ctx, domainname); - STEAL_STRING(cifs_sb, ctx, nodename); - STEAL_STRING(cifs_sb, ctx, iocharset); - - /* if rsize or wsize not passed in on remount, use previous values */ - ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; - ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize; - smb3_cleanup_fs_context_contents(cifs_sb->ctx); - rc = smb3_fs_context_dup(cifs_sb->ctx, ctx); + memcpy(cifs_sb->ctx, new_ctx, sizeof(*new_ctx)); + kfree(new_ctx); + new_ctx = NULL; + smb3_cleanup_fs_context(old_ctx); + old_ctx = NULL; smb3_update_mnt_flags(cifs_sb); #ifdef CONFIG_CIFS_DFS_UPCALL if (!rc) @@ -1183,6 +1226,18 @@ static int smb3_reconfigure(struct fs_context *fc) #endif return rc; + +cleanup_new_ctx: + smb3_cleanup_fs_context_contents(new_ctx); +restore_ctx: + kfree(new_ctx); + kfree_sensitive(new_password); + kfree_sensitive(new_password2); + smb3_cleanup_fs_context_contents(cifs_sb->ctx); + memcpy(cifs_sb->ctx, old_ctx, sizeof(*old_ctx)); + kfree(old_ctx); + + return rc; } static int smb3_fs_context_parse_param(struct fs_context *fc, diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 16a5310155d5..9472c0a6c187 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -119,7 +119,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); mtime = inode_get_mtime(inode); if (timespec64_equal(&mtime, &fattr->cf_mtime) && - cifs_i->netfs.remote_i_size == fattr->cf_eof) { + netfs_read_remote_i_size(inode) == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); return; @@ -173,12 +173,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } - if (inode_state_read_once(inode) & I_NEW) - CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; - cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); + if (inode_state_read_once(inode) & I_NEW) + netfs_write_zero_point(inode, fattr->cf_eof); + fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); fattr->cf_atime = timestamp_truncate(fattr->cf_atime, inode); fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode); @@ -212,7 +212,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, else clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); - cifs_i->netfs.remote_i_size = fattr->cf_eof; + netfs_write_remote_i_size(inode, fattr->cf_eof); /* * Can't safely change the file size here if the client is writing to * it due to potential races. @@ -2772,7 +2772,9 @@ cifs_revalidate_mapping(struct inode *inode) if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_RW_CACHE) goto skip_invalidate; - cifs_inode->netfs.zero_point = cifs_inode->netfs.remote_i_size; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, netfs_inode(inode)->_remote_i_size); + spin_unlock(&inode->i_lock); rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX); if (rc) { cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n", diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 9afab3237e54..17408bb8ab65 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -296,7 +296,7 @@ search_end: break; case SMB2_ENCRYPTION_AES256_CCM: case SMB2_ENCRYPTION_AES256_GCM: - out.session_key_length = CIFS_SESS_KEY_SIZE; + out.session_key_length = ses->auth_key.len; out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE; break; default: diff --git a/fs/smb/client/netlink.c b/fs/smb/client/netlink.c index 147d9409252c..0dd10913c37a 100644 --- a/fs/smb/client/netlink.c +++ b/fs/smb/client/netlink.c @@ -33,13 +33,17 @@ static const struct nla_policy cifs_genl_policy[CIFS_GENL_ATTR_MAX + 1] = { static const struct genl_ops cifs_genl_ops[] = { { .cmd = CIFS_GENL_CMD_SWN_NOTIFY, + .flags = GENL_ADMIN_PERM, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cifs_swn_notify, }, }; static const struct genl_multicast_group cifs_genl_mcgrps[] = { - [CIFS_GENL_MCGRP_SWN] = { .name = CIFS_GENL_MCGRP_SWN_NAME }, + [CIFS_GENL_MCGRP_SWN] = { + .name = CIFS_GENL_MCGRP_SWN_NAME, + .flags = GENL_MCAST_CAP_NET_ADMIN, + }, }; struct genl_family cifs_genl_family = { diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index be22bbc4a65a..e860fa08b5e3 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -143,7 +143,8 @@ retry: fattr->cf_rdev = inode->i_rdev; fattr->cf_uid = inode->i_uid; fattr->cf_gid = inode->i_gid; - fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; + fattr->cf_eof = + netfs_read_remote_i_size(inode); fattr->cf_symlink_target = NULL; } else { CIFS_I(inode)->time = 0; diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index b292aa94a593..6860eff31693 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -49,6 +49,9 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) __func__, le32_to_cpu(p->ErrorId)); len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8); + if (len > end - ((u8 *)p + sizeof(*p))) + return ERR_PTR(-EINVAL); + p = (struct smb2_error_context_rsp *)(p->ErrorContextData + len); } } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) && diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 286912616c73..6c9c229b91f6 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -111,7 +111,7 @@ static int check_wsl_eas(struct kvec *rsp_iov) u32 outlen, next; u16 vlen; u8 nlen; - u8 *end; + u8 *ea_end, *iov_end; outlen = le32_to_cpu(rsp->OutputBufferLength); if (outlen < SMB2_WSL_MIN_QUERY_EA_RESP_SIZE || @@ -120,15 +120,19 @@ static int check_wsl_eas(struct kvec *rsp_iov) ea = (void *)((u8 *)rsp_iov->iov_base + le16_to_cpu(rsp->OutputBufferOffset)); - end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len; + ea_end = (u8 *)ea + outlen; + iov_end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len; + if (ea_end > iov_end) + return -EINVAL; + for (;;) { - if ((u8 *)ea > end - sizeof(*ea)) + if ((u8 *)ea > ea_end - sizeof(*ea)) return -EINVAL; nlen = ea->ea_name_length; vlen = le16_to_cpu(ea->ea_value_length); if (nlen != SMB2_WSL_XATTR_NAME_LEN || - (u8 *)ea->ea_data + nlen + 1 + vlen > end) + (u8 *)ea->ea_data + nlen + 1 + vlen > ea_end) return -EINVAL; switch (vlen) { diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 973fce3c959c..2a7355ce1a07 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -241,7 +241,8 @@ smb2_check_message(char *buf, unsigned int pdu_len, unsigned int len, if (len != calc_len) { /* create failed on symlink */ if (command == SMB2_CREATE_HE && - shdr->Status == STATUS_STOPPED_ON_SYMLINK) + shdr->Status == STATUS_STOPPED_ON_SYMLINK && + len > calc_len) return 0; /* Windows 7 server returns 24 bytes more */ if (calc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e6cb9b144530..d4875f9532b4 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3402,8 +3402,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - struct netfs_inode *ictx = netfs_inode(inode); - unsigned long long i_size, new_size, remote_size; + unsigned long long i_size, new_size, remote_i_size, zero_point; long rc; unsigned int xid; @@ -3414,9 +3413,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, filemap_invalidate_lock(inode->i_mapping); - i_size = i_size_read(inode); - remote_size = ictx->remote_i_size; - if (offset + len >= remote_size && offset < i_size) { + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + if (offset + len >= remote_i_size && offset < i_size) { unsigned long long top = umin(offset + len, i_size); rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1); @@ -3449,9 +3447,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, new_size, true); - if (offset < cifsi->netfs.zero_point) - cifsi->netfs.zero_point = offset; + if (offset < cifsi->netfs._zero_point) + netfs_write_zero_point(inode, offset); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), new_size); } } @@ -3474,7 +3474,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; - unsigned long long end = offset + len, i_size, remote_i_size; + unsigned long long end = offset + len, i_size, remote_i_size, zero_point; long rc; unsigned int xid; __u8 set_sparse = 1; @@ -3516,14 +3516,17 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, * that we locally hole-punch the tail of the dirty data, the proposed * EOF update will end up in the wrong place. */ - i_size = i_size_read(inode); - remote_i_size = netfs_inode(inode)->remote_i_size; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + if (end > remote_i_size && i_size > remote_i_size) { unsigned long long extend_to = umin(end, i_size); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, extend_to); - if (rc >= 0) - netfs_inode(inode)->remote_i_size = extend_to; + if (rc >= 0) { + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, extend_to); + spin_unlock(&inode->i_lock); + } } unlock: @@ -3787,7 +3790,6 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - struct netfs_inode *ictx = &cifsi->netfs; loff_t old_eof, new_eof; xid = get_xid(); @@ -3805,7 +3807,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_pagecache_range(inode, off, old_eof); - ictx->zero_point = old_eof; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, old_eof); + spin_unlock(&inode->i_lock); netfs_wait_for_outstanding_io(inode); rc = smb2_copychunk_range(xid, cfile, cfile, off + len, @@ -3822,8 +3826,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, rc = 0; truncate_setsize(inode, new_eof); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, new_eof, true); - ictx->zero_point = new_eof; + netfs_write_zero_point(inode, new_eof); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); @@ -3866,13 +3872,17 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_setsize(inode, new_eof); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, i_size_read(inode), true); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); if (rc < 0) goto out_2; - cifsi->netfs.zero_point = new_eof; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, new_eof); + spin_unlock(&inode->i_lock); rc = smb3_zero_data(file, tcon, off, len, xid); if (rc < 0) @@ -4696,9 +4706,15 @@ cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size, { for (; folioq; folioq = folioq->next) { for (int s = 0; s < folioq_count(folioq); s++) { - struct folio *folio = folioq_folio(folioq, s); - size_t fsize = folio_size(folio); - size_t n, len = umin(fsize - skip, data_size); + struct folio *folio; + size_t fsize, n, len; + + if (data_size == 0) + return 0; + + folio = folioq_folio(folioq, s); + fsize = folio_size(folio); + len = umin(fsize - skip, data_size); n = copy_folio_to_iter(folio, skip, len, iter); if (n != len) { @@ -4711,6 +4727,12 @@ cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size, } } + if (data_size != 0) { + cifs_dbg(VFS, "%s: short copy, %zu bytes missing\n", + __func__, data_size); + return smb_EIO2(smb_eio_trace_rx_copy_to_iter, 0, data_size); + } + return 0; } @@ -4721,6 +4743,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, { unsigned int data_offset; unsigned int data_len; + unsigned int end_off; unsigned int cur_off; unsigned int cur_page_idx; unsigned int pad_len; @@ -4825,7 +4848,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } /* Copy the data to the output I/O iterator. */ - rdata->result = cifs_copy_folioq_to_iter(buffer, buffer_len, + rdata->result = cifs_copy_folioq_to_iter(buffer, data_len, cur_off, &rdata->subreq.io_iter); if (rdata->result != 0) { if (is_offloaded) @@ -4834,9 +4857,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, dequeue_mid(server, mid, rdata->result); return 0; } - rdata->got_bytes = buffer_len; + rdata->got_bytes = data_len; - } else if (buf_len >= data_offset + data_len) { + } else if (!check_add_overflow(data_offset, data_len, &end_off) && + buf_len >= end_off) { /* read response payload is in buf */ WARN_ONCE(buffer, "read data can be either in buf or in buffer"); copied = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index cb61051f9af3..fbeb2156ddb6 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1713,17 +1713,30 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) is_binding = (ses->ses_status == SES_GOOD); spin_unlock(&ses->ses_lock); + /* + * Per MS-SMB2 3.2.5.3, Session.SessionKey is the first 16 bytes of the + * GSS cryptographic key, right-padded with zero bytes if shorter. + * Allocate at least SMB2_NTLMV2_SESSKEY_SIZE bytes (zeroed) so the KDF + * input buffer is always valid for HMAC-SHA256 even with deprecated + * Kerberos enctypes that return a short session key. + */ + if (unlikely(msg->sesskey_len < SMB2_NTLMV2_SESSKEY_SIZE)) + cifs_dbg(VFS, + "short GSS session key (%u bytes); zero-padding per MS-SMB2 3.2.5.3\n", + msg->sesskey_len); + kfree_sensitive(ses->auth_key.response); - ses->auth_key.response = kmemdup(msg->data, - msg->sesskey_len, - GFP_KERNEL); + ses->auth_key.len = max_t(unsigned int, msg->sesskey_len, + SMB2_NTLMV2_SESSKEY_SIZE); + ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); if (!ses->auth_key.response) { cifs_dbg(VFS, "%s: can't allocate (%u bytes) memory\n", - __func__, msg->sesskey_len); + __func__, ses->auth_key.len); + ses->auth_key.len = 0; rc = -ENOMEM; goto out_put_spnego_key; } - ses->auth_key.len = msg->sesskey_len; + memcpy(ses->auth_key.response, msg->data, msg->sesskey_len); sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; sess_data->iov[1].iov_len = msg->secblob_len; @@ -4595,6 +4608,7 @@ smb2_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct smb2_hdr *shdr = (struct smb2_hdr *)rdata->iov[0].iov_base; + struct inode *inode = &ictx->inode; struct cifs_credits credits = { .value = 0, .instance = 0, @@ -4708,7 +4722,7 @@ do_retry: } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && - rdata->subreq.start + trans >= ictx->remote_i_size) { + rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) { __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } @@ -4941,7 +4955,7 @@ smb2_writev_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) unsigned int rreq_debug_id = wdata->rreq->debug_id; unsigned int subreq_debug_index = wdata->subreq.debug_index; ssize_t result = 0; - size_t written; + size_t written = 0; WARN_ONCE(wdata->server != server, "wdata server %p != mid server %p", diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 41009039b4cb..1143ee52470a 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -169,7 +169,9 @@ smb2_find_smb_sess_tcon_unlocked(struct cifs_ses *ses, __u32 tid) list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid != tid) continue; + spin_lock(&tcon->tc_lock); ++tcon->tc_count; + spin_unlock(&tcon->tc_lock); trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count, netfs_trace_tcon_ref_get_find_sess_tcon); return tcon; @@ -251,7 +253,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) } static void generate_key(struct cifs_ses *ses, struct kvec label, - struct kvec context, __u8 *key, unsigned int key_size) + struct kvec context, __u8 *key, unsigned int key_size, + unsigned int full_key_size) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -265,7 +268,7 @@ static void generate_key(struct cifs_ses *ses, struct kvec label, memset(key, 0x0, key_size); hmac_sha256_init_usingrawkey(&hmac_ctx, ses->auth_key.response, - SMB2_NTLMV2_SESSKEY_SIZE); + full_key_size); hmac_sha256_update(&hmac_ctx, i, 4); hmac_sha256_update(&hmac_ctx, label.iov_base, label.iov_len); hmac_sha256_update(&hmac_ctx, &zero, 1); @@ -298,6 +301,7 @@ generate_smb3signingkey(struct cifs_ses *ses, struct TCP_Server_Info *server, const struct derivation_triplet *ptriplet) { + unsigned int full_key_size = SMB2_NTLMV2_SESSKEY_SIZE; bool is_binding = false; int chan_index = 0; @@ -330,12 +334,24 @@ generate_smb3signingkey(struct cifs_ses *ses, if (is_binding) { generate_key(ses, ptriplet->signing.label, ptriplet->signing.context, - ses->chans[chan_index].signkey, - SMB3_SIGN_KEY_SIZE); + ses->chans[chan_index].signkey, SMB3_SIGN_KEY_SIZE, + SMB2_NTLMV2_SESSKEY_SIZE); } else { generate_key(ses, ptriplet->signing.label, - ptriplet->signing.context, - ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + ptriplet->signing.context, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE, SMB2_NTLMV2_SESSKEY_SIZE); + + /* + * Per MS-SMB2 3.2.5.3.1, signing key always uses Session.SessionKey + * (first 16 bytes). Encryption/decryption keys use + * Session.FullSessionKey when dialect is 3.1.1 and cipher is + * AES-256-CCM or AES-256-GCM, otherwise Session.SessionKey. + */ + + if (server->dialect == SMB311_PROT_ID && + (server->cipher_type == SMB2_ENCRYPTION_AES256_CCM || + server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + full_key_size = ses->auth_key.len; /* safe to access primary channel, since it will never go away */ spin_lock(&ses->chan_lock); @@ -345,10 +361,13 @@ generate_smb3signingkey(struct cifs_ses *ses, generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, - ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE); + ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE, + full_key_size); + generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, - ses->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE); + ses->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE, + full_key_size); } #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS @@ -361,7 +380,7 @@ generate_smb3signingkey(struct cifs_ses *ses, &ses->Suid); cifs_dbg(VFS, "Cipher type %d\n", server->cipher_type); cifs_dbg(VFS, "Session Key %*ph\n", - SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); + (int)ses->auth_key.len, ses->auth_key.response); cifs_dbg(VFS, "Signing Key %*ph\n", SMB3_SIGN_KEY_SIZE, ses->smb3signingkey); if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 75f9f91a7ec9..563ef488a225 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -9,7 +9,6 @@ #include "cifs_debug.h" #include "cifsproto.h" #include "smb2proto.h" -#include "../smbdirect/public.h" /* Port numbers for SMBD transport */ #define SMB_PORT 445 @@ -558,3 +557,5 @@ void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m) server->rdma_readwrite_threshold, m); } + +MODULE_IMPORT_NS("SMBDIRECT"); diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 287ac849213d..be205ec02077 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -12,7 +12,7 @@ #include "cifsglob.h" -#include "../smbdirect/smbdirect.h" +#include <linux/smbdirect.h> extern int rdma_readwrite_threshold; extern int smbd_max_frmr_depth; diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 05f8099047e1..fdf4e50c27ce 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -1158,7 +1158,7 @@ int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length, len; - unsigned int data_offset, data_len; + unsigned int data_offset, data_len, end_off; struct cifs_io_subrequest *rdata = mid->callback_data; char *buf = server->smallbuf; unsigned int buflen = server->pdu_size; @@ -1256,11 +1256,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) use_rdma_mr = rdata->mr; #endif data_len = server->ops->read_data_length(buf, use_rdma_mr); - if (!use_rdma_mr && (data_offset + data_len > buflen)) { - /* data_len is corrupt -- discard frame */ - rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed, - data_offset + data_len, buflen); - return cifs_readv_discard(server, mid); + if (!use_rdma_mr) { + if (check_add_overflow(data_offset, data_len, &end_off) || + end_off > buflen) { + /* data_len is corrupt -- discard frame */ + rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed, + end_off, buflen); + return cifs_readv_discard(server, mid); + } } #ifdef CONFIG_CIFS_SMB_DIRECT diff --git a/fs/smb/common/fscc.h b/fs/smb/common/fscc.h index b4ccddca9256..bc3012cc295d 100644 --- a/fs/smb/common/fscc.h +++ b/fs/smb/common/fscc.h @@ -260,12 +260,12 @@ typedef struct { char FileName[]; } __packed FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ -/* See MS-FSCC 2.4.13 */ +/* See MS-FSCC 2.4.14 */ struct smb2_file_eof_info { /* encoding of request for level 10 */ __le64 EndOfFile; /* new end of file value */ } __packed; /* level 20 Set */ -/* See MS-FSCC 2.4.14 */ +/* See MS-FSCC 2.4.15 */ typedef struct { __le32 NextEntryOffset; __u32 FileIndex; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index a4b12eb8df81..aeb0a245c532 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1566,6 +1566,10 @@ struct validate_negotiate_info_rsp { #define FILE_STANDARD_LINK_INFORMATION 54 #define FILE_ID_INFORMATION 59 #define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */ +#define FileId64ExtdDirectoryInformation 78 /* also for QUERY_DIR */ +#define FileId64ExtdBothDirectoryInformation 79 /* also for QUERY_DIR */ +#define FileIdAllExtdDirectoryInformation 80 /* also for QUERY_DIR */ +#define FileIdAllExtdBothDirectoryInformation 81 /* also for QUERY_DIR */ /* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */ #define SMB_FIND_FILE_POSIX_INFO 0x064 diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index c5aac4946cbe..8347495dbc62 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -79,6 +79,85 @@ static int create_proc_clients(void) { return 0; } static void delete_proc_clients(void) {} #endif +static struct workqueue_struct *ksmbd_conn_wq; + +int ksmbd_conn_wq_init(void) +{ + ksmbd_conn_wq = alloc_workqueue("ksmbd-conn-release", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!ksmbd_conn_wq) + return -ENOMEM; + return 0; +} + +void ksmbd_conn_wq_destroy(void) +{ + if (ksmbd_conn_wq) { + destroy_workqueue(ksmbd_conn_wq); + ksmbd_conn_wq = NULL; + } +} + +/* + * __ksmbd_conn_release_work() - perform the final, once-per-struct cleanup + * of a ksmbd_conn whose refcount has just dropped to zero. + * + * This is the common release path used by ksmbd_conn_put() for the embedded + * state that outlives the connection thread: async_ida and the attached + * transport (which owns the socket and iov for TCP). Called from a workqueue + * so that sleep-allowed teardown (sock_release -> tcp_close -> + * lock_sock_nested) never runs from an RCU softirq callback (free_opinfo_rcu) + * or any other non-sleeping putter context. + */ +static void __ksmbd_conn_release_work(struct work_struct *work) +{ + struct ksmbd_conn *conn = + container_of(work, struct ksmbd_conn, release_work); + + ida_destroy(&conn->async_ida); + conn->transport->ops->free_transport(conn->transport); + kfree(conn); +} + +/** + * ksmbd_conn_get() - take a reference on @conn and return it. + * + * @conn: connection instance to get a reference to + * + * Returns @conn unchanged so callers can write + * "fp->conn = ksmbd_conn_get(work->conn);" in one expression. Returns NULL + * if @conn is NULL. + */ +struct ksmbd_conn *ksmbd_conn_get(struct ksmbd_conn *conn) +{ + if (!conn) + return NULL; + + atomic_inc(&conn->refcnt); + return conn; +} + +/** + * ksmbd_conn_put() - drop a reference and, if it was the last, queue the + * release onto ksmbd_conn_wq so it runs from process context. + * + * @conn: connection instance to put a reference to + * + * Callable from any context including RCU softirq callbacks and non-sleeping + * locks; the actual release is deferred to the workqueue. ksmbd_conn_wq is + * created in ksmbd_server_init() before any conn can be allocated and is + * destroyed in ksmbd_server_exit() after rcu_barrier(), so it is always + * non-NULL while a conn reference is held. + */ +void ksmbd_conn_put(struct ksmbd_conn *conn) +{ + if (!conn) + return; + + if (atomic_dec_and_test(&conn->refcnt)) + queue_work(ksmbd_conn_wq, &conn->release_work); +} + /** * ksmbd_conn_free() - free resources of the connection instance * @@ -93,23 +172,19 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) hash_del(&conn->hlist); up_write(&conn_list_lock); + /* + * request_buf / preauth_info / mechToken are only ever accessed by the + * connection handler thread that owns @conn. ksmbd_conn_free() is + * called from the transport free_transport() path when that thread is + * exiting, so it is safe to release them unconditionally even when + * ksmbd_conn_put() below is not the final putter (oplock / ksmbd_file + * holders only retain the conn pointer, not these per-thread buffers). + */ xa_destroy(&conn->sessions); kvfree(conn->request_buf); kfree(conn->preauth_info); kfree(conn->mechToken); - if (atomic_dec_and_test(&conn->refcnt)) { - /* - * async_ida is embedded in struct ksmbd_conn, so pair - * ida_destroy() with the final kfree() rather than with - * the unconditional field teardown above. This keeps - * the IDA valid for the entire lifetime of the struct, - * even while other refcount holders (oplock / vfs - * durable handles) still reference the connection. - */ - ida_destroy(&conn->async_ida); - conn->transport->ops->free_transport(conn->transport); - kfree(conn); - } + ksmbd_conn_put(conn); } /** @@ -136,6 +211,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->um = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(conn->um)) conn->um = NULL; + INIT_WORK(&conn->release_work, __ksmbd_conn_release_work); atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); atomic_set(&conn->refcnt, 1); @@ -512,8 +588,7 @@ void ksmbd_conn_r_count_dec(struct ksmbd_conn *conn) if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) wake_up(&conn->r_count_q); - if (atomic_dec_and_test(&conn->refcnt)) - kfree(conn); + ksmbd_conn_put(conn); } int ksmbd_conn_transport_init(void) diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index de2d46941c93..e074be942582 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -16,6 +16,7 @@ #include <linux/kthread.h> #include <linux/nls.h> #include <linux/unicode.h> +#include <linux/workqueue.h> #include "smb_common.h" #include "ksmbd_work.h" @@ -120,6 +121,7 @@ struct ksmbd_conn { bool binding; atomic_t refcnt; bool is_aapl; + struct work_struct release_work; }; struct ksmbd_conn_ops { @@ -164,6 +166,10 @@ void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); +struct ksmbd_conn *ksmbd_conn_get(struct ksmbd_conn *conn); +void ksmbd_conn_put(struct ksmbd_conn *conn); +int ksmbd_conn_wq_init(void); +void ksmbd_conn_wq_destroy(void); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index 53f44ff4d376..6f97f8d39657 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -167,7 +167,10 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, share->path = kstrndup(ksmbd_share_config_path(resp), path_len, KSMBD_DEFAULT_GFP); - if (share->path) { + if (!share->path) { + ret = -ENOMEM; + } else { + ret = 0; share->path_sz = strlen(share->path); while (share->path_sz > 1 && share->path[share->path_sz - 1] == '/') @@ -179,9 +182,10 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, share->force_directory_mode = resp->force_directory_mode; share->force_uid = resp->force_uid; share->force_gid = resp->force_gid; - ret = parse_veto_list(share, - KSMBD_SHARE_CONFIG_VETO_LIST(resp), - resp->veto_list_sz); + if (!ret) + ret = parse_veto_list(share, + KSMBD_SHARE_CONFIG_VETO_LIST(resp), + resp->veto_list_sz); if (!ret && share->path) { if (__ksmbd_override_fsids(work, share)) { kill_share(share); diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index cd3f28b0e7cb..b193dde4810d 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -30,7 +30,6 @@ static DEFINE_RWLOCK(lease_list_lock); static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, u64 id, __u16 Tid) { - struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; struct oplock_info *opinfo; @@ -39,7 +38,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, return NULL; opinfo->sess = sess; - opinfo->conn = conn; + opinfo->conn = ksmbd_conn_get(work->conn); opinfo->level = SMB2_OPLOCK_LEVEL_NONE; opinfo->op_state = OPLOCK_STATE_NONE; opinfo->pending_break = 0; @@ -50,7 +49,6 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, init_waitqueue_head(&opinfo->oplock_brk); atomic_set(&opinfo->refcount, 1); atomic_set(&opinfo->breaking_cnt, 0); - atomic_inc(&opinfo->conn->refcnt); return opinfo; } @@ -132,8 +130,7 @@ static void __free_opinfo(struct oplock_info *opinfo) { if (opinfo->is_lease) free_lease(opinfo); - if (opinfo->conn && atomic_dec_and_test(&opinfo->conn->refcnt)) - kfree(opinfo->conn); + ksmbd_conn_put(opinfo->conn); kfree(opinfo); } @@ -484,8 +481,12 @@ static inline int compare_guid_key(struct oplock_info *opinfo, const char *guid1, const char *key1) { const char *guid2, *key2; + struct ksmbd_conn *conn; - guid2 = opinfo->conn->ClientGUID; + conn = READ_ONCE(opinfo->conn); + if (!conn) + return 0; + guid2 = conn->ClientGUID; key2 = opinfo->o_lease->lease_key; if (!memcmp(guid1, guid2, SMB2_CLIENT_GUID_SIZE) && !memcmp(key1, key2, SMB2_LEASE_KEY_SIZE)) @@ -710,11 +711,16 @@ out: */ static int smb2_oplock_break_noti(struct oplock_info *opinfo) { - struct ksmbd_conn *conn = opinfo->conn; + struct ksmbd_conn *conn; struct oplock_break_info *br_info; int ret = 0; - struct ksmbd_work *work = ksmbd_alloc_work_struct(); + struct ksmbd_work *work; + + conn = READ_ONCE(opinfo->conn); + if (!conn) + return 0; + work = ksmbd_alloc_work_struct(); if (!work) return -ENOMEM; @@ -814,11 +820,15 @@ out: */ static int smb2_lease_break_noti(struct oplock_info *opinfo) { - struct ksmbd_conn *conn = opinfo->conn; + struct ksmbd_conn *conn; struct ksmbd_work *work; struct lease_break_info *br_info; struct lease *lease = opinfo->o_lease; + conn = READ_ONCE(opinfo->conn); + if (!conn) + return 0; + work = ksmbd_alloc_work_struct(); if (!work) return -ENOMEM; diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 58ef02c423fc..5d799b2d4c62 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -596,8 +596,14 @@ static int __init ksmbd_server_init(void) if (ret) goto err_crypto_destroy; + ret = ksmbd_conn_wq_init(); + if (ret) + goto err_workqueue_destroy; + return 0; +err_workqueue_destroy: + ksmbd_workqueue_destroy(); err_crypto_destroy: ksmbd_crypto_destroy(); err_release_inode_hash: @@ -623,6 +629,12 @@ static void __exit ksmbd_server_exit(void) { ksmbd_server_shutdown(); rcu_barrier(); + /* + * ksmbd_conn_put() defers the final release onto ksmbd_conn_wq, + * so drain it after rcu_barrier() has fired any pending RCU + * callbacks that may have queued a release. + */ + ksmbd_conn_wq_destroy(); ksmbd_release_inode_hash(); } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 47b7af631f7b..3eb3b1711acb 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -3767,8 +3767,10 @@ err_out1: err_out2: if (!rc) { - ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED); - rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); + rc = ksmbd_update_fstate(&work->sess->file_table, fp, + FP_INITED); + if (!rc) + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); } if (rc) { if (rc == -EINVAL) @@ -3802,8 +3804,19 @@ err_out2: ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status); } - if (dh_info.reconnected) - ksmbd_put_durable_fd(dh_info.fp); + if (dh_info.reconnected) { + /* + * If reconnect succeeded, fp was republished in the + * session file table. On a later error, ksmbd_fd_put() + * above drops the session reference; drop the durable + * lookup reference through the same session-aware path so + * final close removes the volatile id before freeing fp. + */ + if (rc && fp == dh_info.fp) + ksmbd_fd_put(work, dh_info.fp); + else + ksmbd_put_durable_fd(dh_info.fp); + } kfree(name); kfree(lc); @@ -7309,6 +7322,17 @@ int smb2_cancel(struct ksmbd_work *work) le64_to_cpu(hdr->Id.AsyncId)) continue; + /* + * A cancelled deferred byte-range lock frees its + * file_lock and takes the smb2_lock() early-exit that + * skips release_async_work(), so the work stays on + * conn->async_requests with a live cancel_fn pointing + * at the freed file_lock. Re-firing it on a second + * SMB2_CANCEL is a use-after-free. + */ + if (iter->state == KSMBD_WORK_CANCELLED) + break; + ksmbd_debug(SMB, "smb2 with AsyncId %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->Id.AsyncId), @@ -8189,9 +8213,20 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, int ret = 0; __le32 old_fattr; + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, "User does not have write permission\n"); + return -EACCES; + } + fp = ksmbd_lookup_fd_fast(work, id); if (!fp) return -ENOENT; + + if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_WRITE_ATTRIBUTES_LE))) { + ret = -EACCES; + goto out; + } + idmap = file_mnt_idmap(fp->filp); old_fattr = fp->f_ci->m_fattr; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 4bbc2c27e680..664b1b4a3233 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -643,8 +643,10 @@ static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, ntace = (struct smb_ace *)((char *)pndace + *size); ace_sz = fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, flags, pace->e_perm, 0777); - if (check_add_overflow(*size, ace_sz, size)) + if (check_add_overflow(*size, ace_sz, size)) { + kfree(sid); break; + } (*num_aces)++; if (pace->e_tag == ACL_USER) ntace->access_req |= @@ -655,8 +657,10 @@ static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, ntace = (struct smb_ace *)((char *)pndace + *size); ace_sz = fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, 0x03, pace->e_perm, 0777); - if (check_add_overflow(*size, ace_sz, size)) + if (check_add_overflow(*size, ace_sz, size)) { + kfree(sid); break; + } (*num_aces)++; if (pace->e_tag == ACL_USER) ntace->access_req |= @@ -698,8 +702,10 @@ posix_default_acl: ntace = (struct smb_ace *)((char *)pndace + *size); ace_sz = fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, 0x0b, pace->e_perm, 0777); - if (check_add_overflow(*size, ace_sz, size)) + if (check_add_overflow(*size, ace_sz, size)) { + kfree(sid); break; + } (*num_aces)++; if (pace->e_tag == ACL_USER) ntace->access_req |= @@ -1068,7 +1074,60 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type, ace->flags = flags; ace->access_req = access_req; smb_copy_sid(&ace->sid, sid); - ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 + (sid->num_subauth * 4)); + ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 + + (ace->sid.num_subauth * 4)); +} + +static int smb_append_inherited_ace(struct smb_ace **ace, int *nt_size, + u16 *ace_cnt, const struct smb_sid *sid, + u8 type, u8 flags, __le32 access_req) +{ + int ace_size; + + smb_set_ace(*ace, sid, type, flags, access_req); + ace_size = le16_to_cpu((*ace)->size); + /* pdacl->size is __le16 and includes struct smb_acl. */ + if (check_add_overflow(*nt_size, ace_size, nt_size) || + *nt_size > U16_MAX - (int)sizeof(struct smb_acl)) + return -EINVAL; + + (*ace_cnt)++; + *ace = (struct smb_ace *)((char *)*ace + ace_size); + return 0; +} + +static int smb_validate_ntsd_sid(struct smb_ntsd *pntsd, size_t pntsd_size, + unsigned int sid_offset, struct smb_sid **sid, + size_t *sid_size) +{ + size_t sid_end; + + *sid = NULL; + *sid_size = 0; + + if (!sid_offset) + return 0; + + if (sid_offset < sizeof(struct smb_ntsd) || + check_add_overflow(sid_offset, (size_t)CIFS_SID_BASE_SIZE, + &sid_end) || + sid_end > pntsd_size) + return -EINVAL; + + *sid = (struct smb_sid *)((char *)pntsd + sid_offset); + if ((*sid)->num_subauth > SID_MAX_SUB_AUTHORITIES) + return -EINVAL; + + if (check_add_overflow((size_t)CIFS_SID_BASE_SIZE, + sizeof(__le32) * (size_t)(*sid)->num_subauth, + &sid_end)) + return -EINVAL; + + if (sid_offset > pntsd_size || sid_end > pntsd_size - sid_offset) + return -EINVAL; + + *sid_size = sid_end; + return 0; } int smb_inherit_dacl(struct ksmbd_conn *conn, @@ -1083,28 +1142,28 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct dentry *parent = path->dentry->d_parent; struct mnt_idmap *idmap = mnt_idmap(path->mnt); int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size; - int rc = 0, pntsd_type, pntsd_size, acl_len, aces_size; + int rc = 0, pntsd_type, ppntsd_size, acl_len, aces_size; unsigned int dacloffset; size_t dacl_struct_end; u16 num_aces, ace_cnt = 0; char *aces_base; bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); - pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, + ppntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, parent, &parent_pntsd); - if (pntsd_size <= 0) + if (ppntsd_size <= 0) return -ENOENT; dacloffset = le32_to_cpu(parent_pntsd->dacloffset); if (!dacloffset || check_add_overflow(dacloffset, sizeof(struct smb_acl), &dacl_struct_end) || - dacl_struct_end > (size_t)pntsd_size) { + dacl_struct_end > (size_t)ppntsd_size) { rc = -EINVAL; goto free_parent_pntsd; } parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset); - acl_len = pntsd_size - dacloffset; + acl_len = ppntsd_size - dacloffset; num_aces = le16_to_cpu(parent_pdacl->num_aces); pntsd_type = le16_to_cpu(parent_pntsd->type); pdacl_size = le16_to_cpu(parent_pdacl->size); @@ -1157,6 +1216,12 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, CIFS_SID_BASE_SIZE) break; + if (parent_aces->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + pace_size < offsetof(struct smb_ace, sid) + + CIFS_SID_BASE_SIZE + + sizeof(__le32) * parent_aces->sid.num_subauth) + break; + aces_size -= pace_size; flags = parent_aces->flags; @@ -1186,22 +1251,24 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, } if (is_dir && creator && flags & CONTAINER_INHERIT_ACE) { - smb_set_ace(aces, psid, parent_aces->type, inherited_flags, - parent_aces->access_req); - nt_size += le16_to_cpu(aces->size); - ace_cnt++; - aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size)); + rc = smb_append_inherited_ace(&aces, &nt_size, &ace_cnt, + psid, parent_aces->type, + inherited_flags, + parent_aces->access_req); + if (rc) + goto free_aces_base; flags |= INHERIT_ONLY_ACE; psid = creator; } else if (is_dir && !(parent_aces->flags & NO_PROPAGATE_INHERIT_ACE)) { psid = &parent_aces->sid; } - smb_set_ace(aces, psid, parent_aces->type, flags | inherited_flags, - parent_aces->access_req); - nt_size += le16_to_cpu(aces->size); - aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size)); - ace_cnt++; + rc = smb_append_inherited_ace(&aces, &nt_size, &ace_cnt, psid, + parent_aces->type, + flags | inherited_flags, + parent_aces->access_req); + if (rc) + goto free_aces_base; pass: parent_aces = (struct smb_ace *)((char *)parent_aces + pace_size); } @@ -1210,22 +1277,33 @@ pass: struct smb_ntsd *pntsd; struct smb_acl *pdacl; struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL; - int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size; - int pntsd_alloc_size; + size_t powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size; + size_t pntsd_alloc_size; - if (parent_pntsd->osidoffset) { - powner_sid = (struct smb_sid *)((char *)parent_pntsd + - le32_to_cpu(parent_pntsd->osidoffset)); - powner_sid_size = 1 + 1 + 6 + (powner_sid->num_subauth * 4); - } - if (parent_pntsd->gsidoffset) { - pgroup_sid = (struct smb_sid *)((char *)parent_pntsd + - le32_to_cpu(parent_pntsd->gsidoffset)); - pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4); - } + rc = smb_validate_ntsd_sid(parent_pntsd, ppntsd_size, + le32_to_cpu(parent_pntsd->osidoffset), + &powner_sid, &powner_sid_size); + if (rc) + goto free_aces_base; + rc = smb_validate_ntsd_sid(parent_pntsd, ppntsd_size, + le32_to_cpu(parent_pntsd->gsidoffset), + &pgroup_sid, &pgroup_sid_size); + if (rc) + goto free_aces_base; - pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size + - pgroup_sid_size + sizeof(struct smb_acl) + nt_size; + if (check_add_overflow(sizeof(struct smb_ntsd), + (size_t)powner_sid_size, + &pntsd_alloc_size) || + check_add_overflow(pntsd_alloc_size, + (size_t)pgroup_sid_size, + &pntsd_alloc_size) || + check_add_overflow(pntsd_alloc_size, sizeof(struct smb_acl), + &pntsd_alloc_size) || + check_add_overflow(pntsd_alloc_size, (size_t)nt_size, + &pntsd_alloc_size)) { + rc = -EINVAL; + goto free_aces_base; + } pntsd = kzalloc(pntsd_alloc_size, KSMBD_DEFAULT_GFP); if (!pntsd) { @@ -1368,8 +1446,8 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); aces_size = acl_size - sizeof(struct smb_acl); for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) { - if (offsetof(struct smb_ace, sid) + - aces_size < CIFS_SID_BASE_SIZE) + if (aces_size < offsetof(struct smb_ace, sid) + + CIFS_SID_BASE_SIZE) break; ace_size = le16_to_cpu(ace->size); if (ace_size > aces_size || @@ -1389,8 +1467,8 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); aces_size = acl_size - sizeof(struct smb_acl); for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) { - if (offsetof(struct smb_ace, sid) + - aces_size < CIFS_SID_BASE_SIZE) + if (aces_size < offsetof(struct smb_ace, sid) + + CIFS_SID_BASE_SIZE) break; ace_size = le16_to_cpu(ace->size); if (ace_size > aces_size || diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index a8242c00096f..b6d63ff8a8a3 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -18,7 +18,6 @@ #include "smb_common.h" #include "../common/smb2status.h" #include "transport_rdma.h" -#include "../smbdirect/public.h" #define SMB_DIRECT_PORT_IWARP 5445 @@ -540,3 +539,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .rdma_write = smb_direct_rdma_write, .free_transport = smb_direct_free_transport, }; + +MODULE_IMPORT_NS("SMBDIRECT"); diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index bde3d88aecc7..8b78917a1795 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -25,6 +25,6 @@ static inline void init_smbd_max_io_size(unsigned int sz) { } static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; } #endif -#include "../smbdirect/smbdirect.h" +#include <linux/smbdirect.h> #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 3551f01a3fa0..ba3355a6057a 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -81,7 +81,7 @@ static int proc_show_files(struct seq_file *m, void *v) read_lock(&global_ft.lock); idr_for_each_entry(global_ft.idr, fp, id) { seq_printf(m, "%#-10x %#-10llx %#-10llx %#-10x", - fp->tcon->id, + fp->tcon ? fp->tcon->id : 0, fp->persistent_id, fp->volatile_id, atomic_read(&fp->refcount)); @@ -211,13 +211,13 @@ int ksmbd_query_inode_status(struct dentry *dentry) return ret; down_read(&ci->m_lock); - if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)) + if (ci->m_flags & S_DEL_PENDING) ret = KSMBD_INODE_STATUS_PENDING_DELETE; else ret = KSMBD_INODE_STATUS_OK; up_read(&ci->m_lock); - atomic_dec(&ci->m_count); + ksmbd_inode_put(ci); return ret; } @@ -227,7 +227,7 @@ bool ksmbd_inode_pending_delete(struct ksmbd_file *fp) int ret; down_read(&ci->m_lock); - ret = (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)); + ret = (ci->m_flags & S_DEL_PENDING); up_read(&ci->m_lock); return ret; @@ -395,12 +395,20 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) } } + down_write(&ci->m_lock); + /* Promote S_DEL_ON_CLS to S_DEL_PENDING when close */ + if (ci->m_flags & S_DEL_ON_CLS) { + ci->m_flags &= ~S_DEL_ON_CLS; + ci->m_flags |= S_DEL_PENDING; + } + up_write(&ci->m_lock); + if (atomic_dec_and_test(&ci->m_count)) { bool do_unlink = false; down_write(&ci->m_lock); - if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) { - ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); + if (ci->m_flags & S_DEL_PENDING) { + ci->m_flags &= ~S_DEL_PENDING; do_unlink = true; } up_write(&ci->m_lock); @@ -418,6 +426,14 @@ static void __ksmbd_remove_durable_fd(struct ksmbd_file *fp) return; idr_remove(global_ft.idr, fp->persistent_id); + /* + * Clear persistent_id so a later __ksmbd_close_fd() that runs from a + * delayed putter (e.g. when a concurrent ksmbd_lookup_fd_inode() + * walker held the final reference) does not re-issue idr_remove() on + * an id that idr_alloc_cyclic() may have already handed out to a new + * durable handle. + */ + fp->persistent_id = KSMBD_NO_FID; } static void ksmbd_remove_durable_fd(struct ksmbd_file *fp) @@ -431,13 +447,13 @@ static void ksmbd_remove_durable_fd(struct ksmbd_file *fp) static void __ksmbd_remove_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) { - if (!has_file_id(fp->volatile_id)) - return; - down_write(&fp->f_ci->m_lock); list_del_init(&fp->node); up_write(&fp->f_ci->m_lock); + if (!has_file_id(fp->volatile_id)) + return; + write_lock(&ft->lock); idr_remove(ft->idr, fp->volatile_id); write_unlock(&ft->lock); @@ -475,6 +491,17 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) kfree(smb_lock); } + /* + * Drop fp's strong reference on conn (taken in ksmbd_open_fd() / + * ksmbd_reopen_durable_fd()). Durable fps that reached the + * scavenger have already had fp->conn cleared by session_fd_check(), + * in which case there is nothing to drop here. + */ + if (fp->conn) { + ksmbd_conn_put(fp->conn); + fp->conn = NULL; + } + if (ksmbd_stream_fd(fp)) kfree(fp->stream.name); kfree(fp->owner.name); @@ -510,6 +537,20 @@ static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft, static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp) { + /* + * Detached durable fp -- session_fd_check() cleared fp->conn at + * preserve, so this fp is no longer tracked by any conn's + * stats.open_files_count. This happens when + * ksmbd_scavenger_dispose_dh() hands the final close off to an + * m_fp_list walker (e.g. ksmbd_lookup_fd_inode()) whose work->conn + * is unrelated to the conn that originally opened the handle; close + * via the NULL-ft path so we do not underflow that unrelated + * counter. + */ + if (!fp->conn) { + __ksmbd_close_fd(NULL, fp); + return; + } __ksmbd_close_fd(&work->sess->file_table, fp); atomic_dec(&work->conn->stats.open_files_count); } @@ -678,14 +719,14 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry) down_read(&ci->m_lock); list_for_each_entry(lfp, &ci->m_fp_list, node) { if (inode == file_inode(lfp->filp)) { - atomic_dec(&ci->m_count); lfp = ksmbd_fp_get(lfp); up_read(&ci->m_lock); + ksmbd_inode_put(ci); return lfp; } } - atomic_dec(&ci->m_count); up_read(&ci->m_lock); + ksmbd_inode_put(ci); return NULL; } @@ -752,7 +793,14 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) atomic_set(&fp->refcount, 1); fp->filp = filp; - fp->conn = work->conn; + /* + * fp owns a strong reference on fp->conn for as long as fp->conn is + * non-NULL, so session_fd_check() and __ksmbd_close_fd() never + * dereference a dangling pointer. Paired with ksmbd_conn_put() in + * session_fd_check() (durable preserve), in __ksmbd_close_fd() + * (final close), and on the error paths below. + */ + fp->conn = ksmbd_conn_get(work->conn); fp->tcon = work->tcon; fp->volatile_id = KSMBD_NO_FID; fp->persistent_id = KSMBD_NO_FID; @@ -774,19 +822,64 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) return fp; err_out: + /* fp->conn was set and refcounted before every branch here. */ + ksmbd_conn_put(fp->conn); kmem_cache_free(filp_cache, fp); return ERR_PTR(ret); } -void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, - unsigned int state) +/** + * ksmbd_update_fstate() - update an fp state under the file-table lock + * @ft: file table that publishes @fp's volatile id + * @fp: file pointer to update + * @state: new state + * + * Return: 0 on success. The FP_NEW -> FP_INITED transition is special: + * -ENOENT if teardown already unpublished @fp by advancing the state or + * clearing the volatile id. Other state updates preserve the historical + * fire-and-forget behavior. + */ +int ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state) { + int ret; + if (!fp) - return; + return -ENOENT; write_lock(&ft->lock); - fp->f_state = state; + if (state == FP_INITED && + (fp->f_state != FP_NEW || !has_file_id(fp->volatile_id))) { + ret = -ENOENT; + } else { + fp->f_state = state; + ret = 0; + } write_unlock(&ft->lock); + + return ret; +} + +/* + * ksmbd_mark_fp_closed() - mark fp closed under ft->lock and return how many + * refs the teardown path owns. + * + * FP_INITED has a normal idr-owned reference, so teardown owns both that + * reference and the transient lookup reference. FP_NEW is still owned by the + * in-flight opener/reopener, which will drop the original reference after + * ksmbd_update_fstate(..., FP_INITED) observes the cleared volatile id. + * FP_CLOSED on entry means an earlier ksmbd_close_fd() already consumed the + * idr-owned ref. + */ +static int ksmbd_mark_fp_closed(struct ksmbd_file *fp) +{ + if (fp->f_state == FP_INITED) { + set_close_state_blocked_works(fp); + fp->f_state = FP_CLOSED; + return 2; + } + + return 1; } static int @@ -794,7 +887,8 @@ __close_file_table_ids(struct ksmbd_session *sess, struct ksmbd_tree_connect *tcon, bool (*skip)(struct ksmbd_tree_connect *tcon, struct ksmbd_file *fp, - struct ksmbd_user *user)) + struct ksmbd_user *user), + bool skip_preserves_fp) { struct ksmbd_file_table *ft = &sess->file_table; struct ksmbd_file *fp; @@ -802,32 +896,120 @@ __close_file_table_ids(struct ksmbd_session *sess, int num = 0; while (1) { + int n_to_drop; + write_lock(&ft->lock); fp = idr_get_next(ft->idr, &id); if (!fp) { write_unlock(&ft->lock); break; } - - if (skip(tcon, fp, sess->user) || - !atomic_dec_and_test(&fp->refcount)) { + if (!atomic_inc_not_zero(&fp->refcount)) { id++; write_unlock(&ft->lock); continue; } - set_close_state_blocked_works(fp); - idr_remove(ft->idr, fp->volatile_id); - fp->volatile_id = KSMBD_NO_FID; - write_unlock(&ft->lock); + if (skip_preserves_fp) { + /* + * Session teardown: skip() is session_fd_check(), + * which may sleep and mutates fp->conn / fp->tcon / + * fp->volatile_id when it chooses to preserve fp + * for durable reconnect. Unpublish fp from the + * session idr here, under ft->lock, so that + * __ksmbd_lookup_fd() through this session cannot + * grant a new ksmbd_fp_get() reference to an fp + * whose fields are about to be rewritten outside + * the lock. Durable reconnect still reaches fp via + * global_ft. + */ + idr_remove(ft->idr, id); + fp->volatile_id = KSMBD_NO_FID; + write_unlock(&ft->lock); + + if (skip(tcon, fp, sess->user)) { + /* + * session_fd_check() has converted fp to + * durable-preserve state and cleared its + * per-conn fields. fp is already unpublished + * above; the original idr-owned ref keeps it + * alive for the durable scavenger. Drop only + * the transient ref. atomic_dec() is safe -- + * atomic_inc_not_zero() succeeded on a + * positive value and we added one more, so + * refcount cannot be zero here. + */ + atomic_dec(&fp->refcount); + id++; + continue; + } + /* + * Keep the close-state decision under the same lock + * observed by ksmbd_update_fstate(), which is how an + * in-flight FP_NEW opener learns that teardown has + * cleared its volatile id. + */ + write_lock(&ft->lock); + n_to_drop = ksmbd_mark_fp_closed(fp); + write_unlock(&ft->lock); + } else { + /* + * Tree teardown: skip() is tree_conn_fd_check(), a + * cheap pointer compare that doesn't sleep and has + * no side effects, so keep the skip decision plus + * the unpublish-and-mark-closed sequence atomic + * under ft->lock. fps belonging to other tree + * connects (skip() == true) stay fully published in + * the session idr with no lock window. + */ + if (skip(tcon, fp, sess->user)) { + atomic_dec(&fp->refcount); + write_unlock(&ft->lock); + id++; + continue; + } + idr_remove(ft->idr, id); + fp->volatile_id = KSMBD_NO_FID; + n_to_drop = ksmbd_mark_fp_closed(fp); + write_unlock(&ft->lock); + } + + /* + * fp->volatile_id is already cleared to prevent stale idr + * removal from a deferred final close. Remove fp from + * m_fp_list here because __ksmbd_remove_fd() will skip the + * list unlink when volatile_id is KSMBD_NO_FID. + */ down_write(&fp->f_ci->m_lock); list_del_init(&fp->node); up_write(&fp->f_ci->m_lock); - __ksmbd_close_fd(ft, fp); - - num++; + /* + * Drop the references this iteration owns: + * + * n_to_drop == 2: we observed FP_INITED and committed + * the FP_CLOSED transition ourselves, so we own the + * transient (+1) and the still-intact idr-owned ref. + * + * n_to_drop == 1: either a prior ksmbd_close_fd() + * already consumed the idr-owned ref, or fp was still + * FP_NEW and the in-flight opener/reopener must keep + * the original reference until ksmbd_update_fstate() + * observes the cleared volatile id. + * + * If we end up as the final putter, finalize fp and + * account the open_files_count decrement via the caller's + * atomic_sub(num, ...). Otherwise the remaining user's + * ksmbd_fd_put() reaches __put_fd_final(), which does its + * own atomic_dec(&open_files_count), so we must not count + * this fp here -- doing so would double-decrement the + * connection-wide counter. + */ + if (atomic_sub_and_test(n_to_drop, &fp->refcount)) { + __ksmbd_close_fd(NULL, fp); + num++; + } id++; } @@ -881,24 +1063,37 @@ static bool ksmbd_durable_scavenger_alive(void) return true; } -static void ksmbd_scavenger_dispose_dh(struct list_head *head) +static void ksmbd_scavenger_dispose_dh(struct ksmbd_file *fp) { - while (!list_empty(head)) { - struct ksmbd_file *fp; + /* + * Durable-preserved fp can remain linked on f_ci->m_fp_list for + * share-mode checks. Unlink it before final close; fp->node is not + * available as a scavenger-private list node because re-adding it to + * another list corrupts m_fp_list. + */ + down_write(&fp->f_ci->m_lock); + list_del_init(&fp->node); + up_write(&fp->f_ci->m_lock); - fp = list_first_entry(head, struct ksmbd_file, node); - list_del_init(&fp->node); + /* + * Drop both the durable lifetime reference and the transient reference + * taken by the scavenger under global_ft.lock. If a concurrent + * ksmbd_lookup_fd_inode() (or any other m_fp_list walker) snatched fp + * before the unlink above, that holder owns the final close via + * ksmbd_fd_put() -> __ksmbd_close_fd(). Otherwise the scavenger is + * the last putter and finalises fp here. + */ + if (atomic_sub_and_test(2, &fp->refcount)) __ksmbd_close_fd(NULL, fp); - } } static int ksmbd_durable_scavenger(void *dummy) { struct ksmbd_file *fp = NULL; + struct ksmbd_file *expired_fp; unsigned int id; unsigned int min_timeout = 1; bool found_fp_timeout; - LIST_HEAD(scavenger_list); unsigned long remaining_jiffies; __module_get(THIS_MODULE); @@ -908,8 +1103,6 @@ static int ksmbd_durable_scavenger(void *dummy) if (try_to_freeze()) continue; - found_fp_timeout = false; - remaining_jiffies = wait_event_timeout(dh_wq, ksmbd_durable_scavenger_alive() == false, __msecs_to_jiffies(min_timeout)); @@ -918,23 +1111,39 @@ static int ksmbd_durable_scavenger(void *dummy) else min_timeout = DURABLE_HANDLE_MAX_TIMEOUT; - write_lock(&global_ft.lock); - idr_for_each_entry(global_ft.idr, fp, id) { - if (!fp->durable_timeout) - continue; - - if (atomic_read(&fp->refcount) > 1 || - fp->conn) - continue; + do { + expired_fp = NULL; + found_fp_timeout = false; - found_fp_timeout = true; - if (fp->durable_scavenger_timeout <= - jiffies_to_msecs(jiffies)) { - __ksmbd_remove_durable_fd(fp); - list_add(&fp->node, &scavenger_list); - } else { + write_lock(&global_ft.lock); + idr_for_each_entry(global_ft.idr, fp, id) { unsigned long durable_timeout; + if (!fp->durable_timeout) + continue; + + if (atomic_read(&fp->refcount) > 1 || + fp->conn) + continue; + + found_fp_timeout = true; + if (fp->durable_scavenger_timeout <= + jiffies_to_msecs(jiffies)) { + __ksmbd_remove_durable_fd(fp); + /* + * Take a transient reference so fp + * cannot be freed by an in-flight + * ksmbd_lookup_fd_inode() that found + * it through f_ci->m_fp_list while we + * drop global_ft.lock and reach the + * m_fp_list unlink in + * ksmbd_scavenger_dispose_dh(). + */ + atomic_inc(&fp->refcount); + expired_fp = fp; + break; + } + durable_timeout = fp->durable_scavenger_timeout - jiffies_to_msecs(jiffies); @@ -942,10 +1151,11 @@ static int ksmbd_durable_scavenger(void *dummy) if (min_timeout > durable_timeout) min_timeout = durable_timeout; } - } - write_unlock(&global_ft.lock); + write_unlock(&global_ft.lock); - ksmbd_scavenger_dispose_dh(&scavenger_list); + if (expired_fp) + ksmbd_scavenger_dispose_dh(expired_fp); + } while (expired_fp); if (found_fp_timeout == false) break; @@ -1062,25 +1272,35 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon, if (!is_reconnectable(fp)) return false; + if (fp->f_state != FP_INITED) + return false; + + if (WARN_ON_ONCE(!fp->conn)) + return false; + if (ksmbd_vfs_copy_durable_owner(fp, user)) return false; + /* + * fp owns a strong reference on fp->conn (taken in ksmbd_open_fd() + * / ksmbd_reopen_durable_fd()), so conn stays valid for the whole + * body of this function regardless of any op->conn puts below. + */ conn = fp->conn; ci = fp->f_ci; down_write(&ci->m_lock); list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) { if (op->conn != conn) continue; - if (op->conn && atomic_dec_and_test(&op->conn->refcnt)) - kfree(op->conn); + ksmbd_conn_put(op->conn); op->conn = NULL; } up_write(&ci->m_lock); list_for_each_entry_safe(smb_lock, tmp_lock, &fp->lock_list, flist) { - spin_lock(&fp->conn->llist_lock); + spin_lock(&conn->llist_lock); list_del_init(&smb_lock->clist); - spin_unlock(&fp->conn->llist_lock); + spin_unlock(&conn->llist_lock); } fp->conn = NULL; @@ -1091,6 +1311,8 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon, fp->durable_scavenger_timeout = jiffies_to_msecs(jiffies) + fp->durable_timeout; + /* Drop fp's own reference on conn. */ + ksmbd_conn_put(conn); return true; } @@ -1098,7 +1320,8 @@ void ksmbd_close_tree_conn_fds(struct ksmbd_work *work) { int num = __close_file_table_ids(work->sess, work->tcon, - tree_conn_fd_check); + tree_conn_fd_check, + false); atomic_sub(num, &work->conn->stats.open_files_count); } @@ -1107,7 +1330,8 @@ void ksmbd_close_session_fds(struct ksmbd_work *work) { int num = __close_file_table_ids(work->sess, work->tcon, - session_fd_check); + session_fd_check, + true); atomic_sub(num, &work->conn->stats.open_files_count); } @@ -1166,27 +1390,45 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp) struct ksmbd_lock *smb_lock; unsigned int old_f_state; + write_lock(&global_ft.lock); if (!fp->is_durable || fp->conn || fp->tcon) { + write_unlock(&global_ft.lock); pr_err("Invalid durable fd [%p:%p]\n", fp->conn, fp->tcon); return -EBADF; } if (has_file_id(fp->volatile_id)) { + write_unlock(&global_ft.lock); pr_err("Still in use durable fd: %llu\n", fp->volatile_id); return -EBADF; } + /* + * Initialize fp's connection binding before publishing fp into the + * session's file table. If __open_id() is ordered first, a + * concurrent teardown that iterates the table can observe a valid + * volatile_id with fp->conn == NULL and preserve a + * partially-initialized fp. fp owns a strong reference on the new + * conn (see ksmbd_open_fd()); undo it on __open_id() failure. + */ + fp->conn = ksmbd_conn_get(conn); + fp->tcon = work->tcon; + write_unlock(&global_ft.lock); + old_f_state = fp->f_state; fp->f_state = FP_NEW; + __open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID); if (!has_file_id(fp->volatile_id)) { + write_lock(&global_ft.lock); + fp->conn = NULL; + fp->tcon = NULL; + write_unlock(&global_ft.lock); + ksmbd_conn_put(conn); fp->f_state = old_f_state; return -EBADF; } - fp->conn = conn; - fp->tcon = work->tcon; - list_for_each_entry(smb_lock, &fp->lock_list, flist) { spin_lock(&conn->llist_lock); list_add_tail(&smb_lock->clist, &conn->lock_list); @@ -1198,8 +1440,7 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp) list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) { if (op->conn) continue; - op->conn = fp->conn; - atomic_inc(&op->conn->refcnt); + op->conn = ksmbd_conn_get(fp->conn); } up_write(&ci->m_lock); @@ -1228,7 +1469,7 @@ void ksmbd_destroy_file_table(struct ksmbd_session *sess) if (!ft->idr) return; - __close_file_table_ids(sess, NULL, session_fd_check); + __close_file_table_ids(sess, NULL, session_fd_check, true); idr_destroy(ft->idr); kfree(ft->idr); ft->idr = NULL; diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index 866f32c10d4d..e6871266a94b 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -172,8 +172,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); int ksmbd_init_global_file_table(void); void ksmbd_free_global_file_table(void); void ksmbd_set_fd_limit(unsigned long limit); -void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, - unsigned int state); +int ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state); bool ksmbd_vfs_compare_durable_owner(struct ksmbd_file *fp, struct ksmbd_user *user); diff --git a/fs/smb/smbdirect/accept.c b/fs/smb/smbdirect/accept.c index 704b271af3a8..529740005838 100644 --- a/fs/smb/smbdirect/accept.c +++ b/fs/smb/smbdirect/accept.c @@ -854,4 +854,4 @@ struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, return nsc; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_accept); +EXPORT_SYMBOL_GPL(smbdirect_socket_accept); diff --git a/fs/smb/smbdirect/connect.c b/fs/smb/smbdirect/connect.c index 8addee43a381..cd726b399afe 100644 --- a/fs/smb/smbdirect/connect.c +++ b/fs/smb/smbdirect/connect.c @@ -60,7 +60,7 @@ int smbdirect_connect(struct smbdirect_socket *sc, const struct sockaddr *dst) */ return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect); +EXPORT_SYMBOL_GPL(smbdirect_connect); static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc) { @@ -922,4 +922,4 @@ int smbdirect_connect_sync(struct smbdirect_socket *sc, return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect_sync); +EXPORT_SYMBOL_GPL(smbdirect_connect_sync); diff --git a/fs/smb/smbdirect/connection.c b/fs/smb/smbdirect/connection.c index 822366718d45..8adf58097534 100644 --- a/fs/smb/smbdirect/connection.c +++ b/fs/smb/smbdirect/connection.c @@ -706,7 +706,7 @@ bool smbdirect_connection_is_connected(struct smbdirect_socket *sc) return false; return true; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_is_connected); +EXPORT_SYMBOL_GPL(smbdirect_connection_is_connected); int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc) { @@ -779,7 +779,7 @@ int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc) return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_wait_for_connected); +EXPORT_SYMBOL_GPL(smbdirect_connection_wait_for_connected); void smbdirect_connection_idle_timer_work(struct work_struct *work) { @@ -958,7 +958,7 @@ release_credit: return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_batch_flush); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_batch_flush); struct smbdirect_send_batch * smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, @@ -976,7 +976,7 @@ smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, return batch; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_init_send_batch_storage); +EXPORT_SYMBOL_GPL(smbdirect_init_send_batch_storage); static int smbdirect_connection_wait_for_send_bcredit(struct smbdirect_socket *sc, struct smbdirect_send_batch *batch) @@ -1263,7 +1263,7 @@ lcredit_failed: bcredit_failed: return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_single_iter); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_single_iter); int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc) { @@ -1288,7 +1288,7 @@ int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc) return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_wait_zero_pending); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_wait_zero_pending); int smbdirect_connection_send_iter(struct smbdirect_socket *sc, struct iov_iter *iter, @@ -1373,7 +1373,7 @@ int smbdirect_connection_send_iter(struct smbdirect_socket *sc, return total_count; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_iter); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_iter); static void smbdirect_connection_send_io_done(struct ib_cq *cq, struct ib_wc *wc) { @@ -1937,7 +1937,7 @@ read_rfc1002_done: goto again; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_recvmsg); +EXPORT_SYMBOL_GPL(smbdirect_connection_recvmsg); static bool smbdirect_map_sges_single_page(struct smbdirect_map_sges *state, struct page *page, size_t off, size_t len) @@ -2168,7 +2168,7 @@ static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len, if (ret < 0) { while (state->num_sge > before) { - struct ib_sge *sge = &state->sge[state->num_sge--]; + struct ib_sge *sge = &state->sge[--state->num_sge]; ib_dma_unmap_page(state->device, sge->addr, diff --git a/fs/smb/smbdirect/debug.c b/fs/smb/smbdirect/debug.c index a66a19d4a463..3445843445bf 100644 --- a/fs/smb/smbdirect/debug.c +++ b/fs/smb/smbdirect/debug.c @@ -40,7 +40,7 @@ void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, seq_puts(m, "\n"); seq_printf(m, "Conn keep_alive_interval: %u ", - sp->keepalive_interval_msec * 1000); + sp->keepalive_interval_msec / 1000); seq_printf(m, "max_readwrite_size: %u rdma_readwrite_threshold: %u", sp->max_read_write_size, rdma_readwrite_threshold); @@ -85,4 +85,4 @@ void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, atomic_read(&sc->mr_io.ready.count), atomic_read(&sc->mr_io.used.count)); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_legacy_debug_proc_show); +EXPORT_SYMBOL_GPL(smbdirect_connection_legacy_debug_proc_show); diff --git a/fs/smb/smbdirect/devices.c b/fs/smb/smbdirect/devices.c index 44962f221c35..7adacbdfe12e 100644 --- a/fs/smb/smbdirect/devices.c +++ b/fs/smb/smbdirect/devices.c @@ -238,7 +238,7 @@ u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev) return RDMA_NODE_UNSPECIFIED; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_netdev_rdma_capable_node_type); +EXPORT_SYMBOL_GPL(smbdirect_netdev_rdma_capable_node_type); __init int smbdirect_devices_init(void) { diff --git a/fs/smb/smbdirect/internal.h b/fs/smb/smbdirect/internal.h index 2d5acf2c21bc..e9959e6dc13a 100644 --- a/fs/smb/smbdirect/internal.h +++ b/fs/smb/smbdirect/internal.h @@ -6,11 +6,11 @@ #ifndef __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ #define __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ +#define DEFAULT_SYMBOL_NAMESPACE "SMBDIRECT" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "smbdirect.h" +#include <linux/smbdirect.h> #include "pdu.h" -#include "public.h" #include <linux/mutex.h> diff --git a/fs/smb/smbdirect/listen.c b/fs/smb/smbdirect/listen.c index 143a7618d95f..2f78bcaedbf8 100644 --- a/fs/smb/smbdirect/listen.c +++ b/fs/smb/smbdirect/listen.c @@ -90,7 +90,7 @@ int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog) */ return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_listen); +EXPORT_SYMBOL_GPL(smbdirect_socket_listen); static int smbdirect_new_rdma_event_handler(struct rdma_cm_id *new_id, struct rdma_cm_event *event) diff --git a/fs/smb/smbdirect/mr.c b/fs/smb/smbdirect/mr.c index 5228e699cd5d..15c6363a2f97 100644 --- a/fs/smb/smbdirect/mr.c +++ b/fs/smb/smbdirect/mr.c @@ -269,7 +269,7 @@ smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, { const struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_mr_io *mr; - int ret, num_pages; + int ret, num_pages, num_mapped; struct ib_reg_wr *reg_wr; num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); @@ -300,19 +300,22 @@ smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, num_pages, iov_iter_count(iter), sp->max_frmr_depth); smbdirect_iter_to_sgt(iter, &mr->sgt, sp->max_frmr_depth); - ret = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); - if (!ret) { + num_mapped = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + if (!num_mapped) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, - "ib_dma_map_sg num_pages=%u dir=%x ret=%d (%1pe)\n", - num_pages, mr->dir, ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + "ib_dma_map_sg num_pages=%u dir=%x num_mapped=%d\n", + num_pages, mr->dir, num_mapped); + ret = -EIO; goto dma_map_error; } - ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); - if (ret != mr->sgt.nents) { + ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, num_mapped, NULL, PAGE_SIZE); + if (ret != num_mapped) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, - "ib_map_mr_sg failed ret = %d nents = %u\n", - ret, mr->sgt.nents); + "ib_map_mr_sg failed ret = %d num_mapped = %u\n", + ret, num_mapped); + if (ret >= 0) + ret = -EIO; goto map_mr_error; } @@ -380,7 +383,7 @@ dma_map_error: mutex_unlock(&mr->mutex); return NULL; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_register_mr_io); +EXPORT_SYMBOL_GPL(smbdirect_connection_register_mr_io); void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, struct smbdirect_buffer_descriptor_v1 *v1) @@ -397,7 +400,7 @@ void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, } mutex_unlock(&mr->mutex); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_mr_io_fill_buffer_descriptor); +EXPORT_SYMBOL_GPL(smbdirect_mr_io_fill_buffer_descriptor); /* * Deregister a MR after I/O is done @@ -490,4 +493,4 @@ put_kref: if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) mutex_unlock(&mr->mutex); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_deregister_mr_io); +EXPORT_SYMBOL_GPL(smbdirect_connection_deregister_mr_io); diff --git a/fs/smb/smbdirect/public.h b/fs/smb/smbdirect/public.h deleted file mode 100644 index 50088155e7c3..000000000000 --- a/fs/smb/smbdirect/public.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2025, Stefan Metzmacher - */ - -#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ -#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ - -struct smbdirect_buffer_descriptor_v1; -struct smbdirect_socket_parameters; - -struct smbdirect_socket; -struct smbdirect_send_batch; -struct smbdirect_mr_io; - -#define __SMBDIRECT_EXPORT_SYMBOL__(__sym) EXPORT_SYMBOL_FOR_MODULES(__sym, "cifs,ksmbd") - -#include <rdma/rw.h> - -u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev); - -bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs); - -int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc); - -int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc); - -int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, - const struct smbdirect_socket_parameters *sp); - -const struct smbdirect_socket_parameters * -smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc); - -int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, - enum ib_poll_context poll_ctx, - gfp_t gfp_mask); - -#define SMBDIRECT_LOG_ERR 0x0 -#define SMBDIRECT_LOG_INFO 0x1 - -#define SMBDIRECT_LOG_OUTGOING 0x1 -#define SMBDIRECT_LOG_INCOMING 0x2 -#define SMBDIRECT_LOG_READ 0x4 -#define SMBDIRECT_LOG_WRITE 0x8 -#define SMBDIRECT_LOG_RDMA_SEND 0x10 -#define SMBDIRECT_LOG_RDMA_RECV 0x20 -#define SMBDIRECT_LOG_KEEP_ALIVE 0x40 -#define SMBDIRECT_LOG_RDMA_EVENT 0x80 -#define SMBDIRECT_LOG_RDMA_MR 0x100 -#define SMBDIRECT_LOG_RDMA_RW 0x200 -#define SMBDIRECT_LOG_NEGOTIATE 0x400 -void smbdirect_socket_set_logging(struct smbdirect_socket *sc, - void *private_ptr, - bool (*needed)(struct smbdirect_socket *sc, - void *private_ptr, - unsigned int lvl, - unsigned int cls), - void (*vaprintf)(struct smbdirect_socket *sc, - const char *func, - unsigned int line, - void *private_ptr, - unsigned int lvl, - unsigned int cls, - struct va_format *vaf)); - -bool smbdirect_connection_is_connected(struct smbdirect_socket *sc); - -int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc); - -int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr); - -void smbdirect_socket_shutdown(struct smbdirect_socket *sc); - -void smbdirect_socket_release(struct smbdirect_socket *sc); - -int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - bool is_last); - -/* - * This is only temporary and only needed - * as long as the client still requires - * to use smbdirect_connection_send_single_iter() - */ -struct smbdirect_send_batch_storage { - union { - struct list_head __msg_list; - __aligned_u64 __space[5]; - }; -}; - -struct smbdirect_send_batch * -smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, - bool need_invalidate_rkey, - unsigned int remote_key); - -int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - struct iov_iter *iter, - unsigned int flags, - u32 remaining_data_length); - -int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc); - -int smbdirect_connection_send_iter(struct smbdirect_socket *sc, - struct iov_iter *iter, - unsigned int flags, - bool need_invalidate, - unsigned int remote_key); - -int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, - struct msghdr *msg, - unsigned int flags); - -int smbdirect_connect(struct smbdirect_socket *sc, - const struct sockaddr *dst); - -int smbdirect_connect_sync(struct smbdirect_socket *sc, - const struct sockaddr *dst); - -int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog); - -struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, - long timeo, - struct proto_accept_arg *arg); - -int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, - void *buf, size_t buf_len, - struct smbdirect_buffer_descriptor_v1 *desc, - size_t desc_len, - bool is_read); - -struct smbdirect_mr_io * -smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, - struct iov_iter *iter, - bool writing, - bool need_invalidate); - -void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, - struct smbdirect_buffer_descriptor_v1 *v1); - -void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr); - -void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, - unsigned int rdma_readwrite_threshold, - struct seq_file *m); - -#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ */ diff --git a/fs/smb/smbdirect/rw.c b/fs/smb/smbdirect/rw.c index c2f46b17731e..6fe38042cfb9 100644 --- a/fs/smb/smbdirect/rw.c +++ b/fs/smb/smbdirect/rw.c @@ -252,4 +252,4 @@ free_msg: kfree(msg); goto out; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_rdma_xmit); +EXPORT_SYMBOL_GPL(smbdirect_connection_rdma_xmit); diff --git a/fs/smb/smbdirect/smbdirect.h b/fs/smb/smbdirect/smbdirect.h deleted file mode 100644 index bbab5f7f7cc9..000000000000 --- a/fs/smb/smbdirect/smbdirect.h +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2025 Stefan Metzmacher - */ - -#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ -#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ - -#include <linux/types.h> - -/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ -struct smbdirect_buffer_descriptor_v1 { - __le64 offset; - __le32 token; - __le32 length; -} __packed; - -/* - * Connection parameters mostly from [MS-SMBD] 3.1.1.1 - * - * These are setup and negotiated at the beginning of a - * connection and remain constant unless explicitly changed. - * - * Some values are important for the upper layer. - */ -struct smbdirect_socket_parameters { - __u64 flags; -#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) -#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) - __u32 resolve_addr_timeout_msec; - __u32 resolve_route_timeout_msec; - __u32 rdma_connect_timeout_msec; - __u32 negotiate_timeout_msec; - __u16 initiator_depth; /* limited to U8_MAX */ - __u16 responder_resources; /* limited to U8_MAX */ - __u16 recv_credit_max; - __u16 send_credit_target; - __u32 max_send_size; - __u32 max_fragmented_send_size; - __u32 max_recv_size; - __u32 max_fragmented_recv_size; - __u32 max_read_write_size; - __u32 max_frmr_depth; - __u32 keepalive_interval_msec; - __u32 keepalive_timeout_msec; -} __packed; - -#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ - SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ - SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) - -#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */ diff --git a/fs/smb/smbdirect/socket.c b/fs/smb/smbdirect/socket.c index 1b4ab01b745e..39cca7219c4d 100644 --- a/fs/smb/smbdirect/socket.c +++ b/fs/smb/smbdirect/socket.c @@ -20,7 +20,7 @@ bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs) return false; return true; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_frwr_is_supported); +EXPORT_SYMBOL_GPL(smbdirect_frwr_is_supported); static void smbdirect_socket_cleanup_work(struct work_struct *work); @@ -107,7 +107,7 @@ init_failed: alloc_failed: return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_kern); +EXPORT_SYMBOL_GPL(smbdirect_socket_create_kern); int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc) { @@ -148,7 +148,7 @@ init_failed: alloc_failed: return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_accepting); +EXPORT_SYMBOL_GPL(smbdirect_socket_create_accepting); int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, const struct smbdirect_socket_parameters *sp) @@ -189,14 +189,14 @@ int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_initial_parameters); +EXPORT_SYMBOL_GPL(smbdirect_socket_set_initial_parameters); const struct smbdirect_socket_parameters * smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc) { return &sc->parameters; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_get_current_parameters); +EXPORT_SYMBOL_GPL(smbdirect_socket_get_current_parameters); int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, enum ib_poll_context poll_ctx, @@ -220,7 +220,7 @@ int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_kernel_settings); +EXPORT_SYMBOL_GPL(smbdirect_socket_set_kernel_settings); void smbdirect_socket_set_logging(struct smbdirect_socket *sc, void *private_ptr, @@ -240,7 +240,7 @@ void smbdirect_socket_set_logging(struct smbdirect_socket *sc, sc->logging.needed = needed; sc->logging.vaprintf = vaprintf; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_logging); +EXPORT_SYMBOL_GPL(smbdirect_socket_set_logging); static void smbdirect_socket_wake_up_all(struct smbdirect_socket *sc) { @@ -663,13 +663,13 @@ int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr) return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_bind); +EXPORT_SYMBOL_GPL(smbdirect_socket_bind); void smbdirect_socket_shutdown(struct smbdirect_socket *sc) { smbdirect_socket_schedule_cleanup_lvl(sc, SMBDIRECT_LOG_INFO, -ESHUTDOWN); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_shutdown); +EXPORT_SYMBOL_GPL(smbdirect_socket_shutdown); static void smbdirect_socket_release_disconnect(struct kref *kref) { @@ -712,7 +712,7 @@ void smbdirect_socket_release(struct smbdirect_socket *sc) */ kref_put(&sc->refs.destroy, smbdirect_socket_release_destroy); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_release); +EXPORT_SYMBOL_GPL(smbdirect_socket_release); int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc, enum smbdirect_socket_status expected_status, diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 182e54e575ee..4e1e4f18a166 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -188,7 +188,7 @@ static int internal_create_group(struct kobject *kobj, int update, kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update); if (error) { - if (grp->name) + if (grp->name && !update) kernfs_remove(kn); } kernfs_put(kn); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 80ba94f51e5c..aecbab61014c 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -382,6 +382,7 @@ xfs_dir3_data_write_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + struct xfs_dir3_data_hdr *datahdr3 = bp->b_addr; xfs_failaddr_t fa; fa = xfs_dir3_data_verify(bp); @@ -396,6 +397,11 @@ xfs_dir3_data_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + /* + * Zero padding that may be stale from old kernels. + */ + datahdr3->pad = 0; + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } @@ -728,7 +734,6 @@ xfs_dir3_data_init( struct xfs_dir2_data_unused *dup; struct xfs_dir2_data_free *bf; int error; - int i; /* * Get the buffer set up for the block. @@ -741,13 +746,16 @@ xfs_dir3_data_init( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); /* - * Initialize the header. + * Initialize the whole directory header region to zero + * so that all padding, bestfree entries, and any + * future header fields are clean. */ hdr = bp->b_addr; + memset(hdr, 0, geo->data_entry_offset); + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(args->owner); @@ -759,10 +767,6 @@ xfs_dir3_data_init( bf = xfs_dir2_data_bestfree_p(mp, hdr); bf[0].offset = cpu_to_be16(geo->data_entry_offset); bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset); - for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - bf[i].length = 0; - bf[i].offset = 0; - } /* * Set up an unused entry for the block's body. diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 40c7f0ff6cf3..0ec6ccd8b4dc 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1414,8 +1414,7 @@ xfs_refcount_finish_one( if (rcur == NULL) { struct xfs_perag *pag = to_perag(ri->ri_group); - error = xfs_alloc_read_agf(pag, tp, - XFS_ALLOC_FLAG_FREEING, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 20e63069088b..3d40cb0b2496 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -251,6 +251,17 @@ xchk_ino_set_preen( trace_xchk_ino_preen(sc, ino, __return_address); } +/* Record a block indexed by a file fork that could be optimized. */ +void +xchk_fblock_set_preen( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_fblock_preen(sc, whichfork, offset, __return_address); +} + /* Record something being wrong with the filesystem primary superblock. */ void xchk_set_corrupt( diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index f2ecc68538f0..b494d747c008 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -25,6 +25,8 @@ bool xchk_fblock_xref_process_error(struct xfs_scrub *sc, void xchk_block_set_preen(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_fblock_set_preen(struct xfs_scrub *sc, + int whichfork, xfs_fileoff_t offset); void xchk_set_corrupt(struct xfs_scrub *sc); void xchk_block_set_corrupt(struct xfs_scrub *sc, diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index bffc4666ce60..c25716fc4fee 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -300,18 +300,15 @@ xrep_cow_find_bad( * on the debugging knob, replace everything in the CoW fork. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); - if (error) - return error; - } out_sa: xchk_ag_free(sc, &sc->sa); out_pag: xfs_perag_put(pag); - return 0; + return error; } /* @@ -385,12 +382,9 @@ xrep_cow_find_bad_rt( * CoW fork and then scan for staging extents in the refcountbt. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); - if (error) - goto out_rtg; - } out_sr: xchk_rtgroup_btcur_free(&sc->sr); diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 1a71d36898b1..c2d6ad59d03e 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -454,7 +454,12 @@ xchk_da_btree_block( } } - /* XXX: Check hdr3.pad32 once we know how to fix it. */ + if (xfs_has_crc(ip->i_mount)) { + struct xfs_da3_node_hdr *nodehdr3 = blk->bp->b_addr; + + if (nodehdr3->__pad32) + xchk_da_set_preen(ds, level); + } break; default: xchk_da_set_corrupt(ds, level); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index e09724cd3725..09715a4aa154 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -492,7 +492,12 @@ xchk_directory_data_bestfree( goto out; xchk_buffer_recheck(sc, bp); - /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + if (xfs_has_crc(sc->mp)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad) + xchk_fblock_set_preen(sc, XFS_DATA_FORK, lblk); + } if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out_buf; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 580d40a5ee57..0cea458f1353 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -472,6 +472,7 @@ xfs_buf_find_insert( /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; +retry: rcu_read_lock(); bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); @@ -480,8 +481,16 @@ xfs_buf_find_insert( error = PTR_ERR(bp); goto out_free_buf; } - if (bp && lockref_get_not_dead(&bp->b_lockref)) { - /* found an existing buffer */ + if (bp) { + /* + * If there is an existing buffer with a dead lockref, retry + * until the new buffer is added, or a usable buffer is found. + */ + if (!lockref_get_not_dead(&bp->b_lockref)) { + rcu_read_unlock(); + cpu_relax(); + goto retry; + } rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) @@ -820,15 +829,20 @@ xfs_buf_destroy( ASSERT(__lockref_is_dead(&bp->b_lockref)); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + if (bp->b_pag) + xfs_perag_put(bp->b_pag); + xfs_buf_free(bp); +} + +static inline void +xfs_buf_kill( + struct xfs_buf *bp) +{ + lockref_mark_dead(&bp->b_lockref); if (!xfs_buf_is_uncached(bp)) { rhashtable_remove_fast(&bp->b_target->bt_hash, &bp->b_rhash_head, xfs_buf_hash_params); - - if (bp->b_pag) - xfs_perag_put(bp->b_pag); } - - xfs_buf_free(bp); } /* @@ -851,7 +865,7 @@ xfs_buf_rele( return; kill: - lockref_mark_dead(&bp->b_lockref); + xfs_buf_kill(bp); list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); spin_unlock(&bp->b_lockref.lock); @@ -1433,7 +1447,7 @@ xfs_buftarg_drain_rele( return LRU_SKIP; } - lockref_mark_dead(&bp->b_lockref); + xfs_buf_kill(bp); list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lockref.lock); return LRU_REMOVED; @@ -1545,7 +1559,7 @@ xfs_buftarg_isolate( return LRU_ROTATE; } - lockref_mark_dead(&bp->b_lockref); + xfs_buf_kill(bp); list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lockref.lock); return LRU_REMOVED; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index beaa26ec62da..9978ac1422fc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -699,12 +699,6 @@ xfs_create( */ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, &tp); - if (error == -ENOSPC) { - /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(mp); - error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, - resblks, &tp); - } if (error) goto out_parent; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 46e234863644..96af6b62ce39 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -409,6 +409,26 @@ xfs_ioc_ag_geometry( return 0; } +static void +xfs_rtgroup_report_write_pointer( + struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo) +{ + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + if (rtg->rtg_open_zone) { + rgeo->rg_writepointer = rtg->rtg_open_zone->oz_allocated; + } else { + xfs_rgblock_t highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + + if (highest_rgbno == NULLRGBLOCK) + rgeo->rg_writepointer = 0; + else + rgeo->rg_writepointer = highest_rgbno + 1; + } + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + rgeo->rg_flags |= XFS_RTGROUP_GEOM_WRITEPOINTER; +} + STATIC int xfs_ioc_rtgroup_geometry( struct xfs_mount *mp, @@ -416,7 +436,6 @@ xfs_ioc_rtgroup_geometry( { struct xfs_rtgroup *rtg; struct xfs_rtgroup_geometry rgeo; - xfs_rgblock_t highest_rgbno; int error; if (copy_from_user(&rgeo, arg, sizeof(rgeo))) @@ -433,28 +452,16 @@ xfs_ioc_rtgroup_geometry( return -EINVAL; error = xfs_rtgroup_get_geometry(rtg, &rgeo); - xfs_rtgroup_put(rtg); if (error) - return error; - - if (xfs_has_zoned(mp)) { - xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); - if (rtg->rtg_open_zone) { - rgeo.rg_writepointer = rtg->rtg_open_zone->oz_allocated; - } else { - highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); - if (highest_rgbno == NULLRGBLOCK) - rgeo.rg_writepointer = 0; - else - rgeo.rg_writepointer = highest_rgbno + 1; - } - xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); - rgeo.rg_flags |= XFS_RTGROUP_GEOM_WRITEPOINTER; - } + goto out_put_rtg; + if (xfs_has_zoned(mp)) + xfs_rtgroup_report_write_pointer(rtg, &rgeo); if (copy_to_user(arg, &rgeo, sizeof(rgeo))) - return -EFAULT; - return 0; + error = -EFAULT; +out_put_rtg: + xfs_rtgroup_put(rtg); + return error; } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b24195f570cd..7aa51826b1ca 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1149,9 +1149,12 @@ xfs_mountfs( * blocks. */ error = xfs_fs_reserve_ag_blocks(mp); - if (error && error == -ENOSPC) + if (error) { + if (error != -ENOSPC) + goto out_rtunmount; xfs_warn(mp, - "ENOSPC reserving per-AG metadata pool, log recovery may fail."); +"ENOSPC reserving per-AG metadata pool, log recovery may fail."); + } error = xfs_log_mount_finish(mp); xfs_fs_unreserve_ag_blocks(mp); if (error) { diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 64c8afb935c2..b994ff15d5e4 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -350,7 +350,7 @@ xfs_dax_notify_dev_failure( /* * Shutdown fs from a force umount in pre-remove case which won't fail, * so errors can be ignored. Otherwise, shutdown the filesystem with - * CORRUPT flag if error occured or notify.want_shutdown was set during + * CORRUPT flag if error occurred or notify.want_shutdown was set during * RMAP querying. */ if (mf_flags & MF_MEM_PRE_REMOVE) diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 221e55887a2a..d92993367ab6 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -118,7 +118,6 @@ xfs_fs_map_blocks( struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; loff_t limit; - int bmapi_flags = XFS_BMAPI_ENTIRE; int nimaps = 1; uint lock_flags; int error = 0; @@ -172,14 +171,18 @@ xfs_fs_map_blocks( offset_fsb = XFS_B_TO_FSBT(mp, offset); lock_flags = xfs_ilock_data_map_shared(ip); + /* request mappings for the specified range only */ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, bmapi_flags); + &imap, &nimaps, 0); + if (error) { + xfs_iunlock(ip, lock_flags); + goto out_unlock; + } seq = xfs_iomap_inode_sequence(ip, 0); ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); - if (!error && write && - (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) { + if (write && (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) { if (offset + length > XFS_ISIZE(ip)) end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index db23a0f231d6..251dec48f0e3 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -949,16 +949,16 @@ xfs_reflink_end_cow( * repeatedly cycles the ILOCK to allocate one transaction per remapped * extent. * - * If we're being called by writeback then the pages will still - * have PageWriteback set, which prevents races with reflink remapping - * and truncate. Reflink remapping prevents races with writeback by - * taking the iolock and mmaplock before flushing the pages and - * remapping, which means there won't be any further writeback or page - * cache dirtying until the reflink completes. + * If we're being called by writeback then the folios will still + * have the writeback flag set, which prevents races with reflink + * remapping and truncate. Reflink remapping prevents races with + * writeback by taking the iolock and mmaplock before flushing + * the folios and remapping, which means there won't be any further + * writeback or page cache dirtying until the reflink completes. * * We should never have two threads issuing writeback for the same file * region. There are also have post-eof checks in the writeback - * preparation code so that we don't bother writing out pages that are + * preparation code so that we don't bother writing out folios that are * about to be truncated. * * If we're being called as part of directio write completion, the dio diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bcc470f56e46..148cc32449c1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1199,10 +1199,21 @@ xfs_trans_alloc_icreate( { struct xfs_trans *tp; bool retried = false; + bool flushed = false; int error; retry: error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); + if (error == -ENOSPC && !flushed) { + /* + * Flush all delalloc blocks to reclaim space from speculative + * preallocation. This is similar to the quota retry below + * but targets FS-wide ENOSPC. + */ + xfs_flush_inodes(mp); + flushed = true; + goto retry; + } if (error) return error; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index a851b98143c0..5e297b75a85f 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1170,7 +1170,7 @@ xfs_calc_open_zones( if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { mp->m_max_open_zones = bdev_open_zones; - xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + xfs_info(mp, "limiting open zones to %u due to hardware limit.", bdev_open_zones); } @@ -1217,7 +1217,7 @@ xfs_alloc_zone_info( return zi; out_free_bitmaps: - while (--i > 0) + while (--i >= 0) kvfree(zi->zi_used_bucket_bitmap[i]); kfree(zi); return NULL; diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index fedcc47048af..f03211e4354a 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -400,7 +400,7 @@ retry: /* * If the inode was already deleted, skip over it. */ - if (error == -ENOENT) { + if (error == -ENOENT || error == -EINVAL) { iter->rec_idx++; goto retry; } @@ -1221,7 +1221,7 @@ out_put_oz: if (data->oz) xfs_open_zone_put(data->oz); out_free_gc_data: - kfree(data); + xfs_zone_gc_data_free(data); return error; } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 9b646cb5335d..ff43d6d1ea30 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -610,10 +610,14 @@ static long zonefs_fname_to_fno(const struct qstr *fname) return c - '0'; for (i = 0, rname = name + len - 1; i < len; i++, rname--) { + long digit; + c = *rname; if (!isdigit(c)) return -ENOENT; - fno += (c - '0') * shift; + digit = (c - '0') * shift; + if (check_add_overflow(fno, digit, &fno)) + return -ENOENT; shift *= 10; } |
