diff options
| author | Trond Myklebust <trond.myklebust@hammerspace.com> | 2025-12-08 14:45:00 -0500 |
|---|---|---|
| committer | Trond Myklebust <trond.myklebust@hammerspace.com> | 2026-01-04 23:03:24 -0500 |
| commit | 857bf9056291a16785ae3be1d291026b2437fc48 (patch) | |
| tree | 5db91966162989894e4a164e099bdc02ab60c8f8 /fs | |
| parent | 3609fa95fb0f2c1b099e69e56634edb8fc03f87c (diff) | |
pNFS: Fix a deadlock when returning a delegation during open()
Ben Coddington reports seeing a hang in the following stack trace:
0 [ffffd0b50e1774e0] __schedule at ffffffff9ca05415
1 [ffffd0b50e177548] schedule at ffffffff9ca05717
2 [ffffd0b50e177558] bit_wait at ffffffff9ca061e1
3 [ffffd0b50e177568] __wait_on_bit at ffffffff9ca05cfb
4 [ffffd0b50e1775c8] out_of_line_wait_on_bit at ffffffff9ca05ea5
5 [ffffd0b50e177618] pnfs_roc at ffffffffc154207b [nfsv4]
6 [ffffd0b50e1776b8] _nfs4_proc_delegreturn at ffffffffc1506586 [nfsv4]
7 [ffffd0b50e177788] nfs4_proc_delegreturn at ffffffffc1507480 [nfsv4]
8 [ffffd0b50e1777f8] nfs_do_return_delegation at ffffffffc1523e41 [nfsv4]
9 [ffffd0b50e177838] nfs_inode_set_delegation at ffffffffc1524a75 [nfsv4]
10 [ffffd0b50e177888] nfs4_process_delegation at ffffffffc14f41dd [nfsv4]
11 [ffffd0b50e1778a0] _nfs4_opendata_to_nfs4_state at ffffffffc1503edf [nfsv4]
12 [ffffd0b50e1778c0] _nfs4_open_and_get_state at ffffffffc1504e56 [nfsv4]
13 [ffffd0b50e177978] _nfs4_do_open at ffffffffc15051b8 [nfsv4]
14 [ffffd0b50e1779f8] nfs4_do_open at ffffffffc150559c [nfsv4]
15 [ffffd0b50e177a80] nfs4_atomic_open at ffffffffc15057fb [nfsv4]
16 [ffffd0b50e177ad0] nfs4_file_open at ffffffffc15219be [nfsv4]
17 [ffffd0b50e177b78] do_dentry_open at ffffffff9c09e6ea
18 [ffffd0b50e177ba8] vfs_open at ffffffff9c0a082e
19 [ffffd0b50e177bd0] dentry_open at ffffffff9c0a0935
The issue is that the delegreturn is being asked to wait for a layout
return that cannot complete because a state recovery was initiated. The
state recovery cannot complete until the open() finishes processing the
delegations it was given.
The solution is to propagate the existing flags that indicate a
non-blocking call to the function pnfs_roc(), so that it knows not to
wait in this situation.
Reported-by: Benjamin Coddington <bcodding@hammerspace.com>
Fixes: 29ade5db1293 ("pNFS: Wait on outstanding layoutreturns to complete in pnfs_roc()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/nfs/nfs4proc.c | 6 | ||||
| -rw-r--r-- | fs/nfs/pnfs.c | 58 | ||||
| -rw-r--r-- | fs/nfs/pnfs.h | 17 |
3 files changed, 51 insertions, 30 deletions
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ec1ce593dea2..51da62ba6559 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3894,8 +3894,8 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - calldata->lr.roc = pnfs_roc(state->inode, - &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred); + calldata->lr.roc = pnfs_roc(state->inode, &calldata->lr.arg, + &calldata->lr.res, msg.rpc_cred, wait); if (calldata->lr.roc) { calldata->arg.lr_args = &calldata->lr.arg; calldata->res.lr_res = &calldata->lr.res; @@ -7005,7 +7005,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, data->inode = nfs_igrab_and_active(inode); if (data->inode || issync) { data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, - cred); + cred, issync); if (data->lr.roc) { data->args.lr_args = &data->lr.arg; data->res.lr_res = &data->lr.res; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b72d7cc36766..cff225721d1c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1533,10 +1533,9 @@ static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo) PNFS_FL_LAYOUTRETURN_PRIVILEGED); } -bool pnfs_roc(struct inode *ino, - struct nfs4_layoutreturn_args *args, - struct nfs4_layoutreturn_res *res, - const struct cred *cred) +bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, const struct cred *cred, + bool sync) { struct nfs_inode *nfsi = NFS_I(ino); struct nfs_open_context *ctx; @@ -1547,7 +1546,7 @@ bool pnfs_roc(struct inode *ino, nfs4_stateid stateid; enum pnfs_iomode iomode = 0; bool layoutreturn = false, roc = false; - bool skip_read = false; + bool skip_read; if (!nfs_have_layout(ino)) return false; @@ -1560,20 +1559,14 @@ retry: lo = NULL; goto out_noroc; } - pnfs_get_layout_hdr(lo); - if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { - spin_unlock(&ino->i_lock); - rcu_read_unlock(); - wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, - TASK_UNINTERRUPTIBLE); - pnfs_put_layout_hdr(lo); - goto retry; - } /* no roc if we hold a delegation */ + skip_read = false; if (nfs4_check_delegation(ino, FMODE_READ)) { - if (nfs4_check_delegation(ino, FMODE_WRITE)) + if (nfs4_check_delegation(ino, FMODE_WRITE)) { + lo = NULL; goto out_noroc; + } skip_read = true; } @@ -1582,12 +1575,43 @@ retry: if (state == NULL) continue; /* Don't return layout if there is open file state */ - if (state->state & FMODE_WRITE) + if (state->state & FMODE_WRITE) { + lo = NULL; goto out_noroc; + } if (state->state & FMODE_READ) skip_read = true; } + if (skip_read) { + bool writes = false; + + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (lseg->pls_range.iomode != IOMODE_READ) { + writes = true; + break; + } + } + if (!writes) { + lo = NULL; + goto out_noroc; + } + } + + pnfs_get_layout_hdr(lo); + if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { + if (!sync) { + pnfs_set_plh_return_info( + lo, skip_read ? IOMODE_RW : IOMODE_ANY, 0); + goto out_noroc; + } + spin_unlock(&ino->i_lock); + rcu_read_unlock(); + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, + TASK_UNINTERRUPTIBLE); + pnfs_put_layout_hdr(lo); + goto retry; + } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) { if (skip_read && lseg->pls_range.iomode == IOMODE_READ) @@ -1627,7 +1651,7 @@ retry: out_noroc: spin_unlock(&ino->i_lock); rcu_read_unlock(); - pnfs_layoutcommit_inode(ino, true); + pnfs_layoutcommit_inode(ino, sync); if (roc) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; if (ld->prepare_layoutreturn) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 91ff877185c8..3db8f13d8fe4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -303,10 +303,9 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq); int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct list_head *lseg_list); -bool pnfs_roc(struct inode *ino, - struct nfs4_layoutreturn_args *args, - struct nfs4_layoutreturn_res *res, - const struct cred *cred); +bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, const struct cred *cred, + bool sync); int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, struct nfs4_layoutreturn_res **respp, int *ret); void pnfs_roc_release(struct nfs4_layoutreturn_args *args, @@ -773,12 +772,10 @@ pnfs_layoutcommit_outstanding(struct inode *inode) return false; } - -static inline bool -pnfs_roc(struct inode *ino, - struct nfs4_layoutreturn_args *args, - struct nfs4_layoutreturn_res *res, - const struct cred *cred) +static inline bool pnfs_roc(struct inode *ino, + struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + const struct cred *cred, bool sync) { return false; } |
