From bf94dea7fd4e6708d1a784be23db65eff84d82f1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2025 11:53:35 -0400 Subject: [PATCH 01/38] svcrdma: Release transport resources synchronously NFSD has always supported added network listeners. The new netlink protocol now enables the removal of listeners. Olga noticed that if an RDMA listener is removed and immediately re-added, the deferred __svc_rdma_free() function might not have run yet, so some or all of the old listener's RDMA resources linger, which prevents a new listener on the same address from being created. Also, svc_xprt_free() does a module_put() just after calling ->xpo_free(). That means if there is deferred work going on, the module could be unloaded before that work is even started, resulting in a UAF. Neil asks: > What particular part of __svc_rdma_free() needs to run in order for a > subsequent registration to succeed? > Can that bit be run directory from svc_rdma_free() rather than be > delayed? > (I know almost nothing about rdma so forgive me if the answers to these > questions seems obvious) The reasons I can recall are: - Some of the transport tear-down work can sleep - Releasing a cm_id is tricky and can deadlock We might be able to mitigate the second issue with judicious application of transport reference counting. Reported-by: Olga Kornievskaia Closes: https://lore.kernel.org/linux-nfs/20250821204328.89218-1-okorniev@redhat.com/ Suggested-by: NeilBrown Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3d7f1413df02..b7b318ad25c4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -591,12 +591,18 @@ static void svc_rdma_detach(struct svc_xprt *xprt) rdma_disconnect(rdma->sc_cm_id); } -static void __svc_rdma_free(struct work_struct *work) +/** + * svc_rdma_free - Release class-specific transport resources + * @xprt: Generic svc transport object + */ +static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = - container_of(work, struct svcxprt_rdma, sc_work); + container_of(xprt, struct svcxprt_rdma, sc_xprt); struct ib_device *device = rdma->sc_cm_id->device; + might_sleep(); + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); @@ -629,15 +635,6 @@ static void __svc_rdma_free(struct work_struct *work) kfree(rdma); } -static void svc_rdma_free(struct svc_xprt *xprt) -{ - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); - - INIT_WORK(&rdma->sc_work, __svc_rdma_free); - schedule_work(&rdma->sc_work); -} - static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = From 89bd77cf436bf25e448817a662ebf76515f22863 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Sep 2025 12:55:13 +1000 Subject: [PATCH 02/38] nfsd: move name lookup out of nfsd4_list_rec_dir() nfsd4_list_rec_dir() is called with two different callbacks. One of the callbacks uses vfs_rmdir() to remove the directory. The other doesn't use the dentry at all, just the name. As only one callback needs the dentry, this patch moves the lookup into that callback. This prepares of changes to how directory operations are locked. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4recover.c | 54 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e2b9472e5c78..e9d09541161c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -237,7 +237,7 @@ out_creds: nfs4_reset_creds(original_cred); } -typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *); +typedef int (recdir_func)(struct dentry *, char *, struct nfsd_net *); struct name_list { char name[HEXDIR_LEN]; @@ -291,24 +291,14 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { - if (!status) { - struct dentry *dentry; - dentry = lookup_one(&nop_mnt_idmap, - &QSTR(entry->name), dir); - if (IS_ERR(dentry)) { - status = PTR_ERR(dentry); - break; - } - status = f(dir, dentry, nn); - dput(dentry); - } + if (!status) + status = f(dir, entry->name, nn); + list_del(&entry->list); kfree(entry); } - inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { @@ -406,18 +396,19 @@ out: } static int -purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +purge_old(struct dentry *parent, char *cname, struct nfsd_net *nn) { int status; + struct dentry *child; struct xdr_netobj name; - if (child->d_name.len != HEXDIR_LEN - 1) { - printk("%s: illegal name %pd in recovery directory\n", - __func__, child); + if (strlen(cname) != HEXDIR_LEN - 1) { + printk("%s: illegal name %s in recovery directory\n", + __func__, cname); /* Keep trying; maybe the others are OK: */ return 0; } - name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); + name.data = kstrdup(cname, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); @@ -427,10 +418,17 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); - if (status) - printk("failed to remove client recovery directory %pd\n", - child); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); + child = lookup_one(&nop_mnt_idmap, &QSTR(cname), parent); + if (!IS_ERR(child)) { + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); + if (status) + printk("failed to remove client recovery directory %pd\n", + child); + dput(child); + } + inode_unlock(d_inode(parent)); + out_free: kfree(name.data); out: @@ -461,18 +459,18 @@ out: } static int -load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +load_recdir(struct dentry *parent, char *cname, struct nfsd_net *nn) { struct xdr_netobj name; struct xdr_netobj princhash = { .len = 0, .data = NULL }; - if (child->d_name.len != HEXDIR_LEN - 1) { - printk("%s: illegal name %pd in recovery directory\n", - __func__, child); + if (strlen(cname) != HEXDIR_LEN - 1) { + printk("%s: illegal name %s in recovery directory\n", + __func__, cname); /* Keep trying; maybe the others are OK: */ return 0; } - name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); + name.data = kstrdup(cname, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); From 4552f4e3f2c96597914f07b060d5c5db84420ddd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 8 Sep 2025 11:38:33 +1000 Subject: [PATCH 03/38] nfsd: change nfs4_client_to_reclaim() to allocate data The calling convention for nfs4_client_to_reclaim() is clumsy in that the caller needs to free memory if the function fails. It is much cleaner if the function frees its own memory. This patch changes nfs4_client_to_reclaim() to re-allocate the .data fields to be stored in the newly allocated struct nfs4_client_reclaim, and to free everything on failure. __cld_pipe_inprogress_downcall() needs to allocate the data anyway to copy it from user-space, so now that data is allocated twice. I think that is a small price to pay for a cleaner interface. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4recover.c | 67 +++++++++++++++---------------------------- fs/nfsd/nfs4state.c | 22 ++++++++++++-- 2 files changed, 42 insertions(+), 47 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e9d09541161c..b1005abcb903 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -147,24 +147,13 @@ legacy_recdir_name_error(struct nfs4_client *clp, int error) static void __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, - const char *dname, int len, struct nfsd_net *nn) + char *dname, struct nfsd_net *nn) { - struct xdr_netobj name; + struct xdr_netobj name = { .len = strlen(dname), .data = dname }; struct xdr_netobj princhash = { .len = 0, .data = NULL }; struct nfs4_client_reclaim *crp; - name.data = kmemdup(dname, len, GFP_KERNEL); - if (!name.data) { - dprintk("%s: failed to allocate memory for name.data!\n", - __func__); - return; - } - name.len = len; crp = nfs4_client_to_reclaim(name, princhash, nn); - if (!crp) { - kfree(name.data); - return; - } crp->cr_clp = clp; } @@ -223,8 +212,7 @@ out_unlock: inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) - __nfsd4_create_reclaim_record_grace(clp, dname, - HEXDIR_LEN, nn); + __nfsd4_create_reclaim_record_grace(clp, dname, nn); vfs_fsync(nn->rec_file, 0); } else { printk(KERN_ERR "NFSD: failed to write recovery record" @@ -461,7 +449,7 @@ out: static int load_recdir(struct dentry *parent, char *cname, struct nfsd_net *nn) { - struct xdr_netobj name; + struct xdr_netobj name = { .len = HEXDIR_LEN, .data = cname }; struct xdr_netobj princhash = { .len = 0, .data = NULL }; if (strlen(cname) != HEXDIR_LEN - 1) { @@ -470,16 +458,7 @@ load_recdir(struct dentry *parent, char *cname, struct nfsd_net *nn) /* Keep trying; maybe the others are OK: */ return 0; } - name.data = kstrdup(cname, GFP_KERNEL); - if (!name.data) { - dprintk("%s: failed to allocate memory for name.data!\n", - __func__); - goto out; - } - name.len = HEXDIR_LEN; - if (!nfs4_client_to_reclaim(name, princhash, nn)) - kfree(name.data); -out: + nfs4_client_to_reclaim(name, princhash, nn); return 0; } @@ -777,6 +756,8 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, { uint8_t cmd, princhashlen; struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; + char *namecopy __free(kfree) = NULL; + char *princhashcopy __free(kfree) = NULL; uint16_t namelen; if (get_user(cmd, &cmsg->cm_cmd)) { @@ -794,19 +775,19 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, dprintk("%s: invalid namelen (%u)", __func__, namelen); return -EINVAL; } - name.data = memdup_user(&ci->cc_name.cn_id, namelen); - if (IS_ERR(name.data)) - return PTR_ERR(name.data); + namecopy = memdup_user(&ci->cc_name.cn_id, namelen); + if (IS_ERR(namecopy)) + return PTR_ERR(namecopy); + name.data = namecopy; name.len = namelen; get_user(princhashlen, &ci->cc_princhash.cp_len); if (princhashlen > 0) { - princhash.data = memdup_user( - &ci->cc_princhash.cp_data, - princhashlen); - if (IS_ERR(princhash.data)) { - kfree(name.data); - return PTR_ERR(princhash.data); - } + princhashcopy = memdup_user( + &ci->cc_princhash.cp_data, + princhashlen); + if (IS_ERR(princhashcopy)) + return PTR_ERR(princhashcopy); + princhash.data = princhashcopy; princhash.len = princhashlen; } else princhash.len = 0; @@ -820,9 +801,10 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, dprintk("%s: invalid namelen (%u)", __func__, namelen); return -EINVAL; } - name.data = memdup_user(&cnm->cn_id, namelen); - if (IS_ERR(name.data)) - return PTR_ERR(name.data); + namecopy = memdup_user(&cnm->cn_id, namelen); + if (IS_ERR(namecopy)) + return PTR_ERR(namecopy); + name.data = namecopy; name.len = namelen; } #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING @@ -830,15 +812,12 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, struct cld_net *cn = nn->cld_net; name.len = name.len - 5; - memmove(name.data, name.data + 5, name.len); + name.data = name.data + 5; cn->cn_has_legacy = true; } #endif - if (!nfs4_client_to_reclaim(name, princhash, nn)) { - kfree(name.data); - kfree(princhash.data); + if (!nfs4_client_to_reclaim(name, princhash, nn)) return -EFAULT; - } return nn->client_tracking_ops->msglen; } return -EFAULT; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8a6960500217..af7a20ded1ca 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8801,9 +8801,6 @@ nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) /* * failure => all reset bets are off, nfserr_no_grace... - * - * The caller is responsible for freeing name.data if NULL is returned (it - * will be freed in nfs4_remove_reclaim_record in the normal case). */ struct nfs4_client_reclaim * nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, @@ -8812,6 +8809,22 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, unsigned int strhashval; struct nfs4_client_reclaim *crp; + name.data = kmemdup(name.data, name.len, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + return NULL; + } + if (princhash.len) { + princhash.data = kmemdup(princhash.data, princhash.len, GFP_KERNEL); + if (!princhash.data) { + dprintk("%s: failed to allocate memory for princhash.data!\n", + __func__); + kfree(name.data); + return NULL; + } + } else + princhash.data = NULL; crp = alloc_reclaim(); if (crp) { strhashval = clientstr_hashval(name); @@ -8823,6 +8836,9 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; + } else { + kfree(name.data); + kfree(princhash.data); } return crp; } From b5fc406bc730806662429272300fb56e4e6592d8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Sep 2025 11:48:08 -0400 Subject: [PATCH 04/38] nfsd: switch the default for NFSD_LEGACY_CLIENT_TRACKING to "n" We added this Kconfig option a little over a year ago. Switch the default to "n" in preparation for its eventual removal. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index e134dce45e35..df09c5cefb7c 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -164,7 +164,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_LEGACY_CLIENT_TRACKING bool "Support legacy NFSv4 client tracking methods (DEPRECATED)" depends on NFSD_V4 - default y + default n help The NFSv4 server needs to store a small amount of information on stable storage in order to handle state recovery after reboot. Most From ccd608e29b7a73b7bee45b06bfeef088a97c4c92 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 17 Sep 2025 10:31:40 -0400 Subject: [PATCH 05/38] NFSD: Add array bounds-checking in nfsd_iter_read() The *count parameter does not appear to be explicitly restricted to being smaller than rsize, so it might be possible to overrun the rq_bvec or rq_pages arrays. Rather than overrunning these arrays (damage done!) and then WARNING once, let's harden the loop so that it terminates before the end of the arrays are reached. This should result in a short read, which is OK -- clients recover by sending additional READ requests for the remaining unread bytes. Reported-by: NeilBrown Reviewed-by: Jeff Layton Reviewed-by: Mike Snitzer Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9cb20d4aeab1..ea9c2de70429 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1115,18 +1115,20 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, v = 0; total = *count; - while (total) { + while (total && v < rqstp->rq_maxpages && + rqstp->rq_next_page < rqstp->rq_page_end) { len = min_t(size_t, total, PAGE_SIZE - base); - bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++), + bvec_set_page(&rqstp->rq_bvec[v], *rqstp->rq_next_page, len, base); + total -= len; + ++rqstp->rq_next_page; ++v; base = 0; } - WARN_ON_ONCE(v > rqstp->rq_maxpages); - trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count); + trace_nfsd_read_vector(rqstp, fhp, offset, *count - total); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count - total); host_err = vfs_iocb_iter_read(file, &kiocb, &iter); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } From bfce8e4273d81a1d056ed3d79a27e1b2c4f60759 Mon Sep 17 00:00:00 2001 From: Matvey Kovalev Date: Mon, 29 Sep 2025 20:35:20 +0300 Subject: [PATCH 06/38] nfsd: delete unreachable confusing code in nfs4_open_delegation() op_delegate_type is assigned OPEN_DELEGATE_NONE just before the if-block where condition specifies it not be equal to OPEN_DELEGATE_NONE. Compiler treats the block as unreachable and optimizes it out from the resulting executable. In that aspect commit d08d32e6e5c0 ("nfsd4: return delegation immediately if lease fails") notably makes no difference. Seems it's better to just drop this code instead of fiddling with memory barriers or atomics. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Matvey Kovalev Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index af7a20ded1ca..085f5ef12230 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6362,11 +6362,6 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, return; out_no_deleg: open->op_delegate_type = OPEN_DELEGATE_NONE; - if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && - open->op_delegate_type != OPEN_DELEGATE_NONE) { - dprintk("NFSD: WARNING: refusing delegation reclaim\n"); - open->op_recall = true; - } /* 4.1 client asking for a delegation? */ if (open->op_deleg_want) From 166274a2456e5de9dc7e63a0818f7b842f218a5c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 1 Oct 2025 09:24:31 -0400 Subject: [PATCH 07/38] NFSD: Update comment documenting unsupported fattr4 attributes TIME_CREATE has been supported since commit e377a3e698fb ("nfsd: Add support for the birth time attribute"). Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsd.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index b752433c3c2c..4b7296881f31 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -397,14 +397,13 @@ enum { #define NFSD_CB_GETATTR_TIMEOUT NFSD_DELEGRETURN_TIMEOUT /* - * The following attributes are currently not supported by the NFSv4 server: + * The following attributes are not implemented by NFSD: * ARCHIVE (deprecated anyway) * HIDDEN (unlikely to be supported any time soon) * MIMETYPE (unlikely to be supported any time soon) * QUOTA_* (will be supported in a forthcoming patch) * SYSTEM (unlikely to be supported any time soon) * TIME_BACKUP (unlikely to be supported any time soon) - * TIME_CREATE (unlikely to be supported any time soon) */ #define NFSD4_SUPPORTED_ATTRS_WORD0 \ (FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ From 566a414558aec1ab263ab8709fa783dfa2e34325 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 2 Oct 2025 10:00:52 -0400 Subject: [PATCH 08/38] svcrdma: Increase the server's default RPC/RDMA credit grant The range of commits from commit e3274026e2ec ("SUNRPC: move all of xprt handling into svc_xprt_handle()") to commit 15d39883ee7d ("SUNRPC: change the back-channel queue to lwq") enabled NFSD performance to scale better as the number of nfsd threads is increased. These commits were merged in v6.7. Now that the nfsd thread count can scale to more threads, permit individual clients to make more use of those threads. Increase the RPC/RDMA per-connection credit grant from 64 to 128 -- same as the Linux NFS client. Simple single client fio-based benchmarking so far shows only improvement, no regression. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 22704c2e5b9b..57f4fd94166a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -131,7 +131,7 @@ static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp) */ enum { RPCRDMA_LISTEN_BACKLOG = 10, - RPCRDMA_MAX_REQUESTS = 64, + RPCRDMA_MAX_REQUESTS = 128, RPCRDMA_MAX_BC_REQUESTS = 2, }; From 3524b021b0ec620a76c89aee78e9d4b4130fb711 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Fri, 3 Oct 2025 12:11:03 +0300 Subject: [PATCH 09/38] NFSD/blocklayout: Fix minlength check in proc_layoutget The extent returned by the file system may have a smaller offset than the segment offset requested by the client. In this case, the minimum segment length must be checked against the requested range. Otherwise, the client may not be able to continue the read/write operation. Fixes: 8650b8a05850 ("nfsd: pNFS block layout driver") Signed-off-by: Sergey Bashirov Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index fde5539cf6a6..425648565ab2 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -23,6 +23,7 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, { struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; + u64 length; u32 block_size = i_blocksize(inode); struct pnfs_block_extent *bex; struct iomap iomap; @@ -56,7 +57,8 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, goto out_error; } - if (iomap.length < args->lg_minlength) { + length = iomap.offset + iomap.length - seg->offset; + if (length < args->lg_minlength) { dprintk("pnfsd: extent smaller than minlength\n"); goto out_layoutunavailable; } From a1dce715c64d4376321b5534366ae48fd7d14bcc Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Fri, 3 Oct 2025 12:11:04 +0300 Subject: [PATCH 10/38] NFSD/blocklayout: Extract extent mapping from proc_layoutget No changes in functionality. Split the proc_layoutget function to create a helper function that maps single extent to the requested range. This helper function is then used to implement support for multiple extents per LAYOUTGET. Signed-off-by: Sergey Bashirov Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 115 ++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 49 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 425648565ab2..35a95501db63 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -17,68 +17,44 @@ #define NFSDDBG_FACILITY NFSDDBG_PNFS +/* + * Get an extent from the file system that starts at offset or below + * and may be shorter than the requested length. + */ static __be32 -nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, - const struct svc_fh *fhp, struct nfsd4_layoutget *args) +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp, + u64 offset, u64 length, u32 iomode, u64 minlength, + struct pnfs_block_extent *bex) { - struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; - u64 length; - u32 block_size = i_blocksize(inode); - struct pnfs_block_extent *bex; struct iomap iomap; u32 device_generation = 0; int error; - if (locks_in_grace(SVC_NET(rqstp))) - return nfserr_grace; - - if (seg->offset & (block_size - 1)) { - dprintk("pnfsd: I/O misaligned\n"); - goto out_layoutunavailable; - } - - /* - * Some clients barf on non-zero block numbers for NONE or INVALID - * layouts, so make sure to zero the whole structure. - */ - error = -ENOMEM; - bex = kzalloc(sizeof(*bex), GFP_KERNEL); - if (!bex) - goto out_error; - args->lg_content = bex; - - error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, - &iomap, seg->iomode != IOMODE_READ, - &device_generation); + error = sb->s_export_op->map_blocks(inode, offset, length, &iomap, + iomode != IOMODE_READ, &device_generation); if (error) { if (error == -ENXIO) - goto out_layoutunavailable; - goto out_error; - } - - length = iomap.offset + iomap.length - seg->offset; - if (length < args->lg_minlength) { - dprintk("pnfsd: extent smaller than minlength\n"); - goto out_layoutunavailable; + return nfserr_layoutunavailable; + return nfserrno(error); } switch (iomap.type) { case IOMAP_MAPPED: - if (seg->iomode == IOMODE_READ) + if (iomode == IOMODE_READ) bex->es = PNFS_BLOCK_READ_DATA; else bex->es = PNFS_BLOCK_READWRITE_DATA; bex->soff = iomap.addr; break; case IOMAP_UNWRITTEN: - if (seg->iomode & IOMODE_RW) { + if (iomode & IOMODE_RW) { /* * Crack monkey special case from section 2.3.1. */ - if (args->lg_minlength == 0) { + if (minlength == 0) { dprintk("pnfsd: no soup for you!\n"); - goto out_layoutunavailable; + return nfserr_layoutunavailable; } bex->es = PNFS_BLOCK_INVALID_DATA; @@ -87,7 +63,7 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, } fallthrough; case IOMAP_HOLE: - if (seg->iomode == IOMODE_READ) { + if (iomode == IOMODE_READ) { bex->es = PNFS_BLOCK_NONE_DATA; break; } @@ -95,27 +71,68 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, case IOMAP_DELALLOC: default: WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); - goto out_layoutunavailable; + return nfserr_layoutunavailable; } error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); if (error) - goto out_error; + return nfserrno(error); + bex->foff = iomap.offset; bex->len = iomap.length; + return nfs_ok; +} - seg->offset = iomap.offset; - seg->length = iomap.length; +static __be32 +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + struct pnfs_block_extent *bex; + u64 length; + u32 block_size = i_blocksize(inode); + __be32 nfserr; + + if (locks_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + nfserr = nfserr_layoutunavailable; + if (seg->offset & (block_size - 1)) { + dprintk("pnfsd: I/O misaligned\n"); + goto out_error; + } + + /* + * Some clients barf on non-zero block numbers for NONE or INVALID + * layouts, so make sure to zero the whole structure. + */ + nfserr = nfserrno(-ENOMEM); + bex = kzalloc(sizeof(*bex), GFP_KERNEL); + if (!bex) + goto out_error; + args->lg_content = bex; + + nfserr = nfsd4_block_map_extent(inode, fhp, seg->offset, seg->length, + seg->iomode, args->lg_minlength, bex); + if (nfserr != nfs_ok) + goto out_error; + + nfserr = nfserr_layoutunavailable; + length = bex->foff + bex->len - seg->offset; + if (length < args->lg_minlength) { + dprintk("pnfsd: extent smaller than minlength\n"); + goto out_error; + } + + seg->offset = bex->foff; + seg->length = bex->len; dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); - return 0; + return nfs_ok; out_error: seg->length = 0; - return nfserrno(error); -out_layoutunavailable: - seg->length = 0; - return nfserr_layoutunavailable; + return nfserr; } static __be32 From 0cd0d15d47f9e1a77ff64aedb2dbcf1c100e4006 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Fri, 3 Oct 2025 12:11:05 +0300 Subject: [PATCH 11/38] NFSD/blocklayout: Introduce layout content structure Add a layout content structure instead of a single extent. The ability to store and encode an array of extents is then used to implement support for multiple extents per LAYOUTGET. Signed-off-by: Sergey Bashirov Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 26 ++++++++++++++++++++++---- fs/nfsd/blocklayoutxdr.c | 36 +++++++++++++++++++++++++++--------- fs/nfsd/blocklayoutxdr.h | 14 ++++++++++++++ 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 35a95501db63..6d29ea5e8623 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -88,9 +88,10 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, const struct svc_fh *fhp, struct nfsd4_layoutget *args) { struct nfsd4_layout_seg *seg = &args->lg_seg; + struct pnfs_block_layout *bl; struct pnfs_block_extent *bex; u64 length; - u32 block_size = i_blocksize(inode); + u32 nr_extents_max = 1, block_size = i_blocksize(inode); __be32 nfserr; if (locks_in_grace(SVC_NET(rqstp))) @@ -102,16 +103,33 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, goto out_error; } + /* + * RFC 8881, section 3.3.17: + * The layout4 data type defines a layout for a file. + * + * RFC 8881, section 18.43.3: + * The loga_maxcount field specifies the maximum layout size + * (in bytes) that the client can handle. If the size of the + * layout structure exceeds the size specified by maxcount, + * the metadata server will return the NFS4ERR_TOOSMALL error. + */ + nfserr = nfserr_toosmall; + if (args->lg_maxcount < PNFS_BLOCK_LAYOUT4_SIZE + + PNFS_BLOCK_EXTENT_SIZE) + goto out_error; + /* * Some clients barf on non-zero block numbers for NONE or INVALID * layouts, so make sure to zero the whole structure. */ nfserr = nfserrno(-ENOMEM); - bex = kzalloc(sizeof(*bex), GFP_KERNEL); - if (!bex) + bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL); + if (!bl) goto out_error; - args->lg_content = bex; + bl->nr_extents = nr_extents_max; + args->lg_content = bl; + bex = &bl->extents[0]; nfserr = nfsd4_block_map_extent(inode, fhp, seg->offset, seg->length, seg->iomode, args->lg_minlength, bex); if (nfserr != nfs_ok) diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index e50afe340737..196ef4245604 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -14,12 +14,25 @@ #define NFSDDBG_FACILITY NFSDDBG_PNFS +/** + * nfsd4_block_encode_layoutget - encode block/scsi layout extent array + * @xdr: stream for data encoding + * @lgp: layoutget content, actually an array of extents to encode + * + * Encode the opaque loc_body field in the layoutget response. Since the + * pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire are + * the same, this function is used by both layout drivers. + * + * Return values: + * %nfs_ok: Success, all extents encoded into @xdr + * %nfserr_toosmall: Not enough space in @xdr to encode all the data + */ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp) { - const struct pnfs_block_extent *b = lgp->lg_content; - int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); + const struct pnfs_block_layout *bl = lgp->lg_content; + u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE; __be32 *p; p = xdr_reserve_space(xdr, sizeof(__be32) + len); @@ -27,14 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr, return nfserr_toosmall; *p++ = cpu_to_be32(len); - *p++ = cpu_to_be32(1); /* we always return a single extent */ + *p++ = cpu_to_be32(bl->nr_extents); - p = svcxdr_encode_deviceid4(p, &b->vol_id); - p = xdr_encode_hyper(p, b->foff); - p = xdr_encode_hyper(p, b->len); - p = xdr_encode_hyper(p, b->soff); - *p++ = cpu_to_be32(b->es); - return 0; + for (i = 0; i < bl->nr_extents; i++) { + const struct pnfs_block_extent *bex = bl->extents + i; + + p = svcxdr_encode_deviceid4(p, &bex->vol_id); + p = xdr_encode_hyper(p, bex->foff); + p = xdr_encode_hyper(p, bex->len); + p = xdr_encode_hyper(p, bex->soff); + *p++ = cpu_to_be32(bex->es); + } + + return nfs_ok; } static int diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index 7d25ef689671..2e0c6c7d2b42 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -8,6 +8,15 @@ struct iomap; struct xdr_stream; +/* On the wire size of the layout4 struct with zero number of extents */ +#define PNFS_BLOCK_LAYOUT4_SIZE \ + (sizeof(__be32) * 2 + /* offset4 */ \ + sizeof(__be32) * 2 + /* length4 */ \ + sizeof(__be32) + /* layoutiomode4 */ \ + sizeof(__be32) + /* layouttype4 */ \ + sizeof(__be32) + /* number of bytes */ \ + sizeof(__be32)) /* number of extents */ + struct pnfs_block_extent { struct nfsd4_deviceid vol_id; u64 foff; @@ -21,6 +30,11 @@ struct pnfs_block_range { u64 len; }; +struct pnfs_block_layout { + u32 nr_extents; + struct pnfs_block_extent extents[] __counted_by(nr_extents); +}; + /* * Random upper cap for the uuid length to avoid unbounded allocation. * Not actually limited by the protocol. From cc6c40e09d7b1c559bdf42f0fe99b16eb7cfc5e3 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Fri, 3 Oct 2025 12:11:06 +0300 Subject: [PATCH 12/38] NFSD/blocklayout: Support multiple extents per LAYOUTGET Allow the pNFS server to respond with multiple extents to a LAYOUTGET request, thereby avoiding unnecessary load on the server and improving performance for the client. The number of LAYOUTGET requests is significantly reduced for various file access patterns, including random and parallel writes. Additionally, this change allows the client to request layouts with the loga_minlength value greater than the minimum possible length of a single extent in XFS. We use this functionality to fix a livelock in the client. Signed-off-by: Sergey Bashirov Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 47 +++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 6d29ea5e8623..101cccbee4a3 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -89,9 +89,9 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, { struct nfsd4_layout_seg *seg = &args->lg_seg; struct pnfs_block_layout *bl; - struct pnfs_block_extent *bex; - u64 length; - u32 nr_extents_max = 1, block_size = i_blocksize(inode); + struct pnfs_block_extent *first_bex, *last_bex; + u64 offset = seg->offset, length = seg->length; + u32 i, nr_extents_max, block_size = i_blocksize(inode); __be32 nfserr; if (locks_in_grace(SVC_NET(rqstp))) @@ -118,6 +118,13 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, PNFS_BLOCK_EXTENT_SIZE) goto out_error; + /* + * Limit the maximum layout size to avoid allocating + * a large buffer on the server for each layout request. + */ + nr_extents_max = (min(args->lg_maxcount, PAGE_SIZE) - + PNFS_BLOCK_LAYOUT4_SIZE) / PNFS_BLOCK_EXTENT_SIZE; + /* * Some clients barf on non-zero block numbers for NONE or INVALID * layouts, so make sure to zero the whole structure. @@ -129,23 +136,37 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, bl->nr_extents = nr_extents_max; args->lg_content = bl; - bex = &bl->extents[0]; - nfserr = nfsd4_block_map_extent(inode, fhp, seg->offset, seg->length, - seg->iomode, args->lg_minlength, bex); - if (nfserr != nfs_ok) - goto out_error; + for (i = 0; i < bl->nr_extents; i++) { + struct pnfs_block_extent *bex = bl->extents + i; + u64 bex_length; + + nfserr = nfsd4_block_map_extent(inode, fhp, offset, length, + seg->iomode, args->lg_minlength, bex); + if (nfserr != nfs_ok) + goto out_error; + + bex_length = bex->len - (offset - bex->foff); + if (bex_length >= length) { + bl->nr_extents = i + 1; + break; + } + + offset = bex->foff + bex->len; + length -= bex_length; + } + + first_bex = bl->extents; + last_bex = bl->extents + bl->nr_extents - 1; nfserr = nfserr_layoutunavailable; - length = bex->foff + bex->len - seg->offset; + length = last_bex->foff + last_bex->len - seg->offset; if (length < args->lg_minlength) { dprintk("pnfsd: extent smaller than minlength\n"); goto out_error; } - seg->offset = bex->foff; - seg->length = bex->len; - - dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); + seg->offset = first_bex->foff; + seg->length = last_bex->foff - first_bex->foff + last_bex->len; return nfs_ok; out_error: From 803bc849f0039291f546ba0e2237faebeb5c073e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 8 Oct 2025 09:52:28 -0400 Subject: [PATCH 13/38] NFSD: pass nfsd_file to nfsd_iter_read() Prepare for nfsd_iter_read() to use the DIO alignment stored in nfsd_file by passing the nfsd_file to nfsd_iter_read() rather than just the file which is associaed with the nfsd_file. This means nfsd4_encode_readv() now also needs the nfsd_file rather than the file. Instead of changing the file arg to be the nfsd_file, we discard the file arg as the nfsd_file (and indeed the file) is already available via the "read" argument. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 8 ++++---- fs/nfsd/vfs.c | 7 ++++--- fs/nfsd/vfs.h | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 67bb9c0b9fcb..494a703e0570 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4472,7 +4472,7 @@ out_err: static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read, - struct file *file, unsigned long maxcount) + unsigned long maxcount) { struct xdr_stream *xdr = resp->xdr; unsigned int base = xdr->buf->page_len & ~PAGE_MASK; @@ -4483,7 +4483,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, if (xdr_reserve_space_vec(xdr, maxcount) < 0) return nfserr_resource; - nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file, + nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, read->rd_nf, read->rd_offset, &maxcount, base, &read->rd_eof); read->rd_length = maxcount; @@ -4530,7 +4530,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, if (file->f_op->splice_read && splice_ok) nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else - nfserr = nfsd4_encode_readv(resp, read, file, maxcount); + nfserr = nfsd4_encode_readv(resp, read, maxcount); if (nfserr) { xdr_truncate_encode(xdr, eof_offset); return nfserr; @@ -5426,7 +5426,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, if (file->f_op->splice_read && splice_ok) nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else - nfserr = nfsd4_encode_readv(resp, read, file, maxcount); + nfserr = nfsd4_encode_readv(resp, read, maxcount); if (nfserr) return nfserr; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ea9c2de70429..406fe62de219 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1078,7 +1078,7 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, * nfsd_iter_read - Perform a VFS read using an iterator * @rqstp: RPC transaction context * @fhp: file handle of file to be read - * @file: opened struct file of file to be read + * @nf: opened struct nfsd_file of file to be read * @offset: starting byte offset * @count: IN: requested number of bytes; OUT: number of bytes read * @base: offset in first page of read buffer @@ -1091,9 +1091,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, * returned. */ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, unsigned long *count, + struct nfsd_file *nf, loff_t offset, unsigned long *count, unsigned int base, u32 *eof) { + struct file *file = nf->nf_file; unsigned long v, total; struct iov_iter iter; struct kiocb kiocb; @@ -1336,7 +1337,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp)) err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof); + err = nfsd_iter_read(rqstp, fhp, nf, offset, count, 0, eof); nfsd_file_put(nf); trace_nfsd_read_done(rqstp, fhp, offset, *count); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 0c0292611c6d..fa46f8b5f132 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -121,7 +121,7 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof); __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, + struct nfsd_file *nf, loff_t offset, unsigned long *count, unsigned int base, u32 *eof); bool nfsd_read_splice_ok(struct svc_rqst *rqstp); From d7de37d6d7ccfac9321d8cc4f36fc85dfadad54a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Oct 2025 09:52:29 -0400 Subject: [PATCH 14/38] NFSD: Relocate the xdr_reserve_space_vec() call site In order to detect when a direct READ is possible, we need the send buffer's .page_len to be zero when there is nothing in the buffer's .pages array yet. However, when xdr_reserve_space_vec() extends the size of the xdr_stream to accommodate a READ payload, it adds to the send buffer's .page_len. It should be safe to reserve the stream space /after/ the VFS read operation completes. This is, for example, how an NFSv3 READ works: the VFS read goes into the rq_bvec, and is then added to the send xdr_stream later by svcxdr_encode_opaque_pages(). Now that xdr_reserve_space_vec() uses the number of bytes actually read, the xdr_truncate_encode() call is no longer necessary. Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 494a703e0570..30ce5851fe4c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4480,18 +4480,30 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, __be32 zero = xdr_zero; __be32 nfserr; - if (xdr_reserve_space_vec(xdr, maxcount) < 0) - return nfserr_resource; - nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, read->rd_nf, read->rd_offset, &maxcount, base, &read->rd_eof); read->rd_length = maxcount; if (nfserr) return nfserr; + + /* + * svcxdr_encode_opaque_pages() is not used here because + * we don't want to encode subsequent results in this + * COMPOUND into the xdr->buf's tail, but rather those + * results should follow the NFS READ payload in the + * buf's pages. + */ + if (xdr_reserve_space_vec(xdr, maxcount) < 0) + return nfserr_resource; + + /* + * Mark the buffer location of the NFS READ payload so that + * direct placement-capable transports send only the + * payload bytes out-of-band. + */ if (svc_encode_result_payload(resp->rqstp, starting_len, maxcount)) return nfserr_io; - xdr_truncate_encode(xdr, starting_len + xdr_align_size(maxcount)); write_bytes_to_xdr_buf(xdr->buf, starting_len + maxcount, &zero, xdr_pad_size(maxcount)); From d686e64e931c594af8b27597f6bf04944c857ed7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Oct 2025 09:52:30 -0400 Subject: [PATCH 15/38] NFSD: Implement NFSD_IO_DIRECT for NFS READ Add an experimental option that forces NFS READ operations to use direct I/O instead of reading through the NFS server's page cache. There is already at least one other layer of read caching: the page cache on NFS clients. The server's page cache, in many cases, is unlikely to provide additional benefit. Some benchmarks have demonstrated that the server's page cache is actively detrimental for workloads whose working set is larger than the server's available physical memory. For instance, on small NFS servers, cached NFS file content can squeeze out local memory consumers. For large sequential workloads, an enormous amount of data flows into and out of the page cache and is consumed by NFS clients exactly once -- caching that data is expensive to do and totally valueless. For now this is a hidden option that can be enabled on test systems for benchmarking. In the longer term, this option might be enabled persistently or per-export. When the exported file system does not support direct I/O, NFSD falls back to using either DONTCACHE or buffered I/O to fulfill NFS READ requests. Suggested-by: Mike Snitzer Reviewed-by: Mike Snitzer Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/debugfs.c | 2 ++ fs/nfsd/nfsd.h | 1 + fs/nfsd/trace.h | 1 + fs/nfsd/vfs.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+) diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index ed2b9e066206..00eb1ecef6ac 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); * Contents: * %0: NFS READ will use buffered IO * %1: NFS READ will use dontcache (buffered IO w/ dropbehind) + * %2: NFS READ will use direct IO * * This setting takes immediate effect for all NFS versions, * all exports, and in all NFSD net namespaces. @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val) nfsd_io_cache_read = NFSD_IO_BUFFERED; break; case NFSD_IO_DONTCACHE: + case NFSD_IO_DIRECT: /* * Must disable splice_read when enabling * NFSD_IO_DONTCACHE. diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 4b7296881f31..e4263326ca4a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -160,6 +160,7 @@ enum { /* Any new NFSD_IO enum value must be added at the end */ NFSD_IO_BUFFERED, NFSD_IO_DONTCACHE, + NFSD_IO_DIRECT, }; extern u64 nfsd_io_cache_read __read_mostly; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 6e2c8e2aab10..bfd41236aff2 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ DEFINE_NFSD_IO_EVENT(read_start); DEFINE_NFSD_IO_EVENT(read_splice); DEFINE_NFSD_IO_EVENT(read_vector); +DEFINE_NFSD_IO_EVENT(read_direct); DEFINE_NFSD_IO_EVENT(read_io_done); DEFINE_NFSD_IO_EVENT(read_done); DEFINE_NFSD_IO_EVENT(write_start); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 406fe62de219..f537a7b4ee01 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1074,6 +1074,83 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } +/* + * The byte range of the client's READ request is expanded on both ends + * until it meets the underlying file system's direct I/O alignment + * requirements. After the internal read is complete, the byte range of + * the NFS READ payload is reduced to the byte range that was originally + * requested. + * + * Note that a direct read can be done only when the xdr_buf containing + * the NFS READ reply does not already have contents in its .pages array. + * This is due to potentially restrictive alignment requirements on the + * read buffer. When .page_len and @base are zero, the .pages array is + * guaranteed to be page-aligned. + */ +static noinline_for_stack __be32 +nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, unsigned long *count, + u32 *eof) +{ + u64 dio_start, dio_end; + unsigned long v, total; + struct iov_iter iter; + struct kiocb kiocb; + ssize_t host_err; + size_t len; + + init_sync_kiocb(&kiocb, nf->nf_file); + kiocb.ki_flags |= IOCB_DIRECT; + + /* Read a properly-aligned region of bytes into rq_bvec */ + dio_start = round_down(offset, nf->nf_dio_read_offset_align); + dio_end = round_up((u64)offset + *count, nf->nf_dio_read_offset_align); + + kiocb.ki_pos = dio_start; + + v = 0; + total = dio_end - dio_start; + while (total && v < rqstp->rq_maxpages && + rqstp->rq_next_page < rqstp->rq_page_end) { + len = min_t(size_t, total, PAGE_SIZE); + bvec_set_page(&rqstp->rq_bvec[v], *rqstp->rq_next_page, + len, 0); + + total -= len; + ++rqstp->rq_next_page; + ++v; + } + + trace_nfsd_read_direct(rqstp, fhp, offset, *count - total); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, + dio_end - dio_start - total); + + host_err = vfs_iocb_iter_read(nf->nf_file, &kiocb, &iter); + if (host_err >= 0) { + unsigned int pad = offset - dio_start; + + /* The returned payload starts after the pad */ + rqstp->rq_res.page_base = pad; + + /* Compute the count of bytes to be returned */ + if (host_err > pad + *count) + host_err = *count; + else if (host_err > pad) + host_err -= pad; + else + host_err = 0; + } else if (unlikely(host_err == -EINVAL)) { + struct inode *inode = d_inode(fhp->fh_dentry); + + pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + host_err = -ESERVERFAULT; + } + + return nfsd_finish_read(rqstp, fhp, nf->nf_file, offset, count, + eof, host_err); +} + /** * nfsd_iter_read - Perform a VFS read using an iterator * @rqstp: RPC transaction context @@ -1106,6 +1183,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, switch (nfsd_io_cache_read) { case NFSD_IO_BUFFERED: break; + case NFSD_IO_DIRECT: + /* When dio_read_offset_align is zero, dio is not supported */ + if (nf->nf_dio_read_offset_align && !rqstp->rq_res.page_len) + return nfsd_direct_read(rqstp, fhp, nf, offset, + count, eof); + fallthrough; case NFSD_IO_DONTCACHE: if (file->f_op->fop_flags & FOP_DONTCACHE) kiocb.ki_flags = IOCB_DONTCACHE; From ebd3330d1ca8844b0a0dba060d223523a186a5f9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Oct 2025 11:39:56 -0400 Subject: [PATCH 16/38] SUNRPC: Improve "fragment too large" warning Including the client IP address that generated the overrun traffic seems like it would be helpful. The message now reads: kernel: svc: nfsd oversized RPC fragment (1064958 octets) from 100.64.0.11:45866 Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7b90abc5cf0e..0cb9c4d45745 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1062,9 +1062,10 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, return svc_sock_reclen(svsk); err_too_large: - net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n", - __func__, svsk->sk_xprt.xpt_server->sv_name, - svc_sock_reclen(svsk)); + net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n", + svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk), + (struct sockaddr *)&svsk->sk_xprt.xpt_remote); svc_xprt_deferred_close(&svsk->sk_xprt); err_short: return -EAGAIN; From 6b3b697d65d46a0f640216a3f6c72856c159c567 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 13 Oct 2025 09:54:53 -0400 Subject: [PATCH 17/38] sunrpc: allocate a separate bvec array for socket sends svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of rq_bvec as the start, but doesn't reduce the array length by one, which could lead to an array overrun. Also, rq_bvec is always rq_maxpages in length, which can be too short in some cases, since the TCP record marker consumes a slot. Fix both problems by adding a separate bvec array to the svc_sock that is specifically for sending. For TCP, make this array one slot longer than rq_maxpages, to account for the record marker. For UDP, only allocate as large an array as we need since it's limited to 64k of payload. Signed-off-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcsock.h | 3 ++ net/sunrpc/svcsock.c | 55 +++++++++++++++++++++++++++++----- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 963bbe251e52..de37069aba90 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -26,6 +26,9 @@ struct svc_sock { void (*sk_odata)(struct sock *); void (*sk_owspace)(struct sock *); + /* For sends (protected by xpt_mutex) */ + struct bio_vec *sk_bvec; + /* private TCP part */ /* On-the-wire fragment header: */ __be32 sk_marker; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 0cb9c4d45745..93de79020a2d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -68,6 +68,17 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + /* To-do: to avoid tying up an nfsd thread while waiting for a * handshake request, the request could instead be deferred. */ @@ -740,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr); + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); } @@ -1236,19 +1247,19 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, int ret; /* The stream record marker is copied into a temporary page - * fragment buffer so that it can be included in rq_bvec. + * fragment buffer so that it can be included in sk_bvec. */ buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, &marker, sizeof(marker)); - bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages, + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, &rqstp->rq_res); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); @@ -1393,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} + /* * Initialize socket for RPC use and create svc_sock struct */ @@ -1403,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int sendpages; unsigned long pages; + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); + pages = svc_serv_maxpages(serv); svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + + if (sendpages) { + svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + svsk->sk_maxpages = pages; inet = sock->sk; @@ -1420,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); if (err < 0) { + kfree(svsk->sk_bvec); kfree(svsk); return ERR_PTR(err); } @@ -1637,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(sock); page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } From 3a1ce35030e1e0e35bc38db5e0be0165945f7e7f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 14 Oct 2025 11:09:58 -0400 Subject: [PATCH 18/38] NFSD: Add a subsystem policy document Steer contributors to NFSD's patchworks instance, list our patch submission preferences, and more. The new document is based on the existing netdev and xfs subsystem policy documents. This is an attempt to add transparency to the process of accepting contributions to NFSD and getting them merged upstream. Suggested-by: "Darrick J. Wong" Cc: Luis Chamberlain Cc: Martin K. Petersen Reviewed-by: NeilBrown [ cel: Hand-edits to address review comments ] Signed-off-by: Chuck Lever --- .../nfs/nfsd-maintainer-entry-profile.rst | 547 ++++++++++++++++++ .../maintainer/maintainer-entry-profile.rst | 1 + MAINTAINERS | 1 + 3 files changed, 549 insertions(+) create mode 100644 Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst diff --git a/Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst b/Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst new file mode 100644 index 000000000000..cebbbdad8710 --- /dev/null +++ b/Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst @@ -0,0 +1,547 @@ +NFSD Maintainer Entry Profile +============================= + +A Maintainer Entry Profile supplements the top-level process +documents (found in Documentation/process/) with customs that are +specific to a subsystem and its maintainers. A contributor may use +this document to set their expectations and avoid common mistakes. +A maintainer may use these profiles to look across subsystems for +opportunities to converge on best common practices. + +Overview +-------- +The Network File System (NFS) is a standardized family of network +protocols that enable access to files across a set of network- +connected peer hosts. Applications on NFS clients access files that +reside on file systems that are shared by NFS servers. A single +network peer can act as both an NFS client and an NFS server. + +NFSD refers to the NFS server implementation included in the Linux +kernel. An in-kernel NFS server has fast access to files stored +in file systems local to that server. NFSD can share files stored +on most of the file system types native to Linux, including xfs, +ext4, btrfs, and tmpfs. + +Mailing list +------------ +The linux-nfs@vger.kernel.org mailing list is a public list. Its +purpose is to enable collaboration among developers working on the +Linux NFS stack, both client and server. It is not a place for +conversations that are not related directly to the Linux NFS stack. + +The linux-nfs mailing list is archived on `lore.kernel.org `_. + +The Linux NFS community does not have any chat room. + +Reporting bugs +-------------- +If you experience an NFSD-related bug on a distribution-built +kernel, please start by working with your Linux distributor. + +Bug reports against upstream Linux code bases are welcome on the +linux-nfs@vger.kernel.org mailing list, where some active triage +can be done. NFSD bugs may also be reported in the Linux kernel +community's bugzilla at: + + https://bugzilla.kernel.org + +Please file NFSD-related bugs under the "Filesystems/NFSD" +component. In general, including as much detail as possible is a +good start, including pertinent system log messages from both +the client and server. + +User space software related to NFSD, such as mountd or the exportfs +command, is contained in the nfs-utils package. Report problems +with those components to linux-nfs@vger.kernel.org. You might be +directed to move the report to a specific bug tracker. + +Contributor's Guide +------------------- + +Standards compliance +~~~~~~~~~~~~~~~~~~~~ +The priority is for NFSD to interoperate fully with the Linux NFS +client. We also test against other popular NFS client implementa- +tions regularly at NFS bake-a-thon events (also known as plug- +fests). Non-Linux NFS clients are not part of upstream NFSD CI/CD. + +The NFSD community strives to provide an NFS server implementation +that interoperates with all standards-compliant NFS client +implementations. This is done by staying as close as is sensible to +the normative mandates in the IETF's published NFS, RPC, and GSS-API +standards. + +It is always useful to reference an RFC and section number in a code +comment where behavior deviates from the standard (and even when the +behavior is compliant but the implementation is obfuscatory). + +On the rare occasion when a deviation from standard-mandated +behavior is needed, brief documentation of the use case or +deficiencies in the standard is a required part of in-code +documentation. + +Care must always be taken to avoid leaking local error codes (ie, +errnos) to clients of NFSD. A proper NFS status code is always +required in NFS protocol replies. + +NFSD administrative interfaces +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +NFSD administrative interfaces include: + +- an NFSD or SUNRPC module parameter + +- export options in /etc/exports + +- files under /proc/fs/nfsd/ or /proc/sys/sunrpc/ + +- the NFSD netlink protocol + +Frequently, a request is made to introduce or modify one of NFSD's +traditional administrative interfaces. Certainly it is technically +easy to introduce a new administrative setting. However, there are +good reasons why the NFSD maintainers prefer to leave that as a last +resort: + +- As with any API, administrative interfaces are difficult to get + right. + +- Once they are documented and have a legacy of use, administrative + interfaces become difficult to modify or remove. + +- Every new administrative setting multiplies the NFSD test matrix. + +- The cost of one administrative interface is incremental, but costs + add up across all of the existing interfaces. + +It is often better for everyone if effort is made up front to +understanding the underlying requirement of the new setting, and +then trying to make it tune itself (or to become otherwise +unnecessary). + +If a new setting is indeed necessary, first consider adding it to +the NFSD netlink protocol. Or if it doesn't need to be a reliable +long term user space feature, it can be added to NFSD's menagerie of +experimental settings which reside under /sys/kernel/debug/nfsd/ . + +Field observability +~~~~~~~~~~~~~~~~~~~ +NFSD employs several different mechanisms for observing operation, +including counters, printks, WARNings, and static trace points. Each +have their strengths and weaknesses. Contributors should select the +most appropriate tool for their task. + +- BUG must be avoided if at all possible, as it will frequently + result in a full system crash. + +- WARN is appropriate only when a full stack trace is useful. + +- printk can show detailed information. These must not be used + in code paths where they can be triggered repeatedly by remote + users. + +- dprintk can show detailed information, but can be enabled only + in pre-set groups. The overhead of emitting output makes dprintk + inappropriate for frequent operations like I/O. + +- Counters are always on, but provide little information about + individual events other than how frequently they occur. + +- static trace points can be enabled individually or in groups + (via a glob). These are generally low overhead, and thus are + favored for use in hot paths. + +- dynamic tracing, such as kprobes or eBPF, are quite flexible but + cannot be used in certain environments (eg, full kernel lock- + down). + +Testing +~~~~~~~ +The kdevops project + + https://github.com/linux-kdevops/kdevops + +contains several NFS-specific workflows, as well as the community +standard fstests suite. These workflows are based on open source +testing tools such as ltp and fio. Contributors are encouraged to +use these tools without kdevops, or contributors should install and +use kdevops themselves to verify their patches before submission. + +Coding style +~~~~~~~~~~~~ +Follow the coding style preferences described in + + Documentation/process/coding-style.rst + +with the following exceptions: + +- Add new local variables to a function in reverse Christmas tree + order + +- Use the kdoc comment style for + + non-static functions + + static inline functions + + static functions that are callbacks/virtual functions + +- All new function names start with "nfsd_" for non-NFS-version- + specific functions. + +- New function names that are specific to NFSv2 or NFSv3, or are + used by all minor versions of NFSv4, use "nfsdN_" where N is + the version. + +- New function names specific to an NFSv4 minor version can be + named with "nfsd4M_" where M is the minor version. + +Patch preparation +~~~~~~~~~~~~~~~~~ +Read and follow all guidelines in + + Documentation/process/submitting-patches.rst + +Use tagging to identify all patch authors. However, reviewers and +testers should be added by replying to the email patch submission. +Email is extensively used in order to publicly archive review and +testing attributions. These tags are automatically inserted into +your patches when they are applied. + +The code in the body of the diff already shows /what/ is being +changed. Thus it is not necessary to repeat that in the patch +description. Instead, the description should contain one or more +of: + +- A brief problem statement ("what is this patch trying to fix?") + with a root-cause analysis. + +- End-user visible symptoms or items that a support engineer might + use to search for the patch, like stack traces. + +- A brief explanation of why the patch is the best way to address + the problem. + +- Any context that reviewers might need to understand the changes + made by the patch. + +- Any relevant benchmarking results, and/or functional test results. + +As detailed in Documentation/process/submitting-patches.rst, +identify the point in history that the issue being addressed was +introduced by using a Fixes: tag. + +Mention in the patch description if that point in history cannot be +determined -- that is, no Fixes: tag can be provided. In this case, +please make it clear to maintainers whether an LTS backport is +needed even though there is no Fixes: tag. + +The NFSD maintainers prefer to add stable tagging themselves, after +public discussion in response to the patch submission. Contributors +may suggest stable tagging, but be aware that many version +management tools add such stable Cc's when you post your patches. +Don't add "Cc: stable" unless you are absolutely sure the patch +needs to go to stable during the initial submission process. + +Patch submission +~~~~~~~~~~~~~~~~ +Patches to NFSD are submitted via the kernel's email-based review +process that is common to most other kernel subsystems. + +Just before each submission, rebase your patch or series on the +nfsd-testing branch at + + https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git + +The NFSD subsystem is maintained separately from the Linux in-kernel +NFS client. The NFSD maintainers do not normally take submissions +for client changes, nor can they respond authoritatively to bug +reports or feature requests for NFS client code. + +This means that contributors might be asked to resubmit patches if +they were emailed to the incorrect set of maintainers and reviewers. +This is not a rejection, but simply a correction of the submission +process. + +When in doubt, consult the NFSD entry in the MAINTAINERS file to +see which files and directories fall under the NFSD subsystem. + +The proper set of email addresses for NFSD patches are: + +To: the NFSD maintainers and reviewers listed in MAINTAINERS +Cc: linux-nfs@vger.kernel.org and optionally linux-kernel@ + +If there are other subsystems involved in the patches (for example +MM or RDMA) their primary mailing list address can be included in +the Cc: field. Other contributors and interested parties may be +included there as well. + +In general we prefer that contributors use common patch email tools +such as "git send-email" or "stg email format/send", which tend to +get the details right without a lot of fuss. + +A series consisting of a single patch is not required to have a +cover letter. However, a cover letter can be included if there is +substantial context that is not appropriate to include in the +patch description. + +Please note that, with an e-mail based submission process, series +cover letters are not part of the work that is committed to the +kernel source code base or its commit history. Therefore always try +to keep pertinent information in the patch descriptions. + +Design documentation is welcome, but as cover letters are not +preserved, a perhaps better option is to include a patch that adds +such documentation under Documentation/filesystems/nfs/. + +Reviewers will ask about test coverage and what use cases the +patches are expected to address. Please be prepared to answer these +questions. + +Review comments from maintainers might be politely stated, but in +general, these are not optional to address when they are actionable. +If necessary, the maintainers retain the right to not apply patches +when contributors refuse to address reasonable requests. + +Post changes to kernel source code and user space source code as +separate series. You can connect the two series with comments in +your cover letters. + +Generally the NFSD maintainers ask for a reposts even for simple +modifications in order to publicly archive the request and the +resulting repost before it is pulled into the NFSD trees. This +also enables us to rebuild a patch series quickly without missing +changes that might have been discussed via email. + +Avoid frequently reposting large series with only small changes. As +a rule of thumb, posting substantial changes more than once a week +will result in reviewer overload. + +Remember, there are only a handful of subsystem maintainers and +reviewers, but potentially many sources of contributions. The +maintainers and reviewers, therefore, are always the less scalable +resource. Be kind to your friendly neighborhood maintainer. + +Patch Acceptance +~~~~~~~~~~~~~~~~ +There isn't a formal review process for NFSD, but we like to see +at least two Reviewed-by: notices for patches that are more than +simple clean-ups. Reviews are done in public on +linux-nfs@vger.kernel.org and are archived on lore.kernel.org. + +Currently the NFSD patch queues are maintained in branches here: + + https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git + +The NFSD maintainers apply patches initially to the nfsd-testing +branch, which is always open to new submissions. Patches can be +applied while review is ongoing. nfsd-testing is a topic branch, +so it can change frequently, it will be rebased, and your patch +might get dropped if there is a problem with it. + +Generally a script-generated "thank you" email will indicate when +your patch has been added to the nfsd-testing branch. You can track +the progress of your patch using the linux-nfs patchworks instance: + + https://patchwork.kernel.org/project/linux-nfs/list/ + +While your patch is in nfsd-testing, it is exposed to a variety of +test environments, including community zero-day bots, static +analysis tools, and NFSD continuous integration testing. The soak +period is three to four weeks. + +Each patch that survives in nfsd-testing for the soak period without +changes is moved to the nfsd-next branch. + +The nfsd-next branch is automatically merged into linux-next and +fs-next on a nightly basis. + +Patches that survive in nfsd-next are included in the next NFSD +merge window pull request. These windows typically occur once every +63 days (nine weeks). + +When the upstream merge window closes, the nfsd-next branch is +renamed nfsd-fixes, and a new nfsd-next branch is created, based on +the upstream -rc1 tag. + +Fixes that are destined for an upstream -rc release also run the +nfsd-testing gauntlet, but are then applied to the nfsd-fixes +branch. That branch is made available for Linus to pull after a +short time. In order to limit the risk of introducing regressions, +we limit such fixes to emergency situations or fixes to breakage +that occurred during the most recent upstream merge. + +Please make it clear when submitting an emergency patch that +immediate action (either application to -rc or LTS backport) is +needed. + +Sensitive patch submissions and bug reports +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +CVEs are generated by specific members of the Linux kernel community +and several external entities. The Linux NFS community does not emit +or assign CVEs. CVEs are assigned after an issue and its fix are +known. + +However, the NFSD maintainers sometimes receive sensitive security +reports, and at times these are significant enough to need to be +embargoed. In such rare cases, fixes can be developed and reviewed +out of the public eye. + +Please be aware that many version management tools add the stable +Cc's when you post your patches. This is generally a nuisance, but +it can result in outing an embargoed security issue accidentally. +Don't add "Cc: stable" unless you are absolutely sure the patch +needs to go to stable@ during the initial submission process. + +Patches that are merged without ever appearing on any list, and +which carry a Reported-by: or Fixes: tag are detected as suspicious +by security-focused people. We encourage that, after any private +review, security-sensitive patches should be posted to linux-nfs@ +for the usual public review, archiving, and test period. + +LLM-generated submissions +~~~~~~~~~~~~~~~~~~~~~~~~~ +The Linux kernel community as a whole is still exploring the new +world of LLM-generated code. The NFSD maintainers will entertain +submission of patches that are partially or wholly generated by +LLM-based development tools. Such submissions are held to the +same standards as submissions created entirely by human authors: + +- The human contributor identifies themselves via a Signed-off-by: + tag. This tag counts as a DoC. + +- The human contributor is solely responsible for code provenance + and any contamination by inadvertently-included code with a + conflicting license, as usual. + +- The human contributor must be able to answer and address review + questions. A patch description such as "This fixed my problem + but I don't know why" is not acceptable. + +- The contribution is subjected to the same test regimen as all + other submissions. + +- An indication (via a Generated-by: tag or otherwise) that the + contribution is LLM-generated is not required. + +It is easy to address review comments and fix requests in LLM +generated code. So easy, in fact, that it becomes tempting to repost +refreshed code immediately. Please resist that temptation. + +As always, please avoid reposting series revisions more than once +every 24 hours. + +Clean-up patches +~~~~~~~~~~~~~~~~ +The NFSD maintainers discourage patches which perform simple clean- +ups, which are not in the context of other work. For example: + +* Addressing ``checkpatch.pl`` warnings after merge +* Addressing :ref:`Local variable ordering` issues +* Addressing long-standing whitespace damage + +This is because it is felt that the churn that such changes produce +comes at a greater cost than the value of such clean-ups. + +Conversely, spelling and grammar fixes are encouraged. + +Stable and LTS support +---------------------- +Upstream NFSD continuous integration testing runs against LTS trees +whenever they are updated. + +Please indicate when a patch containing a fix needs to be considered +for LTS kernels, either via a Fixes: tag or explicit mention. + +Feature requests +---------------- +There is no one way to make an official feature request, but +discussion about the request should eventually make its way to +the linux-nfs@vger.kernel.org mailing list for public review by +the community. + +Subsystem boundaries +~~~~~~~~~~~~~~~~~~~~ +NFSD itself is not much more than a protocol engine. This means its +primary responsibility is to translate the NFS protocol into API +calls in the Linux kernel. For example, NFSD is not responsible for +knowing exactly how bytes or file attributes are managed on a block +device. It relies on other kernel subsystems for that. + +If the subsystems on which NFSD relies do not implement a particular +feature, even if the standard NFS protocols do support that feature, +that usually means NFSD cannot provide that feature without +substantial development work in other areas of the kernel. + +Specificity +~~~~~~~~~~~ +Feature requests can come from anywhere, and thus can often be +nebulous. A requester might not understand what a "use case" or +"user story" is. These descriptive paradigms are often used by +developers and architects to understand what is required of a +design, but are terms of art in the software trade, not used in +the everyday world. + +In order to prevent contributors and maintainers from becoming +overwhelmed, we won't be afraid of saying "no" politely to +underspecified requests. + +Community roles and their authority +----------------------------------- +The purpose of Linux subsystem communities is to provide expertise +and active stewardship of a narrow set of source files in the Linux +kernel. This can include managing user space tooling as well. + +To contextualize the structure of the Linux NFS community that +is responsible for stewardship of the NFS server code base, we +define the community roles here. + +- **Contributor** : Anyone who submits a code change, bug fix, + recommendation, documentation fix, and so on. A contributor can + submit regularly or infrequently. + +- **Outside Contributor** : A contributor who is not a regular actor + in the Linux NFS community. This can mean someone who contributes + to other parts of the kernel, or someone who just noticed a + misspelling in a comment and sent a patch. + +- **Reviewer** : Someone who is named in the MAINTAINERS file as a + reviewer is an area expert who can request changes to contributed + code, and expects that contributors will address the request. + +- **External Reviewer** : Someone who is not named in the + MAINTAINERS file as a reviewer, but who is an area expert. + Examples include Linux kernel contributors with networking, + security, or persistent storage expertise, or developers who + contribute primarily to other NFS implementations. + +One or more people will take on the following roles. These people +are often generically referred to as "maintainers", and are +identified in the MAINTAINERS file with the "M:" tag under the NFSD +subsystem. + +- **Upstream Release Manager** : This role is responsible for + curating contributions into a branch, reviewing test results, and + then sending a pull request during merge windows. There is a + trust relationship between the release manager and Linus. + +- **Bug Triager** : Someone who is a first responder to bug reports + submitted to the linux-nfs mailing list or bug trackers, and helps + troubleshoot and identify next steps. + +- **Security Lead** : The security lead handles contacts from the + security community to resolve immediate issues, as well as dealing + with long-term security issues such as supply chain concerns. For + upstream, that's usually whether contributions violate licensing + or other intellectual property agreements. + +- **Testing Lead** : The testing lead builds and runs the test + infrastructure for the subsystem. The testing lead may ask for + patches to be dropped because of ongoing high defect rates. + +- **LTS Maintainer** : The LTS maintainer is responsible for managing + the Fixes: and Cc: stable annotations on patches, and seeing that + patches that cannot be automatically applied to LTS kernels get + proper manual backports as necessary. + +- **Community Manager** : This umpire role can be asked to call balls + and strikes during conflicts, but is also responsible for ensuring + the health of the relationships within the community and for + facilitating discussions on long-term topics such as how to manage + growing technical debt. diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst index d36dd892a78a..6020d188e13d 100644 --- a/Documentation/maintainer/maintainer-entry-profile.rst +++ b/Documentation/maintainer/maintainer-entry-profile.rst @@ -110,5 +110,6 @@ to do something different in the near future. ../process/maintainer-netdev ../driver-api/vfio-pci-device-specific-driver-acceptance ../nvme/feature-and-quirk-policy + ../filesystems/nfs/nfsd-maintainer-entry-profile ../filesystems/xfs/xfs-maintainer-entry-profile ../mm/damon/maintainer-profile diff --git a/MAINTAINERS b/MAINTAINERS index e64b94e6b5a9..a19fd4d8beaf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13560,6 +13560,7 @@ R: Dai Ngo R: Tom Talpey L: linux-nfs@vger.kernel.org S: Supported +P: Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst B: https://bugzilla.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git F: Documentation/filesystems/nfs/ From 8320b75b2b8bf94d4d4f1b59f75ec8dd7188dc76 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 17 Nov 2025 17:24:17 +0700 Subject: [PATCH 19/38] NFS: nfsd-maintainer-entry-profile: Inline function name prefixes Sphinx reports htmldocs warnings: Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst:185: ERROR: Unknown target name: "nfsd". [docutils] Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst:188: ERROR: Unknown target name: "nfsdn". [docutils] Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst:192: ERROR: Unknown target name: "nfsd4m". [docutils] These are due to Sphinx confusing function name prefixes for external link syntax. Fix the warnings by inlining the prefixes. Fixes: 3a1ce35030e1e0 ("NFSD: Add a subsystem policy document") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20251117174218.29365f30@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Signed-off-by: Chuck Lever --- .../filesystems/nfs/nfsd-maintainer-entry-profile.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst b/Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst index cebbbdad8710..4d6b57dbab2a 100644 --- a/Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst +++ b/Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst @@ -182,15 +182,15 @@ with the following exceptions: + static inline functions + static functions that are callbacks/virtual functions -- All new function names start with "nfsd_" for non-NFS-version- +- All new function names start with ``nfsd_`` for non-NFS-version- specific functions. - New function names that are specific to NFSv2 or NFSv3, or are - used by all minor versions of NFSv4, use "nfsdN_" where N is + used by all minor versions of NFSv4, use ``nfsdN_`` where N is the version. - New function names specific to an NFSv4 minor version can be - named with "nfsd4M_" where M is the minor version. + named with ``nfsd4M_`` where M is the minor version. Patch preparation ~~~~~~~~~~~~~~~~~ From fceb8734e7f1e5dd698c03403ff500923e0fd612 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 16 Oct 2025 09:49:58 -0400 Subject: [PATCH 20/38] nfsd: stop pretending that we cache the SEQUENCE reply. nfsd does not cache the reply to a SEQUENCE. As the comment above nfsd4_replay_cache_entry() says: * The sequence operation is not cached because we can use the slot and * session values. The comment above nfsd4_cache_this() suggests otherwise. * The session reply cache only needs to cache replies that the client * actually asked us to. But it's almost free for us to cache compounds * consisting of only a SEQUENCE op, so we may as well cache those too. * Also, the protocol doesn't give us a convenient response in the case * of a replay of a solo SEQUENCE op that wasn't cached The code in nfsd4_store_cache_entry() makes it clear that only responses beyond 'cstate.data_offset' are actually cached, and data_offset is set at the end of nfsd4_encode_sequence() *after* the sequence response has been encoded. This patch simplifies code and removes the confusing comments. - nfsd4_is_solo_sequence() is discarded as not-useful. - nfsd4_cache_this() is now trivial so it too is discarded with the code placed in-line at the one call-site in nfsd4_store_cache_entry(). - nfsd4_enc_sequence_replay() is open-coded in to nfsd4_replay_cache_entry(), and then simplified to (hopefully) make the process of replaying a reply clearer. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 58 ++++++++++++++------------------------------- fs/nfsd/xdr4.h | 21 ---------------- 2 files changed, 18 insertions(+), 61 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 085f5ef12230..35004568d43e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3508,7 +3508,7 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) free_svc_cred(&slot->sl_cred); copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred); - if (!nfsd4_cache_this(resp)) { + if (!(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)) { slot->sl_flags &= ~NFSD4_SLOT_CACHED; return; } @@ -3522,41 +3522,6 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) return; } -/* - * Encode the replay sequence operation from the slot values. - * If cachethis is FALSE encode the uncached rep error on the next - * operation which sets resp->p and increments resp->opcnt for - * nfs4svc_encode_compoundres. - * - */ -static __be32 -nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, - struct nfsd4_compoundres *resp) -{ - struct nfsd4_op *op; - struct nfsd4_slot *slot = resp->cstate.slot; - - /* Encode the replayed sequence operation */ - op = &args->ops[resp->opcnt - 1]; - nfsd4_encode_operation(resp, op); - - if (slot->sl_flags & NFSD4_SLOT_CACHED) - return op->status; - if (args->opcnt == 1) { - /* - * The original operation wasn't a solo sequence--we - * always cache those--so this retry must not match the - * original: - */ - op->status = nfserr_seq_false_retry; - } else { - op = &args->ops[resp->opcnt++]; - op->status = nfserr_retry_uncached_rep; - nfsd4_encode_operation(resp, op); - } - return op->status; -} - /* * The sequence operation is not cached because we can use the slot and * session values. @@ -3565,17 +3530,30 @@ static __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; struct nfsd4_slot *slot = resp->cstate.slot; struct xdr_stream *xdr = resp->xdr; __be32 *p; - __be32 status; dprintk("--> %s slot %p\n", __func__, slot); - status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); - if (status) - return status; + /* Always encode the SEQUENCE response. */ + nfsd4_encode_operation(resp, &args->ops[0]); + if (args->opcnt == 1) + /* A solo SEQUENCE - nothing was cached */ + return args->ops[0].status; + if (!(slot->sl_flags & NFSD4_SLOT_CACHED)) { + /* We weren't asked to cache this. */ + struct nfsd4_op *op; + + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); + return op->status; + } + + /* return reply from cache */ p = xdr_reserve_space(xdr, slot->sl_datalen); if (!p) { WARN_ON_ONCE(1); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 1ce8e12ae335..ae75846b3cd7 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -924,27 +924,6 @@ struct nfsd4_compoundres { struct nfsd4_compound_state cstate; }; -static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) -{ - struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; - return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; -} - -/* - * The session reply cache only needs to cache replies that the client - * actually asked us to. But it's almost free for us to cache compounds - * consisting of only a SEQUENCE op, so we may as well cache those too. - * Also, the protocol doesn't give us a convenient response in the case - * of a replay of a solo SEQUENCE op that wasn't cached - * (RETRY_UNCACHED_REP can only be returned in the second op of a - * compound). - */ -static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp) -{ - return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) - || nfsd4_is_solo_sequence(resp); -} - static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) { struct nfsd4_compoundres *resp = rqstp->rq_resp; From 78cd170d035fe9b0372c0527f1613ddde8296667 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 16 Oct 2025 11:15:34 -0700 Subject: [PATCH 21/38] nfsd: Use MD5 library instead of crypto_shash Update NFSD's support for "legacy client tracking" (which uses MD5) to use the MD5 library instead of crypto_shash. This has several benefits: - Simpler code. Notably, much of the error-handling code is no longer needed, since the library functions can't fail. - Improved performance due to reduced overhead. A microbenchmark of nfs4_make_rec_clidname() shows a speedup from 1455 cycles to 425. - The MD5 code can now safely be built as a loadable module when nfsd is built as a loadable module. (Previously, nfsd forced the MD5 code to built-in, presumably to work around the unreliability of the name-based loading.) Thus select MD5 from the tristate option NFSD if NFSD_LEGACY_CLIENT_TRACKING, instead of from the bool option NFSD_V4. - Fixes a bug where legacy client tracking was not supported on kernels booted with "fips=1", due to crypto_shash not allowing MD5 to be used. This particular use of MD5 is not for a cryptographic purpose, though, so it is acceptable even when fips=1 (see https://lore.kernel.org/r/dae495a93cbcc482f4ca23c3a0d9360a1fd8c3a8.camel@redhat.com/). Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Acked-by: Jeff Layton Reviewed-by: Scott Mayhew Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 4 +-- fs/nfsd/nfs4recover.c | 76 +++++-------------------------------------- 2 files changed, 11 insertions(+), 69 deletions(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index df09c5cefb7c..0b5c1a0bf1cf 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -5,6 +5,7 @@ config NFSD depends on FILE_LOCKING depends on FSNOTIFY select CRC32 + select CRYPTO_LIB_MD5 if NFSD_LEGACY_CLIENT_TRACKING select CRYPTO_LIB_SHA256 if NFSD_V4 select LOCKD select SUNRPC @@ -77,8 +78,7 @@ config NFSD_V4 depends on NFSD && PROC_FS select FS_POSIX_ACL select RPCSEC_GSS_KRB5 - select CRYPTO - select CRYPTO_MD5 + select CRYPTO # required by RPCSEC_GSS_KRB5 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 help diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index b1005abcb903..aa15b30f9dbf 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -32,7 +32,7 @@ * */ -#include +#include #include #include #include @@ -92,57 +92,18 @@ nfs4_reset_creds(const struct cred *original) put_cred(revert_creds(original)); } -static int +static void nfs4_make_rec_clidname(char dname[HEXDIR_LEN], const struct xdr_netobj *clname) { u8 digest[MD5_DIGEST_SIZE]; - struct crypto_shash *tfm; - int status; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); - tfm = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(tfm)) { - status = PTR_ERR(tfm); - goto out_no_tfm; - } - status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, - digest); - if (status) - goto out; + md5(clname->data, clname->len, digest); static_assert(HEXDIR_LEN == 2 * MD5_DIGEST_SIZE + 1); sprintf(dname, "%*phN", MD5_DIGEST_SIZE, digest); - - status = 0; -out: - crypto_free_shash(tfm); -out_no_tfm: - return status; -} - -/* - * If we had an error generating the recdir name for the legacy tracker - * then warn the admin. If the error doesn't appear to be transient, - * then disable recovery tracking. - */ -static void -legacy_recdir_name_error(struct nfs4_client *clp, int error) -{ - printk(KERN_ERR "NFSD: unable to generate recoverydir " - "name (%d).\n", error); - - /* - * if the algorithm just doesn't exist, then disable the recovery - * tracker altogether. The crypto libs will generally return this if - * FIPS is enabled as well. - */ - if (error == -ENOENT) { - printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " - "Reboot recovery will not function correctly!\n"); - nfsd4_client_tracking_exit(clp->net); - } } static void @@ -171,9 +132,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (!nn->rec_file) return; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) - return legacy_recdir_name_error(clp, status); + nfs4_make_rec_clidname(dname, &clp->cl_name); status = nfs4_save_creds(&original_cred); if (status < 0) @@ -354,9 +313,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) - return legacy_recdir_name_error(clp, status); + nfs4_make_rec_clidname(dname, &clp->cl_name); status = mnt_want_write_file(nn->rec_file); if (status) @@ -636,7 +593,6 @@ nfs4_recoverydir(void) static int nfsd4_check_legacy_client(struct nfs4_client *clp) { - int status; char dname[HEXDIR_LEN]; struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -646,11 +602,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) { - legacy_recdir_name_error(clp, status); - return status; - } + nfs4_make_rec_clidname(dname, &clp->cl_name); /* look for it in the reclaim hashtable otherwise */ name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); @@ -1243,13 +1195,10 @@ nfsd4_cld_check(struct nfs4_client *clp) #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (nn->cld_net->cn_has_legacy) { - int status; char dname[HEXDIR_LEN]; struct xdr_netobj name; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) - return -ENOENT; + nfs4_make_rec_clidname(dname, &clp->cl_name); name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { @@ -1294,11 +1243,8 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) if (cn->cn_has_legacy) { struct xdr_netobj name; char dname[HEXDIR_LEN]; - int status; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) - return -ENOENT; + nfs4_make_rec_clidname(dname, &clp->cl_name); name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { @@ -1671,11 +1617,7 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) return NULL; } - copied = nfs4_make_rec_clidname(result + copied, name); - if (copied) { - kfree(result); - return NULL; - } + nfs4_make_rec_clidname(result + copied, name); return result; } From f6dcad1d748e192d8cd01d76736131ae913787af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Oct 2025 13:45:30 +0200 Subject: [PATCH 22/38] MAINTAINERS: add a nfsd blocklayout reviewer Add a minimal entry for the block layout driver to make sure Christoph who wrote the code gets Cced on all patches. The actual maintenance stays with the nfsd maintainer team. Signed-off-by: Christoph Hellwig Acked-by: Jeff Layton Signed-off-by: Chuck Lever --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a19fd4d8beaf..5b5980a68091 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13580,6 +13580,10 @@ F: include/uapi/linux/sunrpc/ F: net/sunrpc/ F: tools/net/sunrpc/ +KERNEL NFSD BLOCK and SCSI LAYOUT DRIVER +R: Christoph Hellwig +F: fs/nfsd/blocklayout* + KERNEL PACMAN PACKAGING (in addition to generic KERNEL BUILD) M: Thomas Weißschuh R: Christian Heusel From 898f94465205e33295c29333a82a249b8f90aa74 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 23 Oct 2025 09:12:39 -0400 Subject: [PATCH 23/38] lockd: don't allow locking on reexported NFSv2/3 Since commit 9254c8ae9b81 ("nfsd: disallow file locking and delegations for NFSv4 reexport"), file locking when reexporting an NFS mount via NFSv4 is expressly prohibited by nfsd. Do the same in lockd: Add a new nlmsvc_file_cannot_lock() helper that will test whether file locking is allowed for a given file, and return nlm_lck_denied_nolocks if it isn't. Signed-off-by: Jeff Layton Tested-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 12 ++++++++++++ fs/lockd/svcshare.c | 6 ++++++ include/linux/lockd/lockd.h | 9 ++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index a31dc9588eb8..3a3d05cfe09a 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -495,6 +495,9 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); + if (nlmsvc_file_cannot_lock(file)) + return nlm_lck_denied_nolocks; + if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) { async_block = wait; wait = 0; @@ -621,6 +624,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (nlmsvc_file_cannot_lock(file)) + return nlm_lck_denied_nolocks; + if (locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; @@ -678,6 +684,9 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (nlmsvc_file_cannot_lock(file)) + return nlm_lck_denied_nolocks; + /* First, cancel any lock that might be there */ nlmsvc_cancel_blocked(net, file, lock); @@ -715,6 +724,9 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (nlmsvc_file_cannot_lock(file)) + return nlm_lck_denied_nolocks; + if (locks_in_grace(net)) return nlm_lck_denied_grace_period; diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index ade4931b2da2..88c81ce1148d 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -32,6 +32,9 @@ nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, struct xdr_netobj *oh = &argp->lock.oh; u8 *ohdata; + if (nlmsvc_file_cannot_lock(file)) + return nlm_lck_denied_nolocks; + for (share = file->f_shares; share; share = share->s_next) { if (share->s_host == host && nlm_cmp_owner(share, oh)) goto update; @@ -72,6 +75,9 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, struct nlm_share *share, **shpp; struct xdr_netobj *oh = &argp->lock.oh; + if (nlmsvc_file_cannot_lock(file)) + return nlm_lck_denied_nolocks; + for (shpp = &file->f_shares; (share = *shpp) != NULL; shpp = &share->s_next) { if (share->s_host == host && nlm_cmp_owner(share, oh)) { diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index c8f0f9458f2c..330e38776bb2 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -12,6 +12,7 @@ /* XXX: a lot of this should really be under fs/lockd. */ +#include #include #include #include @@ -307,7 +308,7 @@ void nlmsvc_invalidate_all(void); int nlmsvc_unlock_all_by_sb(struct super_block *sb); int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); -static inline struct file *nlmsvc_file_file(struct nlm_file *file) +static inline struct file *nlmsvc_file_file(const struct nlm_file *file) { return file->f_file[O_RDONLY] ? file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; @@ -318,6 +319,12 @@ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) return file_inode(nlmsvc_file_file(file)); } +static inline bool +nlmsvc_file_cannot_lock(const struct nlm_file *file) +{ + return exportfs_cannot_lock(nlmsvc_file_file(file)->f_path.dentry->d_sb->s_export_op); +} + static inline int __nlm_privileged_request4(const struct sockaddr *sap) { const struct sockaddr_in *sin = (struct sockaddr_in *)sap; From 75a9b40f3b14d1cc3771c463d32b71cf4e558246 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Oct 2025 09:56:31 -0400 Subject: [PATCH 24/38] xdrgen: Generalize/harden pathname construction Use Python's built-in Path constructor to find the Jinja templates. This provides better error checking, proper use of path component separators, and more reliable location of the template files. Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/generators/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py index b98574a36a4a..e22632cf38fb 100644 --- a/tools/net/sunrpc/xdrgen/generators/__init__.py +++ b/tools/net/sunrpc/xdrgen/generators/__init__.py @@ -2,7 +2,7 @@ """Define a base code generator class""" -import sys +from pathlib import Path from jinja2 import Environment, FileSystemLoader, Template from xdr_ast import _XdrAst, Specification, _RpcProgram, _XdrTypeSpecifier @@ -14,8 +14,11 @@ def create_jinja2_environment(language: str, xdr_type: str) -> Environment: """Open a set of templates based on output language""" match language: case "C": + templates_dir = ( + Path(__file__).parent.parent / "templates" / language / xdr_type + ) environment = Environment( - loader=FileSystemLoader(sys.path[0] + "/templates/C/" + xdr_type + "/"), + loader=FileSystemLoader(templates_dir), trim_blocks=True, lstrip_blocks=True, ) @@ -48,9 +51,7 @@ def find_xdr_program_name(root: Specification) -> str: def header_guard_infix(filename: str) -> str: """Extract the header guard infix from the specification filename""" - basename = filename.split("/")[-1] - program = basename.replace(".x", "") - return program.upper() + return Path(filename).stem.upper() def kernel_c_type(spec: _XdrTypeSpecifier) -> str: From 3bd937b49a2e0d45450c9326e288c8d1612e8ecd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Oct 2025 09:56:32 -0400 Subject: [PATCH 25/38] xdrgen: Make the xdrgen script location-independent The @pythondir@ placeholder is meant for build-time substitution, such as with autoconf. autoconf is not used in the kernel. Let's replace that mechanism with one that better enables the xdrgen script to be run from any directory. Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/xdrgen | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen index 43762be39252..3afd0547d67c 100755 --- a/tools/net/sunrpc/xdrgen/xdrgen +++ b/tools/net/sunrpc/xdrgen/xdrgen @@ -10,8 +10,13 @@ __license__ = "GPL-2.0 only" __version__ = "0.2" import sys +from pathlib import Path import argparse +_XDRGEN_DIR = Path(__file__).resolve().parent +if str(_XDRGEN_DIR) not in sys.path: + sys.path.insert(0, str(_XDRGEN_DIR)) + from subcmds import definitions from subcmds import declarations from subcmds import lint From 42ba5bd2e28b1f9e86303e4d176ae0809a53f0b6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Oct 2025 09:56:33 -0400 Subject: [PATCH 26/38] xdrgen: Fix the variable-length opaque field decoder template Ensure that variable-length opaques are decoded into the named field, and do not overwrite the structure itself. Signed-off-by: Chuck Lever --- .../xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 index 9a814de54ae8..65698e20d8cd 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 @@ -2,5 +2,5 @@ {% if annotate %} /* member {{ name }} (variable-length opaque) */ {% endif %} - if (!xdrgen_decode_opaque(xdr, (opaque *)ptr, {{ maxsize }})) + if (!xdrgen_decode_opaque(xdr, &ptr->{{ name }}, {{ maxsize }})) return false; From b0f8e1f1f5e8427ea1d955c48bddb6408f354421 Mon Sep 17 00:00:00 2001 From: Khushal Chitturi Date: Wed, 29 Oct 2025 11:42:36 +0530 Subject: [PATCH 27/38] xdrgen: handle _XdrString in union encoder/decoder Running xdrgen on xdrgen/tests/test.x fails when generating encoder or decoder functions for union members of type _XdrString. It was because _XdrString does not have a spec attribute like _XdrBasic, leading to AttributeError. This patch updates emit_union_case_spec_definition and emit_union_case_spec_decoder/encoder to handle _XdrString by assigning type_name = "char *" and avoiding referencing to spec. Testing: Fixed xdrgen tool was run on originally failing test file (tools/net/sunrpc/xdrgen/tests/test.x) and now completes without AttributeError. Modified xdrgen tool was also run against nfs4_1.x (Documentation/sunrpc/xdr/nfs4_1.x). The output header file matches with nfs4_1.h (include/linux/sunrpc/xdrgen/nfs4_1.h). This validates the patch for all XDR input files currently within the kernel. Changes since v2: - Moved the shebang to the first line - Removed SPDX header to match style of current xdrgen files Changes since v1: - Corrected email address in Signed-off-by. - Wrapped patch description lines to 72 characters. Signed-off-by: Khushal Chitturi Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/generators/union.py | 34 ++++++++++++++----- .../templates/C/union/encoder/string.j2 | 6 ++++ 2 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/string.j2 diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py index 2cca00e279cd..ad1f214ef22a 100644 --- a/tools/net/sunrpc/xdrgen/generators/union.py +++ b/tools/net/sunrpc/xdrgen/generators/union.py @@ -8,7 +8,7 @@ from jinja2 import Environment from generators import SourceGenerator from generators import create_jinja2_environment, get_jinja2_template -from xdr_ast import _XdrBasic, _XdrUnion, _XdrVoid, get_header_name +from xdr_ast import _XdrBasic, _XdrUnion, _XdrVoid, _XdrString, get_header_name from xdr_ast import _XdrDeclaration, _XdrCaseSpec, public_apis, big_endian @@ -40,13 +40,20 @@ def emit_union_case_spec_definition( """Emit a definition for an XDR union's case arm""" if isinstance(node.arm, _XdrVoid): return - assert isinstance(node.arm, _XdrBasic) + if isinstance(node.arm, _XdrString): + type_name = "char *" + classifier = "" + else: + type_name = node.arm.spec.type_name + classifier = node.arm.spec.c_classifier + + assert isinstance(node.arm, (_XdrBasic, _XdrString)) template = get_jinja2_template(environment, "definition", "case_spec") print( template.render( name=node.arm.name, - type=node.arm.spec.type_name, - classifier=node.arm.spec.c_classifier, + type=type_name, + classifier=classifier, ) ) @@ -84,6 +91,12 @@ def emit_union_case_spec_decoder( if isinstance(node.arm, _XdrVoid): return + if isinstance(node.arm, _XdrString): + type_name = "char *" + classifier = "" + else: + type_name = node.arm.spec.type_name + classifier = node.arm.spec.c_classifier if big_endian_discriminant: template = get_jinja2_template(environment, "decoder", "case_spec_be") @@ -92,13 +105,13 @@ def emit_union_case_spec_decoder( for case in node.values: print(template.render(case=case)) - assert isinstance(node.arm, _XdrBasic) + assert isinstance(node.arm, (_XdrBasic, _XdrString)) template = get_jinja2_template(environment, "decoder", node.arm.template) print( template.render( name=node.arm.name, - type=node.arm.spec.type_name, - classifier=node.arm.spec.c_classifier, + type=type_name, + classifier=classifier, ) ) @@ -169,7 +182,10 @@ def emit_union_case_spec_encoder( if isinstance(node.arm, _XdrVoid): return - + if isinstance(node.arm, _XdrString): + type_name = "char *" + else: + type_name = node.arm.spec.type_name if big_endian_discriminant: template = get_jinja2_template(environment, "encoder", "case_spec_be") else: @@ -181,7 +197,7 @@ def emit_union_case_spec_encoder( print( template.render( name=node.arm.name, - type=node.arm.spec.type_name, + type=type_name, ) ) diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/string.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/string.j2 new file mode 100644 index 000000000000..2f035a64f1f4 --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/string.j2 @@ -0,0 +1,6 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +{% if annotate %} + /* member {{ name }} (variable-length string) */ +{% endif %} + if (!xdrgen_encode_string(xdr, ptr->u.{{ name }}, {{ maxsize }})) + return false; From 14282cc3cfa25b7c137fb2f63ea0db61311d45e3 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 3 Nov 2025 12:57:34 -0500 Subject: [PATCH 28/38] NFSD: don't start nfsd if sv_permsocks is empty Previously, while trying to create a server instance, if no listening sockets were present then default parameter udp and tcp listeners were created. It's unclear what purpose was of starting these listeners were and how this could have been triggered by the userland setup. This patch proposed to ensure the reverse that we never end in a situation where no listener sockets are created and we are trying to create nfsd threads. The problem it solves is: when nfs.conf only has tcp=n (and nothing else for the choice of transports), nfsdctl would still start the server and create udp and tcp listeners. Signed-off-by: Olga Kornievskaia Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 7057ddd7a0a8..b08ae85d53ef 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -249,27 +249,6 @@ int nfsd_nrthreads(struct net *net) return rv; } -static int nfsd_init_socks(struct net *net, const struct cred *cred) -{ - int error; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - if (!list_empty(&nn->nfsd_serv->sv_permsocks)) - return 0; - - error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, - SVC_SOCK_DEFAULTS, cred); - if (error < 0) - return error; - - error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, - SVC_SOCK_DEFAULTS, cred); - if (error < 0) - return error; - - return 0; -} - static int nfsd_users = 0; static int nfsd_startup_generic(void) @@ -377,9 +356,12 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) ret = nfsd_startup_generic(); if (ret) return ret; - ret = nfsd_init_socks(net, cred); - if (ret) + + if (list_empty(&nn->nfsd_serv->sv_permsocks)) { + pr_warn("NFSD: Failed to start, no listeners configured.\n"); + ret = -EIO; goto out_socks; + } if (nfsd_needs_lockd(nn) && !nn->lockd_up) { ret = lockd_up(net, cred); From f7cb94fad4e6cec354a3ea779f91fe5560fb72b6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 5 Nov 2025 10:26:06 -0500 Subject: [PATCH 29/38] xdrgen: Fix union declarations Add a missing template file. This file is used when a union is defined as a public API (ie, "pragma public ;"). Signed-off-by: Chuck Lever --- .../net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 new file mode 100644 index 000000000000..816291184e8c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 @@ -0,0 +1,4 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr); +bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *value); From 1c873a2fd1109302a7687524d541ed815c13c026 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 5 Nov 2025 10:26:07 -0500 Subject: [PATCH 30/38] xdrgen: Don't generate unnecessary semicolon The Jinja2 templates add a semicolon at the end of every function. The C language does not require this punctuation. Signed-off-by: Chuck Lever --- tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 | 2 +- .../xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 | 2 +- .../xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 | 2 +- .../xdrgen/templates/C/typedef/decoder/variable_length_array.j2 | 2 +- .../templates/C/typedef/decoder/variable_length_opaque.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 | 2 +- .../xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 | 2 +- .../xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 | 2 +- .../xdrgen/templates/C/typedef/encoder/variable_length_array.j2 | 2 +- .../templates/C/typedef/encoder/variable_length_opaque.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 | 2 +- tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 index da4709403dc9..b215e157dfa7 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr) /* (basic) */ {% endif %} return xdrgen_decode_{{ type }}(xdr, ptr); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 index d7c80e472fe3..c8953719e626 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 @@ -22,4 +22,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr return false; } return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 index bdc7bd24ffb1..c854fc8c74e3 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr /* (fixed-length opaque) */ {% endif %} return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) == 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 index 56c5a17d6a70..bcbc1758aae9 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr /* (variable-length string) */ {% endif %} return xdrgen_decode_string(xdr, ptr, {{ maxsize }}); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 index e74ffdd98463..a59cc1f38eed 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 @@ -23,4 +23,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr if (!xdrgen_decode_{{ type }}(xdr, &ptr->element[i])) return false; return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 index f28f8b228ad5..eb05f53e1041 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr /* (variable-length opaque) */ {% endif %} return xdrgen_decode_opaque(xdr, ptr, {{ maxsize }}); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 index 35effe67e4ef..0d21dd0b723a 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 @@ -18,4 +18,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (basic) */ {% endif %} return xdrgen_encode_{{ type }}(xdr, value); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 index 95202ad5ad2d..ec8cd6509514 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 @@ -22,4 +22,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } return false; } return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 index 9c66a11b9912..b53fa87e1858 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (fixed-length opaque) */ {% endif %} return xdr_stream_encode_opaque_fixed(xdr, value, {{ size }}) >= 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 index 3d490ff180d0..28b81f1d0bd6 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 @@ -14,4 +14,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (variable-length string) */ {% endif %} return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 index 2d2384f64918..ff093c281d51 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 @@ -27,4 +27,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } {% endif %} return false; return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 index 8508f13c95b9..2e89592fa702 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (variable-length opaque) */ {% endif %} return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 index fdc2dfd1843b..39d8d6c5094d 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 @@ -1,4 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} } return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 index fdc2dfd1843b..39d8d6c5094d 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 @@ -1,4 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} } return true; -}; +} From 6f52063db9aabdaabea929b1e998af98c2e8d917 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 5 Nov 2025 12:45:54 -0800 Subject: [PATCH 31/38] NFSD: use correct reservation type in nfsd4_scsi_fence_client The reservation type argument for the pr_preempt call should match the one used in nfsd4_block_get_device_info_scsi. Fixes: f99d4fbdae67 ("nfsd: add SCSI layout support") Cc: stable@vger.kernel.org Signed-off-by: Dai Ngo Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 101cccbee4a3..06769576e309 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -400,7 +400,8 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, - nfsd4_scsi_pr_key(clp), 0, true); + nfsd4_scsi_pr_key(clp), + PR_EXCLUSIVE_ACCESS_REG_ONLY, true); } const struct nfsd4_layout_ops scsi_layout_ops = { From 99f5aa14f03e364c43a3d1b5459a021b0201f5c5 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 5 Nov 2025 12:45:55 -0800 Subject: [PATCH 32/38] NFSD: Add trace point for SCSI fencing operation. Add trace point to print client IP address, net namespace number, device name and status of SCSI pr_preempt command. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 5 ++++- fs/nfsd/trace.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 06769576e309..afa16d7a8013 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -13,6 +13,7 @@ #include "pnfs.h" #include "filecache.h" #include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS @@ -398,10 +399,12 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; + int status; - bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, + status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), PR_EXCLUSIVE_ACCESS_REG_ONLY, true); + trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); } const struct nfsd4_layout_ops scsi_layout_ops = { diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index bfd41236aff2..85a1521ad757 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -2614,6 +2614,44 @@ DEFINE_EVENT(nfsd_vfs_getattr_class, __name, \ DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_getattr); DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_statfs); +DECLARE_EVENT_CLASS(nfsd_pnfs_class, + TP_PROTO( + const struct nfs4_client *clp, + const char *dev, + int error + ), + TP_ARGS(clp, dev, error), + TP_STRUCT__entry( + __sockaddr(addr, sizeof(struct sockaddr_in6)) + __field(unsigned int, netns_ino) + __string(dev, dev) + __field(int, error) + ), + TP_fast_assign( + __assign_sockaddr(addr, &clp->cl_addr, + sizeof(struct sockaddr_in6)); + __entry->netns_ino = clp->net->ns.inum; + __assign_str(dev); + __entry->error = error; + ), + TP_printk("client=%pISpc nn=%d dev=%s error=%d", + __get_sockaddr(addr), + __entry->netns_ino, + __get_str(dev), + __entry->error + ) +); + +#define DEFINE_NFSD_PNFS_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_pnfs_class, nfsd_pnfs_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const char *dev, \ + int error \ + ), \ + TP_ARGS(clp, dev, error)) + +DEFINE_NFSD_PNFS_ERR_EVENT(fence); #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH From e3e8e176ca4876e6212582022ad80835dddc9de4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 11 Nov 2025 09:59:30 -0500 Subject: [PATCH 33/38] NFSD: Make FILE_SYNC WRITEs comply with spec Mike noted that when NFSD responds to an NFS_FILE_SYNC WRITE, it does not also persist file time stamps. To wit, Section 18.32.3 of RFC 8881 mandates: > The client specifies with the stable parameter the method of how > the data is to be processed by the server. If stable is > FILE_SYNC4, the server MUST commit the data written plus all file > system metadata to stable storage before returning results. This > corresponds to the NFSv2 protocol semantics. Any other behavior > constitutes a protocol violation. If stable is DATA_SYNC4, then > the server MUST commit all of the data to stable storage and > enough of the metadata to retrieve the data before returning. Commit 3f3503adb332 ("NFSD: Use vfs_iocb_iter_write()") replaced: - flags |= RWF_SYNC; with: + kiocb.ki_flags |= IOCB_DSYNC; which appears to be correct given: if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; in kiocb_set_rw_flags(). However the author of that commit did not appreciate that the previous line in kiocb_set_rw_flags() results in IOCB_SYNC also being set: kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); RWF_SUPPORTED contains RWF_SYNC, and RWF_SYNC is the same bit as IOCB_SYNC. Reviewers at the time did not catch the omission. Reported-by: Mike Snitzer Closes: https://lore.kernel.org/linux-nfs/20251018005431.3403-1-cel@kernel.org/T/#t Fixes: 3f3503adb332 ("NFSD: Use vfs_iocb_iter_write()") Cc: stable@vger.kernel.org Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f537a7b4ee01..5333d49910d9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1314,8 +1314,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, stable = NFS_UNSTABLE; init_sync_kiocb(&kiocb, file); kiocb.ki_pos = offset; - if (stable && !fhp->fh_use_wgather) - kiocb.ki_flags |= IOCB_DSYNC; + if (likely(!fhp->fh_use_wgather)) { + switch (stable) { + case NFS_FILE_SYNC: + /* persist data and timestamps */ + kiocb.ki_flags |= IOCB_DSYNC | IOCB_SYNC; + break; + case NFS_DATA_SYNC: + /* persist data only */ + kiocb.ki_flags |= IOCB_DSYNC; + break; + } + } nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); From 06c5c97293e3fca99ce15da157068edf45a7c6e4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 11 Nov 2025 09:59:31 -0500 Subject: [PATCH 34/38] NFSD: Implement NFSD_IO_DIRECT for NFS WRITE When NFSD_IO_DIRECT is selected via the /sys/kernel/debug/nfsd/io_cache_write experimental tunable, split incoming unaligned NFS WRITE requests into a prefix, middle and suffix segment, as needed. The middle segment is now DIO-aligned and the prefix and/or suffix are unaligned. Synchronous buffered IO is used for the unaligned segments, and IOCB_DIRECT is used for the middle DIO-aligned extent. Although IOCB_DIRECT avoids the use of the page cache, by itself it doesn't guarantee data durability. For UNSTABLE WRITE requests, durability is obtained by a subsequent NFS COMMIT request. Signed-off-by: Mike Snitzer Co-developed-by: Chuck Lever Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/debugfs.c | 1 + fs/nfsd/trace.h | 2 + fs/nfsd/vfs.c | 145 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 144 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index 00eb1ecef6ac..7f44689e0a53 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val) switch (val) { case NFSD_IO_BUFFERED: case NFSD_IO_DONTCACHE: + case NFSD_IO_DIRECT: nfsd_io_cache_write = val; break; default: diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 85a1521ad757..5ae2a611e57f 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -469,6 +469,8 @@ DEFINE_NFSD_IO_EVENT(read_io_done); DEFINE_NFSD_IO_EVENT(read_done); DEFINE_NFSD_IO_EVENT(write_start); DEFINE_NFSD_IO_EVENT(write_opened); +DEFINE_NFSD_IO_EVENT(write_direct); +DEFINE_NFSD_IO_EVENT(write_vector); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); DEFINE_NFSD_IO_EVENT(commit_start); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5333d49910d9..ab46301da4ae 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1254,6 +1254,136 @@ static int wait_for_concurrent_writes(struct file *file) return err; } +struct nfsd_write_dio_seg { + struct iov_iter iter; + int flags; +}; + +static unsigned long +iov_iter_bvec_offset(const struct iov_iter *iter) +{ + return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset); +} + +static void +nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment, + struct bio_vec *bvec, unsigned int nvecs, + unsigned long total, size_t start, size_t len, + struct kiocb *iocb) +{ + iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total); + if (start) + iov_iter_advance(&segment->iter, start); + iov_iter_truncate(&segment->iter, len); + segment->flags = iocb->ki_flags; +} + +static unsigned int +nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec, + unsigned int nvecs, struct kiocb *iocb, + unsigned long total, + struct nfsd_write_dio_seg segments[3]) +{ + u32 offset_align = nf->nf_dio_offset_align; + loff_t prefix_end, orig_end, middle_end; + u32 mem_align = nf->nf_dio_mem_align; + size_t prefix, middle, suffix; + loff_t offset = iocb->ki_pos; + unsigned int nsegs = 0; + + /* + * Check if direct I/O is feasible for this write request. + * If alignments are not available, the write is too small, + * or no alignment can be found, fall back to buffered I/O. + */ + if (unlikely(!mem_align || !offset_align) || + unlikely(total < max(offset_align, mem_align))) + goto no_dio; + + prefix_end = round_up(offset, offset_align); + orig_end = offset + total; + middle_end = round_down(orig_end, offset_align); + + prefix = prefix_end - offset; + middle = middle_end - prefix_end; + suffix = orig_end - middle_end; + + if (!middle) + goto no_dio; + + if (prefix) + nfsd_write_dio_seg_init(&segments[nsegs++], bvec, + nvecs, total, 0, prefix, iocb); + + nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs, + total, prefix, middle, iocb); + + /* + * Check if the bvec iterator is aligned for direct I/O. + * + * bvecs generated from RPC receive buffers are contiguous: After + * the first bvec, all subsequent bvecs start at bv_offset zero + * (page-aligned). Therefore, only the first bvec is checked. + */ + if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1)) + goto no_dio; + segments[nsegs].flags |= IOCB_DIRECT; + nsegs++; + + if (suffix) + nfsd_write_dio_seg_init(&segments[nsegs++], bvec, nvecs, total, + prefix + middle, suffix, iocb); + + return nsegs; + +no_dio: + /* No DIO alignment possible - pack into single non-DIO segment. */ + nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total, 0, + total, iocb); + return 1; +} + +static noinline_for_stack int +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, unsigned int nvecs, + unsigned long *cnt, struct kiocb *kiocb) +{ + struct nfsd_write_dio_seg segments[3]; + struct file *file = nf->nf_file; + unsigned int nsegs, i; + ssize_t host_err; + + nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs, + kiocb, *cnt, segments); + + *cnt = 0; + for (i = 0; i < nsegs; i++) { + kiocb->ki_flags = segments[i].flags; + if (kiocb->ki_flags & IOCB_DIRECT) + trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos, + segments[i].iter.count); + else { + trace_nfsd_write_vector(rqstp, fhp, kiocb->ki_pos, + segments[i].iter.count); + /* + * Mark the I/O buffer as evict-able to reduce + * memory contention. + */ + if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE) + kiocb->ki_flags |= IOCB_DONTCACHE; + } + + host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter); + if (host_err < 0) + return host_err; + *cnt += host_err; + if (host_err < segments[i].iter.count) + break; /* partial write */ + } + + return 0; +} + /** * nfsd_vfs_write - write data to an already-open file * @rqstp: RPC execution context @@ -1328,25 +1458,32 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, } nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); - iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); + since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); switch (nfsd_io_cache_write) { - case NFSD_IO_BUFFERED: + case NFSD_IO_DIRECT: + host_err = nfsd_direct_write(rqstp, fhp, nf, nvecs, + cnt, &kiocb); break; case NFSD_IO_DONTCACHE: if (file->f_op->fop_flags & FOP_DONTCACHE) kiocb.ki_flags |= IOCB_DONTCACHE; + fallthrough; + case NFSD_IO_BUFFERED: + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); + host_err = vfs_iocb_iter_write(file, &kiocb, &iter); + if (host_err < 0) + break; + *cnt = host_err; break; } - host_err = vfs_iocb_iter_write(file, &kiocb, &iter); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; } - *cnt = host_err; nfsd_stats_io_write_add(nn, exp, *cnt); fsnotify_modify(file); host_err = filemap_check_wb_err(file->f_mapping, since); From fa8d4e6784d1b6a6eaa3911bac993181631d2856 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 11 Nov 2025 09:59:32 -0500 Subject: [PATCH 35/38] NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst This document details the NFSD IO modes that are configurable using NFSD's experimental debugfs interfaces: /sys/kernel/debug/nfsd/io_cache_read /sys/kernel/debug/nfsd/io_cache_write This document will evolve as NFSD's interfaces do (e.g. if/when NFSD's debugfs interfaces are replaced with per-export controls). Future updates will provide more specific guidance and howto information to help others use and evaluate NFSD's IO modes: BUFFERED, DONTCACHE and DIRECT. Signed-off-by: Mike Snitzer Signed-off-by: Chuck Lever --- .../filesystems/nfs/nfsd-io-modes.rst | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 Documentation/filesystems/nfs/nfsd-io-modes.rst diff --git a/Documentation/filesystems/nfs/nfsd-io-modes.rst b/Documentation/filesystems/nfs/nfsd-io-modes.rst new file mode 100644 index 000000000000..e3a522d09766 --- /dev/null +++ b/Documentation/filesystems/nfs/nfsd-io-modes.rst @@ -0,0 +1,144 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +NFSD IO MODES +============= + +Overview +======== + +NFSD has historically always used buffered IO when servicing READ and +WRITE operations. BUFFERED is NFSD's default IO mode, but it is possible +to override that default to use either DONTCACHE or DIRECT IO modes. + +Experimental NFSD debugfs interfaces are available to allow the NFSD IO +mode used for READ and WRITE to be configured independently. See both: +- /sys/kernel/debug/nfsd/io_cache_read +- /sys/kernel/debug/nfsd/io_cache_write + +The default value for both io_cache_read and io_cache_write reflects +NFSD's default IO mode (which is NFSD_IO_BUFFERED=0). + +Based on the configured settings, NFSD's IO will either be: +- cached using page cache (NFSD_IO_BUFFERED=0) +- cached but removed from page cache on completion (NFSD_IO_DONTCACHE=1) +- not cached stable_how=NFS_UNSTABLE (NFSD_IO_DIRECT=2) + +To set an NFSD IO mode, write a supported value (0 - 2) to the +corresponding IO operation's debugfs interface, e.g.: + echo 2 > /sys/kernel/debug/nfsd/io_cache_read + echo 2 > /sys/kernel/debug/nfsd/io_cache_write + +To check which IO mode NFSD is using for READ or WRITE, simply read the +corresponding IO operation's debugfs interface, e.g.: + cat /sys/kernel/debug/nfsd/io_cache_read + cat /sys/kernel/debug/nfsd/io_cache_write + +If you experiment with NFSD's IO modes on a recent kernel and have +interesting results, please report them to linux-nfs@vger.kernel.org + +NFSD DONTCACHE +============== + +DONTCACHE offers a hybrid approach to servicing IO that aims to offer +the benefits of using DIRECT IO without any of the strict alignment +requirements that DIRECT IO imposes. To achieve this buffered IO is used +but the IO is flagged to "drop behind" (meaning associated pages are +dropped from the page cache) when IO completes. + +DONTCACHE aims to avoid what has proven to be a fairly significant +limition of Linux's memory management subsystem if/when large amounts of +data is infrequently accessed (e.g. read once _or_ written once but not +read until much later). Such use-cases are particularly problematic +because the page cache will eventually become a bottleneck to servicing +new IO requests. + +For more context on DONTCACHE, please see these Linux commit headers: +- Overview: 9ad6344568cc3 ("mm/filemap: change filemap_create_folio() + to take a struct kiocb") +- for READ: 8026e49bff9b1 ("mm/filemap: add read support for + RWF_DONTCACHE") +- for WRITE: 974c5e6139db3 ("xfs: flag as supporting FOP_DONTCACHE") + +NFSD_IO_DONTCACHE will fall back to NFSD_IO_BUFFERED if the underlying +filesystem doesn't indicate support by setting FOP_DONTCACHE. + +NFSD DIRECT +=========== + +DIRECT IO doesn't make use of the page cache, as such it is able to +avoid the Linux memory management's page reclaim scalability problems +without resorting to the hybrid use of page cache that DONTCACHE does. + +Some workloads benefit from NFSD avoiding the page cache, particularly +those with a working set that is significantly larger than available +system memory. The pathological worst-case workload that NFSD DIRECT has +proven to help most is: NFS client issuing large sequential IO to a file +that is 2-3 times larger than the NFS server's available system memory. +The reason for such improvement is NFSD DIRECT eliminates a lot of work +that the memory management subsystem would otherwise be required to +perform (e.g. page allocation, dirty writeback, page reclaim). When +using NFSD DIRECT, kswapd and kcompactd are no longer commanding CPU +time trying to find adequate free pages so that forward IO progress can +be made. + +The performance win associated with using NFSD DIRECT was previously +discussed on linux-nfs, see: +https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/ +But in summary: +- NFSD DIRECT can significantly reduce memory requirements +- NFSD DIRECT can reduce CPU load by avoiding costly page reclaim work +- NFSD DIRECT can offer more deterministic IO performance + +As always, your mileage may vary and so it is important to carefully +consider if/when it is beneficial to make use of NFSD DIRECT. When +assessing comparative performance of your workload please be sure to log +relevant performance metrics during testing (e.g. memory usage, cpu +usage, IO performance). Using perf to collect perf data that may be used +to generate a "flamegraph" for work Linux must perform on behalf of your +test is a really meaningful way to compare the relative health of the +system and how switching NFSD's IO mode changes what is observed. + +If NFSD_IO_DIRECT is specified by writing 2 (or 3 and 4 for WRITE) to +NFSD's debugfs interfaces, ideally the IO will be aligned relative to +the underlying block device's logical_block_size. Also the memory buffer +used to store the READ or WRITE payload must be aligned relative to the +underlying block device's dma_alignment. + +But NFSD DIRECT does handle misaligned IO in terms of O_DIRECT as best +it can: + +Misaligned READ: + If NFSD_IO_DIRECT is used, expand any misaligned READ to the next + DIO-aligned block (on either end of the READ). The expanded READ is + verified to have proper offset/len (logical_block_size) and + dma_alignment checking. + +Misaligned WRITE: + If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start, + middle and end as needed. The large middle segment is DIO-aligned + and the start and/or end are misaligned. Buffered IO is used for the + misaligned segments and O_DIRECT is used for the middle DIO-aligned + segment. DONTCACHE buffered IO is _not_ used for the misaligned + segments because using normal buffered IO offers significant RMW + performance benefit when handling streaming misaligned WRITEs. + +Tracing: + The nfsd_read_direct trace event shows how NFSD expands any + misaligned READ to the next DIO-aligned block (on either end of the + original READ, as needed). + + This combination of trace events is useful for READs: + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_direct/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable + echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable + + The nfsd_write_direct trace event shows how NFSD splits a given + misaligned WRITE into a DIO-aligned middle segment. + + This combination of trace events is useful for WRITEs: + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_direct/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable + echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable From 21478b6ecaa443ee5a89ae744559583ffbe50f30 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 3 Dec 2025 08:09:09 +0700 Subject: [PATCH 36/38] NFSD: Add toctree entry for NFSD IO modes docs Commit fa8d4e6784d1b6 ("NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst") adds documentation for NFSD I/O modes, but it forgets to add toctree entry for it. Hence, Sphinx reports: Documentation/filesystems/nfs/nfsd-io-modes.rst: WARNING: document isn't included in any toctree [toc.not_included] Add the entry. Fixes: fa8d4e6784d1b6 ("NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20251202152506.7a2d2d41@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Reviewed-by: Jeff Layton Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Chuck Lever --- Documentation/filesystems/nfs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/filesystems/nfs/index.rst b/Documentation/filesystems/nfs/index.rst index 95c2c009874c..a29a212b5b4d 100644 --- a/Documentation/filesystems/nfs/index.rst +++ b/Documentation/filesystems/nfs/index.rst @@ -13,5 +13,6 @@ NFS rpc-cache rpc-server-gss nfs41-server + nfsd-io-modes knfsd-stats reexport From 4fcf9952fb3137c64e32edb5fcd03da6febe4724 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 3 Dec 2025 08:09:10 +0700 Subject: [PATCH 37/38] NFSD: nfsd-io-modes: Wrap shell snippets in literal code blocks Sphinx reports htmldocs indentation warnings: Documentation/filesystems/nfs/nfsd-io-modes.rst:29: ERROR: Unexpected indentation. [docutils] Documentation/filesystems/nfs/nfsd-io-modes.rst:34: ERROR: Unexpected indentation. [docutils] Fix these by wrapping shell snippets in literal code blocks. Fixes: fa8d4e6784d1b6 ("NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20251202152506.7a2d2d41@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Reviewed-by: Jeff Layton Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Chuck Lever --- .../filesystems/nfs/nfsd-io-modes.rst | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/nfs/nfsd-io-modes.rst b/Documentation/filesystems/nfs/nfsd-io-modes.rst index e3a522d09766..fa47c4d3dfb9 100644 --- a/Documentation/filesystems/nfs/nfsd-io-modes.rst +++ b/Documentation/filesystems/nfs/nfsd-io-modes.rst @@ -25,12 +25,14 @@ Based on the configured settings, NFSD's IO will either be: - not cached stable_how=NFS_UNSTABLE (NFSD_IO_DIRECT=2) To set an NFSD IO mode, write a supported value (0 - 2) to the -corresponding IO operation's debugfs interface, e.g.: +corresponding IO operation's debugfs interface, e.g.:: + echo 2 > /sys/kernel/debug/nfsd/io_cache_read echo 2 > /sys/kernel/debug/nfsd/io_cache_write To check which IO mode NFSD is using for READ or WRITE, simply read the -corresponding IO operation's debugfs interface, e.g.: +corresponding IO operation's debugfs interface, e.g.:: + cat /sys/kernel/debug/nfsd/io_cache_read cat /sys/kernel/debug/nfsd/io_cache_write @@ -128,17 +130,19 @@ Tracing: misaligned READ to the next DIO-aligned block (on either end of the original READ, as needed). - This combination of trace events is useful for READs: - echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable - echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_direct/enable - echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable - echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable + This combination of trace events is useful for READs:: + + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_direct/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable + echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable The nfsd_write_direct trace event shows how NFSD splits a given misaligned WRITE into a DIO-aligned middle segment. - This combination of trace events is useful for WRITEs: - echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable - echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_direct/enable - echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable - echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable + This combination of trace events is useful for WRITEs:: + + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_direct/enable + echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable + echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable From df8c841dd92a7f262ad4fa649aa493b181e02812 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 3 Dec 2025 08:09:11 +0700 Subject: [PATCH 38/38] NFSD: nfsd-io-modes: Separate lists Sphinx reports htmldocs indentation warnings: Documentation/filesystems/nfs/nfsd-io-modes.rst:58: ERROR: Unexpected indentation. [docutils] Documentation/filesystems/nfs/nfsd-io-modes.rst:59: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils] These caused the lists to be shown as long running paragraphs merged with their previous paragraphs. Fix these by separating the lists with a blank line. Fixes: fa8d4e6784d1b6 ("NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20251202152506.7a2d2d41@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Reviewed-by: Jeff Layton Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Chuck Lever --- Documentation/filesystems/nfs/nfsd-io-modes.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/filesystems/nfs/nfsd-io-modes.rst b/Documentation/filesystems/nfs/nfsd-io-modes.rst index fa47c4d3dfb9..0fd6e82478fe 100644 --- a/Documentation/filesystems/nfs/nfsd-io-modes.rst +++ b/Documentation/filesystems/nfs/nfsd-io-modes.rst @@ -13,6 +13,7 @@ to override that default to use either DONTCACHE or DIRECT IO modes. Experimental NFSD debugfs interfaces are available to allow the NFSD IO mode used for READ and WRITE to be configured independently. See both: + - /sys/kernel/debug/nfsd/io_cache_read - /sys/kernel/debug/nfsd/io_cache_write @@ -20,6 +21,7 @@ The default value for both io_cache_read and io_cache_write reflects NFSD's default IO mode (which is NFSD_IO_BUFFERED=0). Based on the configured settings, NFSD's IO will either be: + - cached using page cache (NFSD_IO_BUFFERED=0) - cached but removed from page cache on completion (NFSD_IO_DONTCACHE=1) - not cached stable_how=NFS_UNSTABLE (NFSD_IO_DIRECT=2) @@ -56,6 +58,7 @@ because the page cache will eventually become a bottleneck to servicing new IO requests. For more context on DONTCACHE, please see these Linux commit headers: + - Overview: 9ad6344568cc3 ("mm/filemap: change filemap_create_folio() to take a struct kiocb") - for READ: 8026e49bff9b1 ("mm/filemap: add read support for @@ -87,7 +90,9 @@ be made. The performance win associated with using NFSD DIRECT was previously discussed on linux-nfs, see: https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/ + But in summary: + - NFSD DIRECT can significantly reduce memory requirements - NFSD DIRECT can reduce CPU load by avoiding costly page reclaim work - NFSD DIRECT can offer more deterministic IO performance