NFSD: Implement NFSD_IO_DIRECT for NFS WRITE

When NFSD_IO_DIRECT is selected via the
/sys/kernel/debug/nfsd/io_cache_write experimental tunable, split
incoming unaligned NFS WRITE requests into a prefix, middle and
suffix segment, as needed. The middle segment is now DIO-aligned and
the prefix and/or suffix are unaligned. Synchronous buffered IO is
used for the unaligned segments, and IOCB_DIRECT is used for the
middle DIO-aligned extent.

Although IOCB_DIRECT avoids the use of the page cache, by itself it
doesn't guarantee data durability. For UNSTABLE WRITE requests,
durability is obtained by a subsequent NFS COMMIT request.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Co-developed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
This commit is contained in:
Mike Snitzer
2025-11-11 09:59:31 -05:00
committed by Chuck Lever
parent e3e8e176ca
commit 06c5c97293
3 changed files with 144 additions and 4 deletions

View File

@@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
switch (val) {
case NFSD_IO_BUFFERED:
case NFSD_IO_DONTCACHE:
case NFSD_IO_DIRECT:
nfsd_io_cache_write = val;
break;
default:

View File

@@ -469,6 +469,8 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
DEFINE_NFSD_IO_EVENT(read_done);
DEFINE_NFSD_IO_EVENT(write_start);
DEFINE_NFSD_IO_EVENT(write_opened);
DEFINE_NFSD_IO_EVENT(write_direct);
DEFINE_NFSD_IO_EVENT(write_vector);
DEFINE_NFSD_IO_EVENT(write_io_done);
DEFINE_NFSD_IO_EVENT(write_done);
DEFINE_NFSD_IO_EVENT(commit_start);

View File

@@ -1254,6 +1254,136 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
struct nfsd_write_dio_seg {
struct iov_iter iter;
int flags;
};
static unsigned long
iov_iter_bvec_offset(const struct iov_iter *iter)
{
return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset);
}
static void
nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment,
struct bio_vec *bvec, unsigned int nvecs,
unsigned long total, size_t start, size_t len,
struct kiocb *iocb)
{
iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total);
if (start)
iov_iter_advance(&segment->iter, start);
iov_iter_truncate(&segment->iter, len);
segment->flags = iocb->ki_flags;
}
static unsigned int
nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec,
unsigned int nvecs, struct kiocb *iocb,
unsigned long total,
struct nfsd_write_dio_seg segments[3])
{
u32 offset_align = nf->nf_dio_offset_align;
loff_t prefix_end, orig_end, middle_end;
u32 mem_align = nf->nf_dio_mem_align;
size_t prefix, middle, suffix;
loff_t offset = iocb->ki_pos;
unsigned int nsegs = 0;
/*
* Check if direct I/O is feasible for this write request.
* If alignments are not available, the write is too small,
* or no alignment can be found, fall back to buffered I/O.
*/
if (unlikely(!mem_align || !offset_align) ||
unlikely(total < max(offset_align, mem_align)))
goto no_dio;
prefix_end = round_up(offset, offset_align);
orig_end = offset + total;
middle_end = round_down(orig_end, offset_align);
prefix = prefix_end - offset;
middle = middle_end - prefix_end;
suffix = orig_end - middle_end;
if (!middle)
goto no_dio;
if (prefix)
nfsd_write_dio_seg_init(&segments[nsegs++], bvec,
nvecs, total, 0, prefix, iocb);
nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs,
total, prefix, middle, iocb);
/*
* Check if the bvec iterator is aligned for direct I/O.
*
* bvecs generated from RPC receive buffers are contiguous: After
* the first bvec, all subsequent bvecs start at bv_offset zero
* (page-aligned). Therefore, only the first bvec is checked.
*/
if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1))
goto no_dio;
segments[nsegs].flags |= IOCB_DIRECT;
nsegs++;
if (suffix)
nfsd_write_dio_seg_init(&segments[nsegs++], bvec, nvecs, total,
prefix + middle, suffix, iocb);
return nsegs;
no_dio:
/* No DIO alignment possible - pack into single non-DIO segment. */
nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total, 0,
total, iocb);
return 1;
}
static noinline_for_stack int
nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct nfsd_file *nf, unsigned int nvecs,
unsigned long *cnt, struct kiocb *kiocb)
{
struct nfsd_write_dio_seg segments[3];
struct file *file = nf->nf_file;
unsigned int nsegs, i;
ssize_t host_err;
nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs,
kiocb, *cnt, segments);
*cnt = 0;
for (i = 0; i < nsegs; i++) {
kiocb->ki_flags = segments[i].flags;
if (kiocb->ki_flags & IOCB_DIRECT)
trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos,
segments[i].iter.count);
else {
trace_nfsd_write_vector(rqstp, fhp, kiocb->ki_pos,
segments[i].iter.count);
/*
* Mark the I/O buffer as evict-able to reduce
* memory contention.
*/
if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
kiocb->ki_flags |= IOCB_DONTCACHE;
}
host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter);
if (host_err < 0)
return host_err;
*cnt += host_err;
if (host_err < segments[i].iter.count)
break; /* partial write */
}
return 0;
}
/**
* nfsd_vfs_write - write data to an already-open file
* @rqstp: RPC execution context
@@ -1328,25 +1458,32 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
switch (nfsd_io_cache_write) {
case NFSD_IO_BUFFERED:
case NFSD_IO_DIRECT:
host_err = nfsd_direct_write(rqstp, fhp, nf, nvecs,
cnt, &kiocb);
break;
case NFSD_IO_DONTCACHE:
if (file->f_op->fop_flags & FOP_DONTCACHE)
kiocb.ki_flags |= IOCB_DONTCACHE;
fallthrough;
case NFSD_IO_BUFFERED:
iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
if (host_err < 0)
break;
*cnt = host_err;
break;
}
host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
}
*cnt = host_err;
nfsd_stats_io_write_add(nn, exp, *cnt);
fsnotify_modify(file);
host_err = filemap_check_wb_err(file->f_mapping, since);