Merge branch 'zcrx-updates-6.19' into for-6.19/io_uring

Merge zcrx updates from Pavel:

"Zcrx updates for 6.19. It includes a bunch of small patches,
 IORING_REGISTER_ZCRX_CTRL and RQ flushing (Patches 4-5) and David's
 work on sharing zcrx b/w multiple io_uring instances."

Link: https://lore.kernel.org/io-uring/cover.1763029704.git.asml.silence@gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* zcrx-updates-6.19:
  io_uring/zcrx: share an ifq between rings
  io_uring/zcrx: add io_fill_zcrx_offsets()
  io_uring/zcrx: export zcrx via a file
  io_uring/zcrx: move io_zcrx_scrub() and dependencies up
  io_uring/zcrx: count zcrx users
  io_uring/zcrx: add sync refill queue flushing
  io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
  io_uring/zcrx: elide passing msg flags
  io_uring/zcrx: use folio_nr_pages() instead of shift operation
  io_uring/zcrx: convert to use netmem_desc
This commit is contained in:
Jens Axboe
2025-11-13 11:20:19 -07:00
5 changed files with 317 additions and 61 deletions

View File

@@ -697,6 +697,9 @@ enum io_uring_register_op {
/* query various aspects of io_uring, see linux/io_uring/query.h */
IORING_REGISTER_QUERY = 35,
/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
IORING_REGISTER_ZCRX_CTRL = 36,
/* this goes last */
IORING_REGISTER_LAST,
@@ -1060,6 +1063,10 @@ struct io_uring_zcrx_area_reg {
__u64 __resv2[2];
};
enum zcrx_reg_flags {
ZCRX_REG_IMPORT = 1,
};
/*
* Argument for IORING_REGISTER_ZCRX_IFQ
*/
@@ -1078,6 +1085,33 @@ struct io_uring_zcrx_ifq_reg {
__u64 __resv[3];
};
enum zcrx_ctrl_op {
ZCRX_CTRL_FLUSH_RQ,
ZCRX_CTRL_EXPORT,
__ZCRX_CTRL_LAST,
};
struct zcrx_ctrl_flush_rq {
__u64 __resv[6];
};
struct zcrx_ctrl_export {
__u32 zcrx_fd;
__u32 __resv1[11];
};
struct zcrx_ctrl {
__u32 zcrx_id;
__u32 op; /* see enum zcrx_ctrl_op */
__u64 __resv[2];
union {
struct zcrx_ctrl_export zc_export;
struct zcrx_ctrl_flush_rq zc_flush;
};
};
#ifdef __cplusplus
}
#endif

View File

@@ -110,7 +110,6 @@ enum sr_retry_flags {
struct io_recvzc {
struct file *file;
unsigned msg_flags;
u16 flags;
u32 len;
struct io_zcrx_ifq *ifq;
@@ -1253,8 +1252,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
zc->len = READ_ONCE(sqe->len);
zc->flags = READ_ONCE(sqe->ioprio);
zc->msg_flags = READ_ONCE(sqe->msg_flags);
if (zc->msg_flags)
if (READ_ONCE(sqe->msg_flags))
return -EINVAL;
if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
return -EINVAL;
@@ -1283,8 +1281,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
len = zc->len;
ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT,
issue_flags, &zc->len);
ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
if (len && zc->len == 0) {
io_req_set_res(req, 0, 0);

View File

@@ -815,6 +815,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_REGISTER_QUERY:
ret = io_query(ctx, arg, nr_args);
break;
case IORING_REGISTER_ZCRX_CTRL:
ret = io_zcrx_ctrl(ctx, arg, nr_args);
break;
default:
ret = -EINVAL;
break;

View File

@@ -8,6 +8,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff_ref.h>
#include <linux/anon_inodes.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
@@ -170,7 +171,7 @@ static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pag
if (folio == last_folio)
continue;
last_folio = folio;
res += 1UL << folio_order(folio);
res += folio_nr_pages(folio);
}
return res;
}
@@ -344,6 +345,13 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets)
{
offsets->head = offsetof(struct io_uring, head);
offsets->tail = offsetof(struct io_uring, tail);
offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
}
static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
@@ -355,7 +363,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
void *ptr;
int ret;
off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
io_fill_zcrx_offsets(&reg->offsets);
off = reg->offsets.rqes;
size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
if (size > rd->size)
return -EINVAL;
@@ -371,9 +380,6 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
reg->offsets.head = offsetof(struct io_uring, head);
reg->offsets.tail = offsetof(struct io_uring, tail);
reg->offsets.rqes = off;
return 0;
}
@@ -482,6 +488,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
spin_lock_init(&ifq->rq_lock);
mutex_init(&ifq->pp_lock);
refcount_set(&ifq->refs, 1);
refcount_set(&ifq->user_refs, 1);
return ifq;
}
@@ -543,6 +550,57 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
io_zcrx_ifq_free(ifq);
}
static void io_zcrx_return_niov_freelist(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
spin_lock_bh(&area->freelist_lock);
area->freelist[area->free_count++] = net_iov_idx(niov);
spin_unlock_bh(&area->freelist_lock);
}
static void io_zcrx_return_niov(struct net_iov *niov)
{
netmem_ref netmem = net_iov_to_netmem(niov);
if (!niov->desc.pp) {
/* copy fallback allocated niovs */
io_zcrx_return_niov_freelist(niov);
return;
}
page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false);
}
static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
{
struct io_zcrx_area *area = ifq->area;
int i;
if (!area)
return;
/* Reclaim back all buffers given to the user space. */
for (i = 0; i < area->nia.num_niovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
int nr;
if (!atomic_read(io_get_user_counter(niov)))
continue;
nr = atomic_xchg(io_get_user_counter(niov), 0);
if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
io_zcrx_return_niov(niov);
}
}
static void zcrx_unregister(struct io_zcrx_ifq *ifq)
{
if (refcount_dec_and_test(&ifq->user_refs)) {
io_close_queue(ifq);
io_zcrx_scrub(ifq);
}
io_put_zcrx_ifq(ifq);
}
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id)
{
@@ -553,6 +611,112 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
return ifq ? &ifq->region : NULL;
}
static int zcrx_box_release(struct inode *inode, struct file *file)
{
struct io_zcrx_ifq *ifq = file->private_data;
if (WARN_ON_ONCE(!ifq))
return -EFAULT;
zcrx_unregister(ifq);
return 0;
}
static const struct file_operations zcrx_box_fops = {
.owner = THIS_MODULE,
.release = zcrx_box_release,
};
static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
struct zcrx_ctrl *ctrl, void __user *arg)
{
struct zcrx_ctrl_export *ce = &ctrl->zc_export;
struct file *file;
int fd = -1;
if (!mem_is_zero(ce, sizeof(*ce)))
return -EINVAL;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
return fd;
ce->zcrx_fd = fd;
if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
put_unused_fd(fd);
return -EFAULT;
}
refcount_inc(&ifq->refs);
refcount_inc(&ifq->user_refs);
file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
ifq, O_CLOEXEC, NULL);
if (IS_ERR(file)) {
put_unused_fd(fd);
zcrx_unregister(ifq);
return PTR_ERR(file);
}
fd_install(fd, file);
return 0;
}
static int import_zcrx(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg,
struct io_uring_zcrx_ifq_reg *reg)
{
struct io_zcrx_ifq *ifq;
struct file *file;
int fd, ret;
u32 id;
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
return -EINVAL;
if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
return -EINVAL;
if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
return -EINVAL;
fd = reg->if_idx;
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
file = fd_file(f);
if (file->f_op != &zcrx_box_fops || !file->private_data)
return -EBADF;
ifq = file->private_data;
refcount_inc(&ifq->refs);
refcount_inc(&ifq->user_refs);
scoped_guard(mutex, &ctx->mmap_lock) {
ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
if (ret)
goto err;
}
reg->zcrx_id = id;
io_fill_zcrx_offsets(&reg->offsets);
if (copy_to_user(arg, reg, sizeof(*reg))) {
ret = -EFAULT;
goto err_xa_erase;
}
scoped_guard(mutex, &ctx->mmap_lock) {
ret = -ENOMEM;
if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
goto err_xa_erase;
}
return 0;
err_xa_erase:
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->zcrx_ctxs, id);
err:
zcrx_unregister(ifq);
return ret;
}
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -578,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return -EINVAL;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
return -EFAULT;
if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
reg.__resv2 || reg.zcrx_id)
return -EINVAL;
if (reg.flags & ZCRX_REG_IMPORT)
return import_zcrx(ctx, arg, &reg);
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
return -EFAULT;
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
return -EINVAL;
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
@@ -683,48 +849,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
return &area->nia.niovs[niov_idx];
}
static void io_zcrx_return_niov_freelist(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
spin_lock_bh(&area->freelist_lock);
area->freelist[area->free_count++] = net_iov_idx(niov);
spin_unlock_bh(&area->freelist_lock);
}
static void io_zcrx_return_niov(struct net_iov *niov)
{
netmem_ref netmem = net_iov_to_netmem(niov);
if (!niov->pp) {
/* copy fallback allocated niovs */
io_zcrx_return_niov_freelist(niov);
return;
}
page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
}
static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
{
struct io_zcrx_area *area = ifq->area;
int i;
if (!area)
return;
/* Reclaim back all buffers given to the user space. */
for (i = 0; i < area->nia.num_niovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
int nr;
if (!atomic_read(io_get_user_counter(niov)))
continue;
nr = atomic_xchg(io_get_user_counter(niov), 0);
if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
io_zcrx_return_niov(niov);
}
}
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
@@ -741,10 +865,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
}
if (!ifq)
break;
io_close_queue(ifq);
io_zcrx_scrub(ifq);
io_put_zcrx_ifq(ifq);
zcrx_unregister(ifq);
}
xa_destroy(&ctx->zcrx_ctxs);
@@ -815,7 +936,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp,
if (!page_pool_unref_and_test(netmem))
continue;
if (unlikely(niov->pp != pp)) {
if (unlikely(niov->desc.pp != pp)) {
io_zcrx_return_niov(niov);
continue;
}
@@ -941,6 +1062,97 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
.uninstall = io_pp_uninstall,
};
static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
struct io_zcrx_ifq *zcrx)
{
unsigned int mask = zcrx->rq_entries - 1;
unsigned int i;
guard(spinlock_bh)(&zcrx->rq_lock);
nr = min(nr, io_zcrx_rqring_entries(zcrx));
for (i = 0; i < nr; i++) {
struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
struct net_iov *niov;
if (!io_parse_rqe(rqe, zcrx, &niov))
break;
netmem_array[i] = net_iov_to_netmem(niov);
}
smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
return i;
}
#define ZCRX_FLUSH_BATCH 32
static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr)
{
unsigned i;
for (i = 0; i < nr; i++) {
netmem_ref netmem = netmems[i];
struct net_iov *niov = netmem_to_net_iov(netmem);
if (!io_zcrx_put_niov_uref(niov))
continue;
if (!page_pool_unref_and_test(netmem))
continue;
io_zcrx_return_niov(niov);
}
}
static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
struct zcrx_ctrl *ctrl)
{
struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush;
netmem_ref netmems[ZCRX_FLUSH_BATCH];
unsigned total = 0;
unsigned nr;
if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv)))
return -EINVAL;
do {
nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
zcrx_return_buffers(netmems, nr);
total += nr;
if (fatal_signal_pending(current))
break;
cond_resched();
} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
return 0;
}
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
struct zcrx_ctrl ctrl;
struct io_zcrx_ifq *zcrx;
if (nr_args)
return -EINVAL;
if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
return -EFAULT;
if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv)))
return -EFAULT;
zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
if (!zcrx)
return -ENXIO;
switch (ctrl.op) {
case ZCRX_CTRL_FLUSH_RQ:
return zcrx_flush_rq(ctx, zcrx, &ctrl);
case ZCRX_CTRL_EXPORT:
return zcrx_export(ctx, zcrx, &ctrl, arg);
}
return -EOPNOTSUPP;
}
static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
struct io_zcrx_ifq *ifq, int off, int len)
{
@@ -1082,13 +1294,15 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
const skb_frag_t *frag, int off, int len)
{
struct net_iov *niov;
struct page_pool *pp;
if (unlikely(!skb_frag_is_net_iov(frag)))
return io_zcrx_copy_frag(req, ifq, frag, off, len);
niov = netmem_to_net_iov(frag->netmem);
if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
io_pp_to_ifq(niov->pp) != ifq)
pp = niov->desc.pp;
if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq)
return -EFAULT;
if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))

View File

@@ -55,6 +55,8 @@ struct io_zcrx_ifq {
struct net_device *netdev;
netdevice_tracker netdev_tracker;
refcount_t refs;
/* counts userspace facing users like io_uring */
refcount_t user_refs;
/*
* Page pool and net configuration lock, can be taken deeper in the
@@ -65,6 +67,7 @@ struct io_zcrx_ifq {
};
#if defined(CONFIG_IO_URING_ZCRX)
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
@@ -93,6 +96,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct
{
return NULL;
}
static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx,
void __user *arg, unsigned nr_arg)
{
return -EOPNOTSUPP;
}
#endif
int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);