diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 0413dcd9ef69d..28300fee22bfc 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -382,17 +382,17 @@ Zero copy --------- ublk zero copy relies on io_uring's fixed kernel buffer, which provides -two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`. +two APIs: `io_buffer_register_request()` and `io_buffer_unregister`. ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call -`io_buffer_register_bvec()` for ublk server to register client request +`io_buffer_register_request()` for ublk server to register client request buffer into io_uring buffer table, then ublk server can submit io_uring IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF` -calls `io_buffer_unregister_bvec()` to unregister the buffer, which is -guaranteed to be live between calling `io_buffer_register_bvec()` and -`io_buffer_unregister_bvec()`. Any io_uring operation which supports this -kind of kernel buffer will grab one reference of the buffer until the -operation is completed. +calls `io_buffer_unregister()` to unregister the buffer, which is guaranteed +to be live between calling `io_buffer_register_request()` and +`io_buffer_unregister()`. Any io_uring operation which supports this kind of +kernel buffer will grab one reference of the buffer until the operation is +completed. ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and be trusted, because it is ublk server's responsibility to make sure IO buffer diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ef8a0705e68b5..72b63928d59ec 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1690,8 +1690,8 @@ ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req, { int ret; - ret = io_buffer_register_bvec(cmd, req, ublk_io_release, - io->buf.auto_reg.index, issue_flags); + ret = io_buffer_register_request(cmd, req, ublk_io_release, + io->buf.auto_reg.index, issue_flags); if (ret) { if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { ublk_auto_buf_reg_fallback(ubq, req->tag); @@ -1897,7 +1897,7 @@ static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq, ublk_io_unlock(io); if (index != -1) - io_buffer_unregister_bvec(data->cmd, index, + io_buffer_unregister(data->cmd, index, data->issue_flags); } @@ -3171,8 +3171,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, if (!req) return -EINVAL; - ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, - issue_flags); + ret = io_buffer_register_request(cmd, req, ublk_io_release, index, + issue_flags); if (ret) { ublk_put_req_ref(io, req); return ret; @@ -3203,8 +3203,8 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req)) return -EINVAL; - ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, - issue_flags); + ret = io_buffer_register_request(cmd, req, ublk_io_release, index, + issue_flags); if (ret) return ret; @@ -3219,7 +3219,7 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; - return io_buffer_unregister_bvec(cmd, index, issue_flags); + return io_buffer_unregister(cmd, index, issue_flags); } static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) @@ -3360,7 +3360,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, goto out; /* - * io_buffer_unregister_bvec() doesn't access the ubq or io, + * io_buffer_unregister() doesn't access the ubq or io, * so no need to validate the q_id, tag, or task */ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) @@ -3427,7 +3427,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, req = ublk_fill_io_cmd(io, cmd); ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); if (buf_idx != UBLK_INVALID_BUF_IDX) - io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); + io_buffer_unregister(cmd, buf_idx, issue_flags); compl = ublk_need_complete_req(ub, io); if (req_op(req) == REQ_OP_ZONE_APPEND) @@ -3762,7 +3762,7 @@ static int ublk_batch_commit_io(struct ublk_queue *ubq, } if (buf_idx != UBLK_INVALID_BUF_IDX) - io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); + io_buffer_unregister(data->cmd, buf_idx, data->issue_flags); if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = ublk_batch_zone_lba(uc, elem); if (compl) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 331dcbefe72f1..42801f0b6456e 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -91,6 +91,15 @@ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, struct io_br_sel *sel, unsigned int issue_flags); +int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +int io_buffer_register_bvec(struct io_uring_cmd *cmd, const struct bio_vec *bvs, + unsigned int nr_bvecs, void (*release)(void *), + void *priv, u8 dir, unsigned int index, + unsigned int issue_flags); +int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -133,6 +142,29 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, { return true; } +static inline int io_buffer_register_request(struct io_uring_cmd *cmd, + struct request *rq, + void (*release)(void *), + unsigned int index, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} +static inline int io_buffer_register_bvec(struct io_uring_cmd *cmd, + const struct bio_vec *bvs, + unsigned int nr_bvecs, + void (*release)(void *), void *priv, + u8 dir, unsigned int index, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} +static inline int io_buffer_unregister(struct io_uring_cmd *cmd, + unsigned int index, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} #endif static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) @@ -182,10 +214,4 @@ static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); } -int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, - void (*release)(void *), unsigned int index, - unsigned int issue_flags); -int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags); - #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 244392026c6d5..7aee83e5ea0ee 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -44,6 +44,11 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +enum { + IO_BUF_DEST = 1 << ITER_DEST, + IO_BUF_SOURCE = 1 << ITER_SOURCE, +}; + struct iou_loop_params; struct io_wq_work_node { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index dd6326dc5f88f..6068448a5aaa3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3233,7 +3233,7 @@ static int __init io_uring_init(void) io_uring_optable_init(); /* imu->dir is u8 */ - BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + BUILD_BUG_ON((IO_BUF_DEST | IO_BUF_SOURCE) > U8_MAX); /* * Allow user copy in the per-command field, which starts after the diff --git a/io_uring/poll.c b/io_uring/poll.c index 74eef78841596..6834e2db937ec 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { - if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) + if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index fd36e0e319a25..c4a7a77d1ee97 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -820,7 +820,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->release = io_release_ubuf; imu->priv = imu; imu->flags = 0; - imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; + imu->dir = IO_BUF_DEST | IO_BUF_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); @@ -924,72 +924,125 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, - void (*release)(void *), unsigned int index, - unsigned int issue_flags) +static struct io_mapped_ubuf *io_kernel_buffer_init(struct io_ring_ctx *ctx, + unsigned int nr_bvecs, + unsigned int total_bytes, + u8 dir, + void (*release)(void *), + void *priv, + unsigned int index) { - struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; - struct req_iterator rq_iter; struct io_mapped_ubuf *imu; struct io_rsrc_node *node; - struct bio_vec bv; - unsigned int nr_bvecs = 0; - int ret = 0; - io_ring_submit_lock(ctx, issue_flags); - if (index >= data->nr) { - ret = -EINVAL; - goto unlock; - } + if (index >= data->nr) + return ERR_PTR(-EINVAL); index = array_index_nospec(index, data->nr); - if (data->nodes[index]) { - ret = -EBUSY; - goto unlock; - } + if (data->nodes[index]) + return ERR_PTR(-EBUSY); node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); - if (!node) { - ret = -ENOMEM; - goto unlock; - } + if (!node) + return ERR_PTR(-ENOMEM); - /* - * blk_rq_nr_phys_segments() may overestimate the number of bvecs - * but avoids needing to iterate over the bvecs - */ - imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); + imu = io_alloc_imu(ctx, nr_bvecs); if (!imu) { io_cache_free(&ctx->node_cache, node); - ret = -ENOMEM; - goto unlock; + return ERR_PTR(-ENOMEM); } imu->ubuf = 0; - imu->len = blk_rq_bytes(rq); + imu->len = total_bytes; imu->acct_pages = 0; imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; refcount_set(&imu->refs, 1); imu->release = release; - imu->priv = rq; + imu->priv = priv; + imu->dir = dir; imu->flags = IO_REGBUF_F_KBUF; - imu->dir = 1 << rq_data_dir(rq); + node->buf = imu; + data->nodes[index] = node; + + return imu; +} + +int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct bio_vec bv; + /* + * blk_rq_nr_phys_segments() may overestimate the number of bvecs + * but avoids needing to iterate over the bvecs + */ + unsigned int nr_bvecs = blk_rq_nr_phys_segments(rq); + unsigned int total_bytes = blk_rq_bytes(rq); + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + + imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes, + 1 << rq_data_dir(rq), release, rq, index); + if (IS_ERR(imu)) { + ret = PTR_ERR(imu); + goto unlock; + } + + nr_bvecs = 0; rq_for_each_bvec(bv, rq, rq_iter) imu->bvec[nr_bvecs++] = bv; imu->nr_bvecs = nr_bvecs; - node->buf = imu; - data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_request); + +/* + * bvs is copied internally. caller may free it on return. + */ +int io_buffer_register_bvec(struct io_uring_cmd *cmd, const struct bio_vec *bvs, + unsigned int nr_bvecs, void (*release)(void *), + void *priv, u8 dir, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_mapped_ubuf *imu; + struct bio_vec *bvec; + unsigned int i, total_bytes = 0; + int ret = 0; + + for (i = 0; i < nr_bvecs; i++) + total_bytes += bvs[i].bv_len; + + io_ring_submit_lock(ctx, issue_flags); + imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes, dir, release, + priv, index); + if (IS_ERR(imu)) { + ret = PTR_ERR(imu); + goto unlock; + } + + bvec = imu->bvec; + for (i = 0; i < nr_bvecs; i++) + bvec[i] = bvs[i]; + unlock: io_ring_submit_unlock(ctx, issue_flags); return ret; } EXPORT_SYMBOL_GPL(io_buffer_register_bvec); -int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags) +int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) { struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; @@ -1019,7 +1072,7 @@ int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, io_ring_submit_unlock(ctx, issue_flags); return ret; } -EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); +EXPORT_SYMBOL_GPL(io_buffer_unregister); static int validate_fixed_range(u64 buf_addr, size_t len, const struct io_mapped_ubuf *imu) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index cff0f8834c353..8d48195faf9d3 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -23,11 +23,6 @@ struct io_rsrc_node { }; }; -enum { - IO_IMU_DEST = 1 << ITER_DEST, - IO_IMU_SOURCE = 1 << ITER_SOURCE, -}; - enum { IO_REGBUF_F_KBUF = 1, }; diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 61533f30494f5..80366320276dd 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -171,7 +171,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) } if (!current->io_uring) { err_free: - io_wq_put_and_exit(tctx->io_wq); + if (tctx->io_wq) { + io_wq_exit_start(tctx->io_wq); + io_wq_put_and_exit(tctx->io_wq); + } percpu_counter_destroy(&tctx->inflight); kfree(tctx); }