Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions Documentation/block/ublk.rst
Original file line number Diff line number Diff line change
Expand Up @@ -382,17 +382,17 @@ Zero copy
---------

ublk zero copy relies on io_uring's fixed kernel buffer, which provides
two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`.
two APIs: `io_buffer_register_request()` and `io_buffer_unregister`.

ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call
`io_buffer_register_bvec()` for ublk server to register client request
`io_buffer_register_request()` for ublk server to register client request
buffer into io_uring buffer table, then ublk server can submit io_uring
IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF`
calls `io_buffer_unregister_bvec()` to unregister the buffer, which is
guaranteed to be live between calling `io_buffer_register_bvec()` and
`io_buffer_unregister_bvec()`. Any io_uring operation which supports this
kind of kernel buffer will grab one reference of the buffer until the
operation is completed.
calls `io_buffer_unregister()` to unregister the buffer, which is guaranteed
to be live between calling `io_buffer_register_request()` and
`io_buffer_unregister()`. Any io_uring operation which supports this kind of
kernel buffer will grab one reference of the buffer until the operation is
completed.

ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and
be trusted, because it is ublk server's responsibility to make sure IO buffer
Expand Down
22 changes: 11 additions & 11 deletions drivers/block/ublk_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1690,8 +1690,8 @@ ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
{
int ret;

ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
io->buf.auto_reg.index, issue_flags);
ret = io_buffer_register_request(cmd, req, ublk_io_release,
io->buf.auto_reg.index, issue_flags);
if (ret) {
if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
ublk_auto_buf_reg_fallback(ubq, req->tag);
Expand Down Expand Up @@ -1897,7 +1897,7 @@ static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
ublk_io_unlock(io);

if (index != -1)
io_buffer_unregister_bvec(data->cmd, index,
io_buffer_unregister(data->cmd, index,
data->issue_flags);
}

Expand Down Expand Up @@ -3171,8 +3171,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
if (!req)
return -EINVAL;

ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags);
ret = io_buffer_register_request(cmd, req, ublk_io_release, index,
issue_flags);
if (ret) {
ublk_put_req_ref(io, req);
return ret;
Expand Down Expand Up @@ -3203,8 +3203,8 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
return -EINVAL;

ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags);
ret = io_buffer_register_request(cmd, req, ublk_io_release, index,
issue_flags);
if (ret)
return ret;

Expand All @@ -3219,7 +3219,7 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
return -EINVAL;

return io_buffer_unregister_bvec(cmd, index, issue_flags);
return io_buffer_unregister(cmd, index, issue_flags);
}

static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
Expand Down Expand Up @@ -3360,7 +3360,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
goto out;

/*
* io_buffer_unregister_bvec() doesn't access the ubq or io,
* io_buffer_unregister() doesn't access the ubq or io,
* so no need to validate the q_id, tag, or task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
Expand Down Expand Up @@ -3427,7 +3427,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
req = ublk_fill_io_cmd(io, cmd);
ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
io_buffer_unregister(cmd, buf_idx, issue_flags);
compl = ublk_need_complete_req(ub, io);

if (req_op(req) == REQ_OP_ZONE_APPEND)
Expand Down Expand Up @@ -3762,7 +3762,7 @@ static int ublk_batch_commit_io(struct ublk_queue *ubq,
}

if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
io_buffer_unregister(data->cmd, buf_idx, data->issue_flags);
if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ublk_batch_zone_lba(uc, elem);
if (compl)
Expand Down
38 changes: 32 additions & 6 deletions include/linux/io_uring/cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
struct io_br_sel *sel, unsigned int issue_flags);

int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);
int io_buffer_register_bvec(struct io_uring_cmd *cmd, const struct bio_vec *bvs,
unsigned int nr_bvecs, void (*release)(void *),
void *priv, u8 dir, unsigned int index,
unsigned int issue_flags);
int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
Expand Down Expand Up @@ -133,6 +142,29 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
{
return true;
}
static inline int io_buffer_register_request(struct io_uring_cmd *cmd,
struct request *rq,
void (*release)(void *),
unsigned int index,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
static inline int io_buffer_register_bvec(struct io_uring_cmd *cmd,
const struct bio_vec *bvs,
unsigned int nr_bvecs,
void (*release)(void *), void *priv,
u8 dir, unsigned int index,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
static inline int io_buffer_unregister(struct io_uring_cmd *cmd,
unsigned int index,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
#endif

static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
Expand Down Expand Up @@ -182,10 +214,4 @@ static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret,
return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true);
}

int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);
int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags);

#endif /* _LINUX_IO_URING_CMD_H */
5 changes: 5 additions & 0 deletions include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ enum io_uring_cmd_flags {
IO_URING_F_COMPAT = (1 << 12),
};

enum {
IO_BUF_DEST = 1 << ITER_DEST,
IO_BUF_SOURCE = 1 << ITER_SOURCE,
};

struct iou_loop_params;

struct io_wq_work_node {
Expand Down
2 changes: 1 addition & 1 deletion io_uring/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -3233,7 +3233,7 @@ static int __init io_uring_init(void)
io_uring_optable_init();

/* imu->dir is u8 */
BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX);
BUILD_BUG_ON((IO_BUF_DEST | IO_BUF_SOURCE) > U8_MAX);

/*
* Allow user copy in the per-command field, which starts after the
Expand Down
2 changes: 1 addition & 1 deletion io_uring/poll.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
*/
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
return io_poll_get_ownership_slowpath(req);
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
Expand Down
127 changes: 90 additions & 37 deletions io_uring/rsrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
imu->release = io_release_ubuf;
imu->priv = imu;
imu->flags = 0;
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
imu->dir = IO_BUF_DEST | IO_BUF_SOURCE;
if (coalesced)
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
Expand Down Expand Up @@ -924,72 +924,125 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}

int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags)
static struct io_mapped_ubuf *io_kernel_buffer_init(struct io_ring_ctx *ctx,
unsigned int nr_bvecs,
unsigned int total_bytes,
u8 dir,
void (*release)(void *),
void *priv,
unsigned int index)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_rsrc_data *data = &ctx->buf_table;
struct req_iterator rq_iter;
struct io_mapped_ubuf *imu;
struct io_rsrc_node *node;
struct bio_vec bv;
unsigned int nr_bvecs = 0;
int ret = 0;

io_ring_submit_lock(ctx, issue_flags);
if (index >= data->nr) {
ret = -EINVAL;
goto unlock;
}
if (index >= data->nr)
return ERR_PTR(-EINVAL);
index = array_index_nospec(index, data->nr);

if (data->nodes[index]) {
ret = -EBUSY;
goto unlock;
}
if (data->nodes[index])
return ERR_PTR(-EBUSY);

node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!node) {
ret = -ENOMEM;
goto unlock;
}
if (!node)
return ERR_PTR(-ENOMEM);

/*
* blk_rq_nr_phys_segments() may overestimate the number of bvecs
* but avoids needing to iterate over the bvecs
*/
imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
imu = io_alloc_imu(ctx, nr_bvecs);
if (!imu) {
io_cache_free(&ctx->node_cache, node);
ret = -ENOMEM;
goto unlock;
return ERR_PTR(-ENOMEM);
}

imu->ubuf = 0;
imu->len = blk_rq_bytes(rq);
imu->len = total_bytes;
imu->acct_pages = 0;
imu->folio_shift = PAGE_SHIFT;
imu->nr_bvecs = nr_bvecs;
refcount_set(&imu->refs, 1);
imu->release = release;
imu->priv = rq;
imu->priv = priv;
imu->dir = dir;
imu->flags = IO_REGBUF_F_KBUF;
imu->dir = 1 << rq_data_dir(rq);

node->buf = imu;
data->nodes[index] = node;

return imu;
}

int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct req_iterator rq_iter;
struct io_mapped_ubuf *imu;
struct bio_vec bv;
/*
* blk_rq_nr_phys_segments() may overestimate the number of bvecs
* but avoids needing to iterate over the bvecs
*/
unsigned int nr_bvecs = blk_rq_nr_phys_segments(rq);
unsigned int total_bytes = blk_rq_bytes(rq);
int ret = 0;

io_ring_submit_lock(ctx, issue_flags);

imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes,
1 << rq_data_dir(rq), release, rq, index);
if (IS_ERR(imu)) {
ret = PTR_ERR(imu);
goto unlock;
}

nr_bvecs = 0;
rq_for_each_bvec(bv, rq, rq_iter)
imu->bvec[nr_bvecs++] = bv;
imu->nr_bvecs = nr_bvecs;

node->buf = imu;
data->nodes[index] = node;
unlock:
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
EXPORT_SYMBOL_GPL(io_buffer_register_request);

/*
* bvs is copied internally. caller may free it on return.
*/
int io_buffer_register_bvec(struct io_uring_cmd *cmd, const struct bio_vec *bvs,
unsigned int nr_bvecs, void (*release)(void *),
void *priv, u8 dir, unsigned int index,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_mapped_ubuf *imu;
struct bio_vec *bvec;
unsigned int i, total_bytes = 0;
int ret = 0;

for (i = 0; i < nr_bvecs; i++)
total_bytes += bvs[i].bv_len;

io_ring_submit_lock(ctx, issue_flags);
imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes, dir, release,
priv, index);
if (IS_ERR(imu)) {
ret = PTR_ERR(imu);
goto unlock;
}

bvec = imu->bvec;
for (i = 0; i < nr_bvecs; i++)
bvec[i] = bvs[i];

unlock:
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);

int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags)
int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_rsrc_data *data = &ctx->buf_table;
Expand Down Expand Up @@ -1019,7 +1072,7 @@ int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
EXPORT_SYMBOL_GPL(io_buffer_unregister);

static int validate_fixed_range(u64 buf_addr, size_t len,
const struct io_mapped_ubuf *imu)
Expand Down
5 changes: 0 additions & 5 deletions io_uring/rsrc.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,6 @@ struct io_rsrc_node {
};
};

enum {
IO_IMU_DEST = 1 << ITER_DEST,
IO_IMU_SOURCE = 1 << ITER_SOURCE,
};

enum {
IO_REGBUF_F_KBUF = 1,
};
Expand Down
5 changes: 4 additions & 1 deletion io_uring/tctx.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
}
if (!current->io_uring) {
err_free:
io_wq_put_and_exit(tctx->io_wq);
if (tctx->io_wq) {
io_wq_exit_start(tctx->io_wq);
io_wq_put_and_exit(tctx->io_wq);
}
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
}
Expand Down
Loading