diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 900b3fc4c72d0..bec5e04085daa 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -239,6 +239,30 @@ Description: last zone of the device which may be smaller. +What: /sys/block//queue/copy_max_bytes +Date: May 2026 +Contact: linux-block@vger.kernel.org +Description: + [RW] This is the maximum number of bytes that the block layer + will allow for a copy request. This is always smaller or + equal to the maximum size allowed by the block driver. + Any value higher than 'copy_max_hw_bytes' will be reduced to + 'copy_max_hw_bytes'. Writing '0' to this attribute will disable + copy offloading for this block device. If copy offloading is + disabled, copy requests will be translated into read and write + requests. + + +What: /sys/block//queue/copy_max_hw_bytes +Date: May 2026 +Contact: linux-block@vger.kernel.org +Description: + [RO] This is the maximum number of bytes that is allowed for + a single data copy request. Set by the block driver. The value + zero indicates that the block device does not support copy + offloading. + + What: /sys/block//queue/crypto/ Date: February 2022 Contact: linux-block@vger.kernel.org diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst index 4dd78f24d10af..ea0616dbf7f3b 100644 --- a/Documentation/block/null_blk.rst +++ b/Documentation/block/null_blk.rst @@ -149,3 +149,7 @@ zone_size=[MB]: Default: 256 zone_nr_conv=[nr_conv]: Default: 0 The number of conventional zones to create when block device is zoned. If zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1. + +max_copy_bytes=[size in bytes]: Default: UINT_MAX + A module and configfs parameter which can be used to set hardware/driver + supported maximum copy offload limit. diff --git a/block/Makefile b/block/Makefile index 7dce2e44276c4..d99e8d4fda7d6 100644 --- a/block/Makefile +++ b/block/Makefile @@ -6,7 +6,7 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \ - blk-mq-tag.o blk-mq-dma.o blk-stat.o \ + blk-mq-tag.o blk-mq-dma.o blk-stat.o blk-copy.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ disk-events.o blk-ia-ranges.o early-lookup.o diff --git a/block/bio.c b/block/bio.c index b8972dba68a09..51480c9be27b6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -852,6 +852,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; + bio->bi_copy_ctx = bio_src->bi_copy_ctx; if (bio->bi_bdev) { if (bio->bi_bdev == bio_src->bi_bdev && diff --git a/block/blk-copy.c b/block/blk-copy.c new file mode 100644 index 0000000000000..f49a5f835b4ac --- /dev/null +++ b/block/blk-copy.c @@ -0,0 +1,631 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Offloaded and onloaded data copying support. + */ +#include +#include +#include +#include + +static struct bio *__blk_next_copy_bio(struct request *rq, struct bio *prev_bio, + enum req_op op) +{ + struct bio *bio; + + if (prev_bio) { + bio = prev_bio->bi_next; + } else { + struct bio_copy_offload_ctx *copy_ctx = rq->bio->bi_copy_ctx; + + bio = copy_ctx->bios; + } + + for (; bio && bio_op(bio) != op; bio = bio->bi_next) + ; + return bio; +} + +struct bio *blk_first_copy_bio(struct request *rq, enum req_op op) +{ + struct bio *bio = rq->bio; + + if (bio_op(bio) == op) + return bio; + + return __blk_next_copy_bio(rq, NULL, op); +} +EXPORT_SYMBOL_GPL(blk_first_copy_bio); + +struct bio *blk_next_copy_bio(struct bio *bio) +{ + return __blk_next_copy_bio(NULL, bio, bio_op(bio)); +} +EXPORT_SYMBOL_GPL(blk_next_copy_bio); + +unsigned int blk_copy_bio_count(struct request *rq, enum req_op op) +{ + unsigned int count = 0; + + for (struct bio *bio = blk_first_copy_bio(rq, op); bio; + bio = blk_next_copy_bio(bio)) + count++; + + return count; +} +EXPORT_SYMBOL_GPL(blk_copy_bio_count); + +/** + * Tracks the state of a single onloaded copy operation. + * @params: Data copy parameters. + * @read_work: For scheduling read work. + * @write_work: For scheduling write work. + * @buf: Data buffer. + * @buf_len: Length in bytes of @buf. + * @offset: Current copying offset. Range: [0, @len[. + * @chunk: Size in bytes of the chunk of data that is being copied. + */ +struct blkdev_copy_onload_ctx { + struct blk_copy_params *params; + struct work_struct read_work; + struct work_struct write_work; + void *buf; + ssize_t buf_len; + loff_t offset; + loff_t chunk; +}; + +/* End all bios in the @ctx->bios list with status @ctx->status. */ +static void blkdev_end_bios(struct bio_copy_offload_ctx *ctx) +{ + struct bio *bio, *next; + + bio = ctx->bios; + ctx->bios = NULL; + for (; bio; bio = next) { + next = bio->bi_next; + bio->bi_status = ctx->status; + bio_endio(bio); + } +} + +/* + * Called after LBA translation finished for all bios associated with copy context + * @ctx. + */ +static void blkdev_translation_complete(struct bio_copy_offload_ctx *ctx) +{ + struct module *owner = NULL; + struct bio *bio; + + WARN_ON_ONCE(ctx->phase != BLKDEV_TRANSLATE_LBAS); + ctx->phase = BLKDEV_COPY; + + /* Check whether all bios are associated with the same block driver. */ + for (bio = ctx->bios; bio; bio = bio->bi_next) { + if (!owner) { + owner = bio->bi_bdev->bd_disk->fops->owner; + } else if (owner != bio->bi_bdev->bd_disk->fops->owner) { + ctx->status = BLK_STS_INVAL; + break; + } + } + + /* Remove the first bio from the bio list and submit it. */ + bio = ctx->bios; + ctx->bios = bio->bi_next; + bio->bi_next = NULL; + if (ctx->biotail == bio) + ctx->biotail = NULL; + if (ctx->status == BLK_STS_OK) + submit_bio(bio); + else + bio_endio(bio); +} + +/* REQ_OP_COPY_* completion handler. */ +static void blkdev_req_op_copy_done(struct bio *bio) +{ + struct bio_copy_offload_ctx *ctx = bio->bi_copy_ctx; + struct blk_copy_params *params = ctx->params; + blk_status_t status; + + switch (ctx->phase) { + case BLKDEV_TRANSLATE_LBAS: + scoped_guard(spinlock_irqsave, &ctx->lock) + if (!ctx->status) + ctx->status = bio->bi_status; + break; + case BLKDEV_COPY: + status = ctx->status; + ctx->phase = BLKDEV_COPY_DONE; + blkdev_end_bios(ctx); + kfree(ctx); + scoped_guard(spinlock_irqsave, ¶ms->lock) { + if (!params->status) + params->status = status; + } + if (atomic_dec_and_test(¶ms->copy_ctx_count)) + params->end_io(params); + break; + case BLKDEV_COPY_DONE: + break; + } +} + +/* + * Check that all LBA offsets are aligned with both the source and the destination + * logical block sizes. Compare input and output length. Store the number of bytes + * to be transferred in *@len. + */ +static int blkdev_copy_check_params(const struct blk_copy_params *params, + loff_t *len) +{ + const unsigned int mask = + max(bdev_logical_block_size(params->in_bdev), + bdev_logical_block_size(params->out_bdev)) - 1; + loff_t in_len = 0, out_len = 0; + unsigned int i; + + for (i = 0; i < params->in_nseg; i++) { + if ((params->in_segs[i].pos | params->in_segs[i].len) & mask) + return -EINVAL; + in_len += params->in_segs[i].len; + } + + for (i = 0; i < params->out_nseg; i++) { + if ((params->out_segs[i].pos | params->out_segs[i].len) & mask) + return -EINVAL; + out_len += params->out_segs[i].len; + } + + if (in_len != out_len) + return -EINVAL; + + *len = in_len; + + return 0; +} + +/* + * Calculate the number of bytes in the max_copy_src_segments input segments + * starting from input segment @in_idx. + */ +static loff_t blk_max_src_len(const struct blk_copy_params *params, + unsigned int in_idx) +{ + uint16_t max_src_segments = + params->in_bdev->bd_queue->limits.max_copy_src_segments; + unsigned int max_i = min(params->in_nseg, in_idx + max_src_segments); + loff_t len = 0; + + for (uint32_t i = in_idx; i < max_i; i++) + len += params->in_segs[i].len; + + return len; +} + +/* + * Calculate the number of bytes in the max_copy_dst_segments output segments + * starting from output segment @out_idx. + */ +static loff_t blk_max_dst_len(const struct blk_copy_params *params, + unsigned int out_idx) +{ + uint16_t max_dst_segments = + params->out_bdev->bd_queue->limits.max_copy_dst_segments; + unsigned int max_i = min(params->out_nseg, out_idx + max_dst_segments); + loff_t len = 0; + + for (uint32_t i = out_idx; i < max_i; i++) + len += params->out_segs[i].len; + + return len; +} + +struct blkdev_copy_sync_ctx { + struct completion compl; + blk_status_t status; +}; + +static void blkdev_end_copy_sync(const struct blk_copy_params *params) +{ + struct blkdev_copy_sync_ctx *ctx = params->private; + + complete(&ctx->compl); +} + +static int blkdev_copy_sync(struct blk_copy_params *params) +{ + struct blkdev_copy_sync_ctx ctx = { + .compl = COMPLETION_INITIALIZER_ONSTACK(ctx.compl), + }; + int ret; + + WARN_ON_ONCE(params->end_io || params->private); + params->end_io = blkdev_end_copy_sync; + params->private = &ctx; + + ret = blkdev_copy_offload(params); + if (ret && ret != -EIOCBQUEUED) + return ret; + + wait_for_completion(&ctx.compl); + return blk_status_to_errno(ctx.status); +} + +/** + * blkdev_copy_chunk() - submit a single copy offload operation + * @params: Copy offload input parameters. + * @in_idx: Index of the input segment from where to start copying. + * @out_idx: Index of the output segment to where to start copying. + * @in_offset: Offset in bytes from the start of input segment @in_idx. + * @out_offset: Offset in bytes from the start of output segment @out_idx. + * @chunk: Maximum number of bytes to copy. + * + * Returns: the number of bytes covered by the submitted copy operation or a + * negative error number. + */ +static loff_t blkdev_copy_chunk(struct blk_copy_params *params, u32 *in_idx, + u32 *out_idx, loff_t *in_offset, + loff_t *out_offset, loff_t chunk) +{ + struct bio_copy_offload_ctx *ctx; + u32 bio_count; + + ctx = kzalloc_obj(*ctx); + if (!ctx) + return -ENOMEM; + + spin_lock_init(&ctx->lock); + ctx->params = params; + ctx->phase = BLKDEV_TRANSLATE_LBAS; + ctx->translation_complete = blkdev_translation_complete; + /* + * Initialized to one to prevent that ctx->translation_complete() is + * called before bio submission has finished. + */ + ctx->bio_count = 1; + + WARN_ON_ONCE(chunk <= 0); + chunk = min(chunk, blk_max_src_len(params, *in_idx) - *in_offset); + WARN_ON_ONCE(chunk <= 0); + chunk = min(chunk, blk_max_dst_len(params, *out_idx) - *out_offset); + WARN_ON_ONCE(chunk <= 0); + ctx->len = chunk; + for (loff_t bytes, remaining_in = chunk; remaining_in > 0; + remaining_in -= bytes) { + struct bio *src_bio; + + src_bio = bio_alloc(params->in_bdev, 0, REQ_OP_COPY_SRC, + GFP_NOIO); + if (!src_bio) { + if (remaining_in == chunk) + goto free_ctx; + else + goto enomem; + } + atomic_inc(¶ms->copy_ctx_count); + scoped_guard(spinlock_irqsave, &ctx->lock) + ctx->bio_count++; + bytes = min(remaining_in, params->in_segs[*in_idx].len - + *in_offset); + src_bio->bi_iter.bi_size = bytes; + src_bio->bi_iter.bi_sector = (params->in_segs[*in_idx].pos + + *in_offset) >> SECTOR_SHIFT; + src_bio->bi_copy_ctx = ctx; + src_bio->bi_end_io = blkdev_req_op_copy_done; + *in_offset += bytes; + if (*in_offset >= params->in_segs[*in_idx].len) { + *in_offset -= params->in_segs[*in_idx].len; + (*in_idx)++; + } + submit_bio(src_bio); + } + for (loff_t bytes, remaining_out = chunk; remaining_out; + remaining_out -= bytes) { + struct bio *dst_bio; + + dst_bio = bio_alloc(params->out_bdev, 0, REQ_OP_COPY_DST, + GFP_NOIO); + if (!dst_bio) + goto enomem; + scoped_guard(spinlock_irqsave, &ctx->lock) + ctx->bio_count++; + bytes = min(remaining_out, params->out_segs[*out_idx].len - + *out_offset); + dst_bio->bi_iter.bi_size = bytes; + dst_bio->bi_iter.bi_sector = (params->out_segs[*out_idx].pos + + *out_offset) >> SECTOR_SHIFT; + dst_bio->bi_copy_ctx = ctx; + dst_bio->bi_end_io = blkdev_req_op_copy_done; + *out_offset += bytes; + if (*out_offset >= params->out_segs[*out_idx].len) { + *out_offset -= params->out_segs[*out_idx].len; + (*out_idx)++; + } + submit_bio(dst_bio); + } + +dec_bio_count: + scoped_guard(spinlock_irqsave, &ctx->lock) + bio_count = --ctx->bio_count; + if (bio_count == 0) + ctx->translation_complete(ctx); + return chunk; + +enomem: + scoped_guard(spinlock_irqsave, &ctx->lock) + if (!ctx->status) + ctx->status = BLK_STS_RESOURCE; + chunk = -ENOMEM; + goto dec_bio_count; + +free_ctx: + kfree(ctx); + return -ENOMEM; +} + +/** + * blkdev_copy_offload() - copy data and offload copying if possible. + * @params: Source and destination block device, data ranges and completion + * callback. + * + * If @params->end_io != NULL, data is copied asynchronously. If @params->end_io + * == NULL, this function only returns after data copying finished. + * + * Return: 0 upon success; -EIOCBQUEUED if the completion callback function will + * be called or has already been called; -EOPNOTSUPP if copy offloading is + * not supported by the block device or if the source or destination + * address ranges span more than one dm device. + */ +int blkdev_copy_offload(struct blk_copy_params *params) +{ + loff_t in_offset = 0, out_offset = 0; + u32 in_idx = 0, out_idx = 0; + loff_t len, chunk, max_chunk; + int ret; + + might_sleep(); + + if (!params->end_io) + return blkdev_copy_sync(params); + + spin_lock_init(¶ms->lock); + + if (!bdev_max_copy_sectors(params->in_bdev) || + !bdev_max_copy_sectors(params->out_bdev)) + return -EOPNOTSUPP; + + ret = blkdev_copy_check_params(params, &len); + if (ret) + return ret; + + params->len = len; + + max_chunk = (u64)min(bdev_max_copy_sectors(params->in_bdev), + bdev_max_copy_sectors(params->out_bdev)) + << SECTOR_SHIFT; + + atomic_set(¶ms->copy_ctx_count, 1); + + for (loff_t offset = 0; offset < len; offset += chunk) { + chunk = min(len - offset, max_chunk); + chunk = blkdev_copy_chunk(params, &in_idx, &out_idx, &in_offset, + &out_offset, chunk); + } + + if (atomic_dec_and_test(¶ms->copy_ctx_count)) + params->end_io(params); + + return -EIOCBQUEUED; +} +EXPORT_SYMBOL_GPL(blkdev_copy_offload); + +static void *blkdev_copy_alloc_buf(size_t req_size, size_t *alloc_size) +{ + unsigned int min_size = PAGE_SIZE; + char *buf; + + while (req_size >= min_size) { + buf = kmalloc(req_size, GFP_NOIO | __GFP_NOWARN); + if (buf) { + *alloc_size = req_size; + return buf; + } + req_size >>= 1; + } + + return NULL; +} + +static struct bio *bio_map_buf(void *buf, unsigned int len) +{ + struct page *page; + struct bio *bio; + static const uint16_t nr_vecs = 1; + + bio = bio_kmalloc(nr_vecs, GFP_NOIO); + if (!bio) + return NULL; + bio_init_inline(bio, /*bdev=*/NULL, /*max_vecs=*/nr_vecs, /*opf=*/0); + + page = virt_to_page(buf); + if (bio_add_page(bio, page, len, offset_in_page(buf)) < len) { + /* we don't support partial mappings */ + bio_uninit(bio); + kfree(bio); + WARN_ON_ONCE(true); + return NULL; + } + + return bio; +} + +static void blkdev_write_done(struct bio *bio) +{ + struct blkdev_copy_onload_ctx *ctx = bio->bi_copy_ctx; + struct blk_copy_params *params = ctx->params; + blk_status_t sts = bio->bi_status; + + kfree(bio); + + if (sts) { + params->status = sts; + params->end_io(params); + return; + } + + ctx->offset += ctx->chunk; + + schedule_work(&ctx->read_work); +} + +static sector_t blkdev_offset_to_out_pos(const struct blk_copy_params *params, + loff_t offset) +{ + for (int i = 0; i < params->out_nseg; i++) { + loff_t rem = params->out_segs[i].len - offset; + + if (rem > 0) + return params->out_segs[i].pos + offset; + offset -= params->out_segs[i].len; + } + return 0; +} + +static void blkdev_write_work(struct work_struct *work) +{ + struct blkdev_copy_onload_ctx *ctx = + container_of(work, typeof(*ctx), read_work); + struct blk_copy_params *params = ctx->params; + struct bio *bio; + loff_t out_pos; + + out_pos = blkdev_offset_to_out_pos(params, ctx->offset); + + bio = bio_map_buf(ctx->buf, ctx->buf_len); + if (!bio) { + params->status = BLK_STS_AGAIN; + params->end_io(params); + return; + } + bio->bi_opf = REQ_OP_WRITE; + bio_set_dev(bio, params->out_bdev); + bio->bi_iter.bi_sector = out_pos >> SECTOR_SHIFT; + bio->bi_iter.bi_size = ctx->chunk; + bio->bi_end_io = blkdev_write_done; + bio->bi_copy_ctx = ctx; + submit_bio(bio); +} + +static void blkdev_read_done(struct bio *bio) +{ + struct blkdev_copy_onload_ctx *ctx = bio->bi_copy_ctx; + struct blk_copy_params *params = ctx->params; + blk_status_t sts = bio->bi_status; + + kfree(bio); + + if (sts) { + params->status = sts; + params->end_io(params); + return; + } + + schedule_work(&ctx->write_work); +} + +static sector_t blkdev_offset_to_in_pos(const struct blk_copy_params *params, + loff_t offset, loff_t *chunk) +{ + for (int i = 0; i < params->in_nseg; i++) { + loff_t rem = params->in_segs[i].len - offset; + + if (rem > 0) { + if (*chunk > rem) + *chunk = rem; + return params->in_segs[i].pos + offset; + } + offset -= params->in_segs[i].len; + } + *chunk = 0; + return 0; +} + +static void blkdev_read_work(struct work_struct *work) +{ + struct blkdev_copy_onload_ctx *ctx = + container_of(work, typeof(*ctx), read_work); + struct blk_copy_params *params = ctx->params; + loff_t offset = ctx->offset; + sector_t in_pos; + struct bio *bio; + + ctx->chunk = min(ctx->buf_len, params->len - offset); + if (ctx->chunk) + in_pos = blkdev_offset_to_in_pos(params, offset, &ctx->chunk); + if (ctx->chunk == 0) { + params->end_io(params); + return; + } + + bio = bio_map_buf(ctx->buf, ctx->buf_len); + if (!bio) { + params->status = BLK_STS_AGAIN; + params->end_io(params); + return; + } + bio->bi_opf = REQ_OP_READ; + bio_set_dev(bio, params->in_bdev); + bio->bi_iter.bi_sector = in_pos >> SECTOR_SHIFT; + bio->bi_iter.bi_size = ctx->chunk; + bio->bi_end_io = blkdev_read_done; + bio->bi_copy_ctx = ctx; + submit_bio(bio); +} + +/** + * blkdev_copy_onload - asynchronously copy data between two block devices using + * read and write operations. + * @params: Input and output block devices, input and output ranges and + * completion callback pointer. + * Return: 0 upon success; -EIOCBQUEUED if the completion callback function will + * be called or has already been called. + */ +int blkdev_copy_onload(struct blk_copy_params *params) +{ + loff_t max_hw_bytes = + min(queue_max_hw_sectors(params->in_bdev->bd_queue), + queue_max_hw_sectors(params->out_bdev->bd_queue)) << + SECTOR_SHIFT; + struct blkdev_copy_onload_ctx *ctx; + loff_t len; + int ret; + + ret = blkdev_copy_check_params(params, &len); + if (ret) + return ret; + + params->len = len; + + ctx = kzalloc_obj(*ctx); + if (!ctx) + return -ENOMEM; + + INIT_WORK(&ctx->read_work, blkdev_read_work); + INIT_WORK(&ctx->write_work, blkdev_write_work); + ctx->params = params; + + ctx->buf = blkdev_copy_alloc_buf(min(max_hw_bytes, len), &ctx->buf_len); + if (!ctx->buf) + goto err; + + blkdev_read_work(&ctx->read_work); + + return -EIOCBQUEUED; + +err: + kfree(ctx); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blkdev_copy_onload); diff --git a/block/blk-core.c b/block/blk-core.c index 17450058ea6d8..37c01e717202f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,8 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_ZEROES), + REQ_OP_NAME(COPY_SRC), + REQ_OP_NAME(COPY_DST), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; @@ -782,6 +785,8 @@ void submit_bio_noacct(struct bio *bio) struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; + struct bio_copy_offload_ctx *copy_ctx; + u32 bio_count; might_sleep(); @@ -875,6 +880,39 @@ void submit_bio_noacct(struct bio *bio) * requests. */ fallthrough; + case REQ_OP_COPY_SRC: + case REQ_OP_COPY_DST: + copy_ctx = bio->bi_copy_ctx; + WARN_ON_ONCE(copy_ctx->phase == BLKDEV_COPY_DONE); + if (copy_ctx->phase == BLKDEV_COPY) + break; + /* If copy offloading is not supported, fail the bio. */ + if (!q->limits.max_copy_sectors) { + scoped_guard(spinlock_irqsave, ©_ctx->lock) + copy_ctx->bio_count--; + goto not_supported; + } + /* + * If the block driver is a stacking driver that supports copy + * offloading, submit the bio. + */ + if (q->limits.features & BLK_FEAT_STACKING_COPY_OFFL) + break; + /* + * Append the bio at the end of the bio->bi_copy_ctx->bios list. + */ + scoped_guard(spinlock_irqsave, ©_ctx->lock) { + if (copy_ctx->biotail) + copy_ctx->biotail->bi_next = bio; + else + copy_ctx->bios = bio; + copy_ctx->biotail = bio; + bio_count = --copy_ctx->bio_count; + } + WARN_ON_ONCE(bio_count < 0); + if (bio_count == 0) + copy_ctx->translation_complete(copy_ctx); + return; default: goto not_supported; } diff --git a/block/blk-merge.c b/block/blk-merge.c index fcf09325b22e3..4678131650d2b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -207,6 +207,19 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, return __bio_split_discard(bio, lim, nsegs, max_sectors); } +struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim, + unsigned int *nsegs) +{ + *nsegs = 1; + if (bio_sectors(bio) <= lim->max_copy_sectors) + return bio; + + /* Splitting a REQ_OP_COPY_* bio is not supported. */ + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return NULL; +} + static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, bool is_atomic) { diff --git a/block/blk-settings.c b/block/blk-settings.c index 78c83817b9d36..cb846ff2926e8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -57,6 +57,11 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; lim->atomic_write_hw_max = UINT_MAX; + + lim->max_user_copy_sectors = UINT_MAX; + lim->max_copy_hw_sectors = UINT_MAX; + lim->max_copy_src_segments = U16_MAX; + lim->max_copy_dst_segments = U16_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -333,6 +338,21 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) lim->atomic_write_unit_max = 0; } +/* + * Check whether max_copy_hw_sectors and max_copy_{src,dst}_segments are + * either all nonzero or all zero. + */ +static int blk_validate_copy_limits(const struct queue_limits *lim) +{ + if (lim->max_copy_hw_sectors && lim->max_copy_src_segments && + lim->max_copy_dst_segments) + return 0; + if (!lim->max_copy_hw_sectors && !lim->max_copy_src_segments && + !lim->max_copy_dst_segments) + return 0; + return -EINVAL; +} + /* * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. @@ -510,6 +530,13 @@ int blk_validate_limits(struct queue_limits *lim) err = blk_validate_integrity_limits(lim); if (err) return err; + + err = blk_validate_copy_limits(lim); + if (err) + return err; + lim->max_copy_sectors = + min(lim->max_copy_hw_sectors, lim->max_user_copy_sectors); + return blk_validate_zoned_limits(lim); } EXPORT_SYMBOL_GPL(blk_validate_limits); @@ -528,6 +555,7 @@ int blk_set_default_limits(struct queue_limits *lim) */ lim->max_user_discard_sectors = UINT_MAX; lim->max_user_wzeroes_unmap_sectors = UINT_MAX; + lim->max_user_copy_sectors = UINT_MAX; return blk_validate_limits(lim); } @@ -829,6 +857,14 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); + t->max_copy_hw_sectors = + min(t->max_copy_hw_sectors, b->max_copy_hw_sectors); + t->max_copy_src_segments = + min(t->max_copy_src_segments, b->max_copy_src_segments); + t->max_copy_dst_segments = + min(t->max_copy_dst_segments, b->max_copy_dst_segments); + t->max_copy_sectors = min(t->max_copy_sectors, b->max_copy_sectors); + alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f22c1f253eb3a..8e1e14d1682d5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -325,6 +325,36 @@ queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, return 0; } +static ssize_t queue_copy_hw_max_show(struct gendisk *disk, char *page) +{ + return queue_var_show( + disk->queue->limits.max_copy_hw_sectors << SECTOR_SHIFT, page); +} + +static ssize_t queue_copy_max_show(struct gendisk *disk, char *page) +{ + return queue_var_show( + disk->queue->limits.max_copy_sectors << SECTOR_SHIFT, page); +} + +static int queue_copy_max_store(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim) +{ + unsigned long max_copy_bytes; + ssize_t ret; + + ret = queue_var_store(&max_copy_bytes, page, count); + if (ret < 0) + return ret; + + if ((max_copy_bytes >> SECTOR_SHIFT) > UINT_MAX) + return -EINVAL; + + lim->max_user_copy_sectors = max_copy_bytes >> SECTOR_SHIFT; + + return 0; +} + static ssize_t queue_feature_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim, blk_features_t feature) { @@ -652,6 +682,9 @@ QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); +QUEUE_LIM_RO_ENTRY(queue_copy_hw_max, "copy_max_hw_bytes"); +QUEUE_LIM_RW_ENTRY(queue_copy_max, "copy_max_bytes"); + QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); @@ -760,6 +793,8 @@ static const struct attribute *const queue_attrs[] = { &queue_max_hw_wzeroes_unmap_sectors_entry.attr, &queue_max_wzeroes_unmap_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, + &queue_copy_hw_max_entry.attr, + &queue_copy_max_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, diff --git a/block/blk.h b/block/blk.h index b998a7761faf3..274c226e87eea 100644 --- a/block/blk.h +++ b/block/blk.h @@ -379,6 +379,8 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); struct bio *bio_split_zone_append(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); +struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim, + unsigned int *nsegs); /* * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. @@ -435,6 +437,9 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: return bio_split_write_zeroes(bio, lim, nr_segs); + case REQ_OP_COPY_SRC: + case REQ_OP_COPY_DST: + return bio_split_copy(bio, lim, nr_segs); default: /* other operations can't be split */ *nr_segs = 0; diff --git a/block/fops.c b/block/fops.c index bb6642b45937c..f438503f1b777 100644 --- a/block/fops.c +++ b/block/fops.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) @@ -861,6 +862,58 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } +static ssize_t blkdev_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct block_device *in_bdev = I_BDEV(bdev_file_inode(file_in)); + struct block_device *out_bdev = I_BDEV(bdev_file_inode(file_out)); + loff_t in_end, out_end; + int err; + + if (check_add_overflow(pos_in, len, &in_end) || + PAGE_ALIGN(in_end) < in_end || + check_add_overflow(pos_out, len, &out_end) || + PAGE_ALIGN(out_end) < out_end) + return -EINVAL; + + /* + * filemap_write_and_wait_range() and filemap_invalidate_inode() expect + * that the 'end' argument is rounded up to the next multiple of + * PAGE_SIZE. + */ + in_end = PAGE_ALIGN(in_end); + out_end = PAGE_ALIGN(out_end); + + if (bdev_max_copy_sectors(in_bdev) && bdev_max_copy_sectors(out_bdev) && + file_in->f_iocb_flags & file_out->f_iocb_flags & IOCB_DIRECT) { + struct blk_copy_seg in_seg = { .pos = pos_in, .len = len }; + struct blk_copy_seg out_seg = { .pos = pos_out, .len = len }; + struct blk_copy_params params = { + .in_bdev = in_bdev, + .out_bdev = out_bdev, + .in_nseg = 1, + .in_segs = &in_seg, + .out_nseg = 1, + .out_segs = &out_seg, + }; + err = filemap_write_and_wait_range(file_in->f_mapping, pos_in, + in_end); + if (err) + return err; + err = filemap_invalidate_inode(bdev_file_inode(file_out), + /*flush=*/false, + pos_out, out_end); + if (err) + return err; + if (blkdev_copy_offload(¶ms) == 0) + return len; + /* If copy offloading fails, fall back to onloading. */ + } + + return splice_copy_file_range(file_in, pos_in, file_out, pos_out, len); +} + #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES) @@ -967,6 +1020,7 @@ const struct file_operations def_blk_fops = { .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC, + .copy_file_range = blkdev_copy_file_range, }; static __init int blkdev_init(void) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index f8c0fd57e041a..87a2f3536b50b 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "null_blk.h" #undef pr_fmt @@ -169,6 +170,10 @@ static int g_max_sectors; module_param_named(max_sectors, g_max_sectors, int, 0444); MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); +static unsigned long g_max_copy_bytes = UINT_MAX; +module_param_named(max_copy_bytes, g_max_copy_bytes, ulong, 0444); +MODULE_PARM_DESC(max_copy_bytes, "Maximum size of a copy command (in bytes)"); + static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); @@ -450,6 +455,7 @@ NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); NULLB_DEVICE_ATTR(max_sectors, uint, NULL); +NULLB_DEVICE_ATTR(max_copy_bytes, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); @@ -601,6 +607,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_blocksize, &nullb_device_attr_cache_size, &nullb_device_attr_completion_nsec, + &nullb_device_attr_max_copy_bytes, &nullb_device_attr_discard, &nullb_device_attr_fua, &nullb_device_attr_home_node, @@ -805,6 +812,7 @@ static struct nullb_device *null_alloc_dev(void) dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; dev->max_sectors = g_max_sectors; + dev->max_copy_bytes = g_max_copy_bytes; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; @@ -1275,6 +1283,96 @@ static blk_status_t null_transfer(struct nullb *nullb, struct page *page, return err; } +static ssize_t nullb_copy_sector(struct nullb *nullb, sector_t sector_in, + sector_t sector_out, ssize_t rem, bool is_fua) +{ + struct nullb_page *t_page_in, *t_page_out; + loff_t offset_in, offset_out; + void *in, *out; + ssize_t chunk; + + chunk = min_t(size_t, nullb->dev->blocksize, rem); + offset_in = (sector_in & SECTOR_MASK) << SECTOR_SHIFT; + offset_out = (sector_out & SECTOR_MASK) << SECTOR_SHIFT; + + guard(spinlock_irq)(&nullb->lock); + + if (null_cache_active(nullb) && !is_fua) + null_make_cache_space(nullb, PAGE_SIZE); + + t_page_in = null_insert_page(nullb, sector_in, + !null_cache_active(nullb)); + if (!t_page_in) + return -1; + t_page_out = null_insert_page(nullb, sector_out, + !null_cache_active(nullb) || is_fua); + if (!t_page_out) + return -1; + + in = kmap_local_page(t_page_in->page); + out = kmap_local_page(t_page_out->page); + memcpy(out + offset_out, in + offset_in, chunk); + kunmap_local(out); + kunmap_local(in); + + __set_bit(sector_out & SECTOR_MASK, t_page_out->bitmap); + + if (is_fua) + null_free_sector(nullb, sector_out, true); + + return chunk; +} + +static blk_status_t nullb_do_copy(struct nullb *nullb, struct request *rq) +{ + sector_t sector_in, sector_in_end, sector_out, sector_out_end; + struct bio_copy_offload_ctx *copy_ctx = rq->bio->bi_copy_ctx; + ssize_t chunk, rem = copy_ctx->len; + struct bio *src_bio, *dst_bio; + + src_bio = blk_first_copy_bio(rq, REQ_OP_COPY_SRC); + dst_bio = blk_first_copy_bio(rq, REQ_OP_COPY_DST); + + if (WARN_ON_ONCE(!src_bio || !dst_bio)) + return BLK_STS_IOERR; + + sector_in = src_bio->bi_iter.bi_sector; + sector_in_end = sector_in + (src_bio->bi_iter.bi_size >> SECTOR_SHIFT); + sector_out = dst_bio->bi_iter.bi_sector; + sector_out_end = sector_out + (dst_bio->bi_iter.bi_size >> SECTOR_SHIFT); + + while (rem > 0) { + chunk = nullb_copy_sector(nullb, sector_in, sector_out, rem, + rq->cmd_flags & REQ_FUA); + if (chunk < 0) + return BLK_STS_IOERR; + rem -= chunk; + if (!rem) + break; + sector_in += chunk >> SECTOR_SHIFT; + if (sector_in >= sector_in_end) { + src_bio = blk_next_copy_bio(src_bio); + if (WARN_ON_ONCE(!src_bio)) + return BLK_STS_IOERR; + sector_in = src_bio->bi_iter.bi_sector; + sector_in_end = sector_in + + (src_bio->bi_iter.bi_size >> SECTOR_SHIFT); + } + sector_out += chunk >> SECTOR_SHIFT; + if (sector_out >= sector_out_end) { + dst_bio = blk_next_copy_bio(dst_bio); + if (WARN_ON_ONCE(!dst_bio)) + return BLK_STS_IOERR; + sector_out = dst_bio->bi_iter.bi_sector; + sector_out_end = sector_out + + (dst_bio->bi_iter.bi_size >> SECTOR_SHIFT); + } + cond_resched(); + } + + return BLK_STS_OK; +} + /* * Transfer data for the given request. The transfer size is capped with the * nr_sectors argument. @@ -1292,6 +1390,9 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, struct req_iterator iter; struct bio_vec bvec; + if (op_is_copy(req_op(rq))) + return nullb_do_copy(nullb, rq); + spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; @@ -1806,6 +1907,13 @@ static void null_config_discard(struct nullb *nullb, struct queue_limits *lim) lim->max_hw_discard_sectors = UINT_MAX >> 9; } +static void null_config_copy(struct nullb *nullb, struct queue_limits *lim) +{ + lim->max_copy_hw_sectors = nullb->dev->max_copy_bytes >> SECTOR_SHIFT; + lim->max_copy_src_segments = nullb->dev->max_copy_bytes ? U16_MAX : 0; + lim->max_copy_dst_segments = lim->max_copy_src_segments; +} + static const struct block_device_operations null_ops = { .owner = THIS_MODULE, .report_zones = null_report_zones, @@ -1922,6 +2030,9 @@ static int null_validate_conf(struct nullb_device *dev) return -EINVAL; } + if (dev->queue_mode == NULL_Q_BIO) + dev->max_copy_bytes = 0; + return 0; } @@ -1989,6 +2100,8 @@ static int null_add_dev(struct nullb_device *dev) if (dev->virt_boundary) lim.virt_boundary_mask = PAGE_SIZE - 1; null_config_discard(nullb, &lim); + null_config_copy(nullb, &lim); + if (dev->zoned) { rv = null_init_zoned_dev(dev, &lim); if (rv) diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 6c4c4bbe7dadc..c15c319ed91ba 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -93,6 +93,7 @@ struct nullb_device { unsigned int queue_mode; /* block interface */ unsigned int blocksize; /* block size */ unsigned int max_sectors; /* Max sectors per command */ + unsigned long max_copy_bytes; /* Max copy offload length in bytes */ unsigned int irqmode; /* IRQ completion handler */ unsigned int hw_queue_depth; /* queue depth */ unsigned int index; /* index of the disk, only valid with a disk */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 38c17846deb0a..3de8bf5f11fbc 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -119,6 +119,11 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } +static void linear_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + limits->features |= BLK_FEAT_STACKING_COPY_OFFL; +} + static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, unsigned int cmd, unsigned long arg, bool *forward) @@ -211,6 +216,7 @@ static struct target_type linear_target = { .dtr = linear_dtr, .map = linear_map, .status = linear_status, + .io_hints = linear_io_hints, .prepare_ioctl = linear_prepare_ioctl, .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index dc2eff6b739df..888c5bdca5f11 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1816,6 +1816,14 @@ int dm_calculate_queue_limits(struct dm_table *t, return -EINVAL; combine_limits: + if (!(ti_limits.features & BLK_FEAT_STACKING_COPY_OFFL)) { + ti_limits.max_copy_hw_sectors = 0; + ti_limits.max_copy_src_segments = 0; + ti_limits.max_copy_dst_segments = 0; + ti_limits.max_user_copy_sectors = 0; + ti_limits.max_copy_sectors = 0; + } + /* * Merge this target's queue limits into the overall limits * for the table. diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index dc90df9e13a21..b80c7c7fb629e 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -19,6 +19,7 @@ static const char * const nvme_ops[] = { [nvme_cmd_resv_report] = "Reservation Report", [nvme_cmd_resv_acquire] = "Reservation Acquire", [nvme_cmd_resv_release] = "Reservation Release", + [nvme_cmd_copy] = "Copy Offload", [nvme_cmd_zone_mgmt_send] = "Zone Management Send", [nvme_cmd_zone_mgmt_recv] = "Zone Management Receive", [nvme_cmd_zone_append] = "Zone Append", diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1e33af94c24b9..6f3c1fde112fa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -821,6 +822,87 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } +/* + * Translate REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios into an NVMe Copy command. + * The NVMe copy command supports multiple source LBA ranges, a single + * destination LBA range, and also supports copying across NVMe namespaces. This + * implementation supports all these features except copying across NVMe + * namespaces. + */ +static inline blk_status_t nvme_setup_copy_offload(struct nvme_ns *ns, + struct request *req, + struct nvme_command *cmnd) +{ + const u32 nr_range = blk_copy_bio_count(req, REQ_OP_COPY_SRC); + struct nvme_ns *src_ns, *dst_ns; + struct bio *src_bio = NULL, *dst_bio; + struct nvme_copy_range *range; + u16 control = 0; + u64 dlba; + + dst_bio = blk_first_copy_bio(req, REQ_OP_COPY_DST); + + if (WARN_ON_ONCE(!dst_bio)) + return BLK_STS_IOERR; + + /* TO DO: derive dst_ns from dst_bio. */ + dst_ns = ns; + dlba = nvme_sect_to_lba(dst_ns->head, dst_bio->bi_iter.bi_sector); + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + + if (req->cmd_flags & REQ_FAILFAST_DEV) + control |= NVME_RW_LR; + + *cmnd = (typeof(*cmnd)){ + .copy = { + .opcode = nvme_cmd_copy, + .nsid = cpu_to_le32(dst_ns->head->ns_id), + .control = cpu_to_le16(control), + .sdlba = cpu_to_le64(dlba), + .desfmt_prinfor = 2, /* DESFMT=2 */ + .nr_range = nr_range - 1, /* 0's based */ + } + }; + + range = kmalloc_array(nr_range, sizeof(*range), + GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN); + if (!range) + return BLK_STS_RESOURCE; + + for (unsigned int i = 0; i < nr_range; i++) { + u64 slba; + u32 nslb; + + if (!src_bio) + src_bio = blk_first_copy_bio(req, REQ_OP_COPY_SRC); + else + src_bio = blk_next_copy_bio(src_bio); + if (WARN_ON_ONCE(!src_bio)) + goto free_range; + /* TO DO: derive src_ns from src_bio. */ + src_ns = ns; + slba = nvme_sect_to_lba(src_ns->head, + src_bio->bi_iter.bi_sector); + nslb = src_bio->bi_iter.bi_size >> src_ns->head->lba_shift; + range[i].nsid = cpu_to_le32(src_ns->head->ns_id); /* requires DESFMT=2 */ + range[i].slba = cpu_to_le64(slba); + range[i].nlb = cpu_to_le16(nslb - 1); + } + + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = sizeof(*range) * nr_range; + req->rq_flags |= RQF_SPECIAL_PAYLOAD; + + return BLK_STS_OK; + +free_range: + kfree(range); + return BLK_STS_IOERR; +} + static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { @@ -1122,6 +1204,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) case REQ_OP_ZONE_APPEND: ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; + case REQ_OP_COPY_DST: + case REQ_OP_COPY_SRC: + ret = nvme_setup_copy_offload(ns, req, cmd); + break; default: WARN_ON_ONCE(1); return BLK_STS_IOERR; @@ -1884,6 +1970,21 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, return true; } +static void nvme_config_copy(struct nvme_ns *ns, struct nvme_id_ns *id, + struct queue_limits *lim) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + + if (!(ctrl->oncs & NVME_CTRL_ONCS_COPY)) { + lim->max_copy_hw_sectors = 0; + return; + } + lim->max_copy_hw_sectors = nvme_lba_to_sect(ns->head, + le16_to_cpu(id->mssrl)); + lim->max_copy_src_segments = 256; + lim->max_copy_dst_segments = 1; +} + static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && @@ -2416,6 +2517,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_update_disk_info(ns, id, nvm, &lim)) capacity = 0; + nvme_config_copy(ns, id, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); @@ -2542,6 +2644,9 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) lim.physical_block_size = ns_lim->physical_block_size; lim.io_min = ns_lim->io_min; lim.io_opt = ns_lim->io_opt; + lim.max_copy_hw_sectors = UINT_MAX; + lim.max_copy_src_segments = U16_MAX; + lim.max_copy_dst_segments = U16_MAX; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); if (unsupported) @@ -5368,6 +5473,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64); BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_copy_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64); diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index ad25ad1e40412..fd49363f8516e 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -143,7 +143,7 @@ static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) u16 length = get_unaligned_le16(cdw10 + 8); u16 control = get_unaligned_le16(cdw10 + 10); u32 dsmgmt = get_unaligned_le32(cdw10 + 12); - u32 reftag = get_unaligned_le32(cdw10 + 16); + u32 reftag = get_unaligned_le32(cdw10 + 16); trace_seq_printf(p, "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", @@ -153,6 +153,23 @@ static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_copy(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 sdlba = get_unaligned_le64(cdw10); + u8 nr_range = get_unaligned_le16(cdw10 + 8); + u16 control = get_unaligned_le16(cdw10 + 10); + u32 dsmgmt = get_unaligned_le32(cdw10 + 12); + u32 reftag = get_unaligned_le32(cdw10 + 16); + + trace_seq_printf(p, + "sdlba=%llu, nr_range=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", + sdlba, nr_range, control, dsmgmt, reftag); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -386,6 +403,8 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, return nvme_trace_resv_rel(p, cdw10); case nvme_cmd_resv_report: return nvme_trace_resv_report(p, cdw10); + case nvme_cmd_copy: + return nvme_trace_copy(p, cdw10); default: return nvme_trace_common(p, cdw10); } diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e4fd1caadfb00..1e404df6ad843 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -733,8 +733,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES | - NVME_CTRL_ONCS_RESERVATIONS); - + NVME_CTRL_ONCS_RESERVATIONS | NVME_CTRL_ONCS_COPY); /* XXX: don't report vwc if the underlying device is write through */ id->vwc = NVME_CTRL_VWC_PRESENT; @@ -797,6 +796,27 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) nvmet_req_complete(req, status); } +static void nvmet_set_copy_limits(struct nvme_id_ns *id) +{ + /* + * MSRC = Maximum Source Range Count - the maximum number of + * source ranges that may be used to specify source data in a + * Copy command. 0's based. + */ + id->msrc = 256 - 1; + /* + * MSSRL = Maximum Single Source Range Length - the maximum number + * of logical blocks that may be specified in the Number of Logical + * Blocks field in each valid Source Range Entries Descriptor. + */ + id->mssrl = cpu_to_le16(U16_MAX); + /* + * MCL = Maximum Copy Length - the maximum number of logical + * blocks that may be specified in a Copy command. + */ + id->mcl = cpu_to_le32(U32_MAX); +} + static void nvmet_execute_identify_ns(struct nvmet_req *req) { struct nvme_id_ns *id; @@ -845,6 +865,8 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) if (req->ns->bdev) nvmet_bdev_set_limits(req->ns->bdev, id); + nvmet_set_copy_limits(id); + /* * We just provide a single LBA format that matches what the * underlying device reports. diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index f2d9e8901df4e..4196f10b02ab3 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -451,6 +451,83 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) } } +static void nvmet_bdev_copy_endio(const struct blk_copy_params *params) +{ + struct nvmet_req *rq = params->private; + blk_status_t status = params->status; + + /* + * From the NVM Command Set Specification section about the Copy + * Command: "If the command completes with failure (i.e., completes with + * a status code other than Successful Completion), then: [ ... ] Dword + * 0 of the completion queue entry contains the number of the lowest + * numbered Source Range entry that was not successfully copied". Since + * that information is not available, clear Dword 0. + */ + rq->cqe->result.u32 = cpu_to_le32(0); + + nvmet_req_complete(rq, blk_to_nvme_status(rq, status)); +} + +static void nvmet_bdev_execute_copy(struct nvmet_req *rq) +{ + u32 i, nr_range = (u32)rq->cmd->copy.nr_range + 1; + struct blk_copy_seg *in_segs __free(kfree) = NULL; + struct nvme_command *cmd = rq->cmd; + struct nvme_copy_range range; + u64 src_len, copy_len = 0; + loff_t dst_pos, src_pos; + u16 status; + int ret; + + status = NVME_SC_INTERNAL; + in_segs = kmalloc_array(nr_range, sizeof(*in_segs), GFP_KERNEL); + if (!in_segs) + goto err_rq_complete; + + for (i = 0; i < nr_range; i++) { + status = nvmet_copy_from_sgl(rq, i * sizeof(range), &range, + sizeof(range)); + if (WARN_ON_ONCE(status)) + goto err_rq_complete; + /* + * TO DO: implement support for different source and destination namespace + * IDs. + */ + status = errno_to_nvme_status(rq, -EIO); + if (le32_to_cpu(range.nsid) != rq->ns->nsid) + goto err_rq_complete; + src_pos = le64_to_cpu(range.slba) << rq->ns->blksize_shift; + src_len = (le16_to_cpu(range.nlb) + 1) << rq->ns->blksize_shift; + in_segs[i] = + (struct blk_copy_seg){ .pos = src_pos, .len = src_len }; + copy_len += src_len; + } + + dst_pos = le64_to_cpu(cmd->copy.sdlba) << rq->ns->blksize_shift; + struct blk_copy_seg out_seg = { .pos = dst_pos, .len = copy_len }; + struct blk_copy_params params = { + .in_bdev = rq->ns->bdev, + .in_segs = in_segs, + .in_nseg = nr_range, + .out_bdev = rq->ns->bdev, + .out_segs = &out_seg, + .out_nseg = 1, + .end_io = nvmet_bdev_copy_endio, + .private = rq, + }; + ret = blkdev_copy_offload(¶ms); + if (ret == -EIOCBQUEUED) + return; + if (ret) + ret = blkdev_copy_onload(¶ms); + + rq->cqe->result.u32 = cpu_to_le32(ret == 0); + status = errno_to_nvme_status(rq, ret); +err_rq_complete: + nvmet_req_complete(rq, status); +} + u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) { switch (req->cmd->common.opcode) { @@ -469,6 +546,9 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) case nvme_cmd_write_zeroes: req->execute = nvmet_bdev_execute_write_zeroes; return 0; + case nvme_cmd_copy: + req->execute = nvmet_bdev_execute_copy; + return 0; default: return nvmet_report_invalid_opcode(req); } diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 0b22d183f9279..5e8738b45d52d 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -131,11 +131,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC) is_sync = true; - pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; - if (unlikely(pos + req->transfer_len > req->ns->size)) { - nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); - return true; - } + pos = le64_to_cpu(req->cmd->copy.sdlba) << req->ns->blksize_shift; memset(&req->f.iocb, 0, sizeof(struct kiocb)); for_each_sg(req->sg, sg, req->sg_cnt, i) { @@ -321,6 +317,50 @@ static void nvmet_file_dsm_work(struct work_struct *w) } } +static void nvmet_file_copy_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + u32 id, nr_range = req->cmd->copy.nr_range + 1; + loff_t dst_pos; + ssize_t ret; + u16 status; + + status = errno_to_nvme_status(req, -ENOSPC); + dst_pos = le64_to_cpu(req->cmd->copy.sdlba) << req->ns->blksize_shift; + + for (id = 0; id < nr_range; id++) { + struct nvme_copy_range range; + loff_t src_pos, src_len; + + status = nvmet_copy_from_sgl(req, id * sizeof(range), &range, + sizeof(range)); + if (status) + goto out; + /* + * TO DO: implement support for different source and destination namespace + * IDs. + */ + status = errno_to_nvme_status(req, -EIO); + if (le32_to_cpu(range.nsid) != req->ns->nsid) + goto out; + src_pos = le64_to_cpu(range.slba) << (req->ns->blksize_shift); + src_len = (le16_to_cpu(range.nlb) + 1) << req->ns->blksize_shift; + ret = vfs_copy_file_range(req->ns->file, src_pos, req->ns->file, + dst_pos, src_len, COPY_FILE_SPLICE); + if (ret != src_len) { + req->cqe->result.u32 = cpu_to_le32(id); + status = errno_to_nvme_status(req, ret < 0 ? ret : -EIO); + goto out; + } + dst_pos += ret; + } + + status = 0; + +out: + nvmet_req_complete(req, status); +} + static void nvmet_file_execute_dsm(struct nvmet_req *req) { if (!nvmet_check_data_len_lte(req, nvmet_dsm_len(req))) @@ -329,6 +369,12 @@ static void nvmet_file_execute_dsm(struct nvmet_req *req) queue_work(nvmet_wq, &req->f.work); } +static void nvmet_file_execute_copy(struct nvmet_req *req) +{ + INIT_WORK(&req->f.work, nvmet_file_copy_work); + queue_work(nvmet_wq, &req->f.work); +} + static void nvmet_file_write_zeroes_work(struct work_struct *w) { struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); @@ -375,6 +421,9 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) case nvme_cmd_write_zeroes: req->execute = nvmet_file_execute_write_zeroes; return 0; + case nvme_cmd_copy: + req->execute = nvmet_file_execute_copy; + return 0; default: return nvmet_report_invalid_opcode(req); } diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 6dbc7036f2e4b..2baef72944919 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -92,6 +92,23 @@ static const char *nvmet_trace_dsm(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvmet_trace_copy(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 sdlba = get_unaligned_le64(cdw10); + u8 nr_range = get_unaligned_le16(cdw10 + 8); + u16 control = get_unaligned_le16(cdw10 + 10); + u32 dsmgmt = get_unaligned_le32(cdw10 + 12); + u32 reftag = get_unaligned_le32(cdw10 + 16); + + trace_seq_printf(p, + "sdlba=%llu, nr_range=%u, ctrl=1x%x, dsmgmt=%u, reftag=%u", + sdlba, nr_range, control, dsmgmt, reftag); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_common(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -303,6 +320,8 @@ const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, return nvmet_trace_resv_rel(p, cdw10); case nvme_cmd_resv_report: return nvmet_trace_resv_report(p, cdw10); + case nvme_cmd_copy: + return nvmet_trace_copy(p, cdw10); default: return nvmet_trace_common(p, cdw10); } diff --git a/fs/read_write.c b/fs/read_write.c index 50bff7edc91f3..d6fba5afff946 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1484,8 +1484,8 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; uint64_t count = *req_count; loff_t size_in; int ret; @@ -1791,7 +1791,9 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + if (!S_ISREG(inode_in->i_mode) && !S_ISBLK(inode_in->i_mode)) + return -EINVAL; + if ((inode_in->i_mode & S_IFMT) != (inode_out->i_mode & S_IFMT)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || diff --git a/include/linux/blk-copy.h b/include/linux/blk-copy.h new file mode 100644 index 0000000000000..4c84353127529 --- /dev/null +++ b/include/linux/blk-copy.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_BLK_COPY_H +#define __LINUX_BLK_COPY_H + +#include +#include +#include +#include +#include + +struct blk_copy_params; +struct request; + +enum blkdev_copy_phase { + BLKDEV_TRANSLATE_LBAS, + BLKDEV_COPY, + BLKDEV_COPY_DONE, +}; + +/* + * struct bio_copy_offload_ctx - context information for blkdev_copy_offload() + * @params: Input parameters passed to blkdev_copy_offload(). + * @len: Number of bytes associated with this copy context. + * @phase: Copy offload phase: either translating LBAs or copying data. + * @lock: Protects @bios, @biotail and @bio_count. + * @bios: List with REQ_OP_COPY_* bios for which LBA translation completed. + * @biotail: Last element in the @bios list. + * @bio_count: Number bios for which LBA translation has not yet completed. + * @status: bio completion status. + * @translation_complete: Called after LBA translation has completed. + * LBA translation has completed once bio_count drops to zero. + */ +struct bio_copy_offload_ctx { + struct blk_copy_params *params; + loff_t len; + enum blkdev_copy_phase phase; + spinlock_t lock; + struct bio *bios __guarded_by(&lock); + struct bio *biotail __guarded_by(&lock); + u32 bio_count __guarded_by(&lock); + blk_status_t status __guarded_by(&lock); + void (*translation_complete)(struct bio_copy_offload_ctx *ctx); +}; + +struct bio *blk_first_copy_bio(struct request *rq, enum req_op op); +struct bio *blk_next_copy_bio(struct bio *bio); +unsigned int blk_copy_bio_count(struct request *rq, enum req_op op); + +#endif /* __LINUX_BLK_COPY_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8808ee76e73c0..27a0f92fc2cbe 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -284,6 +284,8 @@ struct bio { atomic_t __bi_cnt; /* pin count */ struct bio_set *bi_pool; + + void *bi_copy_ctx; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) @@ -370,6 +372,10 @@ enum req_op { /** @REQ_OP_ZONE_RESET_ALL: reset all the zone present on the device */ REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)19, + /* copy offload source and destination operations */ + REQ_OP_COPY_SRC = (__force blk_opf_t)20, + REQ_OP_COPY_DST = (__force blk_opf_t)21, + /* Driver private requests */ /* private: */ REQ_OP_DRV_IN = (__force blk_opf_t)34, @@ -461,6 +467,17 @@ static inline bool op_is_write(blk_opf_t op) return !!(op & (__force blk_opf_t)1); } +static inline bool op_is_copy(blk_opf_t op) +{ + switch (op & REQ_OP_MASK) { + case REQ_OP_COPY_DST: + case REQ_OP_COPY_SRC: + return true; + default: + return false; + } +} + /* * Check if the bio or request is one that needs special treatment in the * flush state machine. @@ -518,4 +535,44 @@ struct blk_rq_stat { u64 batch; }; +/* A single input or output segment descriptor. */ +struct blk_copy_seg { + loff_t pos; + loff_t len; +}; + +/** + * struct blk_copy_params - input parameters and internal parameters for copy + * operations. + * @in_bdev: Input block device. + * @in_segs: Input LBA ranges. + * @in_nseg: Number of elements in @in_segs. + * @out_bdev: Output block device. + * @out_segs: Output LBA ranges. + * @out_nseg: Number of elements in @out_segs. + * @end_io: Called after copying data finished. If %NULL, copying data happens + * synchronously instead of asynchronously. + * @private: May be used by @end_io. Not used directly. + * @len: Total number of bytes to copy. Set by blkdev_copy_offload() or + * blkdev_copy_onload(). + * @copy_ctxs: Number of in-flight copy contexts associated with copy offload + * operations. + * @lock: Protects @status updates. + * @status: I/O completion status. + */ +struct blk_copy_params { + struct block_device *in_bdev; + struct blk_copy_seg *in_segs; + unsigned int in_nseg; + struct block_device *out_bdev; + struct blk_copy_seg *out_segs; + unsigned int out_nseg; + void (*end_io)(const struct blk_copy_params *params); + void *private; + loff_t len; + atomic_t copy_ctx_count; + spinlock_t lock; + blk_status_t status; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 890128cdea1ce..817eeba2f2071 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -353,13 +353,17 @@ typedef unsigned int __bitwise blk_features_t; #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) +/* block driver is a stacking block driver that supports copy offloading */ +#define BLK_FEAT_STACKING_COPY_OFFL ((__force blk_features_t)(1u << 16)) + /* * Flags automatically inherited when stacking limits. */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \ - BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE | \ + BLK_FEAT_STACKING_COPY_OFFL) /* internal flags in queue_limits.flags */ typedef unsigned int __bitwise blk_flags_t; @@ -415,6 +419,13 @@ struct queue_limits { unsigned int atomic_write_hw_unit_max; unsigned int atomic_write_unit_max; + /* copy offloading limits */ + unsigned int max_copy_hw_sectors; /* set by block driver*/ + uint16_t max_copy_src_segments; /* set by block driver*/ + uint16_t max_copy_dst_segments; /* set by block driver*/ + unsigned int max_user_copy_sectors; /* set via sysfs */ + unsigned int max_copy_sectors; /* min() of the above */ + unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; @@ -1272,6 +1283,8 @@ void __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp); +int blkdev_copy_offload(struct blk_copy_params *params); +int blkdev_copy_onload(struct blk_copy_params *params); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ @@ -1454,6 +1467,11 @@ static inline unsigned int bdev_discard_granularity(struct block_device *bdev) return bdev_limits(bdev)->discard_granularity; } +static inline unsigned int bdev_max_copy_sectors(struct block_device *bdev) +{ + return bdev_get_queue(bdev)->limits.max_copy_sectors; +} + static inline unsigned int bdev_max_secure_erase_sectors(struct block_device *bdev) { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 041f30931a908..c6325aeb13a03 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -376,7 +376,7 @@ struct nvme_id_ctrl { __u8 nvscc; __u8 nwpc; __le16 acwu; - __u8 rsvd534[2]; + __le16 ocfs; __le32 sgls; __le32 mnan; __u8 rsvd544[224]; @@ -404,6 +404,7 @@ enum { NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_ONCS_RESERVATIONS = 1 << 5, NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, + NVME_CTRL_ONCS_COPY = 1 << 8, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, NVME_CTRL_OACS_NS_MNGT_SUPP = 1 << 3, @@ -458,7 +459,10 @@ struct nvme_id_ns { __le16 npdg; __le16 npda; __le16 nows; - __u8 rsvd74[18]; + __le16 mssrl; + __le32 mcl; + __u8 msrc; + __u8 rsvd81[11]; __le32 anagrpid; __u8 rsvd96[3]; __u8 nsattr; @@ -967,6 +971,7 @@ enum nvme_opcode { nvme_cmd_resv_acquire = 0x11, nvme_cmd_io_mgmt_recv = 0x12, nvme_cmd_resv_release = 0x15, + nvme_cmd_copy = 0x19, nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, nvme_cmd_zone_append = 0x7d, @@ -991,7 +996,8 @@ enum nvme_opcode { nvme_opcode_name(nvme_cmd_resv_release), \ nvme_opcode_name(nvme_cmd_zone_mgmt_send), \ nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \ - nvme_opcode_name(nvme_cmd_zone_append)) + nvme_opcode_name(nvme_cmd_zone_append), \ + nvme_opcode_name(nvme_cmd_copy)) @@ -1169,6 +1175,39 @@ struct nvme_dsm_range { __le64 slba; }; +struct nvme_copy_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 sdlba; + __u8 nr_range; + __u8 desfmt_prinfor; + __le16 control; + __le16 rsvd13; + __le16 dspec; + __le32 ilbrt; + __le16 lbat; + __le16 lbatm; +}; + +struct nvme_copy_range { + __le32 nsid; /* DESFMT=2 only */ + __le32 rsvd1; + __le64 slba; + __le16 nlb; + __le16 rsvd18; + __le32 rsvd20; + __le32 eilbrt; + __le16 elbat; + __le16 elbatm; +}; + +static_assert(sizeof(struct nvme_copy_range) == 32); + struct nvme_write_zeroes_cmd { __u8 opcode; __u8 flags; @@ -2001,6 +2040,7 @@ struct nvme_command { struct nvme_download_firmware dlfw; struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; + struct nvme_copy_command copy; struct nvme_write_zeroes_cmd write_zeroes; struct nvme_zone_mgmt_send_cmd zms; struct nvme_zone_mgmt_recv_cmd zmr; @@ -2180,6 +2220,7 @@ enum { NVME_SC_PMR_SAN_PROHIBITED = 0x123, NVME_SC_ANA_GROUP_ID_INVALID = 0x124, NVME_SC_ANA_ATTACH_FAILED = 0x125, + NVME_SC_COMMAND_SIZE_LIMIT_EXC = 0x183, /* * I/O Command Set Specific - NVM commands: