From bce0edd52ba39e20cb02adb1cb9ee906b5b2a434 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:06 -0700 Subject: [PATCH 1/8] blk-mq-dma: introduce blk_map_iter Create a type that fully captures the lower level physical address iteration. Signed-off-by: Keith Busch --- block/blk-mq-dma.c | 81 +++++++++++++++++--------------------- include/linux/blk-mq-dma.h | 9 ++++- 2 files changed, 44 insertions(+), 46 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index ad283017caef2..ff4c9a7e19d83 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -5,13 +5,7 @@ #include #include "blk.h" -struct phys_vec { - phys_addr_t paddr; - u32 len; -}; - -static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, - struct phys_vec *vec) +static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter) { unsigned int max_size; struct bio_vec bv; @@ -19,8 +13,8 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { if (!iter->bio) return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; + iter->paddr = bvec_phys(&req->special_vec); + iter->len = req->special_vec.bv_len; iter->bio = NULL; return true; } @@ -29,8 +23,8 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, return false; bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); - vec->paddr = bvec_phys(&bv); - max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); + iter->paddr = bvec_phys(&bv); + max_size = get_max_segment_size(&req->q->limits, iter->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); @@ -58,7 +52,7 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); } - vec->len = bv.bv_len; + iter->len = bv.bv_len; return true; } @@ -77,29 +71,29 @@ static inline bool blk_can_dma_map_iova(struct request *req, dma_get_merge_boundary(dma_dev)); } -static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) +static bool blk_dma_map_bus(struct blk_dma_iter *iter) { - iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); - iter->len = vec->len; + iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, iter->iter.paddr); + iter->len = iter->iter.len; return true; } static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, - struct blk_dma_iter *iter, struct phys_vec *vec) + struct blk_dma_iter *iter) { - iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), - offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); + iter->addr = dma_map_page(dma_dev, phys_to_page(iter->iter.paddr), + offset_in_page(iter->iter.paddr), iter->iter.len, + rq_dma_dir(req)); if (dma_mapping_error(dma_dev, iter->addr)) { iter->status = BLK_STS_RESOURCE; return false; } - iter->len = vec->len; + iter->len = iter->iter.len; return true; } static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter, - struct phys_vec *vec) + struct dma_iova_state *state, struct blk_dma_iter *iter) { enum dma_data_direction dir = rq_dma_dir(req); unsigned int mapped = 0; @@ -109,12 +103,12 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, iter->len = dma_iova_size(state); do { - error = dma_iova_link(dma_dev, state, vec->paddr, mapped, - vec->len, dir, 0); + error = dma_iova_link(dma_dev, state, iter->iter.paddr, mapped, + iter->iter.len, dir, 0); if (error) break; - mapped += vec->len; - } while (blk_map_iter_next(req, &iter->iter, vec)); + mapped += iter->iter.len; + } while (blk_map_iter_next(req, &iter->iter)); error = dma_iova_sync(dma_dev, state, 0, mapped); if (error) { @@ -151,7 +145,6 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { unsigned int total_len = blk_rq_payload_bytes(req); - struct phys_vec vec; iter->iter.bio = req->bio; iter->iter.iter = req->bio->bi_iter; @@ -162,14 +155,14 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, * Grab the first segment ASAP because we'll need it to check for P2P * transfers. */ - if (!blk_map_iter_next(req, &iter->iter, &vec)) + if (!blk_map_iter_next(req, &iter->iter)) return false; if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, - phys_to_page(vec.paddr))) { + phys_to_page(iter->iter.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: - return blk_dma_map_bus(iter, &vec); + return blk_dma_map_bus(iter); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* * P2P transfers through the host bridge are treated the @@ -184,9 +177,9 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, } if (blk_can_dma_map_iova(req, dma_dev) && - dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) - return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); - return blk_dma_map_direct(req, dma_dev, iter, &vec); + dma_iova_try_alloc(dma_dev, state, iter->iter.paddr, total_len)) + return blk_rq_dma_map_iova(req, dma_dev, state, iter); + return blk_dma_map_direct(req, dma_dev, iter); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); @@ -211,14 +204,12 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { - struct phys_vec vec; - - if (!blk_map_iter_next(req, &iter->iter, &vec)) + if (!blk_map_iter_next(req, &iter->iter)) return false; if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) - return blk_dma_map_bus(iter, &vec); - return blk_dma_map_direct(req, dma_dev, iter, &vec); + return blk_dma_map_bus(iter); + return blk_dma_map_direct(req, dma_dev, iter); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); @@ -246,20 +237,20 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct req_iterator iter = { - .bio = rq->bio, + struct bio *bio = rq->bio; + struct blk_map_iter iter = { + .bio = bio, }; - struct phys_vec vec; int nsegs = 0; /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; + if (bio) + iter.iter = bio->bi_iter; - while (blk_map_iter_next(rq, &iter, &vec)) { + while (blk_map_iter_next(rq, &iter)) { *last_sg = blk_next_sg(last_sg, sglist); - sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, - offset_in_page(vec.paddr)); + sg_set_page(*last_sg, phys_to_page(iter.paddr), iter.len, + offset_in_page(iter.paddr)); nsegs++; } diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index c26a01aeae006..1e5988afdb978 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -5,6 +5,13 @@ #include #include +struct blk_map_iter { + phys_addr_t paddr; + u32 len; + struct bvec_iter iter; + struct bio *bio; +}; + struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; @@ -14,7 +21,7 @@ struct blk_dma_iter { blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ - struct req_iterator iter; + struct blk_map_iter iter; struct pci_p2pdma_map_state p2pdma; }; From 7558332aa0a26da751f1504d8d7eeb807bc75320 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:07 -0700 Subject: [PATCH 2/8] blk-mq-dma: provide the bio_vec list being iterated This will make it easier to add different sources of the bvec table, like for upcoming integrity support, rather than assume to use the bio's bi_io_vec. It also makes iterating "special" payloads more in common with iterating normal payloads. Signed-off-by: Keith Busch --- block/blk-mq-dma.c | 56 ++++++++++++++++++++++---------------- include/linux/blk-mq-dma.h | 1 + 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index ff4c9a7e19d83..4a013703bcba5 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -10,23 +10,14 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter) unsigned int max_size; struct bio_vec bv; - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - iter->paddr = bvec_phys(&req->special_vec); - iter->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - if (!iter->iter.bi_size) return false; - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + bv = mp_bvec_iter_bvec(iter->bvec, iter->iter); iter->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, iter->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + bvec_iter_advance_single(iter->bvec, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next @@ -37,19 +28,20 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter) struct bio_vec next; if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) + if (!iter->bio || !iter->bio->bi_next) break; iter->bio = iter->bio->bi_next; iter->iter = iter->bio->bi_iter; + iter->bvec = iter->bio->bi_io_vec; } - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + next = mp_bvec_iter_bvec(iter->bvec, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + bvec_iter_advance_single(iter->bvec, &iter->iter, next.bv_len); } iter->len = bv.bv_len; @@ -119,6 +111,30 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, return true; } +static struct blk_map_iter blk_rq_map_iter(struct request *rq) +{ + struct bio *bio = rq->bio; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + return (struct blk_map_iter) { + .bvec = &rq->special_vec, + .iter = { + .bi_size = rq->special_vec.bv_len, + } + }; + } + + /* the internal flush request may not have bio attached */ + if (!bio) + return (struct blk_map_iter) {}; + + return (struct blk_map_iter) { + .bio = bio, + .bvec = bio->bi_io_vec, + .iter = bio->bi_iter, + }; +} + /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map @@ -146,10 +162,9 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, { unsigned int total_len = blk_rq_payload_bytes(req); - iter->iter.bio = req->bio; - iter->iter.iter = req->bio->bi_iter; memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; + iter->iter = blk_rq_map_iter(req); /* * Grab the first segment ASAP because we'll need it to check for P2P @@ -237,16 +252,9 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct bio *bio = rq->bio; - struct blk_map_iter iter = { - .bio = bio, - }; + struct blk_map_iter iter = blk_rq_map_iter(rq); int nsegs = 0; - /* the internal flush request may not have bio attached */ - if (bio) - iter.iter = bio->bi_iter; - while (blk_map_iter_next(rq, &iter)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(iter.paddr), iter.len, diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 1e5988afdb978..c82f880dee914 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -8,6 +8,7 @@ struct blk_map_iter { phys_addr_t paddr; u32 len; + struct bio_vec *bvec; struct bvec_iter iter; struct bio *bio; }; From e841693029a2183090312bb24850397a5a99e721 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:08 -0700 Subject: [PATCH 3/8] blk-mq-dma: require unmap caller provide p2p map type In preparing for integrity dma mappings, we can't rely on the request flag because data and metadata may have different mapping types. Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 9 ++++++++- include/linux/blk-mq-dma.h | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2c6d9506b1725..111b6bc6c93eb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -261,6 +261,9 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + + /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ + IOD_P2P_BUS_ADDR = 1U << 3, }; struct nvme_dma_vec { @@ -725,7 +728,8 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, + iod->flags & IOD_P2P_BUS_ADDR)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req); else @@ -1000,6 +1004,9 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; + if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + iod->flags |= IOD_P2P_BUS_ADDR; + if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index c82f880dee914..aef8d784952ff 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -49,14 +49,15 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap + * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) + struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { - if (req->cmd_flags & REQ_P2PDMA) + if (is_p2p) return true; if (dma_use_iova(state)) { From c69143f16673c395a904fd820c80d55020343f67 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:09 -0700 Subject: [PATCH 4/8] blk-mq: remove REQ_P2PDMA flag It's not serving any particular purpose. pci_p2pdma_state() already has all the appropriate checks, so the config and flag checks are not guarding anything. Signed-off-by: Keith Busch --- block/bio.c | 2 +- block/blk-mq-dma.c | 30 ++++++++++++++---------------- include/linux/blk_types.h | 1 - 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3b371a5da159e..44c43b9703875 100644 --- a/block/bio.c +++ b/block/bio.c @@ -981,7 +981,7 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) - bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 4a013703bcba5..988c27667df67 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -173,22 +173,20 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, if (!blk_map_iter_next(req, &iter->iter)) return false; - if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { - switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, - phys_to_page(iter->iter.paddr))) { - case PCI_P2PDMA_MAP_BUS_ADDR: - return blk_dma_map_bus(iter); - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * P2P transfers through the host bridge are treated the - * same as non-P2P transfers below and during unmap. - */ - req->cmd_flags &= ~REQ_P2PDMA; - break; - default: - iter->status = BLK_STS_INVAL; - return false; - } + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(iter->iter.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + return blk_dma_map_bus(iter); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + case PCI_P2PDMA_MAP_NONE: + break; + default: + iter->status = BLK_STS_INVAL; + return false; } if (blk_can_dma_map_iova(req, dma_dev) && diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09b99d52fd365..0a29b20939d17 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -419,7 +419,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) From 9d5f9f7bd3f027998d20de8048cd20f1bbf36964 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:10 -0700 Subject: [PATCH 5/8] blk-mq-dma: move common dma start code to a helper In preparing for dma mapping integrity metadata, move the common dma setup to a helper. Signed-off-by: Keith Busch --- block/blk-mq-dma.c | 60 +++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 988c27667df67..bc694fecb39dc 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -135,36 +135,12 @@ static struct blk_map_iter blk_rq_map_iter(struct request *rq) }; } -/** - * blk_rq_dma_map_iter_start - map the first DMA segment for a request - * @req: request to map - * @dma_dev: device to map to - * @state: DMA IOVA state - * @iter: block layer DMA iterator - * - * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the - * caller and don't need to be initialized. @state needs to be stored for use - * at unmap time, @iter is only needed at map time. - * - * Returns %false if there is no segment to map, including due to an error, or - * %true ft it did map a segment. - * - * If a segment was mapped, the DMA address for it is returned in @iter.addr and - * the length in @iter.len. If no segment was mapped the status code is - * returned in @iter.status. - * - * The caller can call blk_rq_dma_map_coalesce() to check if further segments - * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() - * to try to map the following segments. - */ -bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter) +static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter, + unsigned int total_len) { - unsigned int total_len = blk_rq_payload_bytes(req); - memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; - iter->iter = blk_rq_map_iter(req); /* * Grab the first segment ASAP because we'll need it to check for P2P @@ -194,6 +170,36 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, return blk_rq_dma_map_iova(req, dma_dev, state, iter); return blk_dma_map_direct(req, dma_dev, iter); } + +/** + * blk_rq_dma_map_iter_start - map the first DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the + * caller and don't need to be initialized. @state needs to be stored for use + * at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + iter->iter = blk_rq_map_iter(req); + return blk_dma_map_iter_start(req, dma_dev, state, iter, + blk_rq_payload_bytes(req)); +} EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); /** From ead44191b1d647e6d035374497093a7c4778bb3b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:11 -0700 Subject: [PATCH 6/8] blk-mq-dma: add support for mapping integrity metadata Provide integrity metadata helpers equivalent to the data payload helpers for iterating a request for dma setup. Signed-off-by: Keith Busch --- block/blk-mq-dma.c | 60 +++++++++++++++++++++++++++++++---- include/linux/blk-integrity.h | 17 ++++++++++ include/linux/blk-mq-dma.h | 1 + 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index bc694fecb39dc..faa36ff6465ee 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,9 +2,28 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include #include #include "blk.h" +static bool __blk_map_iter_next(struct blk_map_iter *iter) +{ + if (iter->iter.bi_size) + return true; + if (!iter->bio || !iter->bio->bi_next) + return false; + + iter->bio = iter->bio->bi_next; + if (iter->is_integrity) { + iter->iter = iter->bio->bi_integrity->bip_iter; + iter->bvec = iter->bio->bi_integrity->bip_vec; + } else { + iter->iter = iter->bio->bi_iter; + iter->bvec = iter->bio->bi_io_vec; + } + return true; +} + static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter) { unsigned int max_size; @@ -27,13 +46,8 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter) while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; - if (!iter->iter.bi_size) { - if (!iter->bio || !iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - iter->bvec = iter->bio->bi_io_vec; - } + if (!__blk_map_iter_next(iter)) + break; next = mp_bvec_iter_bvec(iter->bvec, iter->iter); if (bv.bv_len + next.bv_len > max_size || @@ -232,6 +246,38 @@ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); +#ifdef CONFIG_BLK_DEV_INTEGRITY +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + unsigned len = bio_integrity_bytes(&req->q->limits.integrity, + blk_rq_sectors(req)); + struct bio *bio = req->bio; + + iter->iter = (struct blk_map_iter) { + .bio = bio, + .iter = bio->bi_integrity->bip_iter, + .bvec = bio->bi_integrity->bip_vec, + .is_integrity = true, + }; + return blk_dma_map_iter_start(req, dma_dev, state, iter, len); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); + +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + if (!blk_map_iter_next(req, &iter->iter)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter); + return blk_dma_map_direct(req, dma_dev, iter); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); +#endif + static inline struct scatterlist * blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e67a2b6e8f111..78fe2459e6612 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -4,6 +4,7 @@ #include #include +#include struct request; @@ -31,6 +32,11 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp); +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter); +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -115,6 +121,17 @@ static inline int blk_rq_integrity_map_user(struct request *rq, { return -EINVAL; } +static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + return false; +} +static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + return false; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index aef8d784952ff..4b7b85f9e23e6 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -11,6 +11,7 @@ struct blk_map_iter { struct bio_vec *bvec; struct bvec_iter iter; struct bio *bio; + bool is_integrity; }; struct blk_dma_iter { From f2dbe985ef5b4f46d6997b07d0293deaaed66ae0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:12 -0700 Subject: [PATCH 7/8] nvme-pci: create common sgl unmapping helper This can be reused by metadata sgls once that starts using the blk-mq dma api. Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 111b6bc6c93eb..decb3ad1508a7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -693,25 +693,34 @@ static void nvme_free_prps(struct request *req) mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); } + +static void __nvme_free_sgls(struct device *dma_dev, struct nvme_sgl_desc *sge, + struct nvme_sgl_desc *sg_list, enum dma_data_direction dir) +{ + unsigned int len = le32_to_cpu(sge->length); + unsigned int i, nr_entries; + + if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) { + dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir); + return; + } + + nr_entries = len / sizeof(*sg_list); + for (i = 0; i < nr_entries; i++) + dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), dir); +} + static void nvme_free_sgls(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct device *dma_dev = nvmeq->dev->dev; - dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr); - unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length); struct nvme_sgl_desc *sg_list = iod->descriptors[0]; + struct nvme_sgl_desc *sge = &iod->cmd.common.dptr.sgl; enum dma_data_direction dir = rq_dma_dir(req); - if (iod->nr_descriptors) { - unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i; - - for (i = 0; i < nr_entries; i++) - dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), - le32_to_cpu(sg_list[i].length), dir); - } else { - dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir); - } + __nvme_free_sgls(dma_dev, sge, sg_list, dir); } static void nvme_unmap_data(struct request *req) From 7e7d7d01094fcf55083350564e39950b15b95e3b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jul 2025 08:05:13 -0700 Subject: [PATCH 8/8] nvme-pci: convert metadata mapping to dma iter Aligns data and metadata to the similar dma mapping scheme and removes one more user of the scatter-gather dma mapping. Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 152 +++++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 74 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index decb3ad1508a7..4a23729f399ac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -172,9 +172,7 @@ struct nvme_dev { u32 last_ps; bool hmb; struct sg_table *hmb_sgt; - mempool_t *dmavec_mempool; - mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ __le32 *dbbuf_dbs; @@ -264,6 +262,12 @@ enum nvme_iod_flags { /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ IOD_P2P_BUS_ADDR = 1U << 3, + + /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ + IOD_META_P2P_BUS_ADDR = 1U << 4, + + /* Metadata using non-coalesced MPTR */ + IOD_META_MPTR = 1U << 5, }; struct nvme_dma_vec { @@ -281,13 +285,14 @@ struct nvme_iod { u8 nr_descriptors; unsigned int total_len; + unsigned int meta_total_len; struct dma_iova_state dma_state; + struct dma_iova_state meta_dma_state; void *descriptors[NVME_MAX_NR_DESCRIPTORS]; struct nvme_dma_vec *dma_vecs; unsigned int nr_dma_vecs; dma_addr_t meta_dma; - struct sg_table meta_sgt; struct nvme_sgl_desc *meta_descriptor; }; @@ -644,6 +649,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, return nvmeq->descriptor_pools.large; } +static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd) +{ + return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG; +} + static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) { return cmd->common.flags & @@ -1023,70 +1033,96 @@ static blk_status_t nvme_map_data(struct request *req) return nvme_pci_setup_data_prp(req, &iter); } -static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge, - struct scatterlist *sg) +static void nvme_free_meta_sgls(struct nvme_iod *iod, struct device *dma_dev, + enum dma_data_direction dir) { - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); - sge->type = NVME_SGL_FMT_DATA_DESC << 4; + struct nvme_sgl_desc *sge = iod->meta_descriptor; + + __nvme_free_sgls(dma_dev, sge, &sge[1], dir); +} + +static void nvme_unmap_metadata(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum dma_data_direction dir = rq_dma_dir(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct device *dma_dev = nvmeq->dev->dev; + + if (iod->flags & IOD_META_MPTR) { + dma_unmap_page(dma_dev, iod->meta_dma, + rq_integrity_vec(req).bv_len, + rq_dma_dir(req)); + return; + } + + if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len, + iod->flags & IOD_META_P2P_BUS_ADDR)) { + if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) + nvme_free_meta_sgls(iod, dma_dev, dir); + else + dma_unmap_page(dma_dev, iod->meta_dma, + iod->meta_total_len, dir); + } + + if (iod->meta_descriptor) + dma_pool_free(nvmeq->descriptor_pools.small, + iod->meta_descriptor, iod->meta_dma); } static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; + unsigned int entries = req->nr_integrity_segments; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = nvmeq->dev; struct nvme_sgl_desc *sg_list; - struct scatterlist *sgl, *sg; - unsigned int entries; + struct blk_dma_iter iter; dma_addr_t sgl_dma; - int rc, i; - - iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); - if (!iod->meta_sgt.sgl) - return BLK_STS_RESOURCE; + int i = 0; - sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); - iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, - iod->meta_sgt.sgl); - if (!iod->meta_sgt.orig_nents) - goto out_free_sg; + if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev, + &iod->meta_dma_state, &iter)) + return iter.status; - rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) - goto out_free_sg; + if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + iod->flags |= IOD_META_P2P_BUS_ADDR; + else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + entries = 1; + + if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) { + iod->cmd.common.metadata = cpu_to_le64(iter.addr); + iod->meta_total_len = iter.len; + iod->meta_dma = iter.addr; + iod->meta_descriptor = NULL; + return BLK_STS_OK; + } sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, &sgl_dma); if (!sg_list) - goto out_unmap_sg; + return BLK_STS_RESOURCE; - entries = iod->meta_sgt.nents; iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; - iod->cmd.common.flags = NVME_CMD_SGL_METASEG; iod->cmd.common.metadata = cpu_to_le64(sgl_dma); - - sgl = iod->meta_sgt.sgl; if (entries == 1) { - nvme_pci_sgl_set_data_sg(sg_list, sgl); + iod->meta_total_len = iter.len; + nvme_pci_sgl_set_data(sg_list, &iter); return BLK_STS_OK; } sgl_dma += sizeof(*sg_list); - nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); - for_each_sg(sgl, sg, entries, i) - nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg); - - return BLK_STS_OK; + do { + nvme_pci_sgl_set_data(&sg_list[++i], &iter); + iod->meta_total_len += iter.len; + } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter)); -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); - return BLK_STS_RESOURCE; + nvme_pci_sgl_set_seg(sg_list, sgl_dma, i); + if (unlikely(iter.status)) + nvme_unmap_metadata(req); + return iter.status; } static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) @@ -1099,6 +1135,7 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) return BLK_STS_IOERR; iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); + iod->flags |= IOD_META_MPTR; return BLK_STS_OK; } @@ -1120,7 +1157,6 @@ static blk_status_t nvme_prep_rq(struct request *req) iod->flags = 0; iod->nr_descriptors = 0; iod->total_len = 0; - iod->meta_sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) @@ -1231,25 +1267,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } -static __always_inline void nvme_unmap_metadata(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - - if (!iod->meta_sgt.nents) { - dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req).bv_len, - rq_dma_dir(req)); - return; - } - - dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, - iod->meta_dma); - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); -} - static __always_inline void nvme_pci_unmap_rq(struct request *req) { if (blk_integrity_rq(req)) @@ -3055,7 +3072,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { - size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; dev->dmavec_mempool = mempool_create_node(1, @@ -3064,17 +3080,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) dev_to_node(dev->dev)); if (!dev->dmavec_mempool) return -ENOMEM; - - dev->iod_meta_mempool = mempool_create_node(1, - mempool_kmalloc, mempool_kfree, - (void *)meta_size, GFP_KERNEL, - dev_to_node(dev->dev)); - if (!dev->iod_meta_mempool) - goto free; return 0; -free: - mempool_destroy(dev->dmavec_mempool); - return -ENOMEM; } static void nvme_free_tagset(struct nvme_dev *dev) @@ -3524,7 +3530,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_free_queues(dev, 0); out_release_iod_mempool: mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: @@ -3588,7 +3593,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl);