From 4263042dc3dc917102473a04611c674a57cd9961 Mon Sep 17 00:00:00 2001 From: James Rizzo Date: Mon, 20 Apr 2026 17:08:37 +0530 Subject: [PATCH 1/3] scsi: scan: allocate sdev and starget on the NUMA node of the host adapter When a host adapter is attached to a specific NUMA node, allocating scsi_device and scsi_target via kzalloc() may place them on a remote node. All hot-path I/O accesses to these structures then cross the NUMA interconnect, adding latency and consuming inter-node bandwidth. Use kzalloc_node() with dev_to_node(shost->dma_dev) so allocations land on the same node as the HBA, reducing cross-node traffic and improving I/O performance on NUMA systems. Signed-off-by: James Rizzo Signed-off-by: Sumit Saxena --- drivers/scsi/scsi_scan.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index ef22a4228b85..9749a8dbe964 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -286,9 +287,10 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, int display_failure_msg = 1, ret; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct queue_limits lim; + int node = dev_to_node(shost->dma_dev); - sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size, - GFP_KERNEL); + sdev = kzalloc_node(sizeof(*sdev) + shost->transportt->device_size, + GFP_KERNEL, node); if (!sdev) goto out; @@ -501,8 +503,9 @@ static struct scsi_target *scsi_alloc_target(struct device *parent, struct scsi_target *starget; struct scsi_target *found_target; int error, ref_got; + int node = dev_to_node(shost->dma_dev); - starget = kzalloc(size, GFP_KERNEL); + starget = kzalloc_node(size, GFP_KERNEL, node); if (!starget) { printk(KERN_ERR "%s: allocation failure\n", __func__); return NULL; From fa0fcaced56def2445e4ff12f8b8130548983051 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 20 Apr 2026 17:08:38 +0530 Subject: [PATCH 2/3] block: drop shared-tag fairness throttling Original patch [1] by Bart Van Assche; this version is rebased onto the current tree. In testing it improves IOPS by roughly 16-18% by removing the fair-sharing throttle on shared tag queues. This patch removes the following code and structure members: - The function hctx_may_queue(). - blk_mq_hw_ctx.nr_active and request_queue.nr_active_requests_shared_tags and also all the code that modifies these two member variables. [1]: https://lore.kernel.org/linux-block/20240529213921.3166462-1-bvanassche@acm.org/ Signed-off-by: Bart Van Assche Signed-off-by: Sumit Saxena --- block/blk-core.c | 2 - block/blk-mq-debugfs.c | 22 ++++++++- block/blk-mq-tag.c | 4 -- block/blk-mq.c | 17 +------ block/blk-mq.h | 100 ----------------------------------------- include/linux/blk-mq.h | 6 --- include/linux/blkdev.h | 2 - 7 files changed, 22 insertions(+), 131 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 474700ffaa1c..430907b26fc4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -421,8 +421,6 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) q->node = node_id; - atomic_set(&q->nr_active_requests_shared_tags, 0); - timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 047ec887456b..8b85a7f8e987 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -468,11 +468,31 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) return 0; } +struct count_active_params { + struct blk_mq_hw_ctx *hctx; + int *active; +}; + +static bool hctx_count_active(struct request *rq, void *data) +{ + const struct count_active_params *params = data; + + if (rq->mq_hctx == params->hctx) + (*params->active)++; + + return true; +} + static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; + int active = 0; + struct count_active_params params = { .hctx = hctx, .active = &active }; + + blk_mq_all_tag_iter(hctx->sched_tags ?: hctx->tags, hctx_count_active, + ¶ms); - seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); + seq_printf(m, "%d\n", active); return 0; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 33946cdb5716..bfd27cc6249b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -109,10 +109,6 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { - if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && - !hctx_may_queue(data->hctx, bt)) - return BLK_MQ_NO_TAG; - if (data->shallow_depth) return sbitmap_queue_get_shallow(bt, data->shallow_depth); else diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c5c16cce4f8..bbac59a06044 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -489,8 +489,6 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) } } while (data->nr_tags > nr); - if (!(data->rq_flags & RQF_SCHED_TAGS)) - blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; @@ -587,8 +585,6 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) goto retry; } - if (!(data->rq_flags & RQF_SCHED_TAGS)) - blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; @@ -763,8 +759,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - if (!(data.rq_flags & RQF_SCHED_TAGS)) - blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; @@ -807,10 +801,8 @@ static void __blk_mq_free_request(struct request *rq) blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; - if (rq->tag != BLK_MQ_NO_TAG) { - blk_mq_dec_active_requests(hctx); + if (rq->tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->tags, ctx, rq->tag); - } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); @@ -1188,8 +1180,6 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; - blk_mq_sub_active_requests(hctx, nr_tags); - blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); } @@ -1875,9 +1865,6 @@ bool __blk_mq_alloc_driver_tag(struct request *rq) if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; - } else { - if (!hctx_may_queue(rq->mq_hctx, bt)) - return false; } tag = __sbitmap_queue_get(bt); @@ -1885,7 +1872,6 @@ bool __blk_mq_alloc_driver_tag(struct request *rq) return false; rq->tag = tag + tag_offset; - blk_mq_inc_active_requests(rq->mq_hctx); return true; } @@ -4058,7 +4044,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) goto free_hctx; - atomic_set(&hctx->nr_active, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; diff --git a/block/blk-mq.h b/block/blk-mq.h index aa15d31aaae9..8dfb67c55f5d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -291,70 +291,9 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq) return -1; } -static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, - int val) -{ - if (blk_mq_is_shared_tags(hctx->flags)) - atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); - else - atomic_add(val, &hctx->nr_active); -} - -static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) -{ - __blk_mq_add_active_requests(hctx, 1); -} - -static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, - int val) -{ - if (blk_mq_is_shared_tags(hctx->flags)) - atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); - else - atomic_sub(val, &hctx->nr_active); -} - -static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) -{ - __blk_mq_sub_active_requests(hctx, 1); -} - -static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, - int val) -{ - if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) - __blk_mq_add_active_requests(hctx, val); -} - -static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) -{ - if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) - __blk_mq_inc_active_requests(hctx); -} - -static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, - int val) -{ - if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) - __blk_mq_sub_active_requests(hctx, val); -} - -static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) -{ - if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) - __blk_mq_dec_active_requests(hctx); -} - -static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) -{ - if (blk_mq_is_shared_tags(hctx->flags)) - return atomic_read(&hctx->queue->nr_active_requests_shared_tags); - return atomic_read(&hctx->nr_active); -} static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } @@ -396,45 +335,6 @@ static inline void blk_mq_free_requests(struct list_head *list) } } -/* - * For shared tag users, we track the number of currently active users - * and attempt to provide a fair share of the tag depth for each of them. - */ -static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct sbitmap_queue *bt) -{ - unsigned int depth, users; - - if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) - return true; - - /* - * Don't try dividing an ant - */ - if (bt->sb.depth == 1) - return true; - - if (blk_mq_is_shared_tags(hctx->flags)) { - struct request_queue *q = hctx->queue; - - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - return true; - } else { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return true; - } - - users = READ_ONCE(hctx->tags->active_queues); - if (!users) - return true; - - /* - * Allow at least some tags - */ - depth = max((bt->sb.depth + users - 1) / users, 4U); - return __blk_mq_active_requests(hctx) < depth; -} - /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 18a2388ba581..ccbb07559402 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -432,12 +432,6 @@ struct blk_mq_hw_ctx { /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; - /** - * @nr_active: Number of active requests. Only used when a tag set is - * shared across request queues. - */ - atomic_t nr_active; - /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 890128cdea1c..95525b1d7b74 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -567,8 +567,6 @@ struct request_queue { struct timer_list timeout; struct work_struct timeout_work; - atomic_t nr_active_requests_shared_tags; - struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; From 41afac9e8f11e7e739b3b8ead4b0ab1ae626d3ea Mon Sep 17 00:00:00 2001 From: Sumit Saxena Date: Mon, 20 Apr 2026 17:08:39 +0530 Subject: [PATCH 3/3] scsi: use percpu counters for iorequest_cnt and iodone_cnt iorequest_cnt and iodone_cnt are updated on every command dispatch and completion, often from different CPUs on high queue depth workloads. Using adjacent atomic_t fields caused cache line contention between the submission and completion paths. Represent these statistics with struct percpu_counter so increments are mostly local to each CPU, avoiding false sharing without growing struct scsi_device further for cache-line padding. Suggested-by: Bart Van Assche Signed-off-by: Sumit Saxena --- drivers/scsi/scsi_error.c | 2 +- drivers/scsi/scsi_lib.c | 8 ++++---- drivers/scsi/scsi_scan.c | 9 +++++++++ drivers/scsi/scsi_sysfs.c | 27 +++++++++++++++++++++++---- include/scsi/scsi_device.h | 5 +++-- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 147127fb4db9..c7424ce92f3e 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -370,7 +370,7 @@ enum blk_eh_timer_return scsi_timeout(struct request *req) */ if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) return BLK_EH_DONE; - atomic_inc(&scmd->device->iodone_cnt); + percpu_counter_inc(&scmd->device->iodone_cnt); if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6e8c7a42603e..0b05cb63f630 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1554,7 +1554,7 @@ static void scsi_complete(struct request *rq) INIT_LIST_HEAD(&cmd->eh_entry); - atomic_inc(&cmd->device->iodone_cnt); + percpu_counter_inc(&cmd->device->iodone_cnt); if (cmd->result) atomic_inc(&cmd->device->ioerr_cnt); @@ -1592,7 +1592,7 @@ static enum scsi_qc_status scsi_dispatch_cmd(struct scsi_cmnd *cmd) struct Scsi_Host *host = cmd->device->host; int rtn = 0; - atomic_inc(&cmd->device->iorequest_cnt); + percpu_counter_inc(&cmd->device->iorequest_cnt); /* check if the device is still usable */ if (unlikely(cmd->device->sdev_state == SDEV_DEL)) { @@ -1614,7 +1614,7 @@ static enum scsi_qc_status scsi_dispatch_cmd(struct scsi_cmnd *cmd) */ SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : device blocked\n")); - atomic_dec(&cmd->device->iorequest_cnt); + percpu_counter_dec(&cmd->device->iorequest_cnt); return SCSI_MLQUEUE_DEVICE_BUSY; } @@ -1647,7 +1647,7 @@ static enum scsi_qc_status scsi_dispatch_cmd(struct scsi_cmnd *cmd) trace_scsi_dispatch_cmd_start(cmd); rtn = host->hostt->queuecommand(host, cmd); if (rtn) { - atomic_dec(&cmd->device->iorequest_cnt); + percpu_counter_dec(&cmd->device->iorequest_cnt); trace_scsi_dispatch_cmd_error(cmd, rtn); if (rtn != SCSI_MLQUEUE_DEVICE_BUSY && rtn != SCSI_MLQUEUE_TARGET_BUSY) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 9749a8dbe964..0b4fa89149af 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -351,6 +351,15 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, scsi_sysfs_device_initialize(sdev); + ret = percpu_counter_init(&sdev->iorequest_cnt, 0, GFP_KERNEL); + if (ret) + goto out_device_destroy; + ret = percpu_counter_init(&sdev->iodone_cnt, 0, GFP_KERNEL); + if (ret) { + percpu_counter_destroy(&sdev->iorequest_cnt); + goto out_device_destroy; + } + if (scsi_device_is_pseudo_dev(sdev)) return sdev; diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index dfc3559e7e04..1f5b2dc156a8 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -516,6 +516,10 @@ static void scsi_device_dev_release(struct device *dev) if (vpd_pgb7) kfree_rcu(vpd_pgb7, rcu); kfree(sdev->inquiry); + if (percpu_counter_initialized(&sdev->iodone_cnt)) + percpu_counter_destroy(&sdev->iodone_cnt); + if (percpu_counter_initialized(&sdev->iorequest_cnt)) + percpu_counter_destroy(&sdev->iorequest_cnt); kfree(sdev); if (parent) @@ -936,11 +940,26 @@ static ssize_t show_iostat_counterbits(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, 20, "%d\n", (int)sizeof(atomic_t) * 8); + /* + * iorequest_cnt and iodone_cnt are per-CPU sums (s64); ioerr_cnt and + * iotmo_cnt remain atomic_t. Report the widest counter for tools. + */ + return snprintf(buf, 20, "%zu\n", sizeof(s64) * 8); } static DEVICE_ATTR(iocounterbits, S_IRUGO, show_iostat_counterbits, NULL); +#define show_sdev_iostat_percpu(field) \ +static ssize_t \ +show_iostat_##field(struct device *dev, struct device_attribute *attr, \ + char *buf) \ +{ \ + struct scsi_device *sdev = to_scsi_device(dev); \ + unsigned long long count = percpu_counter_sum(&sdev->field); \ + return snprintf(buf, 20, "0x%llx\n", count); \ +} \ +static DEVICE_ATTR(field, 0444, show_iostat_##field, NULL) + #define show_sdev_iostat(field) \ static ssize_t \ show_iostat_##field(struct device *dev, struct device_attribute *attr, \ @@ -950,10 +969,10 @@ show_iostat_##field(struct device *dev, struct device_attribute *attr, \ unsigned long long count = atomic_read(&sdev->field); \ return snprintf(buf, 20, "0x%llx\n", count); \ } \ -static DEVICE_ATTR(field, S_IRUGO, show_iostat_##field, NULL) +static DEVICE_ATTR(field, 0444, show_iostat_##field, NULL) -show_sdev_iostat(iorequest_cnt); -show_sdev_iostat(iodone_cnt); +show_sdev_iostat_percpu(iorequest_cnt); +show_sdev_iostat_percpu(iodone_cnt); show_sdev_iostat(ioerr_cnt); show_sdev_iostat(iotmo_cnt); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 9c2a7bbe5891..ad80b500ced9 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -8,6 +8,7 @@ #include #include #include +#include #include struct bsg_device; @@ -271,8 +272,8 @@ struct scsi_device { unsigned int max_device_blocked; /* what device_blocked counts down from */ #define SCSI_DEFAULT_DEVICE_BLOCKED 3 - atomic_t iorequest_cnt; - atomic_t iodone_cnt; + struct percpu_counter iorequest_cnt; + struct percpu_counter iodone_cnt; atomic_t ioerr_cnt; atomic_t iotmo_cnt;