From d01fd805fe943f9ca58cfa543c7985bade2f462a Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 30 Jul 2025 13:16:07 +0530 Subject: [PATCH 1/3] block: move elevator queue allocation logic into blk_mq_init_sched In preparation for allocating sched_tags before freezing the request queue and acquiring ->elevator_lock, move the elevator queue allocation logic from the elevator ops ->init_sched callback into blk_mq_init_sched. As elevator_alloc is now only invoked from block layer core, we don't need to export it, so unexport elevator_alloc function. This refactoring provides a centralized location for elevator queue initialization, which makes it easier to store pre-allocated sched_tags in the struct elevator_queue during later changes. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Nilay Shroff --- block/bfq-iosched.c | 13 +++---------- block/blk-mq-sched.c | 11 ++++++++--- block/elevator.c | 1 - block/elevator.h | 2 +- block/kyber-iosched.c | 11 ++--------- block/mq-deadline.c | 14 ++------------ 6 files changed, 16 insertions(+), 36 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0cb1e9873aabb..fd26dc1901b08 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7232,22 +7232,16 @@ static void bfq_init_root_group(struct bfq_group *root_group, root_group->sched_data.bfq_class_idle_last_service = jiffies; } -static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct bfq_data *bfqd; - struct elevator_queue *eq; unsigned int i; struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); - if (!bfqd) { - kobject_put(&eq->kobj); + if (!bfqd) return -ENOMEM; - } + eq->elevator_data = bfqd; spin_lock_irq(&q->queue_lock); @@ -7405,7 +7399,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) out_free: kfree(bfqd); - kobject_put(&eq->kobj); return -ENOMEM; } diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 55a0fd1051479..359e0704e09be 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -475,10 +475,14 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, BLKDEV_DEFAULT_RQ); + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + if (blk_mq_is_shared_tags(flags)) { ret = blk_mq_init_sched_shared_tags(q); if (ret) - return ret; + goto err_put_elevator; } queue_for_each_hw_ctx(q, hctx, i) { @@ -487,7 +491,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) goto err_free_map_and_rqs; } - ret = e->ops.init_sched(q, e); + ret = e->ops.init_sched(q, eq); if (ret) goto err_free_map_and_rqs; @@ -508,7 +512,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) err_free_map_and_rqs: blk_mq_sched_free_rqs(q); blk_mq_sched_tags_teardown(q, flags); - +err_put_elevator: + kobject_put(&eq->kobj); q->elevator = NULL; return ret; } diff --git a/block/elevator.c b/block/elevator.c index 88f8f36bed981..939b0c590fbe9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -148,7 +148,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, return eq; } -EXPORT_SYMBOL(elevator_alloc); static void elevator_release(struct kobject *kobj) { diff --git a/block/elevator.h b/block/elevator.h index a07ce773a38f7..a4de5f9ad7907 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -24,7 +24,7 @@ struct blk_mq_alloc_data; struct blk_mq_hw_ctx; struct elevator_mq_ops { - int (*init_sched)(struct request_queue *, struct elevator_type *); + int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4dba8405bd015..7b6832cb3a8d5 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -402,20 +402,13 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) return ERR_PTR(ret); } -static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) +static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct kyber_queue_data *kqd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; kqd = kyber_queue_data_alloc(q); - if (IS_ERR(kqd)) { - kobject_put(&eq->kobj); + if (IS_ERR(kqd)) return PTR_ERR(kqd); - } blk_stat_enable_accounting(q); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 2edf1cac06d5b..7b6caf30e00ae 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -568,20 +568,14 @@ static void dd_exit_sched(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; - struct elevator_queue *eq; enum dd_prio prio; - int ret = -ENOMEM; - - eq = elevator_alloc(q, e); - if (!eq) - return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) - goto put_eq; + return -ENOMEM; eq->elevator_data = dd; @@ -608,10 +602,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) q->elevator = eq; return 0; - -put_eq: - kobject_put(&eq->kobj); - return ret; } /* From 6bcd451cf146a684b1f96ca97955938fdf3126ee Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 30 Jul 2025 13:16:08 +0530 Subject: [PATCH 2/3] block: fix lockdep warning caused by lock dependency in elv_iosched_store Recent lockdep reports [1] have revealed a potential deadlock caused by a lock dependency between the percpu allocator lock and the elevator lock. This issue can be avoided by ensuring that the allocation and release of scheduler tags (sched_tags) are performed outside the elevator lock. Furthermore, the queue does not need to be remain frozen during these operations. To address this, move all sched_tags allocations and deallocations outside of both the ->elevator_lock and the ->freeze_lock. Since the lifetime of the elevator queue and its associated sched_tags is closely tied, the allocated sched_tags are now stored in the elevator queue structure. Then, during the actual elevator switch (which runs under ->freeze_lock and ->elevator_lock), the pre-allocated sched_tags are assigned to the appropriate q->hctx. Once the elevator switch is complete and the locks are released, the old elevator queue and its associated sched_tags are freed. This commit specifically addresses the allocation/deallocation of sched_ tags during elevator switching. Note that sched_tags may also be allocated in other contexts, such as during nr_hw_queues updates. Supporting that use case will require batch allocation/deallocation, which will be handled in a follow-up patch. This restructuring ensures that sched_tags memory management occurs entirely outside of the ->elevator_lock and ->freeze_lock context, eliminating the lock dependency problem seen during scheduler updates. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reported-by: Stefan Haberland Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff --- block/blk-mq-sched.c | 155 +++++++++++++++++++++++-------------------- block/blk-mq-sched.h | 8 ++- block/elevator.c | 40 +++++++++-- block/elevator.h | 14 +++- 4 files changed, 136 insertions(+), 81 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 359e0704e09be..2d6d1ebdd8fbe 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - if (blk_mq_is_shared_tags(q->tag_set->flags)) { - hctx->sched_tags = q->sched_shared_tags; - return 0; - } - - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, - q->nr_requests); - - if (!hctx->sched_tags) - return -ENOMEM; - return 0; -} - -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) -{ - blk_mq_free_rq_map(queue->sched_shared_tags); - queue->sched_shared_tags = NULL; -} - /* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - if (!blk_mq_is_shared_tags(flags)) - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } - } + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = NULL; if (blk_mq_is_shared_tags(flags)) - blk_mq_exit_sched_shared_tags(q); -} - -static int blk_mq_init_sched_shared_tags(struct request_queue *queue) -{ - struct blk_mq_tag_set *set = queue->tag_set; - - /* - * Set initial depth at max so that we don't need to reallocate for - * updating nr_requests. - */ - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, - BLK_MQ_NO_HCTX_IDX, - MAX_SCHED_RQ); - if (!queue->sched_shared_tags) - return -ENOMEM; - - blk_mq_tag_update_sched_shared_tags(queue); - - return 0; + q->sched_shared_tags = NULL; } void blk_mq_sched_reg_debugfs(struct request_queue *q) @@ -458,8 +411,75 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q) mutex_unlock(&q->debugfs_mutex); } +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set) +{ + unsigned long i; + + /* Shared tags are stored at index 0 in @tags. */ + if (blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX); + else { + for (i = 0; i < et->nr_hw_queues; i++) + blk_mq_free_map_and_rqs(set, et->tags[i], i); + } + + kfree(et); +} + +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues) +{ + unsigned int nr_tags; + int i; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + if (blk_mq_is_shared_tags(set->flags)) + nr_tags = 1; + else + nr_tags = nr_hw_queues; + + et = kmalloc(sizeof(struct elevator_tags) + + nr_tags * sizeof(struct blk_mq_tags *), gfp); + if (!et) + return NULL; + /* + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. + */ + et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, + BLKDEV_DEFAULT_RQ); + et->nr_hw_queues = nr_hw_queues; + + if (blk_mq_is_shared_tags(set->flags)) { + /* Shared tags are stored at index 0 in @tags. */ + et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!et->tags[0]) + goto out; + } else { + for (i = 0; i < et->nr_hw_queues; i++) { + et->tags[i] = blk_mq_alloc_map_and_rqs(set, i, + et->nr_requests); + if (!et->tags[i]) + goto out_unwind; + } + } + + return et; +out_unwind: + while (--i >= 0) + blk_mq_free_map_and_rqs(set, et->tags[i], i); +out: + kfree(et); + return NULL; +} + /* caller must have a reference to @e, will grab another one if successful */ -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et) { unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; @@ -467,40 +487,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - /* - * Default to double of smaller one between hw queue_depth and 128, - * since we don't split into sync/async like the old code did. - * Additionally, this is a per-hw queue depth. - */ - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_DEFAULT_RQ); - - eq = elevator_alloc(q, e); + eq = elevator_alloc(q, e, et); if (!eq) return -ENOMEM; + q->nr_requests = et->nr_requests; + if (blk_mq_is_shared_tags(flags)) { - ret = blk_mq_init_sched_shared_tags(q); - if (ret) - goto err_put_elevator; + /* Shared tags are stored at index 0 in @et->tags. */ + q->sched_shared_tags = et->tags[0]; + blk_mq_tag_update_sched_shared_tags(q); } queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); - if (ret) - goto err_free_map_and_rqs; + if (blk_mq_is_shared_tags(flags)) + hctx->sched_tags = q->sched_shared_tags; + else + hctx->sched_tags = et->tags[i]; } ret = e->ops.init_sched(q, eq); if (ret) - goto err_free_map_and_rqs; + goto out; queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { - eq = q->elevator; - blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -509,10 +522,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) } return 0; -err_free_map_and_rqs: - blk_mq_sched_free_rqs(q); +out: blk_mq_sched_tags_teardown(q, flags); -err_put_elevator: kobject_put(&eq->kobj); q->elevator = NULL; return ret; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1326526bb7338..0cde00cd1c471 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,10 +18,16 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues); +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set); + static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) diff --git a/block/elevator.c b/block/elevator.c index 939b0c590fbe9..e9dc837b7b704 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -54,6 +54,8 @@ struct elv_change_ctx { struct elevator_queue *old; /* for registering new elevator */ struct elevator_queue *new; + /* holds sched tags data */ + struct elevator_tags *et; }; static DEFINE_SPINLOCK(elv_list_lock); @@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name) static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, - struct elevator_type *e) + struct elevator_type *e, struct elevator_tags *et) { struct elevator_queue *eq; @@ -145,6 +147,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); + eq->et = et; return eq; } @@ -165,7 +168,6 @@ static void elevator_exit(struct request_queue *q) lockdep_assert_held(&q->elevator_lock); ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); @@ -591,7 +593,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) } if (new_e) { - ret = blk_mq_init_sched(q, new_e); + ret = blk_mq_init_sched(q, new_e, ctx->et); if (ret) goto out_unfreeze; ctx->new = q->elevator; @@ -626,8 +628,10 @@ static void elv_exit_and_release(struct request_queue *q) elevator_exit(q); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - if (e) + if (e) { + blk_mq_free_sched_tags(e->et, q->tag_set); kobject_put(&e->kobj); + } } static int elevator_change_done(struct request_queue *q, @@ -640,6 +644,7 @@ static int elevator_change_done(struct request_queue *q, &ctx->old->flags); elv_unregister_queue(q, ctx->old); + blk_mq_free_sched_tags(ctx->old->et, q->tag_set); kobject_put(&ctx->old->kobj); if (enable_wbt) wbt_enable_default(q->disk); @@ -658,9 +663,16 @@ static int elevator_change_done(struct request_queue *q, static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) { unsigned int memflags; + struct blk_mq_tag_set *set = q->tag_set; int ret = 0; - lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); + lockdep_assert_held(&set->update_nr_hwq_lock); + + if (strncmp(ctx->name, "none", 4)) { + ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + if (!ctx->et) + return -ENOMEM; + } memflags = blk_mq_freeze_queue(q); /* @@ -680,6 +692,11 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) blk_mq_unfreeze_queue(q, memflags); if (!ret) ret = elevator_change_done(q, ctx); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (ctx->et && !ctx->new) + blk_mq_free_sched_tags(ctx->et, set); return ret; } @@ -690,6 +707,7 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) */ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) { + struct blk_mq_tag_set *set = q->tag_set; struct elv_change_ctx ctx = {}; int ret = -ENODEV; @@ -697,15 +715,25 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { ctx.name = e->elevator_name; - + ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + if (!ctx.et) { + WARN_ON_ONCE(1); + goto unfreeze; + } mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ ret = elevator_switch(q, &ctx); mutex_unlock(&q->elevator_lock); } +unfreeze: blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx)); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (ctx.et && !ctx.new) + blk_mq_free_sched_tags(ctx.et, set); } /* diff --git a/block/elevator.h b/block/elevator.h index a4de5f9ad7907..adc5c157e17e5 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -23,6 +23,15 @@ enum elv_merge { struct blk_mq_alloc_data; struct blk_mq_hw_ctx; +struct elevator_tags { + /* num. of hardware queues for which tags are allocated */ + unsigned int nr_hw_queues; + /* depth used while allocating tags */ + unsigned int nr_requests; + /* shared tag is stored at index 0 */ + struct blk_mq_tags *tags[]; +}; + struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); @@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); struct elevator_queue { struct elevator_type *type; + struct elevator_tags *et; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; @@ -152,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); -extern struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *); +struct elevator_queue *elevator_alloc(struct request_queue *, + struct elevator_type *, struct elevator_tags *); /* * Helper functions. From 7fec760536b87fdc7a9f45f86e3ff9b73577d44b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 30 Jul 2025 13:16:09 +0530 Subject: [PATCH 3/3] block: fix potential deadlock while running nr_hw_queue update Move scheduler tags (sched_tags) allocation and deallocation outside both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues. This change breaks the dependency chain from the percpu allocator lock to the elevator lock, helping to prevent potential deadlocks, as observed in the reported lockdep splat[1]. This commit introduces batch allocation and deallocation helpers for sched_tags, which are now used from within __blk_mq_update_nr_hw_queues routine while iterating through the tagset. With this change, all sched_tags memory management is handled entirely outside the ->elevator_lock and the ->freeze_lock context, thereby eliminating the lock dependency that could otherwise manifest during nr_hw_queues updates. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reported-by: Stefan Haberland Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff --- block/blk-mq-sched.c | 65 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sched.h | 4 +++ block/blk-mq.c | 16 +++++++---- block/blk.h | 4 ++- block/elevator.c | 15 ++++------ 5 files changed, 89 insertions(+), 15 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2d6d1ebdd8fbe..e2ce4a28e6c9e 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,6 +427,32 @@ void blk_mq_free_sched_tags(struct elevator_tags *et, kfree(et); } +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set) +{ + struct request_queue *q; + struct elevator_tags *et; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = xa_load(et_table, q->id); + if (unlikely(!et)) + WARN_ON_ONCE(1); + else + blk_mq_free_sched_tags(et, set); + } + } +} + struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues) { @@ -477,6 +503,45 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, return NULL; } +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues) +{ + struct request_queue *q; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + if (!et) + goto out_unwind; + if (xa_insert(et_table, q->id, et, gfp)) + goto out_free_tags; + } + } + return 0; +out_free_tags: + blk_mq_free_sched_tags(et, set); +out_unwind: + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { + if (q->elevator) { + et = xa_load(et_table, q->id); + if (et) + blk_mq_free_sched_tags(et, set); + } + } + return -ENOMEM; +} + /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, struct elevator_tags *et) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0cde00cd1c471..b554e1d559508 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues); +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 9692fa4c3ef29..b67d6c02ecebd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4974,12 +4974,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, - struct xarray *elv_tbl) + struct xarray *elv_tbl, struct xarray *et_tbl) { struct elevator_type *e = xa_load(elv_tbl, q->id); + struct elevator_tags *t = xa_load(et_tbl, q->id); /* The elv_update_nr_hw_queues unfreezes the queue. */ - elv_update_nr_hw_queues(q, e); + elv_update_nr_hw_queues(q, e, t); /* Drop the reference acquired in blk_mq_elv_switch_none. */ if (e) @@ -5031,7 +5032,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; - struct xarray elv_tbl; + struct xarray elv_tbl, et_tbl; lockdep_assert_held(&set->tag_list_lock); @@ -5044,6 +5045,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, memflags = memalloc_noio_save(); + xa_init(&et_tbl); + if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) + goto out_memalloc_restore; + xa_init(&elv_tbl); list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5087,7 +5092,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, switch_back: /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_elv_switch_back(q, &elv_tbl); + blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); @@ -5098,7 +5103,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, } xa_destroy(&elv_tbl); - + xa_destroy(&et_tbl); +out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ diff --git a/block/blk.h b/block/blk.h index 76901a39997f8..0a2eccf28ca41 100644 --- a/block/blk.h +++ b/block/blk.h @@ -12,6 +12,7 @@ #include "blk-crypto-internal.h" struct elevator_type; +struct elevator_tags; /* * Default upper limit for the software max_sectors limit used for regular I/Os. @@ -330,7 +331,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e); +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *t); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index e9dc837b7b704..fe96c6f4753ca 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -705,7 +705,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *t) { struct blk_mq_tag_set *set = q->tag_set; struct elv_change_ctx ctx = {}; @@ -715,25 +716,21 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { ctx.name = e->elevator_name; - ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); - if (!ctx.et) { - WARN_ON_ONCE(1); - goto unfreeze; - } + ctx.et = t; + mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ ret = elevator_switch(q, &ctx); mutex_unlock(&q->elevator_lock); } -unfreeze: blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx)); /* * Free sched tags if it's allocated but we couldn't switch elevator. */ - if (ctx.et && !ctx.new) - blk_mq_free_sched_tags(ctx.et, set); + if (t && !ctx.new) + blk_mq_free_sched_tags(t, set); } /*