diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 047ec887456b..1a993bcea5c9 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -484,6 +485,54 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m) return 0; } +/** + * hctx_wait_on_hw_tag_show - display hardware tag starvation count + * @data: generic pointer to the associated hardware context (hctx) + * @m: seq_file pointer for debugfs output formatting + * + * Prints the cumulative number of times a submitting context was forced + * to block due to the exhaustion of physical hardware driver tags. + * + * Return: 0 on success. + */ +static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + unsigned long count = 0; + int cpu; + + if (hctx->wait_on_hw_tag) { + for_each_possible_cpu(cpu) + count += *per_cpu_ptr(hctx->wait_on_hw_tag, cpu); + } + seq_printf(m, "%lu\n", count); + return 0; +} + +/** + * hctx_wait_on_sched_tag_show - display scheduler tag starvation count + * @data: generic pointer to the associated hardware context (hctx) + * @m: seq_file pointer for debugfs output formatting + * + * Prints the cumulative number of times a submitting context was forced + * to block due to the exhaustion of software scheduler tags. + * + * Return: 0 on success. + */ +static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + unsigned long count = 0; + int cpu; + + if (hctx->wait_on_sched_tag) { + for_each_possible_cpu(cpu) + count += *per_cpu_ptr(hctx->wait_on_sched_tag, cpu); + } + seq_printf(m, "%lu\n", count); + return 0; +} + #define CTX_RQ_SEQ_OPS(name, type) \ static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ __acquires(&ctx->lock) \ @@ -599,6 +648,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, + {"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show}, + {"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show}, {}, }; @@ -815,3 +866,61 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } + +/** + * blk_mq_debugfs_alloc_hctx_stats - Allocate per-cpu starvation statistics + * @hctx: hardware context associated with the tag allocation + * @gfp: memory allocation flags + * + * Allocates the per-cpu memory for tracking hardware and scheduler tag + * starvation. + */ +void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx, gfp_t gfp) +{ + if (!hctx->wait_on_hw_tag) + hctx->wait_on_hw_tag = alloc_percpu_gfp(unsigned long, + gfp); + if (!hctx->wait_on_sched_tag) + hctx->wait_on_sched_tag = alloc_percpu_gfp(unsigned long, + gfp); +} + +/** + * blk_mq_debugfs_free_hctx_stats - Free per-cpu starvation statistics + * @hctx: hardware context associated with the tag allocation + * + * Frees the per-cpu memory used for tracking hardware and scheduler tag + * starvation. This must only be called during hardware queue teardown when + * the queue is safely frozen and no active I/O submissions can race to + * increment the statistics. + */ +void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx) +{ + free_percpu(hctx->wait_on_hw_tag); + hctx->wait_on_hw_tag = NULL; + free_percpu(hctx->wait_on_sched_tag); + hctx->wait_on_sched_tag = NULL; +} + +/** + * blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters + * @hctx: hardware context associated with the tag allocation + * @is_sched: true if the starved pool is the software scheduler + * + * Evaluates the exhausted tag pool and safely increments the appropriate + * per-cpu debugfs starvation counter. + * + * Note: The per-cpu pointers are explicitly checked to prevent a NULL + * pointer dereference in the event that the system was under heavy memory + * pressure and the initial per-cpu allocation failed. + */ +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, + bool is_sched) +{ + unsigned long __percpu *tags = is_sched ? + READ_ONCE(hctx->wait_on_sched_tag) : + READ_ONCE(hctx->wait_on_hw_tag); + + if (likely(tags)) + this_cpu_inc(*tags); +} diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 49bb1aaa83dc..7a7c0f376a2b 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -17,6 +17,8 @@ struct blk_mq_debugfs_attr { const struct seq_operations *seq_ops; }; +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, + bool is_sched); int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); @@ -26,6 +28,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_hctxs(struct request_queue *q); void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); +void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx, + gfp_t gfp); +void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_sched(struct request_queue *q); void blk_mq_debugfs_unregister_sched(struct request_queue *q); @@ -35,6 +40,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_rq_qos(struct request_queue *q); #else +static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, + bool is_sched) +{ +} + static inline void blk_mq_debugfs_register(struct request_queue *q) { } @@ -56,6 +66,15 @@ static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { } +static inline void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx, + gfp_t gfp) +{ +} + +static inline void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx) +{ +} + static inline void blk_mq_debugfs_register_sched(struct request_queue *q) { } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 33946cdb5716..3cc6a97a87a0 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,9 +13,11 @@ #include #include +#include #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" +#include "blk-mq-debugfs.h" /* * Recalculate wakeup batch when tag is shared by hctx. @@ -187,6 +189,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != BLK_MQ_NO_TAG) break; + trace_block_rq_tag_wait(data->q, data->hctx, + data->rq_flags & RQF_SCHED_TAGS); + + blk_mq_debugfs_inc_wait_tags(data->hctx, + data->rq_flags & RQF_SCHED_TAGS); + bt_prev = bt; io_schedule(); diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c5c16cce4f8..cd52bf6f82ce 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3991,6 +3991,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_free_flush_queue_callback); hctx->fq = NULL; + blk_mq_debugfs_free_hctx_stats(hctx); + spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -4016,6 +4018,8 @@ static int blk_mq_init_hctx(struct request_queue *q, { gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + blk_mq_debugfs_alloc_hctx_stats(hctx, gfp); + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto fail; @@ -4041,6 +4045,7 @@ static int blk_mq_init_hctx(struct request_queue *q, blk_free_flush_queue(hctx->fq); hctx->fq = NULL; fail: + blk_mq_debugfs_free_hctx_stats(hctx); return -1; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 18a2388ba581..41d61488d683 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -453,6 +453,18 @@ struct blk_mq_hw_ctx { struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; + /** + * @wait_on_hw_tag: Cumulative per-cpu counter incremented each + * time a submitting context is forced to block due to physical + * hardware tag exhaustion. + */ + unsigned long __percpu *wait_on_hw_tag; + /** + * @wait_on_sched_tag: Cumulative per-cpu counter incremented each + * time a submitting context is forced to block due to software + * scheduler tag exhaustion. + */ + unsigned long __percpu *wait_on_sched_tag; #endif /** diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 6aa79e2d799c..7c1026d1cb35 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -226,6 +226,49 @@ DECLARE_EVENT_CLASS(block_rq, IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm) ); +/** + * block_rq_tag_wait - triggered when a request is starved of a tag + * @q: request queue of the target device + * @hctx: hardware context of the request experiencing starvation + * @is_sched_tag: indicates whether the starved pool is the software scheduler + * + * Called immediately before the submitting context is forced to block due + * to the exhaustion of available tags (i.e., physical hardware driver tags + * or software scheduler tags). This trace point indicates that the context + * will be placed into an uninterruptible state via io_schedule() until an + * active request completes and relinquishes its assigned tag. + */ +TRACE_EVENT(block_rq_tag_wait, + + TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag), + + TP_ARGS(q, hctx, is_sched_tag), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u32, hctx_id ) + __field( u32, nr_tags ) + __field( bool, is_sched_tag ) + ), + + TP_fast_assign( + __entry->dev = q->disk ? disk_devt(q->disk); + __entry->hctx_id = hctx->queue_num; + __entry->is_sched_tag = is_sched_tag; + + if (is_sched_tag) + __entry->nr_tags = hctx->sched_tags->nr_tags; + else + __entry->nr_tags = hctx->tags->nr_tags; + ), + + TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->hctx_id, + __entry->is_sched_tag ? "scheduler" : "hardware", + __entry->nr_tags) +); + /** * block_rq_insert - insert block operation request into queue * @rq: block IO operation request