Skip to content

Commit 50dc16e

Browse files
aarontomlinkawasaki
authored andcommitted
blk-mq: expose tag starvation counts via debugfs
In high-performance storage environments, particularly when utilising RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe latency spikes can occur when fast devices are starved of available tags. This patch introduces two new debugfs attributes for each block hardware queue: - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag These files expose atomic counters that increment each time a submitting context is forced into an uninterruptible sleep via io_schedule() due to the complete exhaustion of physical driver tags or software scheduler tags, respectively. To ensure negligible performance overhead even in production environments where CONFIG_BLK_DEBUG_FS is actively enabled, this tracking logic utilises dynamically allocated per-CPU counters. When this configuration is disabled, the tracking logic compiles down to a safe no-op. Signed-off-by: Aaron Tomlin <[email protected]>
1 parent 6ae38d8 commit 50dc16e

5 files changed

Lines changed: 149 additions & 0 deletions

File tree

block/blk-mq-debugfs.c

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/blkdev.h>
88
#include <linux/build_bug.h>
99
#include <linux/debugfs.h>
10+
#include <linux/percpu.h>
1011

1112
#include "blk.h"
1213
#include "blk-mq.h"
@@ -484,6 +485,54 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
484485
return 0;
485486
}
486487

488+
/**
489+
* hctx_wait_on_hw_tag_show - display hardware tag starvation count
490+
* @data: generic pointer to the associated hardware context (hctx)
491+
* @m: seq_file pointer for debugfs output formatting
492+
*
493+
* Prints the cumulative number of times a submitting context was forced
494+
* to block due to the exhaustion of physical hardware driver tags.
495+
*
496+
* Return: 0 on success.
497+
*/
498+
static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
499+
{
500+
struct blk_mq_hw_ctx *hctx = data;
501+
unsigned long count = 0;
502+
int cpu;
503+
504+
if (hctx->wait_on_hw_tag) {
505+
for_each_possible_cpu(cpu)
506+
count += *per_cpu_ptr(hctx->wait_on_hw_tag, cpu);
507+
}
508+
seq_printf(m, "%lu\n", count);
509+
return 0;
510+
}
511+
512+
/**
513+
* hctx_wait_on_sched_tag_show - display scheduler tag starvation count
514+
* @data: generic pointer to the associated hardware context (hctx)
515+
* @m: seq_file pointer for debugfs output formatting
516+
*
517+
* Prints the cumulative number of times a submitting context was forced
518+
* to block due to the exhaustion of software scheduler tags.
519+
*
520+
* Return: 0 on success.
521+
*/
522+
static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m)
523+
{
524+
struct blk_mq_hw_ctx *hctx = data;
525+
unsigned long count = 0;
526+
int cpu;
527+
528+
if (hctx->wait_on_sched_tag) {
529+
for_each_possible_cpu(cpu)
530+
count += *per_cpu_ptr(hctx->wait_on_sched_tag, cpu);
531+
}
532+
seq_printf(m, "%lu\n", count);
533+
return 0;
534+
}
535+
487536
#define CTX_RQ_SEQ_OPS(name, type) \
488537
static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
489538
__acquires(&ctx->lock) \
@@ -599,6 +648,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
599648
{"active", 0400, hctx_active_show},
600649
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
601650
{"type", 0400, hctx_type_show},
651+
{"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
652+
{"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
602653
{},
603654
};
604655

@@ -815,3 +866,61 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
815866
debugfs_remove_recursive(hctx->sched_debugfs_dir);
816867
hctx->sched_debugfs_dir = NULL;
817868
}
869+
870+
/**
871+
* blk_mq_debugfs_alloc_hctx_stats - Allocate per-cpu starvation statistics
872+
* @hctx: hardware context associated with the tag allocation
873+
* @gfp: memory allocation flags
874+
*
875+
* Allocates the per-cpu memory for tracking hardware and scheduler tag
876+
* starvation.
877+
*/
878+
void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx, gfp_t gfp)
879+
{
880+
if (!hctx->wait_on_hw_tag)
881+
hctx->wait_on_hw_tag = alloc_percpu_gfp(unsigned long,
882+
gfp);
883+
if (!hctx->wait_on_sched_tag)
884+
hctx->wait_on_sched_tag = alloc_percpu_gfp(unsigned long,
885+
gfp);
886+
}
887+
888+
/**
889+
* blk_mq_debugfs_free_hctx_stats - Free per-cpu starvation statistics
890+
* @hctx: hardware context associated with the tag allocation
891+
*
892+
* Frees the per-cpu memory used for tracking hardware and scheduler tag
893+
* starvation. This must only be called during hardware queue teardown when
894+
* the queue is safely frozen and no active I/O submissions can race to
895+
* increment the statistics.
896+
*/
897+
void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx)
898+
{
899+
free_percpu(hctx->wait_on_hw_tag);
900+
hctx->wait_on_hw_tag = NULL;
901+
free_percpu(hctx->wait_on_sched_tag);
902+
hctx->wait_on_sched_tag = NULL;
903+
}
904+
905+
/**
906+
* blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters
907+
* @hctx: hardware context associated with the tag allocation
908+
* @is_sched: true if the starved pool is the software scheduler
909+
*
910+
* Evaluates the exhausted tag pool and safely increments the appropriate
911+
* per-cpu debugfs starvation counter.
912+
*
913+
* Note: The per-cpu pointers are explicitly checked to prevent a NULL
914+
* pointer dereference in the event that the system was under heavy memory
915+
* pressure and the initial per-cpu allocation failed.
916+
*/
917+
void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
918+
bool is_sched)
919+
{
920+
unsigned long __percpu *tags = is_sched ?
921+
READ_ONCE(hctx->wait_on_sched_tag) :
922+
READ_ONCE(hctx->wait_on_hw_tag);
923+
924+
if (likely(tags))
925+
this_cpu_inc(*tags);
926+
}

block/blk-mq-debugfs.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ struct blk_mq_debugfs_attr {
1717
const struct seq_operations *seq_ops;
1818
};
1919

20+
void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
21+
bool is_sched);
2022
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
2123
int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
2224

@@ -26,6 +28,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
2628
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
2729
void blk_mq_debugfs_register_hctxs(struct request_queue *q);
2830
void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
31+
void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx,
32+
gfp_t gfp);
33+
void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx);
2934

3035
void blk_mq_debugfs_register_sched(struct request_queue *q);
3136
void blk_mq_debugfs_unregister_sched(struct request_queue *q);
@@ -35,6 +40,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
3540

3641
void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
3742
#else
43+
static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
44+
bool is_sched)
45+
{
46+
}
47+
3848
static inline void blk_mq_debugfs_register(struct request_queue *q)
3949
{
4050
}
@@ -56,6 +66,15 @@ static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
5666
{
5767
}
5868

69+
static inline void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx,
70+
gfp_t gfp)
71+
{
72+
}
73+
74+
static inline void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx)
75+
{
76+
}
77+
5978
static inline void blk_mq_debugfs_register_sched(struct request_queue *q)
6079
{
6180
}

block/blk-mq-tag.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "blk.h"
1818
#include "blk-mq.h"
1919
#include "blk-mq-sched.h"
20+
#include "blk-mq-debugfs.h"
2021

2122
/*
2223
* Recalculate wakeup batch when tag is shared by hctx.
@@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
191192
trace_block_rq_tag_wait(data->q, data->hctx,
192193
data->rq_flags & RQF_SCHED_TAGS);
193194

195+
blk_mq_debugfs_inc_wait_tags(data->hctx,
196+
data->rq_flags & RQF_SCHED_TAGS);
197+
194198
bt_prev = bt;
195199
io_schedule();
196200

block/blk-mq.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3991,6 +3991,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
39913991
blk_free_flush_queue_callback);
39923992
hctx->fq = NULL;
39933993

3994+
blk_mq_debugfs_free_hctx_stats(hctx);
3995+
39943996
spin_lock(&q->unused_hctx_lock);
39953997
list_add(&hctx->hctx_list, &q->unused_hctx_list);
39963998
spin_unlock(&q->unused_hctx_lock);
@@ -4016,6 +4018,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
40164018
{
40174019
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
40184020

4021+
blk_mq_debugfs_alloc_hctx_stats(hctx, gfp);
4022+
40194023
hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
40204024
if (!hctx->fq)
40214025
goto fail;
@@ -4041,6 +4045,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
40414045
blk_free_flush_queue(hctx->fq);
40424046
hctx->fq = NULL;
40434047
fail:
4048+
blk_mq_debugfs_free_hctx_stats(hctx);
40444049
return -1;
40454050
}
40464051

include/linux/blk-mq.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,18 @@ struct blk_mq_hw_ctx {
453453
struct dentry *debugfs_dir;
454454
/** @sched_debugfs_dir: debugfs directory for the scheduler. */
455455
struct dentry *sched_debugfs_dir;
456+
/**
457+
* @wait_on_hw_tag: Cumulative per-cpu counter incremented each
458+
* time a submitting context is forced to block due to physical
459+
* hardware tag exhaustion.
460+
*/
461+
unsigned long __percpu *wait_on_hw_tag;
462+
/**
463+
* @wait_on_sched_tag: Cumulative per-cpu counter incremented each
464+
* time a submitting context is forced to block due to software
465+
* scheduler tag exhaustion.
466+
*/
467+
unsigned long __percpu *wait_on_sched_tag;
456468
#endif
457469

458470
/**

0 commit comments

Comments
 (0)