Skip to content

Commit 5e1c43e

Browse files
aarontomlinkawasaki
authored andcommitted
blk-mq: expose tag starvation counts via debugfs
In high-performance storage environments, particularly when utilising RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe latency spikes can occur when fast devices are starved of available tags. This patch introduces two new debugfs attributes for each block hardware queue: - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag These files expose atomic counters that increment each time a submitting context is forced into an uninterruptible sleep via io_schedule() due to the complete exhaustion of physical driver tags or software scheduler tags, respectively. To ensure negligible performance overhead even in production environments where CONFIG_BLK_DEBUG_FS is actively enabled, this tracking logic utilises dynamically allocated per-CPU counters. When this configuration is disabled, the tracking logic compiles down to a safe no-op. Signed-off-by: Aaron Tomlin <[email protected]>
1 parent 52a6edc commit 5e1c43e

4 files changed

Lines changed: 107 additions & 0 deletions

File tree

block/blk-mq-debugfs.c

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/blkdev.h>
88
#include <linux/build_bug.h>
99
#include <linux/debugfs.h>
10+
#include <linux/percpu.h>
1011

1112
#include "blk.h"
1213
#include "blk-mq.h"
@@ -484,6 +485,54 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
484485
return 0;
485486
}
486487

488+
/**
489+
* hctx_wait_on_hw_tag_show - display hardware tag starvation count
490+
* @data: generic pointer to the associated hardware context (hctx)
491+
* @m: seq_file pointer for debugfs output formatting
492+
*
493+
* Prints the cumulative number of times a submitting context was forced
494+
* to block due to the exhaustion of physical hardware driver tags.
495+
*
496+
* Return: 0 on success.
497+
*/
498+
static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
499+
{
500+
struct blk_mq_hw_ctx *hctx = data;
501+
unsigned long count = 0;
502+
int cpu;
503+
504+
if (hctx->wait_on_hw_tag) {
505+
for_each_possible_cpu(cpu)
506+
count += *per_cpu_ptr(hctx->wait_on_hw_tag, cpu);
507+
}
508+
seq_printf(m, "%lu\n", count);
509+
return 0;
510+
}
511+
512+
/**
513+
* hctx_wait_on_sched_tag_show - display scheduler tag starvation count
514+
* @data: generic pointer to the associated hardware context (hctx)
515+
* @m: seq_file pointer for debugfs output formatting
516+
*
517+
* Prints the cumulative number of times a submitting context was forced
518+
* to block due to the exhaustion of software scheduler tags.
519+
*
520+
* Return: 0 on success.
521+
*/
522+
static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m)
523+
{
524+
struct blk_mq_hw_ctx *hctx = data;
525+
unsigned long count = 0;
526+
int cpu;
527+
528+
if (hctx->wait_on_sched_tag) {
529+
for_each_possible_cpu(cpu)
530+
count += *per_cpu_ptr(hctx->wait_on_sched_tag, cpu);
531+
}
532+
seq_printf(m, "%lu\n", count);
533+
return 0;
534+
}
535+
487536
#define CTX_RQ_SEQ_OPS(name, type) \
488537
static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
489538
__acquires(&ctx->lock) \
@@ -599,6 +648,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
599648
{"active", 0400, hctx_active_show},
600649
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
601650
{"type", 0400, hctx_type_show},
651+
{"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
652+
{"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
602653
{},
603654
};
604655

@@ -670,6 +721,11 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
670721
snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
671722
hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
672723

724+
if (!hctx->wait_on_hw_tag)
725+
hctx->wait_on_hw_tag = alloc_percpu(unsigned long);
726+
if (!hctx->wait_on_sched_tag)
727+
hctx->wait_on_sched_tag = alloc_percpu(unsigned long);
728+
673729
debugfs_create_files(q, hctx->debugfs_dir, hctx,
674730
blk_mq_debugfs_hctx_attrs);
675731

@@ -684,6 +740,11 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
684740
debugfs_remove_recursive(hctx->debugfs_dir);
685741
hctx->sched_debugfs_dir = NULL;
686742
hctx->debugfs_dir = NULL;
743+
744+
free_percpu(hctx->wait_on_hw_tag);
745+
hctx->wait_on_hw_tag = NULL;
746+
free_percpu(hctx->wait_on_sched_tag);
747+
hctx->wait_on_sched_tag = NULL;
687748
}
688749

689750
void blk_mq_debugfs_register_hctxs(struct request_queue *q)
@@ -815,3 +876,26 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
815876
debugfs_remove_recursive(hctx->sched_debugfs_dir);
816877
hctx->sched_debugfs_dir = NULL;
817878
}
879+
880+
/**
881+
* blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters
882+
* @hctx: hardware context associated with the tag allocation
883+
* @is_sched: true if the starved pool is the software scheduler
884+
*
885+
* Evaluates the exhausted tag pool and safely increments the appropriate
886+
* per-cpu debugfs starvation counter.
887+
*
888+
* Note: A race window exists during rapid device probe or CPU hotplug
889+
* where I/O might be submitted before blk_mq_debugfs_register_hctx() has
890+
* completed allocating the per-CPU counters. Therefore, the pointer is
891+
* explicitly checked to prevent a NULL pointer dereference.
892+
*/
893+
void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
894+
bool is_sched)
895+
{
896+
unsigned long __percpu *tags = is_sched ? hctx->wait_on_sched_tag :
897+
hctx->wait_on_hw_tag;
898+
899+
if (likely(tags))
900+
this_cpu_inc(*tags);
901+
}

block/blk-mq-debugfs.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ struct blk_mq_debugfs_attr {
1717
const struct seq_operations *seq_ops;
1818
};
1919

20+
void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
21+
bool is_sched);
2022
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
2123
int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
2224

@@ -35,6 +37,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
3537

3638
void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
3739
#else
40+
static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
41+
bool is_sched)
42+
{
43+
}
44+
3845
static inline void blk_mq_debugfs_register(struct request_queue *q)
3946
{
4047
}

block/blk-mq-tag.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "blk.h"
1818
#include "blk-mq.h"
1919
#include "blk-mq-sched.h"
20+
#include "blk-mq-debugfs.h"
2021

2122
/*
2223
* Recalculate wakeup batch when tag is shared by hctx.
@@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
191192
trace_block_rq_tag_wait(data->q, data->hctx,
192193
data->rq_flags & RQF_SCHED_TAGS);
193194

195+
blk_mq_debugfs_inc_wait_tags(data->hctx,
196+
data->rq_flags & RQF_SCHED_TAGS);
197+
194198
bt_prev = bt;
195199
io_schedule();
196200

include/linux/blk-mq.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,18 @@ struct blk_mq_hw_ctx {
453453
struct dentry *debugfs_dir;
454454
/** @sched_debugfs_dir: debugfs directory for the scheduler. */
455455
struct dentry *sched_debugfs_dir;
456+
/**
457+
* @wait_on_hw_tag: Cumulative per-cpu counter incremented each
458+
* time a submitting context is forced to block due to physical
459+
* hardware tag exhaustion.
460+
*/
461+
unsigned long __percpu *wait_on_hw_tag;
462+
/**
463+
* @wait_on_sched_tag: Cumulative per-cpu counter incremented each
464+
* time a submitting context is forced to block due to software
465+
* scheduler tag exhaustion.
466+
*/
467+
unsigned long __percpu *wait_on_sched_tag;
456468
#endif
457469

458470
/**

0 commit comments

Comments
 (0)