Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions block/blk-mq-debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <linux/blkdev.h>
#include <linux/build_bug.h>
#include <linux/debugfs.h>
#include <linux/percpu.h>

#include "blk.h"
#include "blk-mq.h"
Expand Down Expand Up @@ -484,6 +485,54 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
return 0;
}

/**
* hctx_wait_on_hw_tag_show - display hardware tag starvation count
* @data: generic pointer to the associated hardware context (hctx)
* @m: seq_file pointer for debugfs output formatting
*
* Prints the cumulative number of times a submitting context was forced
* to block due to the exhaustion of physical hardware driver tags.
*
* Return: 0 on success.
*/
static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
unsigned long count = 0;
int cpu;

if (hctx->wait_on_hw_tag) {
for_each_possible_cpu(cpu)
count += *per_cpu_ptr(hctx->wait_on_hw_tag, cpu);
}
seq_printf(m, "%lu\n", count);
return 0;
}

/**
* hctx_wait_on_sched_tag_show - display scheduler tag starvation count
* @data: generic pointer to the associated hardware context (hctx)
* @m: seq_file pointer for debugfs output formatting
*
* Prints the cumulative number of times a submitting context was forced
* to block due to the exhaustion of software scheduler tags.
*
* Return: 0 on success.
*/
static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
unsigned long count = 0;
int cpu;

if (hctx->wait_on_sched_tag) {
for_each_possible_cpu(cpu)
count += *per_cpu_ptr(hctx->wait_on_sched_tag, cpu);
}
seq_printf(m, "%lu\n", count);
return 0;
}

#define CTX_RQ_SEQ_OPS(name, type) \
static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
__acquires(&ctx->lock) \
Expand Down Expand Up @@ -599,6 +648,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
{"type", 0400, hctx_type_show},
{"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
{"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
{},
};

Expand Down Expand Up @@ -815,3 +866,61 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
debugfs_remove_recursive(hctx->sched_debugfs_dir);
hctx->sched_debugfs_dir = NULL;
}

/**
* blk_mq_debugfs_alloc_hctx_stats - Allocate per-cpu starvation statistics
* @hctx: hardware context associated with the tag allocation
* @gfp: memory allocation flags
*
* Allocates the per-cpu memory for tracking hardware and scheduler tag
* starvation.
*/
void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx, gfp_t gfp)
{
if (!hctx->wait_on_hw_tag)
hctx->wait_on_hw_tag = alloc_percpu_gfp(unsigned long,
gfp);
if (!hctx->wait_on_sched_tag)
hctx->wait_on_sched_tag = alloc_percpu_gfp(unsigned long,
gfp);
}

/**
* blk_mq_debugfs_free_hctx_stats - Free per-cpu starvation statistics
* @hctx: hardware context associated with the tag allocation
*
* Frees the per-cpu memory used for tracking hardware and scheduler tag
* starvation. This must only be called during hardware queue teardown when
* the queue is safely frozen and no active I/O submissions can race to
* increment the statistics.
*/
void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx)
{
free_percpu(hctx->wait_on_hw_tag);
hctx->wait_on_hw_tag = NULL;
free_percpu(hctx->wait_on_sched_tag);
hctx->wait_on_sched_tag = NULL;
}

/**
* blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters
* @hctx: hardware context associated with the tag allocation
* @is_sched: true if the starved pool is the software scheduler
*
* Evaluates the exhausted tag pool and safely increments the appropriate
* per-cpu debugfs starvation counter.
*
* Note: The per-cpu pointers are explicitly checked to prevent a NULL
* pointer dereference in the event that the system was under heavy memory
* pressure and the initial per-cpu allocation failed.
*/
void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
bool is_sched)
{
unsigned long __percpu *tags = is_sched ?
READ_ONCE(hctx->wait_on_sched_tag) :
READ_ONCE(hctx->wait_on_hw_tag);

if (likely(tags))
this_cpu_inc(*tags);
}
19 changes: 19 additions & 0 deletions block/blk-mq-debugfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ struct blk_mq_debugfs_attr {
const struct seq_operations *seq_ops;
};

void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
bool is_sched);
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);

Expand All @@ -26,6 +28,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_register_hctxs(struct request_queue *q);
void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx,
gfp_t gfp);
void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx);

void blk_mq_debugfs_register_sched(struct request_queue *q);
void blk_mq_debugfs_unregister_sched(struct request_queue *q);
Expand All @@ -35,6 +40,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);

void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
#else
static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
bool is_sched)
{
}

static inline void blk_mq_debugfs_register(struct request_queue *q)
{
}
Expand All @@ -56,6 +66,15 @@ static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
{
}

static inline void blk_mq_debugfs_alloc_hctx_stats(struct blk_mq_hw_ctx *hctx,
gfp_t gfp)
{
}

static inline void blk_mq_debugfs_free_hctx_stats(struct blk_mq_hw_ctx *hctx)
{
}

static inline void blk_mq_debugfs_register_sched(struct request_queue *q)
{
}
Expand Down
8 changes: 8 additions & 0 deletions block/blk-mq-tag.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
#include <linux/kmemleak.h>

#include <linux/delay.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-mq-debugfs.h"

/*
* Recalculate wakeup batch when tag is shared by hctx.
Expand Down Expand Up @@ -187,6 +189,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != BLK_MQ_NO_TAG)
break;

trace_block_rq_tag_wait(data->q, data->hctx,
data->rq_flags & RQF_SCHED_TAGS);

blk_mq_debugfs_inc_wait_tags(data->hctx,
data->rq_flags & RQF_SCHED_TAGS);

bt_prev = bt;
io_schedule();

Expand Down
5 changes: 5 additions & 0 deletions block/blk-mq.c
Original file line number Diff line number Diff line change
Expand Up @@ -3991,6 +3991,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_free_flush_queue_callback);
hctx->fq = NULL;

blk_mq_debugfs_free_hctx_stats(hctx);

spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
Expand All @@ -4016,6 +4018,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
{
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;

blk_mq_debugfs_alloc_hctx_stats(hctx, gfp);

hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
if (!hctx->fq)
goto fail;
Expand All @@ -4041,6 +4045,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
blk_free_flush_queue(hctx->fq);
hctx->fq = NULL;
fail:
blk_mq_debugfs_free_hctx_stats(hctx);
return -1;
}

Expand Down
12 changes: 12 additions & 0 deletions include/linux/blk-mq.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,18 @@ struct blk_mq_hw_ctx {
struct dentry *debugfs_dir;
/** @sched_debugfs_dir: debugfs directory for the scheduler. */
struct dentry *sched_debugfs_dir;
/**
* @wait_on_hw_tag: Cumulative per-cpu counter incremented each
* time a submitting context is forced to block due to physical
* hardware tag exhaustion.
*/
unsigned long __percpu *wait_on_hw_tag;
/**
* @wait_on_sched_tag: Cumulative per-cpu counter incremented each
* time a submitting context is forced to block due to software
* scheduler tag exhaustion.
*/
unsigned long __percpu *wait_on_sched_tag;
#endif

/**
Expand Down
43 changes: 43 additions & 0 deletions include/trace/events/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,49 @@ DECLARE_EVENT_CLASS(block_rq,
IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
);

/**
* block_rq_tag_wait - triggered when a request is starved of a tag
* @q: request queue of the target device
* @hctx: hardware context of the request experiencing starvation
* @is_sched_tag: indicates whether the starved pool is the software scheduler
*
* Called immediately before the submitting context is forced to block due
* to the exhaustion of available tags (i.e., physical hardware driver tags
* or software scheduler tags). This trace point indicates that the context
* will be placed into an uninterruptible state via io_schedule() until an
* active request completes and relinquishes its assigned tag.
*/
TRACE_EVENT(block_rq_tag_wait,

TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),

TP_ARGS(q, hctx, is_sched_tag),

TP_STRUCT__entry(
__field( dev_t, dev )
__field( u32, hctx_id )
__field( u32, nr_tags )
__field( bool, is_sched_tag )
),

TP_fast_assign(
__entry->dev = q->disk ? disk_devt(q->disk);
__entry->hctx_id = hctx->queue_num;
__entry->is_sched_tag = is_sched_tag;

if (is_sched_tag)
__entry->nr_tags = hctx->sched_tags->nr_tags;
else
__entry->nr_tags = hctx->tags->nr_tags;
),

TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->hctx_id,
__entry->is_sched_tag ? "scheduler" : "hardware",
__entry->nr_tags)
);

/**
* block_rq_insert - insert block operation request into queue
* @rq: block IO operation request
Expand Down
Loading