Skip to content

Commit 9656c1d

Browse files
Kaitao Chengkawasaki
authored andcommitted
block: Introduce the UFQ I/O scheduler
Introduce IOSCHED_UFQ, a blk-mq elevator ("ufq: User-programmable Flexible Queueing") whose policy is supplied by an eBPF program via struct_ops (insert, dispatch, merge, finish, etc.). When no eBPF program is attached, the UFQ I/O scheduler uses a simple, per-ctx queueing policy (similar to none). After an eBPF program is attached, the user-defined scheduling policy replaces UFQ’s built-in queueing policy, while per-ctx queues remain available as a fallback mechanism. Signed-off-by: Kaitao Cheng <[email protected]>
1 parent ef67188 commit 9656c1d

11 files changed

Lines changed: 934 additions & 8 deletions

File tree

block/Kconfig.iosched

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,12 @@ config BFQ_CGROUP_DEBUG
4444
Enable some debugging help. Currently it exports additional stat
4545
files in a cgroup which can be useful for debugging.
4646

47+
config IOSCHED_UFQ
48+
tristate "UFQ I/O scheduler"
49+
default y
50+
help
51+
The UFQ I/O scheduler is a programmable I/O scheduler. When
52+
enabled, an out-of-kernel I/O scheduler based on eBPF can be
53+
designed to interact with it, leveraging its customizable
54+
hooks to redefine I/O scheduling policies.
4755
endmenu

block/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
2424
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
2525
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
2626
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
27+
obj-$(CONFIG_IOSCHED_UFQ) += ufq-iosched.o ufq-bpfops.o ufq-kfunc.o
2728

2829
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \
2930
bio-integrity-auto.o

block/blk-merge.c

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -774,8 +774,8 @@ u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next,
774774
* For non-mq, this has to be called with the request spinlock acquired.
775775
* For mq with scheduling, the appropriate queue wide lock should be held.
776776
*/
777-
static struct request *attempt_merge(struct request_queue *q,
778-
struct request *req, struct request *next)
777+
static struct request *attempt_merge(struct request_queue *q, struct request *req,
778+
struct request *next, bool nohash)
779779
{
780780
if (!rq_mergeable(req) || !rq_mergeable(next))
781781
return NULL;
@@ -842,7 +842,7 @@ static struct request *attempt_merge(struct request_queue *q,
842842

843843
req->__data_len += blk_rq_bytes(next);
844844

845-
if (!blk_discard_mergable(req))
845+
if (!nohash && !blk_discard_mergable(req))
846846
elv_merge_requests(q, req, next);
847847

848848
blk_crypto_rq_put_keyslot(next);
@@ -868,7 +868,7 @@ static struct request *attempt_back_merge(struct request_queue *q,
868868
struct request *next = elv_latter_request(q, rq);
869869

870870
if (next)
871-
return attempt_merge(q, rq, next);
871+
return attempt_merge(q, rq, next, false);
872872

873873
return NULL;
874874
}
@@ -879,11 +879,17 @@ static struct request *attempt_front_merge(struct request_queue *q,
879879
struct request *prev = elv_former_request(q, rq);
880880

881881
if (prev)
882-
return attempt_merge(q, prev, rq);
882+
return attempt_merge(q, prev, rq, false);
883883

884884
return NULL;
885885
}
886886

887+
struct request *bpf_attempt_merge(struct request_queue *q, struct request *rq,
888+
struct request *next)
889+
{
890+
return attempt_merge(q, rq, next, true);
891+
}
892+
887893
/*
888894
* Try to merge 'next' into 'rq'. Return true if the merge happened, false
889895
* otherwise. The caller is responsible for freeing 'next' if the merge
@@ -892,7 +898,7 @@ static struct request *attempt_front_merge(struct request_queue *q,
892898
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
893899
struct request *next)
894900
{
895-
return attempt_merge(q, rq, next);
901+
return attempt_merge(q, rq, next, false);
896902
}
897903

898904
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
@@ -1169,3 +1175,34 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
11691175
}
11701176
}
11711177
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
1178+
1179+
bool blk_mq_sched_merge_fn(struct request_queue *q, struct bio *bio,
1180+
unsigned int nr_segs, struct request **merged_request,
1181+
struct request *rq, enum elv_merge type, void (*fn)
1182+
(struct request_queue *, struct request *, enum elv_merge))
1183+
{
1184+
switch (type) {
1185+
case ELEVATOR_BACK_MERGE:
1186+
if (!blk_mq_sched_allow_merge(q, rq, bio))
1187+
return false;
1188+
if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
1189+
return false;
1190+
*merged_request = attempt_back_merge(q, rq);
1191+
if (!*merged_request)
1192+
fn(q, rq, ELEVATOR_BACK_MERGE);
1193+
return true;
1194+
case ELEVATOR_FRONT_MERGE:
1195+
if (!blk_mq_sched_allow_merge(q, rq, bio))
1196+
return false;
1197+
if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
1198+
return false;
1199+
*merged_request = attempt_front_merge(q, rq);
1200+
if (!*merged_request)
1201+
fn(q, rq, ELEVATOR_FRONT_MERGE);
1202+
return true;
1203+
case ELEVATOR_DISCARD_MERGE:
1204+
return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
1205+
default:
1206+
return false;
1207+
}
1208+
}

block/blk-mq-sched.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77

88
#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
99

10+
bool blk_mq_sched_merge_fn(struct request_queue *q, struct bio *bio,
11+
unsigned int nr_segs, struct request **merged_request,
12+
struct request *rq, enum elv_merge type, void (*fn)
13+
(struct request_queue *, struct request *, enum elv_merge));
1014
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
1115
unsigned int nr_segs, struct request **merged_request);
1216
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,

block/blk-mq.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ static void blk_mq_finish_request(struct request *rq)
796796
}
797797
}
798798

799-
static void __blk_mq_free_request(struct request *rq)
799+
void __blk_mq_free_request(struct request *rq)
800800
{
801801
struct request_queue *q = rq->q;
802802
struct blk_mq_ctx *ctx = rq->mq_ctx;
@@ -1844,6 +1844,12 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
18441844
if (list_empty(&ctx->rq_lists[type]))
18451845
sbitmap_clear_bit(sb, bitnr);
18461846
}
1847+
1848+
if (dispatch_data->rq) {
1849+
dispatch_data->rq->rq_flags |= RQF_STARTED;
1850+
if (hctx->queue->last_merge == dispatch_data->rq)
1851+
hctx->queue->last_merge = NULL;
1852+
}
18471853
spin_unlock(&ctx->lock);
18481854

18491855
return !dispatch_data->rq;

block/blk-mq.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
5656
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
5757
struct blk_mq_ctx *start);
5858
void blk_mq_put_rq_ref(struct request *rq);
59-
59+
void __blk_mq_free_request(struct request *rq);
6060
/*
6161
* Internal helpers for allocating/freeing the request map
6262
*/

block/blk.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,8 @@ static inline unsigned get_max_segment_size(const struct queue_limits *lim,
449449

450450
int ll_back_merge_fn(struct request *req, struct bio *bio,
451451
unsigned int nr_segs);
452+
struct request *bpf_attempt_merge(struct request_queue *q, struct request *rq,
453+
struct request *next);
452454
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
453455
struct request *next);
454456
unsigned int blk_recalc_rq_segments(struct request *rq);

block/ufq-bpfops.c

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Copyright (c) 2026 KylinSoft Corporation.
4+
* Copyright (c) 2026 Kaitao Cheng <[email protected]>
5+
*/
6+
#include <linux/init.h>
7+
#include <linux/types.h>
8+
#include <linux/bpf_verifier.h>
9+
#include <linux/bpf.h>
10+
#include <linux/btf.h>
11+
#include <linux/btf_ids.h>
12+
#include <linux/string.h>
13+
#include "ufq-iosched.h"
14+
15+
struct ufq_iosched_ops ufq_ops;
16+
17+
static const struct bpf_func_proto *
18+
bpf_ufq_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
19+
{
20+
return bpf_base_func_proto(func_id, prog);
21+
}
22+
23+
static bool bpf_ufq_is_valid_access(int off, int size,
24+
enum bpf_access_type type,
25+
const struct bpf_prog *prog,
26+
struct bpf_insn_access_aux *info)
27+
{
28+
if (type != BPF_READ)
29+
return false;
30+
if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
31+
return false;
32+
if (off % size != 0)
33+
return false;
34+
35+
/*
36+
* merge_req's third argument is int *type. btf_ctx_access() treats
37+
* pointers that are not "pointer to struct" as scalars (no reg_type),
38+
* so loading the pointer from ctx leaves a SCALAR and *type stores
39+
* fail verification. Model it as a read/write buffer of merge_type.
40+
*/
41+
if (off == 16 && size == sizeof(__u64) &&
42+
prog->aux->attach_func_name &&
43+
!strcmp(prog->aux->attach_func_name, "merge_req")) {
44+
if (!btf_ctx_access(off, size, type, prog, info))
45+
return false;
46+
info->reg_type = PTR_TO_BUF;
47+
return true;
48+
}
49+
50+
return btf_ctx_access(off, size, type, prog, info);
51+
}
52+
53+
static const struct bpf_verifier_ops bpf_ufq_verifier_ops = {
54+
.get_func_proto = bpf_ufq_get_func_proto,
55+
.is_valid_access = bpf_ufq_is_valid_access,
56+
};
57+
58+
static int bpf_ufq_init_member(const struct btf_type *t,
59+
const struct btf_member *member,
60+
void *kdata, const void *udata)
61+
{
62+
const struct ufq_iosched_ops *uops = udata;
63+
struct ufq_iosched_ops *ops = kdata;
64+
u32 moff = __btf_member_bit_offset(t, member) / 8;
65+
int ret;
66+
67+
switch (moff) {
68+
case offsetof(struct ufq_iosched_ops, name):
69+
ret = bpf_obj_name_cpy(ops->name, uops->name,
70+
sizeof(ops->name));
71+
if (ret < 0)
72+
return ret;
73+
if (ret == 0)
74+
return -EINVAL;
75+
return 1;
76+
/* other var adding .... */
77+
}
78+
79+
return 0;
80+
}
81+
82+
static int bpf_ufq_check_member(const struct btf_type *t,
83+
const struct btf_member *member,
84+
const struct bpf_prog *prog)
85+
{
86+
return 0;
87+
}
88+
89+
static int bpf_ufq_enable(struct ufq_iosched_ops *ops)
90+
{
91+
ufq_ops = *ops;
92+
return 0;
93+
}
94+
95+
static void bpf_ufq_disable(struct ufq_iosched_ops *ops)
96+
{
97+
memset(&ufq_ops, 0, sizeof(ufq_ops));
98+
}
99+
100+
static int bpf_ufq_reg(void *kdata, struct bpf_link *link)
101+
{
102+
return bpf_ufq_enable(kdata);
103+
}
104+
105+
static void bpf_ufq_unreg(void *kdata, struct bpf_link *link)
106+
{
107+
bpf_ufq_disable(kdata);
108+
}
109+
110+
static int bpf_ufq_init(struct btf *btf)
111+
{
112+
return 0;
113+
}
114+
115+
static int bpf_ufq_update(void *kdata, void *old_kdata, struct bpf_link *link)
116+
{
117+
/*
118+
* UFQ does not support live-updating an already-attached BPF scheduler:
119+
* partial failure during callback setup (e.g. init_sched) would be hard
120+
* to reason about, and update can race with unregister/teardown.
121+
*/
122+
return -EOPNOTSUPP;
123+
}
124+
125+
static int bpf_ufq_validate(void *kdata)
126+
{
127+
return 0;
128+
}
129+
130+
static int init_sched_stub(struct request_queue *q)
131+
{
132+
return -EPERM;
133+
}
134+
135+
static int exit_sched_stub(struct request_queue *q)
136+
{
137+
return -EPERM;
138+
}
139+
140+
static int insert_req_stub(struct request_queue *q, struct request *rq,
141+
blk_insert_t flags)
142+
{
143+
return 0;
144+
}
145+
146+
static struct request *dispatch_req_stub(struct request_queue *q)
147+
{
148+
return NULL;
149+
}
150+
151+
static bool has_req_stub(struct request_queue *q, int rqs_count)
152+
{
153+
return rqs_count > 0;
154+
}
155+
156+
static void finish_req_stub(struct request *rq)
157+
{
158+
}
159+
160+
static struct request *former_req_stub(struct request_queue *q, struct request *rq)
161+
{
162+
return NULL;
163+
}
164+
165+
static struct request *next_req_stub(struct request_queue *q, struct request *rq)
166+
{
167+
return NULL;
168+
}
169+
170+
static struct request *merge_req_stub(struct request_queue *q, struct request *rq,
171+
int *type)
172+
{
173+
*type = ELEVATOR_NO_MERGE;
174+
return NULL;
175+
}
176+
177+
static void req_merged_stub(struct request_queue *q, struct request *rq,
178+
int type)
179+
{
180+
}
181+
182+
static struct ufq_iosched_ops __bpf_ops_ufq_ops = {
183+
.init_sched = init_sched_stub,
184+
.exit_sched = exit_sched_stub,
185+
.insert_req = insert_req_stub,
186+
.dispatch_req = dispatch_req_stub,
187+
.has_req = has_req_stub,
188+
.former_req = former_req_stub,
189+
.next_req = next_req_stub,
190+
.merge_req = merge_req_stub,
191+
.req_merged = req_merged_stub,
192+
.finish_req = finish_req_stub,
193+
};
194+
195+
static struct bpf_struct_ops bpf_iosched_ufq_ops = {
196+
.verifier_ops = &bpf_ufq_verifier_ops,
197+
.reg = bpf_ufq_reg,
198+
.unreg = bpf_ufq_unreg,
199+
.check_member = bpf_ufq_check_member,
200+
.init_member = bpf_ufq_init_member,
201+
.init = bpf_ufq_init,
202+
.update = bpf_ufq_update,
203+
.validate = bpf_ufq_validate,
204+
.name = "ufq_iosched_ops",
205+
.owner = THIS_MODULE,
206+
.cfi_stubs = &__bpf_ops_ufq_ops
207+
};
208+
209+
int bpf_ufq_ops_init(void)
210+
{
211+
return register_bpf_struct_ops(&bpf_iosched_ufq_ops, ufq_iosched_ops);
212+
}
213+

0 commit comments

Comments
 (0)