Skip to content

Commit 8bd178e

Browse files
bvanasschekawasaki
authored andcommitted
blk-zoned: Support pipelining of zoned writes
Support pipelining of zoned writes if the write order is preserved per hardware queue. Track per zone to which software queue writes have been queued. If zoned writes are pipelined, submit new writes to the same software queue as the writes that are already in progress. This prevents reordering by submitting requests for the same zone to different software or hardware queues. In disk_zone_wplug_schedule_bio_work(), only increment the zwplug reference count if queuing zwplug->bio_work succeeded since with this patch applied the bio_work may already be queued if disk_zone_wplug_schedule_bio_work() is called. Cc: Damien Le Moal <[email protected]> Cc: Christoph Hellwig <[email protected]> Signed-off-by: Bart Van Assche <[email protected]>
1 parent e9a8e2d commit 8bd178e

2 files changed

Lines changed: 77 additions & 16 deletions

File tree

block/blk-mq.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3166,8 +3166,8 @@ void blk_mq_submit_bio(struct bio *bio)
31663166
/*
31673167
* A BIO that was released from a zone write plug has already been
31683168
* through the preparation in this function, already holds a reference
3169-
* on the queue usage counter, and is the only write BIO in-flight for
3170-
* the target zone. Go straight to preparing a request for it.
3169+
* on the queue usage counter. Go straight to preparing a request for
3170+
* it.
31713171
*/
31723172
if (bio_zone_write_plugging(bio)) {
31733173
nr_segs = bio->__bi_nr_segments;

block/blk-zoned.c

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ static const char *const zone_cond_name[] = {
5353
* @zone_no: The number of the zone the plug is managing.
5454
* @wp_offset: The zone write pointer location relative to the start of the zone
5555
* as a number of 512B sectors.
56+
* @from_cpu: Software queue to submit writes from for drivers that preserve
57+
* the write order.
5658
* @bio_list: The list of BIOs that are currently plugged.
5759
* @bio_work: Work struct to handle issuing of plugged BIOs
5860
* @rcu_head: RCU head to free zone write plugs with an RCU grace period.
@@ -65,6 +67,7 @@ struct blk_zone_wplug {
6567
unsigned int flags;
6668
unsigned int zone_no;
6769
unsigned int wp_offset;
70+
int from_cpu;
6871
struct bio_list bio_list;
6972
struct work_struct bio_work;
7073
struct rcu_head rcu_head;
@@ -74,8 +77,7 @@ struct blk_zone_wplug {
7477
/*
7578
* Zone write plug flags bits:
7679
* - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
77-
* that is, that write BIOs are being throttled due to a write BIO already
78-
* being executed or the zone write plug bio list is not empty.
80+
* that is, that write BIOs are being throttled.
7981
* - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
8082
* write pointer offset and need to update it.
8183
* - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
@@ -572,6 +574,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
572574
zwplug->flags = 0;
573575
zwplug->zone_no = zno;
574576
zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
577+
zwplug->from_cpu = -1;
575578
bio_list_init(&zwplug->bio_list);
576579
INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
577580
zwplug->disk = disk;
@@ -768,14 +771,23 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
768771
static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
769772
struct blk_zone_wplug *zwplug)
770773
{
774+
int cpu;
775+
771776
/*
772-
* Take a reference on the zone write plug and schedule the submission
773-
* of the next plugged BIO. blk_zone_wplug_bio_work() will release the
774-
* reference we take here.
777+
* Schedule a blk_zone_wplug_bio_work() call and increase the zone write
778+
* plug reference count. blk_zone_wplug_bio_work() will release the
779+
* reference we take here. Increasing the zone write plug reference
780+
* count after the queue_work_on() call is safe because all callers hold
781+
* the zone write plug lock and blk_zone_wplug_bio_work() obtains the
782+
* same lock before decrementing the reference count.
775783
*/
776784
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
777-
refcount_inc(&zwplug->ref);
778-
queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
785+
if (zwplug->from_cpu >= 0)
786+
cpu = zwplug->from_cpu;
787+
else
788+
cpu = WORK_CPU_UNBOUND;
789+
if (queue_work_on(cpu, disk->zone_wplugs_wq, &zwplug->bio_work))
790+
refcount_inc(&zwplug->ref);
779791
}
780792

781793
static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
@@ -972,14 +984,18 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
972984
return true;
973985
}
974986

975-
static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
987+
static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs,
988+
int rq_cpu)
976989
{
977990
struct gendisk *disk = bio->bi_bdev->bd_disk;
991+
const bool ordered_hwq = bio_op(bio) != REQ_OP_ZONE_APPEND &&
992+
disk->queue->limits.features & BLK_FEAT_ORDERED_HWQ;
978993
sector_t sector = bio->bi_iter.bi_sector;
979994
bool schedule_bio_work = false;
980995
struct blk_zone_wplug *zwplug;
981996
gfp_t gfp_mask = GFP_NOIO;
982997
unsigned long flags;
998+
int from_cpu = -1;
983999

9841000
/*
9851001
* BIOs must be fully contained within a zone so that we use the correct
@@ -1032,14 +1048,44 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
10321048
if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
10331049
goto add_to_bio_list;
10341050

1051+
/*
1052+
* The code below has been organized such that zwplug->from_cpu and
1053+
* zwplug->flags are only modified after it is clear that a request will
1054+
* be added to the bio list or that it will be submitted by the
1055+
* caller. This prevents that any changes to these member variables have
1056+
* to be reverted if the blk_zone_wplug_prepare_bio() call fails.
1057+
*/
1058+
1059+
if (ordered_hwq) {
1060+
if (zwplug->from_cpu >= 0)
1061+
from_cpu = zwplug->from_cpu;
1062+
else
1063+
from_cpu = smp_processor_id();
1064+
if (from_cpu != rq_cpu) {
1065+
zwplug->from_cpu = from_cpu;
1066+
goto add_to_bio_list;
1067+
}
1068+
}
1069+
10351070
if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
10361071
spin_unlock_irqrestore(&zwplug->lock, flags);
10371072
bio_io_error(bio);
10381073
return true;
10391074
}
10401075

1041-
/* Otherwise, plug and submit the BIO. */
1042-
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1076+
if (ordered_hwq) {
1077+
/*
1078+
* Submit future writes from the same CPU core as ongoing
1079+
* writes.
1080+
*/
1081+
zwplug->from_cpu = from_cpu;
1082+
} else {
1083+
/*
1084+
* The block driver does not preserve the write order. Plug and
1085+
* let the caller submit the BIO.
1086+
*/
1087+
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1088+
}
10431089

10441090
spin_unlock_irqrestore(&zwplug->lock, flags);
10451091

@@ -1147,7 +1193,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs, int rq_cpu)
11471193
fallthrough;
11481194
case REQ_OP_WRITE:
11491195
case REQ_OP_WRITE_ZEROES:
1150-
return blk_zone_wplug_handle_write(bio, nr_segs);
1196+
return blk_zone_wplug_handle_write(bio, nr_segs, rq_cpu);
11511197
case REQ_OP_ZONE_RESET:
11521198
return blk_zone_wplug_handle_reset_or_finish(bio, 0);
11531199
case REQ_OP_ZONE_FINISH:
@@ -1179,6 +1225,16 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
11791225

11801226
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
11811227

1228+
/*
1229+
* zwplug->from_cpu must not change while one or more writes are pending
1230+
* for the zone associated with zwplug. zwplug->ref is 2 when the plug
1231+
* is unused (one reference taken when the plug was allocated and
1232+
* another reference taken by the caller context). Reset
1233+
* zwplug->from_cpu if no more writes are pending.
1234+
*/
1235+
if (refcount_read(&zwplug->ref) == 2)
1236+
zwplug->from_cpu = -1;
1237+
11821238
/*
11831239
* If the zone is full (it was fully written or finished, or empty
11841240
* (it was reset), remove its zone write plug from the hash table.
@@ -1279,6 +1335,8 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
12791335
{
12801336
struct blk_zone_wplug *zwplug =
12811337
container_of(work, struct blk_zone_wplug, bio_work);
1338+
bool ordered_hwq =
1339+
zwplug->disk->queue->limits.features & BLK_FEAT_ORDERED_HWQ;
12821340
struct block_device *bdev;
12831341
unsigned long flags;
12841342
struct bio *bio;
@@ -1324,7 +1382,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
13241382
} else {
13251383
blk_mq_submit_bio(bio);
13261384
}
1327-
} while (0);
1385+
} while (ordered_hwq);
13281386

13291387
put_zwplug:
13301388
/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
@@ -1851,17 +1909,20 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
18511909
unsigned int zwp_zone_no, zwp_ref;
18521910
unsigned int zwp_bio_list_size;
18531911
unsigned long flags;
1912+
int from_cpu;
18541913

18551914
spin_lock_irqsave(&zwplug->lock, flags);
18561915
zwp_zone_no = zwplug->zone_no;
18571916
zwp_flags = zwplug->flags;
18581917
zwp_ref = refcount_read(&zwplug->ref);
18591918
zwp_wp_offset = zwplug->wp_offset;
18601919
zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
1920+
from_cpu = zwplug->from_cpu;
18611921
spin_unlock_irqrestore(&zwplug->lock, flags);
18621922

1863-
seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
1864-
zwp_wp_offset, zwp_bio_list_size);
1923+
seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u from_cpu %d\n",
1924+
zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset,
1925+
zwp_bio_list_size, from_cpu);
18651926
}
18661927

18671928
int queue_zone_wplugs_show(void *data, struct seq_file *m)

0 commit comments

Comments
 (0)