@@ -53,6 +53,8 @@ static const char *const zone_cond_name[] = {
5353 * @zone_no: The number of the zone the plug is managing.
5454 * @wp_offset: The zone write pointer location relative to the start of the zone
5555 * as a number of 512B sectors.
56+ * @from_cpu: Software queue to submit writes from for drivers that preserve
57+ * the write order.
5658 * @bio_list: The list of BIOs that are currently plugged.
5759 * @bio_work: Work struct to handle issuing of plugged BIOs
5860 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
@@ -65,6 +67,7 @@ struct blk_zone_wplug {
6567 unsigned int flags ;
6668 unsigned int zone_no ;
6769 unsigned int wp_offset ;
70+ int from_cpu ;
6871 struct bio_list bio_list ;
6972 struct work_struct bio_work ;
7073 struct rcu_head rcu_head ;
@@ -74,8 +77,7 @@ struct blk_zone_wplug {
7477/*
7578 * Zone write plug flags bits:
7679 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
77- * that is, that write BIOs are being throttled due to a write BIO already
78- * being executed or the zone write plug bio list is not empty.
80+ * that is, that write BIOs are being throttled.
7981 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
8082 * write pointer offset and need to update it.
8183 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
@@ -572,6 +574,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
572574 zwplug -> flags = 0 ;
573575 zwplug -> zone_no = zno ;
574576 zwplug -> wp_offset = bdev_offset_from_zone_start (disk -> part0 , sector );
577+ zwplug -> from_cpu = -1 ;
575578 bio_list_init (& zwplug -> bio_list );
576579 INIT_WORK (& zwplug -> bio_work , blk_zone_wplug_bio_work );
577580 zwplug -> disk = disk ;
@@ -768,14 +771,23 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
768771static void disk_zone_wplug_schedule_bio_work (struct gendisk * disk ,
769772 struct blk_zone_wplug * zwplug )
770773{
774+ int cpu ;
775+
771776 /*
772- * Take a reference on the zone write plug and schedule the submission
773- * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
774- * reference we take here.
777+ * Schedule a blk_zone_wplug_bio_work() call and increase the zone write
778+ * plug reference count. blk_zone_wplug_bio_work() will release the
779+ * reference we take here. Increasing the zone write plug reference
780+ * count after the queue_work_on() call is safe because all callers hold
781+ * the zone write plug lock and blk_zone_wplug_bio_work() obtains the
782+ * same lock before decrementing the reference count.
775783 */
776784 WARN_ON_ONCE (!(zwplug -> flags & BLK_ZONE_WPLUG_PLUGGED ));
777- refcount_inc (& zwplug -> ref );
778- queue_work (disk -> zone_wplugs_wq , & zwplug -> bio_work );
785+ if (zwplug -> from_cpu >= 0 )
786+ cpu = zwplug -> from_cpu ;
787+ else
788+ cpu = WORK_CPU_UNBOUND ;
789+ if (queue_work_on (cpu , disk -> zone_wplugs_wq , & zwplug -> bio_work ))
790+ refcount_inc (& zwplug -> ref );
779791}
780792
781793static inline void disk_zone_wplug_add_bio (struct gendisk * disk ,
@@ -972,14 +984,18 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
972984 return true;
973985}
974986
975- static bool blk_zone_wplug_handle_write (struct bio * bio , unsigned int nr_segs )
987+ static bool blk_zone_wplug_handle_write (struct bio * bio , unsigned int nr_segs ,
988+ int rq_cpu )
976989{
977990 struct gendisk * disk = bio -> bi_bdev -> bd_disk ;
991+ const bool ordered_hwq = bio_op (bio ) != REQ_OP_ZONE_APPEND &&
992+ disk -> queue -> limits .features & BLK_FEAT_ORDERED_HWQ ;
978993 sector_t sector = bio -> bi_iter .bi_sector ;
979994 bool schedule_bio_work = false;
980995 struct blk_zone_wplug * zwplug ;
981996 gfp_t gfp_mask = GFP_NOIO ;
982997 unsigned long flags ;
998+ int from_cpu = -1 ;
983999
9841000 /*
9851001 * BIOs must be fully contained within a zone so that we use the correct
@@ -1032,14 +1048,44 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
10321048 if (zwplug -> flags & BLK_ZONE_WPLUG_PLUGGED )
10331049 goto add_to_bio_list ;
10341050
1051+ /*
1052+ * The code below has been organized such that zwplug->from_cpu and
1053+ * zwplug->flags are only modified after it is clear that a request will
1054+ * be added to the bio list or that it will be submitted by the
1055+ * caller. This prevents that any changes to these member variables have
1056+ * to be reverted if the blk_zone_wplug_prepare_bio() call fails.
1057+ */
1058+
1059+ if (ordered_hwq ) {
1060+ if (zwplug -> from_cpu >= 0 )
1061+ from_cpu = zwplug -> from_cpu ;
1062+ else
1063+ from_cpu = smp_processor_id ();
1064+ if (from_cpu != rq_cpu ) {
1065+ zwplug -> from_cpu = from_cpu ;
1066+ goto add_to_bio_list ;
1067+ }
1068+ }
1069+
10351070 if (!blk_zone_wplug_prepare_bio (zwplug , bio )) {
10361071 spin_unlock_irqrestore (& zwplug -> lock , flags );
10371072 bio_io_error (bio );
10381073 return true;
10391074 }
10401075
1041- /* Otherwise, plug and submit the BIO. */
1042- zwplug -> flags |= BLK_ZONE_WPLUG_PLUGGED ;
1076+ if (ordered_hwq ) {
1077+ /*
1078+ * Submit future writes from the same CPU core as ongoing
1079+ * writes.
1080+ */
1081+ zwplug -> from_cpu = from_cpu ;
1082+ } else {
1083+ /*
1084+ * The block driver does not preserve the write order. Plug and
1085+ * let the caller submit the BIO.
1086+ */
1087+ zwplug -> flags |= BLK_ZONE_WPLUG_PLUGGED ;
1088+ }
10431089
10441090 spin_unlock_irqrestore (& zwplug -> lock , flags );
10451091
@@ -1147,7 +1193,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs, int rq_cpu)
11471193 fallthrough ;
11481194 case REQ_OP_WRITE :
11491195 case REQ_OP_WRITE_ZEROES :
1150- return blk_zone_wplug_handle_write (bio , nr_segs );
1196+ return blk_zone_wplug_handle_write (bio , nr_segs , rq_cpu );
11511197 case REQ_OP_ZONE_RESET :
11521198 return blk_zone_wplug_handle_reset_or_finish (bio , 0 );
11531199 case REQ_OP_ZONE_FINISH :
@@ -1179,6 +1225,16 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
11791225
11801226 zwplug -> flags &= ~BLK_ZONE_WPLUG_PLUGGED ;
11811227
1228+ /*
1229+ * zwplug->from_cpu must not change while one or more writes are pending
1230+ * for the zone associated with zwplug. zwplug->ref is 2 when the plug
1231+ * is unused (one reference taken when the plug was allocated and
1232+ * another reference taken by the caller context). Reset
1233+ * zwplug->from_cpu if no more writes are pending.
1234+ */
1235+ if (refcount_read (& zwplug -> ref ) == 2 )
1236+ zwplug -> from_cpu = -1 ;
1237+
11821238 /*
11831239 * If the zone is full (it was fully written or finished, or empty
11841240 * (it was reset), remove its zone write plug from the hash table.
@@ -1279,6 +1335,8 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
12791335{
12801336 struct blk_zone_wplug * zwplug =
12811337 container_of (work , struct blk_zone_wplug , bio_work );
1338+ bool ordered_hwq =
1339+ zwplug -> disk -> queue -> limits .features & BLK_FEAT_ORDERED_HWQ ;
12821340 struct block_device * bdev ;
12831341 unsigned long flags ;
12841342 struct bio * bio ;
@@ -1324,7 +1382,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
13241382 } else {
13251383 blk_mq_submit_bio (bio );
13261384 }
1327- } while (0 );
1385+ } while (ordered_hwq );
13281386
13291387put_zwplug :
13301388 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
@@ -1851,17 +1909,20 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
18511909 unsigned int zwp_zone_no , zwp_ref ;
18521910 unsigned int zwp_bio_list_size ;
18531911 unsigned long flags ;
1912+ int from_cpu ;
18541913
18551914 spin_lock_irqsave (& zwplug -> lock , flags );
18561915 zwp_zone_no = zwplug -> zone_no ;
18571916 zwp_flags = zwplug -> flags ;
18581917 zwp_ref = refcount_read (& zwplug -> ref );
18591918 zwp_wp_offset = zwplug -> wp_offset ;
18601919 zwp_bio_list_size = bio_list_size (& zwplug -> bio_list );
1920+ from_cpu = zwplug -> from_cpu ;
18611921 spin_unlock_irqrestore (& zwplug -> lock , flags );
18621922
1863- seq_printf (m , "%u 0x%x %u %u %u\n" , zwp_zone_no , zwp_flags , zwp_ref ,
1864- zwp_wp_offset , zwp_bio_list_size );
1923+ seq_printf (m , "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u from_cpu %d\n" ,
1924+ zwp_zone_no , zwp_flags , zwp_ref , zwp_wp_offset ,
1925+ zwp_bio_list_size , from_cpu );
18651926}
18661927
18671928int queue_zone_wplugs_show (void * data , struct seq_file * m )
0 commit comments