Skip to content

Commit c70dcfe

Browse files
zhangyi089kawasaki
authored andcommitted
block: introduce BLK_FEAT_WRITE_ZEROES_UNMAP to queue limits features
Currently, disks primarily implement the write zeroes command (aka REQ_OP_WRITE_ZEROES) through two mechanisms: the first involves physically writing zeros to the disk media (e.g., HDDs), while the second performs an unmap operation on the logical blocks, effectively putting them into a deallocated state (e.g., SSDs). The first method is generally slow, while the second method is typically very fast. For example, on certain NVMe SSDs that support NVME_NS_DEAC, submitting REQ_OP_WRITE_ZEROES requests with the NVME_WZ_DEAC bit can accelerate the write zeros operation by placing disk blocks into a deallocated state, which opportunistically avoids writing zeroes to media while still guaranteeing that subsequent reads from the specified block range will return zeroed data. This is a best-effort optimization, not a mandatory requirement, some devices may partially fall back to writing physical zeroes due to factors such as misalignment or being asked to clear a block range smaller than the device's internal allocation unit. Therefore, the speed of this operation is not guaranteed. It is difficult to determine whether the storage device supports unmap write zeroes operation. We cannot determine this by only querying bdev_limits(bdev)->max_write_zeroes_sectors. First, add a new queue limit feature, BLK_FEAT_WRITE_ZEROES_UNMAP, to indicate whether a device supports this unmap write zeroes operation. Then, add a new counterpart flag, BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED and a sysfs entry, which allow users to disable this operation if the speed is very slow on some sepcial devices. Finally, for the stacked devices cases, the BLK_FEAT_WRITE_ZEROES_UNMAP should be supported both by the stacking driver and all underlying devices. Thanks to Martin K. Petersen for optimizing the documentation of the write_zeroes_unmap sysfs interface. Signed-off-by: Zhang Yi <[email protected]>
1 parent a73409f commit c70dcfe

4 files changed

Lines changed: 69 additions & 0 deletions

File tree

Documentation/ABI/stable/sysfs-block

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,26 @@ Description:
778778
0, write zeroes is not supported by the device.
779779

780780

781+
What: /sys/block/<disk>/queue/write_zeroes_unmap
782+
Date: January 2025
783+
Contact: Zhang Yi <[email protected]>
784+
Description:
785+
[RW] When read, this file will display whether the device has
786+
enabled the unmap write zeroes operation. This operation
787+
indicates that the device supports zeroing data in a specified
788+
block range without incurring the cost of physically writing
789+
zeroes to media for each individual block. It implements a
790+
zeroing operation which opportunistically avoids writing zeroes
791+
to media while still guaranteeing that subsequent reads from the
792+
specified block range will return zeroed data. This operation is
793+
a best-effort optimization, a device may fall back to physically
794+
writing zeroes to media due to other factors such as
795+
misalignment or being asked to clear a block range smaller than
796+
the device's internal allocation unit. So the speed of this
797+
operation is not guaranteed. Writing a value of '0' to this file
798+
disables this operation.
799+
800+
781801
What: /sys/block/<disk>/queue/zone_append_max_bytes
782802
Date: May 2020
783803

block/blk-settings.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
698698
t->features &= ~BLK_FEAT_NOWAIT;
699699
if (!(b->features & BLK_FEAT_POLL))
700700
t->features &= ~BLK_FEAT_POLL;
701+
if (!(b->features & BLK_FEAT_WRITE_ZEROES_UNMAP))
702+
t->features &= ~BLK_FEAT_WRITE_ZEROES_UNMAP;
701703

702704
t->flags |= (b->flags & BLK_FLAG_MISALIGNED);
703705

@@ -820,6 +822,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
820822
t->zone_write_granularity = 0;
821823
t->max_zone_append_sectors = 0;
822824
}
825+
826+
if (!t->max_write_zeroes_sectors)
827+
t->features &= ~BLK_FEAT_WRITE_ZEROES_UNMAP;
828+
823829
blk_stack_atomic_writes_limits(t, b, start);
824830

825831
return ret;

block/blk-sysfs.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,29 @@ static int queue_wc_store(struct gendisk *disk, const char *page,
457457
return 0;
458458
}
459459

460+
static ssize_t queue_write_zeroes_unmap_show(struct gendisk *disk, char *page)
461+
{
462+
return sysfs_emit(page, "%u\n",
463+
blk_queue_write_zeroes_unmap(disk->queue));
464+
}
465+
466+
static int queue_write_zeroes_unmap_store(struct gendisk *disk,
467+
const char *page, size_t count, struct queue_limits *lim)
468+
{
469+
unsigned long val;
470+
ssize_t ret;
471+
472+
ret = queue_var_store(&val, page, count);
473+
if (ret < 0)
474+
return ret;
475+
476+
if (val)
477+
lim->flags &= ~BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED;
478+
else
479+
lim->flags |= BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED;
480+
return 0;
481+
}
482+
460483
#define QUEUE_RO_ENTRY(_prefix, _name) \
461484
static struct queue_sysfs_entry _prefix##_entry = { \
462485
.attr = { .name = _name, .mode = 0444 }, \
@@ -514,6 +537,7 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
514537

515538
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
516539
QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
540+
QUEUE_LIM_RW_ENTRY(queue_write_zeroes_unmap, "write_zeroes_unmap");
517541
QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
518542
QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
519543

@@ -662,6 +686,7 @@ static struct attribute *queue_attrs[] = {
662686
&queue_atomic_write_unit_min_entry.attr,
663687
&queue_atomic_write_unit_max_entry.attr,
664688
&queue_max_write_zeroes_sectors_entry.attr,
689+
&queue_write_zeroes_unmap_entry.attr,
665690
&queue_max_zone_append_sectors_entry.attr,
666691
&queue_zone_write_granularity_entry.attr,
667692
&queue_rotational_entry.attr,

include/linux/blkdev.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,9 @@ typedef unsigned int __bitwise blk_features_t;
340340
#define BLK_FEAT_ATOMIC_WRITES \
341341
((__force blk_features_t)(1u << 16))
342342

343+
/* supports unmap write zeroes command */
344+
#define BLK_FEAT_WRITE_ZEROES_UNMAP ((__force blk_features_t)(1u << 17))
345+
343346
/*
344347
* Flags automatically inherited when stacking limits.
345348
*/
@@ -360,6 +363,10 @@ typedef unsigned int __bitwise blk_flags_t;
360363
/* passthrough command IO accounting */
361364
#define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2))
362365

366+
/* disable the unmap write zeroes operation */
367+
#define BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED \
368+
((__force blk_flags_t)(1u << 3))
369+
363370
struct queue_limits {
364371
blk_features_t features;
365372
blk_flags_t flags;
@@ -1378,6 +1385,17 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
13781385
return bdev_limits(bdev)->max_write_zeroes_sectors;
13791386
}
13801387

1388+
static inline bool blk_queue_write_zeroes_unmap(struct request_queue *q)
1389+
{
1390+
return (q->limits.features & BLK_FEAT_WRITE_ZEROES_UNMAP) &&
1391+
!(q->limits.flags & BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED);
1392+
}
1393+
1394+
static inline bool bdev_write_zeroes_unmap(struct block_device *bdev)
1395+
{
1396+
return blk_queue_write_zeroes_unmap(bdev_get_queue(bdev));
1397+
}
1398+
13811399
static inline bool bdev_nonrot(struct block_device *bdev)
13821400
{
13831401
return blk_queue_nonrot(bdev_get_queue(bdev));

0 commit comments

Comments
 (0)