From a73409f6be35b0505330585e09422225d23bdc27 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Mon, 9 Jun 2025 20:58:35 +0900 Subject: [PATCH 01/11] adding ci files --- .github/workflows/kernel_build.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/kernel_build.yml diff --git a/.github/workflows/kernel_build.yml b/.github/workflows/kernel_build.yml new file mode 100644 index 0000000000000..cd0f9d145429c --- /dev/null +++ b/.github/workflows/kernel_build.yml @@ -0,0 +1,28 @@ +name: blktests-ci + +on: + pull_request: + +jobs: + build-kernel: + runs-on: ubuntu-latest + steps: + - name: Configure git + run: | + git config --global --add safe.directory '*' + - name: Checkout git + run: | + sudo apt-get install -y libelf-dev + mkdir -p linux + cd linux + git init + git remote add origin https://github.com/${{ github.repository }} + git fetch origin --depth=5 ${{ github.event.pull_request.head.sha }} + git reset --hard ${{ github.event.pull_request.head.sha }} + git log -1 + - name: Build kernel + run: | + cd linux + make defconfig + make -j 8 + From c70dcfe4685a9bd4a21128f92ce4e37fdfccfc1f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:41 +0800 Subject: [PATCH 02/11] block: introduce BLK_FEAT_WRITE_ZEROES_UNMAP to queue limits features Currently, disks primarily implement the write zeroes command (aka REQ_OP_WRITE_ZEROES) through two mechanisms: the first involves physically writing zeros to the disk media (e.g., HDDs), while the second performs an unmap operation on the logical blocks, effectively putting them into a deallocated state (e.g., SSDs). The first method is generally slow, while the second method is typically very fast. For example, on certain NVMe SSDs that support NVME_NS_DEAC, submitting REQ_OP_WRITE_ZEROES requests with the NVME_WZ_DEAC bit can accelerate the write zeros operation by placing disk blocks into a deallocated state, which opportunistically avoids writing zeroes to media while still guaranteeing that subsequent reads from the specified block range will return zeroed data. This is a best-effort optimization, not a mandatory requirement, some devices may partially fall back to writing physical zeroes due to factors such as misalignment or being asked to clear a block range smaller than the device's internal allocation unit. Therefore, the speed of this operation is not guaranteed. It is difficult to determine whether the storage device supports unmap write zeroes operation. We cannot determine this by only querying bdev_limits(bdev)->max_write_zeroes_sectors. First, add a new queue limit feature, BLK_FEAT_WRITE_ZEROES_UNMAP, to indicate whether a device supports this unmap write zeroes operation. Then, add a new counterpart flag, BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED and a sysfs entry, which allow users to disable this operation if the speed is very slow on some sepcial devices. Finally, for the stacked devices cases, the BLK_FEAT_WRITE_ZEROES_UNMAP should be supported both by the stacking driver and all underlying devices. Thanks to Martin K. Petersen for optimizing the documentation of the write_zeroes_unmap sysfs interface. Signed-off-by: Zhang Yi --- Documentation/ABI/stable/sysfs-block | 20 ++++++++++++++++++++ block/blk-settings.c | 6 ++++++ block/blk-sysfs.c | 25 +++++++++++++++++++++++++ include/linux/blkdev.h | 18 ++++++++++++++++++ 4 files changed, 69 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 4ba771b56b3b5..8e7d513286c4e 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -778,6 +778,26 @@ Description: 0, write zeroes is not supported by the device. +What: /sys/block//queue/write_zeroes_unmap +Date: January 2025 +Contact: Zhang Yi +Description: + [RW] When read, this file will display whether the device has + enabled the unmap write zeroes operation. This operation + indicates that the device supports zeroing data in a specified + block range without incurring the cost of physically writing + zeroes to media for each individual block. It implements a + zeroing operation which opportunistically avoids writing zeroes + to media while still guaranteeing that subsequent reads from the + specified block range will return zeroed data. This operation is + a best-effort optimization, a device may fall back to physically + writing zeroes to media due to other factors such as + misalignment or being asked to clear a block range smaller than + the device's internal allocation unit. So the speed of this + operation is not guaranteed. Writing a value of '0' to this file + disables this operation. + + What: /sys/block//queue/zone_append_max_bytes Date: May 2020 Contact: linux-block@vger.kernel.org diff --git a/block/blk-settings.c b/block/blk-settings.c index a000daafbfb48..de99763fd6685 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -698,6 +698,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->features &= ~BLK_FEAT_NOWAIT; if (!(b->features & BLK_FEAT_POLL)) t->features &= ~BLK_FEAT_POLL; + if (!(b->features & BLK_FEAT_WRITE_ZEROES_UNMAP)) + t->features &= ~BLK_FEAT_WRITE_ZEROES_UNMAP; t->flags |= (b->flags & BLK_FLAG_MISALIGNED); @@ -820,6 +822,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } + + if (!t->max_write_zeroes_sectors) + t->features &= ~BLK_FEAT_WRITE_ZEROES_UNMAP; + blk_stack_atomic_writes_limits(t, b, start); return ret; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b2b9b89d6967c..e918b2c93aed8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -457,6 +457,29 @@ static int queue_wc_store(struct gendisk *disk, const char *page, return 0; } +static ssize_t queue_write_zeroes_unmap_show(struct gendisk *disk, char *page) +{ + return sysfs_emit(page, "%u\n", + blk_queue_write_zeroes_unmap(disk->queue)); +} + +static int queue_write_zeroes_unmap_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) +{ + unsigned long val; + ssize_t ret; + + ret = queue_var_store(&val, page, count); + if (ret < 0) + return ret; + + if (val) + lim->flags &= ~BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED; + else + lim->flags |= BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED; + return 0; +} + #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ @@ -514,6 +537,7 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); +QUEUE_LIM_RW_ENTRY(queue_write_zeroes_unmap, "write_zeroes_unmap"); QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); @@ -662,6 +686,7 @@ static struct attribute *queue_attrs[] = { &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, + &queue_write_zeroes_unmap_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 332b56f323d92..6f1cf97b1f006 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -340,6 +340,9 @@ typedef unsigned int __bitwise blk_features_t; #define BLK_FEAT_ATOMIC_WRITES \ ((__force blk_features_t)(1u << 16)) +/* supports unmap write zeroes command */ +#define BLK_FEAT_WRITE_ZEROES_UNMAP ((__force blk_features_t)(1u << 17)) + /* * Flags automatically inherited when stacking limits. */ @@ -360,6 +363,10 @@ typedef unsigned int __bitwise blk_flags_t; /* passthrough command IO accounting */ #define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2)) +/* disable the unmap write zeroes operation */ +#define BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED \ + ((__force blk_flags_t)(1u << 3)) + struct queue_limits { blk_features_t features; blk_flags_t flags; @@ -1378,6 +1385,17 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) return bdev_limits(bdev)->max_write_zeroes_sectors; } +static inline bool blk_queue_write_zeroes_unmap(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_WRITE_ZEROES_UNMAP) && + !(q->limits.flags & BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED); +} + +static inline bool bdev_write_zeroes_unmap(struct block_device *bdev) +{ + return blk_queue_write_zeroes_unmap(bdev_get_queue(bdev)); +} + static inline bool bdev_nonrot(struct block_device *bdev) { return blk_queue_nonrot(bdev_get_queue(bdev)); From 0fbeb026db771d621d21ec14c46e9c6397ee05de Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:42 +0800 Subject: [PATCH 03/11] nvme: set BLK_FEAT_WRITE_ZEROES_UNMAP if device supports DEAC bit When the device supports the Write Zeroes command and the DEAC bit, it indicates that the deallocate bit in the Write Zeroes command is supported, and the bytes read from a deallocated logical block are zeroes. This means the device supports unmap Write Zeroes, so set the BLK_FEAT_WRITE_ZEROES_UNMAP feature to the device's queue limit. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig --- drivers/nvme/host/core.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 92697f98c601d..703f60a77cd3f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2420,22 +2420,25 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, else lim.write_stream_granularity = 0; - ret = queue_limits_commit_update(ns->disk->queue, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue, memflags); - goto out; - } - - set_capacity_and_notify(ns->disk, capacity); - /* * Only set the DEAC bit if the device guarantees that reads from * deallocated data return zeroes. While the DEAC bit does not * require that, it must be a no-op if reads from deallocated data * do not return zeroes. */ - if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) + if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) { ns->head->features |= NVME_NS_DEAC; + if (lim.max_write_zeroes_sectors) + lim.features |= BLK_FEAT_WRITE_ZEROES_UNMAP; + } + + ret = queue_limits_commit_update(ns->disk->queue, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue, memflags); + goto out; + } + + set_capacity_and_notify(ns->disk, capacity); set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue, memflags); From 193890b7c29637de34f99667087de7363948e5cf Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:43 +0800 Subject: [PATCH 04/11] nvme-multipath: add BLK_FEAT_WRITE_ZEROES_UNMAP support Set the BLK_FEAT_WRITE_ZEROES_UNMAP feature while creating multipath stacking queue limits by default. This feature shall be disabled if any attached namespace does not support it. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 140079ff86e6b..9b6c4a6317a4e 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -745,7 +745,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_set_stacking_limits(&lim); lim.dma_alignment = 3; lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | - BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES; + BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES | + BLK_FEAT_WRITE_ZEROES_UNMAP; if (head->ids.csi == NVME_CSI_ZNS) lim.features |= BLK_FEAT_ZONED; From aae87d2ad1f7f3696241cce510890b1ff4d71eb1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:44 +0800 Subject: [PATCH 05/11] nvmet: set WZDS and DRB if device supports BLK_FEAT_WRITE_ZEROES_UNMAP Set WZDS and DRB bit to the namespace dlfeat if the underlying block device supports BLK_FEAT_WRITE_ZEROES_UNMAP, make the nvme target device supports unmaped write zeroes command. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-bdev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index eba42df2f8215..03d69d5003593 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -46,6 +46,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) id->npda = id->npdg; /* NOWS = Namespace Optimal Write Size */ id->nows = to0based(bdev_io_opt(bdev) / bdev_logical_block_size(bdev)); + + /* Set WZDS and DRB if device supports unmapped write zeroes */ + if (bdev_write_zeroes_unmap(bdev)) + id->dlfeat = (1 << 3) | 0x1; } void nvmet_bdev_ns_disable(struct nvmet_ns *ns) From 84dca0f5911925a754ef7a831a8f86183b25d12f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:45 +0800 Subject: [PATCH 06/11] scsi: sd: set BLK_FEAT_WRITE_ZEROES_UNMAP if device supports unmap zeroing mode When the device supports the Write Zeroes command and the zeroing mode is set to SD_ZERO_WS16_UNMAP or SD_ZERO_WS10_UNMAP, this means that the device supports unmap Write Zeroes, so set the corresponding BLK_FEAT_WRITE_ZEROES_UNMAP feature to the device's queue limit. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig --- drivers/scsi/sd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3f6e87705b62e..c34b7fac876dd 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1118,6 +1118,11 @@ static void sd_config_write_same(struct scsi_disk *sdkp, else sdkp->zeroing_mode = SD_ZERO_WRITE; + if (sdkp->max_ws_blocks && + (sdkp->zeroing_mode == SD_ZERO_WS16_UNMAP || + sdkp->zeroing_mode == SD_ZERO_WS10_UNMAP)) + lim->features |= BLK_FEAT_WRITE_ZEROES_UNMAP; + if (sdkp->max_ws_blocks && sdkp->physical_block_size > logical_block_size) { /* From 9bb38bc0e353556216c06c84b104c5a63ad8b162 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:46 +0800 Subject: [PATCH 07/11] dm: add BLK_FEAT_WRITE_ZEROES_UNMAP support Set the BLK_FEAT_WRITE_ZEROES_UNMAP feature on stacking queue limits by default. This feature shall be disabled if any underlying device does not support it. Signed-off-by: Zhang Yi Reviewed-by: Benjamin Marzinski --- drivers/md/dm-table.c | 7 +++++-- drivers/md/dm.c | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 6b23e777e10e7..4d450713b69da 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -599,7 +599,8 @@ int dm_split_args(int *argc, char ***argvp, char *input) static void dm_set_stacking_limits(struct queue_limits *limits) { blk_set_stacking_limits(limits); - limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; + limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL | + BLK_FEAT_WRITE_ZEROES_UNMAP; } /* @@ -1851,8 +1852,10 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, limits->discard_alignment = 0; } - if (!dm_table_supports_write_zeroes(t)) + if (!dm_table_supports_write_zeroes(t)) { limits->max_write_zeroes_sectors = 0; + limits->features &= ~BLK_FEAT_WRITE_ZEROES_UNMAP; + } if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5ab7574c0c76a..b59c3dbeaaf10 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1096,6 +1096,7 @@ void disable_write_zeroes(struct mapped_device *md) /* device doesn't really support WRITE ZEROES, disable it */ limits->max_write_zeroes_sectors = 0; + limits->features &= ~BLK_FEAT_WRITE_ZEROES_UNMAP; } static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) From 63391a36208cf221723196d54557bdc1e24e7c30 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:47 +0800 Subject: [PATCH 08/11] fs: introduce FALLOC_FL_WRITE_ZEROES to fallocate With the development of flash-based storage devices, we can quickly write zeros to SSDs using the WRITE_ZERO command if the devices do not actually write physical zeroes to the media. Therefore, we can use this command to quickly preallocate a real all-zero file with written extents. This approach should be beneficial for subsequent pure overwriting within this file, as it can save on block allocation and, consequently, significant metadata changes, which should greatly improve overwrite performance on certain filesystems. Therefore, introduce a new operation FALLOC_FL_WRITE_ZEROES to fallocate. This flag is used to convert a specified range of a file to zeros by issuing a zeroing operation. Blocks should be allocated for the regions that span holes in the file, and the entire range is converted to written extents. If the underlying device supports the actual offload write zeroes command, the process of zeroing out operation can be accelerated. If it does not, we currently don't prevent the file system from writing actual zeros to the device. This provides users with a new method to quickly generate a zeroed file, users no longer need to write zero data to create a file with written extents. Users can determine whether a disk supports the unmap write zeroes operation through querying this sysfs interface: /sys/block//queue/write_zeroes_unmap Finally, this flag cannot be specified in conjunction with the FALLOC_FL_KEEP_SIZE since allocating written extents beyond file EOF is not permitted. In addition, filesystems that always require out-of-place writes should not support this flag since they still need to allocated new blocks during subsequent overwrites. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig --- fs/open.c | 1 + include/linux/falloc.h | 3 ++- include/uapi/linux/falloc.h | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 7828234a7caa4..b777e11e55220 100644 --- a/fs/open.c +++ b/fs/open.c @@ -281,6 +281,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) break; case FALLOC_FL_COLLAPSE_RANGE: case FALLOC_FL_INSERT_RANGE: + case FALLOC_FL_WRITE_ZEROES: if (mode & FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; break; diff --git a/include/linux/falloc.h b/include/linux/falloc.h index 3f49f3df6af5f..7c38c6b76b60f 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -36,7 +36,8 @@ struct space_resv { FALLOC_FL_COLLAPSE_RANGE | \ FALLOC_FL_ZERO_RANGE | \ FALLOC_FL_INSERT_RANGE | \ - FALLOC_FL_UNSHARE_RANGE) + FALLOC_FL_UNSHARE_RANGE | \ + FALLOC_FL_WRITE_ZEROES) /* on ia32 l_start is on a 32-bit boundary */ #if defined(CONFIG_X86_64) diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h index 5810371ed72bb..265aae7ff8c18 100644 --- a/include/uapi/linux/falloc.h +++ b/include/uapi/linux/falloc.h @@ -78,4 +78,22 @@ */ #define FALLOC_FL_UNSHARE_RANGE 0x40 +/* + * FALLOC_FL_WRITE_ZEROES is used to convert a specified range of a file to + * zeros by issuing a zeroing operation. Blocks should be allocated for the + * regions that span holes in the file, and the entire range is converted to + * written extents. This flag is beneficial for subsequent pure overwriting + * within this range, as it can save on block allocation and, consequently, + * significant metadata changes. Therefore, filesystems that always require + * out-of-place writes should not support this flag. + * + * Different filesystems may implement different limitations on the + * granularity of the zeroing operation. Most will preferably be accelerated + * by submitting write zeroes command if the backing storage supports, which + * may not physically write zeros to the media. + * + * This flag cannot be specified in conjunction with the FALLOC_FL_KEEP_SIZE. + */ +#define FALLOC_FL_WRITE_ZEROES 0x80 + #endif /* _UAPI_FALLOC_H_ */ From f70426349d8a03f97ab3ffe5bf5a1bcd8c6ca92c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:48 +0800 Subject: [PATCH 09/11] block: factor out common part in blkdev_fallocate() Only the flags passed to blkdev_issue_zeroout() differ among the two zeroing branches in blkdev_fallocate(). Therefore, do cleanup by factoring them out. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig --- block/fops.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/block/fops.c b/block/fops.c index 1309861d4c2c4..e1c921549d289 100644 --- a/block/fops.c +++ b/block/fops.c @@ -850,6 +850,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; + unsigned int flags; int error; /* Fail if we don't recognize the flags. */ @@ -877,34 +878,29 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); - /* - * Invalidate the page cache, including dirty pages, for valid - * de-allocate mode calls to fallocate(). - */ switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, - BLKDEV_ZERO_NOUNMAP); + flags = BLKDEV_ZERO_NOUNMAP; break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, - BLKDEV_ZERO_NOFALLBACK); + flags = BLKDEV_ZERO_NOFALLBACK; break; default: error = -EOPNOTSUPP; + goto fail; } + /* + * Invalidate the page cache, including dirty pages, for valid + * de-allocate mode calls to fallocate(). + */ + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); + if (error) + goto fail; + + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, flags); fail: filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); From 3da3890b518c7eeaedc51c2c5af7a3833da02285 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:49 +0800 Subject: [PATCH 10/11] block: add FALLOC_FL_WRITE_ZEROES support Add support for FALLOC_FL_WRITE_ZEROES, if the block device enables the unmap write zeroes operation, it will issue a write zeroes command. Signed-off-by: Zhang Yi --- block/fops.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/block/fops.c b/block/fops.c index e1c921549d289..050c16f5974a4 100644 --- a/block/fops.c +++ b/block/fops.c @@ -841,7 +841,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE) + FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) @@ -856,6 +856,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; + /* + * Don't allow writing zeroes if the device does not enable the + * unmap write zeroes operation. + */ + if (!bdev_write_zeroes_unmap(bdev) && + (mode & FALLOC_FL_WRITE_ZEROES)) + return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = bdev_nr_bytes(bdev); @@ -886,6 +893,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: flags = BLKDEV_ZERO_NOFALLBACK; break; + case FALLOC_FL_WRITE_ZEROES: + flags = 0; + break; default: error = -EOPNOTSUPP; goto fail; From ad7f53838d9017f99d6f01cc1e6bcfa5634255ae Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 10:08:50 +0800 Subject: [PATCH 11/11] ext4: add FALLOC_FL_WRITE_ZEROES support Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable the unmap write zeroes operation. This first allocates blocks as unwritten, then issues a zero command outside of the running journal handle, and finally converts them to a written state. Signed-off-by: Zhang Yi --- fs/ext4/extents.c | 66 ++++++++++++++++++++++++++++++------- include/trace/events/ext4.h | 3 +- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b543a46fc8096..29ce9f6287d08 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4501,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, struct ext4_map_blocks map; unsigned int credits; loff_t epos, old_size = i_size_read(inode); + unsigned int blkbits = inode->i_blkbits; + bool alloc_zero = false; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4513,6 +4515,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + /* + * Do the actual write zero during a running journal transaction + * costs a lot. First allocate an unwritten extent and then + * convert it to written after zeroing it out. + */ + if (flags & EXT4_GET_BLOCKS_ZERO) { + flags &= ~EXT4_GET_BLOCKS_ZERO; + flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; + alloc_zero = true; + } + /* * credits to insert 1 extent into extent tree */ @@ -4549,9 +4562,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * allow a full retry cycle for any remaining allocations */ retries = 0; - map.m_lblk += ret; - map.m_len = len = len - ret; - epos = (loff_t)map.m_lblk << inode->i_blkbits; + epos = (loff_t)(map.m_lblk + ret) << blkbits; inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) @@ -4571,6 +4582,21 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ret2 = ret3 ? ret3 : ret2; if (unlikely(ret2)) break; + + if (alloc_zero && + (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { + ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, + map.m_len); + if (likely(!ret2)) + ret2 = ext4_convert_unwritten_extents(NULL, + inode, (loff_t)map.m_lblk << blkbits, + (loff_t)map.m_len << blkbits); + if (ret2) + break; + } + + map.m_lblk += ret; + map.m_len = len = len - ret; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -4636,7 +4662,11 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (end_lblk > start_lblk) { ext4_lblk_t zero_blks = end_lblk - start_lblk; - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); + if (mode & FALLOC_FL_WRITE_ZEROES) + flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; + else + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, new_size, flags); if (ret) @@ -4745,11 +4775,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; + /* + * Don't allow writing zeroes if the underlying device does not + * enable the unmap write zeroes operation. + */ + if (!bdev_write_zeroes_unmap(inode->i_sb->s_bdev) && + (mode & FALLOC_FL_WRITE_ZEROES)) + return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | - FALLOC_FL_INSERT_RANGE)) + FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES)) return -EOPNOTSUPP; inode_lock(inode); @@ -4780,16 +4817,23 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) goto out_invalidate_lock; - if (mode & FALLOC_FL_PUNCH_HOLE) + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: ret = ext4_punch_hole(file, offset, len); - else if (mode & FALLOC_FL_COLLAPSE_RANGE) + break; + case FALLOC_FL_COLLAPSE_RANGE: ret = ext4_collapse_range(file, offset, len); - else if (mode & FALLOC_FL_INSERT_RANGE) + break; + case FALLOC_FL_INSERT_RANGE: ret = ext4_insert_range(file, offset, len); - else if (mode & FALLOC_FL_ZERO_RANGE) + break; + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_WRITE_ZEROES: ret = ext4_zero_range(file, offset, len, mode); - else + break; + default: ret = -EOPNOTSUPP; + } out_invalidate_lock: filemap_invalidate_unlock(mapping); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 156908641e68f..6f9cf28117330 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -92,7 +92,8 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ - { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) + { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}, \ + { FALLOC_FL_WRITE_ZEROES, "WRITE_ZEROES"}) TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);