From 4df3436cc5602ea838a312709c26bf155c8134bd Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 29 Jul 2025 20:21:31 +0530 Subject: [PATCH 1/5] fs: add a new user_write_streams() callback so that filesystem can control number of write streams for user space. Signed-off-by: Kanchan Joshi --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051..4bf06edc3c8ec 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2379,6 +2379,11 @@ struct super_operations { */ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); void (*shutdown)(struct super_block *sb); + /* + * Implement this callback if filesystem wants to control the + * number of streams that are available to user space. + */ + u8 (*user_write_streams)(struct super_block *sb); }; /* From 82e8ec9e8ac35a082836974c0ac2b5c282241a45 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 29 Jul 2025 20:21:32 +0530 Subject: [PATCH 2/5] fs: add the interface to query user write streams Add new fcntl F_GET_MAX_WRITE_STREAMS. This returns the numbers of streams that are available for userspace. And for that, use ->user_write_streams() callback when the involved filesystem provides it. In absence of such callback, use 'max_write_streams' queue limit of the underlying block device. Signed-off-by: Kanchan Joshi --- fs/fcntl.c | 31 +++++++++++++++++++++++++++++++ include/uapi/linux/fcntl.h | 5 +++++ 2 files changed, 36 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 5598e4d574229..36ca833e9a0bd 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -394,6 +395,33 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, return 0; } +static u8 vfs_user_write_streams(struct inode *inode) +{ + struct super_block *sb; + + if (S_ISBLK(inode->i_mode)) + return bdev_max_write_streams(I_BDEV(inode)); + + sb = inode->i_sb; + /* If available, use per-mount/fs policy */ + if (sb->s_op && sb->s_op->user_write_streams) + return sb->s_op->user_write_streams(sb); + /* otherwise, fallback to queue limit */ + if (sb->s_bdev) + return bdev_max_write_streams(sb->s_bdev); + return 0; +} + +static long fcntl_get_max_write_streams(struct file *file) +{ + struct inode *inode = file_inode(file); + + if (S_ISBLK(inode->i_mode)) + inode = file->f_mapping->host; + + return vfs_user_write_streams(inode); +} + /* Is the file descriptor a dup of the file? */ static long f_dupfd_query(int fd, struct file *filp) { @@ -552,6 +580,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, cmd, arg); break; + case F_GET_MAX_WRITE_STREAMS: + err = fcntl_get_max_write_streams(filp); + break; default: break; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index f291ab4f94ebc..87ec808d0f035 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -61,6 +61,11 @@ #define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) #define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) +/* + * Query available write streams + */ +#define F_GET_MAX_WRITE_STREAMS (F_LINUX_SPECIFIC_BASE + 15) + /* * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be * used to clear any hints previously set. From d26c89840b0a584234f86a43a1aad108fa054b2d Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 29 Jul 2025 20:21:33 +0530 Subject: [PATCH 3/5] fs: add a write stream field to the inode Prepare for supporting per-inode write streams. Part of the existing 32-bit hole is used for the new field. Signed-off-by: Kanchan Joshi --- fs/inode.c | 1 + include/linux/fs.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e2..bb1a9a043b32d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -250,6 +250,7 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; + inode->i_write_stream = 0; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4bf06edc3c8ec..79f9552620e0b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -725,7 +725,8 @@ struct inode { /* Misc */ u32 i_state; - /* 32-bit hole */ + u8 i_write_stream; + /* 24-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ From a4466def8628fc0276c1d070ad07bf80969720d7 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 29 Jul 2025 20:21:34 +0530 Subject: [PATCH 4/5] fs: propagate write stream bio->bi_write_stream is not set by the filesystem code. Use inode's write stream value to do that. Signed-off-by: Kanchan Joshi --- fs/btrfs/extent_io.c | 1 + fs/buffer.c | 14 +++++++++----- fs/direct-io.c | 1 + fs/ext4/page-io.c | 1 + fs/iomap/direct-io.c | 1 + fs/iomap/ioend.c | 1 + fs/mpage.c | 1 + 7 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c953297aa89a0..513d23bd57519 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -666,6 +666,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; + bbio->bio.bi_write_stream = inode->vfs_inode.i_write_stream; bbio->inode = inode; bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; diff --git a/fs/buffer.c b/fs/buffer.c index 6a8752f7bbedb..8cb25c5f047b0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -55,7 +55,8 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - enum rw_hint hint, struct writeback_control *wbc); + enum rw_hint hint, u8 write_stream, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1931,7 +1932,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, - inode->i_write_hint, wbc); + inode->i_write_hint, + inode->i_write_stream, wbc); nr_underway++; } bh = next; @@ -1986,7 +1988,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, if (buffer_async_write(bh)) { clear_buffer_dirty(bh); submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, - inode->i_write_hint, wbc); + inode->i_write_hint, + inode->i_write_stream, wbc); nr_underway++; } bh = next; @@ -2778,7 +2781,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - enum rw_hint write_hint, + enum rw_hint write_hint, u8 write_stream, struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; @@ -2807,6 +2810,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; + bio->bi_write_stream = write_stream; bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); @@ -2826,7 +2830,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); + submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, 0, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/direct-io.c b/fs/direct-io.c index 2267f5ae7f77a..ae8429b120ddb 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -410,6 +410,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, if (dio->is_pinned) bio_set_flag(bio, BIO_PAGE_PINNED); bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint; + bio->bi_write_stream = file_inode(dio->iocb->ki_filp)->i_write_stream; sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 39abfeec5f36c..e5c4a691065e6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -447,6 +447,7 @@ static void io_submit_add_bh(struct ext4_io_submit *io, if (io->io_bio == NULL) { io_submit_init_bio(io, bh); io->io_bio->bi_write_hint = inode->i_write_hint; + io->io_bio->bi_write_stream = inode->i_write_stream; } if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b84f6af2eb4c8..dc35b27316b76 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -430,6 +430,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_write_hint = inode->i_write_hint; + bio->bi_write_stream = inode->i_write_stream; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab260..652525c93fdd4 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -107,6 +107,7 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_write_hint = wpc->inode->i_write_hint; + bio->bi_write_stream = wpc->inode->i_write_stream; wbc_init_bio(wpc->wbc, bio); wpc->nr_folios = 0; return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags); diff --git a/fs/mpage.c b/fs/mpage.c index c5fd821fd30e5..6a50bbe38adc3 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -595,6 +595,7 @@ static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio, bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; + bio->bi_write_stream = inode->i_write_stream; } /* From fcf2b665284dae583c8ee4ddf0489ae1956a1ca4 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 29 Jul 2025 20:21:35 +0530 Subject: [PATCH 5/5] fs: add set and query write stream Add two new fcntls: F_GET_WRITE_STREAM - to query the write-stream on inode F_SET_WRITE_STREAM - to set the write-stream on inode Application should query the available streams by calling F_GET_MAX_WRITE_STREAMS first. If returned value is N, applications can choose any value from 1 to N while setting the stream. Setting the value 0 is not flagged as an error as that implies no stream. But setting a larger value than available streams is rejected. Signed-off-by: Kanchan Joshi --- fs/fcntl.c | 33 +++++++++++++++++++++++++++++++++ include/uapi/linux/fcntl.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 36ca833e9a0bd..ce89393f8dbf0 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -422,6 +422,33 @@ static long fcntl_get_max_write_streams(struct file *file) return vfs_user_write_streams(inode); } +static long fcntl_get_write_stream(struct file *file) +{ + struct inode *inode = file_inode(file); + + if (S_ISBLK(inode->i_mode)) + inode = file->f_mapping->host; + + return inode->i_write_stream; +} + +static long fcntl_set_write_stream(struct file *file, unsigned long arg) +{ + struct inode *inode = file_inode(file); + + if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) + return -EPERM; + + if (S_ISBLK(inode->i_mode)) + inode = file->f_mapping->host; + + if (arg > vfs_user_write_streams(inode)) + return -EINVAL; + + WRITE_ONCE(inode->i_write_stream, arg); + return 0; +} + /* Is the file descriptor a dup of the file? */ static long f_dupfd_query(int fd, struct file *filp) { @@ -583,6 +610,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GET_MAX_WRITE_STREAMS: err = fcntl_get_max_write_streams(filp); break; + case F_GET_WRITE_STREAM: + err = fcntl_get_write_stream(filp); + break; + case F_SET_WRITE_STREAM: + err = fcntl_set_write_stream(filp, arg); + break; default: break; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 87ec808d0f035..dd3c498515ce0 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -65,6 +65,8 @@ * Query available write streams */ #define F_GET_MAX_WRITE_STREAMS (F_LINUX_SPECIFIC_BASE + 15) +#define F_GET_WRITE_STREAM (F_LINUX_SPECIFIC_BASE + 16) +#define F_SET_WRITE_STREAM (F_LINUX_SPECIFIC_BASE + 17) /* * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be