Skip to content

Commit ad0c23c

Browse files
morbidrsakdave
authored andcommitted
btrfs: zoned: limit number of zones reclaimed in flush_space()
Limit the number of zones reclaimed in flush_space()'s RECLAIM_ZONES state. This prevents possibly long running reclaim sweeps to block other tasks in the system, while the system is under pressure anyways, causing the tasks to hang. An example of this can be seen here, triggered by fstests generic/551: generic/551 [ 27.042349] run fstests generic/551 at 2026-02-27 11:05:30 BTRFS: device fsid 78c16e29-20d9-4c8e-bc04-7ba431be38ff devid 1 transid 8 /dev/vdb (254:16) scanned by mount (806) BTRFS info (device vdb): first mount of filesystem 78c16e29-20d9-4c8e-bc04-7ba431be38ff BTRFS info (device vdb): using crc32c checksum algorithm BTRFS info (device vdb): host-managed zoned block device /dev/vdb, 64 zones of 268435456 bytes BTRFS info (device vdb): zoned mode enabled with zone size 268435456 BTRFS info (device vdb): checking UUID tree BTRFS info (device vdb): enabling free space tree INFO: task kworker/u38:1:90 blocked for more than 120 seconds. Not tainted 7.0.0-rc1+ #345 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u38:1 state:D stack:0 pid:90 tgid:90 ppid:2 task_flags:0x4208060 flags:0x00080000 Workqueue: events_unbound btrfs_async_reclaim_data_space Call Trace: <TASK> __schedule+0x34f/0xe70 schedule+0x41/0x140 schedule_timeout+0xa3/0x110 ? mark_held_locks+0x40/0x70 ? lockdep_hardirqs_on_prepare+0xd8/0x1c0 ? trace_hardirqs_on+0x18/0x100 ? lockdep_hardirqs_on+0x84/0x130 ? _raw_spin_unlock_irq+0x33/0x50 wait_for_completion+0xa4/0x150 ? __flush_work+0x24c/0x550 __flush_work+0x339/0x550 ? __pfx_wq_barrier_func+0x10/0x10 ? wait_for_completion+0x39/0x150 flush_space+0x243/0x660 ? find_held_lock+0x2b/0x80 ? kvm_sched_clock_read+0x11/0x20 ? local_clock_noinstr+0x17/0x110 ? local_clock+0x15/0x30 ? lock_release+0x1b7/0x4b0 do_async_reclaim_data_space+0xe8/0x160 btrfs_async_reclaim_data_space+0x19/0x30 process_one_work+0x20a/0x5f0 ? lock_is_held_type+0xcd/0x130 worker_thread+0x1e2/0x3c0 ? __pfx_worker_thread+0x10/0x10 kthread+0x103/0x150 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x20d/0x320 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 </TASK> Showing all locks held in the system: 1 lock held by khungtaskd/67: #0: ffffffff824d58e0 (rcu_read_lock){....}-{1:3}, at: debug_show_all_locks+0x3d/0x194 2 locks held by kworker/u38:1/90: #0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0 #1: ffffc90000c17e58 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0 5 locks held by kworker/u39:1/191: #0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0 #1: ffffc90000dfbe58 ((work_completion)(&fs_info->reclaim_bgs_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0 #2: ffff888101da0420 (sb_writers#9){.+.+}-{0:0}, at: process_one_work+0x20a/0x5f0 #3: ffff88811040a648 (&fs_info->reclaim_bgs_lock){+.+.}-{4:4}, at: btrfs_reclaim_bgs_work+0x1de/0x770 #4: ffff888110408a18 (&fs_info->cleaner_mutex){+.+.}-{4:4}, at: btrfs_relocate_block_group+0x95a/0x20f0 1 lock held by aio-dio-write-v/980: #0: ffff888110093008 (&sb->s_type->i_mutex_key#15){++++}-{4:4}, at: btrfs_inode_lock+0x51/0xb0 ============================================= To prevent these long running reclaims from blocking the system, only reclaim 5 block_groups in the RECLAIM_ZONES state of flush_space(). Also as these reclaims are now constrained, it opens up the use for a synchronous call to brtfs_reclaim_block_groups(), eliminating the need to place the reclaim task on a workqueue and then flushing the workqueue again. Reviewed-by: Boris Burkov <[email protected]> Signed-off-by: Johannes Thumshirn <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent bd0ffde commit ad0c23c

3 files changed

Lines changed: 14 additions & 6 deletions

File tree

fs/btrfs/block-group.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,7 +1909,7 @@ static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 b
19091909
return true;
19101910
}
19111911

1912-
static int btrfs_reclaim_block_group(struct btrfs_block_group *bg)
1912+
static int btrfs_reclaim_block_group(struct btrfs_block_group *bg, int *reclaimed)
19131913
{
19141914
struct btrfs_fs_info *fs_info = bg->fs_info;
19151915
struct btrfs_space_info *space_info = bg->space_info;
@@ -2036,15 +2036,18 @@ static int btrfs_reclaim_block_group(struct btrfs_block_group *bg)
20362036
if (space_info->total_bytes < old_total)
20372037
btrfs_set_periodic_reclaim_ready(space_info, true);
20382038
spin_unlock(&space_info->lock);
2039+
if (!ret)
2040+
(*reclaimed)++;
20392041

20402042
return ret;
20412043
}
20422044

2043-
static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
2045+
void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit)
20442046
{
20452047
struct btrfs_block_group *bg;
20462048
struct btrfs_space_info *space_info;
20472049
LIST_HEAD(retry_list);
2050+
int reclaimed = 0;
20482051

20492052
if (!btrfs_should_reclaim(fs_info))
20502053
return;
@@ -2080,7 +2083,7 @@ static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
20802083

20812084
space_info = bg->space_info;
20822085
spin_unlock(&fs_info->unused_bgs_lock);
2083-
ret = btrfs_reclaim_block_group(bg);
2086+
ret = btrfs_reclaim_block_group(bg, &reclaimed);
20842087

20852088
if (ret && !READ_ONCE(space_info->periodic_reclaim))
20862089
btrfs_link_bg_list(bg, &retry_list);
@@ -2099,6 +2102,8 @@ static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
20992102
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
21002103
goto end;
21012104
spin_lock(&fs_info->unused_bgs_lock);
2105+
if (reclaimed >= limit)
2106+
break;
21022107
}
21032108
spin_unlock(&fs_info->unused_bgs_lock);
21042109
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -2114,7 +2119,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
21142119
struct btrfs_fs_info *fs_info =
21152120
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
21162121

2117-
btrfs_reclaim_block_groups(fs_info);
2122+
btrfs_reclaim_block_groups(fs_info, -1);
21182123
}
21192124

21202125
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)

fs/btrfs/block-group.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
350350
struct btrfs_chunk_map *map);
351351
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
352352
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
353+
void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit);
353354
void btrfs_reclaim_bgs_work(struct work_struct *work);
354355
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
355356
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);

fs/btrfs/space-info.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
212212

213213
#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL)
214214

215+
#define BTRFS_ZONED_SYNC_RECLAIM_BATCH (5)
216+
215217
/*
216218
* Calculate chunk size depending on volume type (regular or zoned).
217219
*/
@@ -918,8 +920,8 @@ static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes,
918920
if (btrfs_is_zoned(fs_info)) {
919921
btrfs_reclaim_sweep(fs_info);
920922
btrfs_delete_unused_bgs(fs_info);
921-
btrfs_reclaim_bgs(fs_info);
922-
flush_work(&fs_info->reclaim_bgs_work);
923+
btrfs_reclaim_block_groups(fs_info,
924+
BTRFS_ZONED_SYNC_RECLAIM_BATCH);
923925
ASSERT(current->journal_info == NULL);
924926
ret = btrfs_commit_current_transaction(root);
925927
} else {

0 commit comments

Comments
 (0)