Skip to content

Commit f84c9dd

Browse files
leitaohtejun
authored andcommitted
workqueue: add time-based panic for stalls
Add a new module parameter 'panic_on_stall_time' that triggers a panic when a workqueue stall persists for longer than the specified duration in seconds. Unlike 'panic_on_stall' which counts accumulated stall events, this parameter triggers based on the duration of a single continuous stall. This is useful for catching truly stuck workqueues rather than accumulating transient stalls. Usage: workqueue.panic_on_stall_time=120 This would panic if any workqueue pool has been stalled for 120 seconds or more. The stall duration is measured from the workqueue last progress (poll_ts) which accounts for legitimate system stalls. Signed-off-by: Breno Leitao <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent 32d572e commit f84c9dd

2 files changed

Lines changed: 26 additions & 4 deletions

File tree

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8339,6 +8339,14 @@ Kernel parameters
83398339
The default is set by CONFIG_BOOTPARAM_WQ_STALL_PANIC,
83408340
which is 0 (disabled) if not configured.
83418341

8342+
workqueue.panic_on_stall_time=<uint>
8343+
Panic when a workqueue stall has been continuous for
8344+
the specified number of seconds. Unlike panic_on_stall
8345+
which counts accumulated stall events, this triggers
8346+
based on the duration of a single continuous stall.
8347+
8348+
The default is 0, which disables the time-based panic.
8349+
83428350
workqueue.cpu_intensive_thresh_us=
83438351
Per-cpu work items which run for longer than this
83448352
threshold are automatically considered CPU intensive

kernel/workqueue.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7571,6 +7571,10 @@ static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
75717571
static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
75727572
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
75737573

7574+
static unsigned int wq_panic_on_stall_time;
7575+
module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
7576+
MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
7577+
75747578
/*
75757579
* Show workers that might prevent the processing of pending work items.
75767580
* The only candidates are CPU-bound workers in the running state.
@@ -7622,14 +7626,21 @@ static void show_cpu_pools_hogs(void)
76227626
rcu_read_unlock();
76237627
}
76247628

7625-
static void panic_on_wq_watchdog(void)
7629+
/*
7630+
* It triggers a panic in two scenarios: when the total number of stalls
7631+
* exceeds a threshold, and when a stall lasts longer than
7632+
* wq_panic_on_stall_time
7633+
*/
7634+
static void panic_on_wq_watchdog(unsigned int stall_time_sec)
76267635
{
76277636
static unsigned int wq_stall;
76287637

76297638
if (wq_panic_on_stall) {
76307639
wq_stall++;
76317640
BUG_ON(wq_stall >= wq_panic_on_stall);
76327641
}
7642+
7643+
BUG_ON(wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time);
76337644
}
76347645

76357646
static void wq_watchdog_reset_touched(void)
@@ -7644,10 +7655,12 @@ static void wq_watchdog_reset_touched(void)
76447655
static void wq_watchdog_timer_fn(struct timer_list *unused)
76457656
{
76467657
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
7658+
unsigned int max_stall_time = 0;
76477659
bool lockup_detected = false;
76487660
bool cpu_pool_stall = false;
76497661
unsigned long now = jiffies;
76507662
struct worker_pool *pool;
7663+
unsigned int stall_time;
76517664
int pi;
76527665

76537666
if (!thresh)
@@ -7681,14 +7694,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
76817694
/* did we stall? */
76827695
if (time_after(now, ts + thresh)) {
76837696
lockup_detected = true;
7697+
stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
7698+
max_stall_time = max(max_stall_time, stall_time);
76847699
if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
76857700
pool->cpu_stall = true;
76867701
cpu_pool_stall = true;
76877702
}
76887703
pr_emerg("BUG: workqueue lockup - pool");
76897704
pr_cont_pool_info(pool);
7690-
pr_cont(" stuck for %us!\n",
7691-
jiffies_to_msecs(now - pool_ts) / 1000);
7705+
pr_cont(" stuck for %us!\n", stall_time);
76927706
}
76937707

76947708

@@ -7701,7 +7715,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
77017715
show_cpu_pools_hogs();
77027716

77037717
if (lockup_detected)
7704-
panic_on_wq_watchdog();
7718+
panic_on_wq_watchdog(max_stall_time);
77057719

77067720
wq_watchdog_reset_touched();
77077721
mod_timer(&wq_watchdog_timer, jiffies + thresh);

0 commit comments

Comments
 (0)