Skip to content

Commit 9bdc648

Browse files
committed
Merge tag 'wq-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
Pull workqueue updates from Tejun Heo: - Rework the rescuer to process work items one-by-one instead of slurping all pending work items in a single pass. As there is only one rescuer per workqueue, a single long-blocking work item could cause high latency for all tasks queued behind it, even after memory pressure is relieved and regular kworkers become available to service them. - Add CONFIG_BOOTPARAM_WQ_STALL_PANIC build-time option and workqueue.panic_on_stall_time parameter for time-based stall panic, giving systems more control over workqueue stall handling. - Replace BUG_ON() with panic() in the stall panic path for clearer intent and more informative output. * tag 'wq-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq: workqueue: replace BUG_ON with panic in panic_on_wq_watchdog workqueue: add time-based panic for stalls workqueue: add CONFIG_BOOTPARAM_WQ_STALL_PANIC option workqueue: Process extra works in rescuer on memory pressure workqueue: Process rescuer work items one-by-one using a cursor workqueue: Make send_mayday() take a PWQ argument directly
2 parents 1e83ccd + 9cb8b0f commit 9bdc648

3 files changed

Lines changed: 148 additions & 35 deletions

File tree

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8376,7 +8376,16 @@ Kernel parameters
83768376
CONFIG_WQ_WATCHDOG. It sets the number times of the
83778377
stall to trigger panic.
83788378

8379-
The default is 0, which disables the panic on stall.
8379+
The default is set by CONFIG_BOOTPARAM_WQ_STALL_PANIC,
8380+
which is 0 (disabled) if not configured.
8381+
8382+
workqueue.panic_on_stall_time=<uint>
8383+
Panic when a workqueue stall has been continuous for
8384+
the specified number of seconds. Unlike panic_on_stall
8385+
which counts accumulated stall events, this triggers
8386+
based on the duration of a single continuous stall.
8387+
8388+
The default is 0, which disables the time-based panic.
83808389

83818390
workqueue.cpu_intensive_thresh_us=
83828391
Per-cpu work items which run for longer than this

kernel/workqueue.c

Lines changed: 115 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ enum wq_internal_consts {
117117
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
118118
CREATE_COOLDOWN = HZ, /* time to breath after fail */
119119

120+
RESCUER_BATCH = 16, /* process items per turn */
121+
120122
/*
121123
* Rescue workers are used only on emergencies and shared by
122124
* all cpus. Give MIN_NICE.
@@ -286,6 +288,7 @@ struct pool_workqueue {
286288
struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
287289
struct list_head pwqs_node; /* WR: node on wq->pwqs */
288290
struct list_head mayday_node; /* MD: node on wq->maydays */
291+
struct work_struct mayday_cursor; /* L: cursor on pool->worklist */
289292

290293
u64 stats[PWQ_NR_STATS];
291294

@@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
11201123
return NULL;
11211124
}
11221125

1126+
static void mayday_cursor_func(struct work_struct *work)
1127+
{
1128+
/* should not be processed, only for marking position */
1129+
BUG();
1130+
}
1131+
11231132
/**
11241133
* move_linked_works - move linked works to a list
11251134
* @work: start of series of works to be scheduled
@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
11821191

11831192
lockdep_assert_held(&pool->lock);
11841193

1194+
/* The cursor work should not be processed */
1195+
if (unlikely(work->func == mayday_cursor_func)) {
1196+
/* only worker_thread() can possibly take this branch */
1197+
WARN_ON_ONCE(worker->rescue_wq);
1198+
if (nextp)
1199+
*nextp = list_next_entry(work, entry);
1200+
list_del_init(&work->entry);
1201+
return false;
1202+
}
1203+
11851204
/*
11861205
* A single work shouldn't be executed concurrently by multiple workers.
11871206
* __queue_work() ensures that @work doesn't jump to a different pool
@@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
29762995
reap_dying_workers(&cull_list);
29772996
}
29782997

2979-
static void send_mayday(struct work_struct *work)
2998+
static void send_mayday(struct pool_workqueue *pwq)
29802999
{
2981-
struct pool_workqueue *pwq = get_work_pwq(work);
29823000
struct workqueue_struct *wq = pwq->wq;
29833001

29843002
lockdep_assert_held(&wq_mayday_lock);
@@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
30163034
* rescuers.
30173035
*/
30183036
list_for_each_entry(work, &pool->worklist, entry)
3019-
send_mayday(work);
3037+
send_mayday(get_work_pwq(work));
30203038
}
30213039

30223040
raw_spin_unlock(&wq_mayday_lock);
@@ -3440,22 +3458,57 @@ static int worker_thread(void *__worker)
34403458
static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
34413459
{
34423460
struct worker_pool *pool = pwq->pool;
3461+
struct work_struct *cursor = &pwq->mayday_cursor;
34433462
struct work_struct *work, *n;
34443463

3445-
/* need rescue? */
3446-
if (!pwq->nr_active || !need_to_create_worker(pool))
3464+
/* have work items to rescue? */
3465+
if (!pwq->nr_active)
34473466
return false;
34483467

3449-
/*
3450-
* Slurp in all works issued via this workqueue and
3451-
* process'em.
3452-
*/
3453-
list_for_each_entry_safe(work, n, &pool->worklist, entry) {
3454-
if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
3468+
/* need rescue? */
3469+
if (!need_to_create_worker(pool)) {
3470+
/*
3471+
* The pool has idle workers and doesn't need the rescuer, so it
3472+
* could simply return false here.
3473+
*
3474+
* However, the memory pressure might not be fully relieved.
3475+
* In PERCPU pool with concurrency enabled, having idle workers
3476+
* does not necessarily mean memory pressure is gone; it may
3477+
* simply mean regular workers have woken up, completed their
3478+
* work, and gone idle again due to concurrency limits.
3479+
*
3480+
* In this case, those working workers may later sleep again,
3481+
* the pool may run out of idle workers, and it will have to
3482+
* allocate new ones and wait for the timer to send mayday,
3483+
* causing unnecessary delay - especially if memory pressure
3484+
* was never resolved throughout.
3485+
*
3486+
* Do more work if memory pressure is still on to reduce
3487+
* relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
3488+
* not precisely, unless there are other PWQs needing help.
3489+
*/
3490+
if (!(pool->flags & POOL_MANAGER_ACTIVE) ||
3491+
!list_empty(&pwq->wq->maydays))
3492+
return false;
3493+
}
3494+
3495+
/* search from the start or cursor if available */
3496+
if (list_empty(&cursor->entry))
3497+
work = list_first_entry(&pool->worklist, struct work_struct, entry);
3498+
else
3499+
work = list_next_entry(cursor, entry);
3500+
3501+
/* find the next work item to rescue */
3502+
list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
3503+
if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
34553504
pwq->stats[PWQ_STAT_RESCUED]++;
3505+
/* put the cursor for next search */
3506+
list_move_tail(&cursor->entry, &n->entry);
3507+
return true;
3508+
}
34563509
}
34573510

3458-
return !list_empty(&rescuer->scheduled);
3511+
return false;
34593512
}
34603513

34613514
/**
@@ -3512,6 +3565,7 @@ static int rescuer_thread(void *__rescuer)
35123565
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
35133566
struct pool_workqueue, mayday_node);
35143567
struct worker_pool *pool = pwq->pool;
3568+
unsigned int count = 0;
35153569

35163570
__set_current_state(TASK_RUNNING);
35173571
list_del_init(&pwq->mayday_node);
@@ -3524,31 +3578,27 @@ static int rescuer_thread(void *__rescuer)
35243578

35253579
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
35263580

3527-
if (assign_rescuer_work(pwq, rescuer)) {
3581+
while (assign_rescuer_work(pwq, rescuer)) {
35283582
process_scheduled_works(rescuer);
35293583

35303584
/*
3531-
* The above execution of rescued work items could
3532-
* have created more to rescue through
3533-
* pwq_activate_first_inactive() or chained
3534-
* queueing. Let's put @pwq back on mayday list so
3535-
* that such back-to-back work items, which may be
3536-
* being used to relieve memory pressure, don't
3537-
* incur MAYDAY_INTERVAL delay inbetween.
3585+
* If the per-turn work item limit is reached and other
3586+
* PWQs are in mayday, requeue mayday for this PWQ and
3587+
* let the rescuer handle the other PWQs first.
35383588
*/
3539-
if (pwq->nr_active && need_to_create_worker(pool)) {
3589+
if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
3590+
pwq->nr_active && need_to_create_worker(pool)) {
35403591
raw_spin_lock(&wq_mayday_lock);
3541-
/*
3542-
* Queue iff somebody else hasn't queued it already.
3543-
*/
3544-
if (list_empty(&pwq->mayday_node)) {
3545-
get_pwq(pwq);
3546-
list_add_tail(&pwq->mayday_node, &wq->maydays);
3547-
}
3592+
send_mayday(pwq);
35483593
raw_spin_unlock(&wq_mayday_lock);
3594+
break;
35493595
}
35503596
}
35513597

3598+
/* The cursor can not be left behind without the rescuer watching it. */
3599+
if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
3600+
list_del_init(&pwq->mayday_cursor.entry);
3601+
35523602
/*
35533603
* Leave this pool. Notify regular workers; otherwise, we end up
35543604
* with 0 concurrency and stalling the execution.
@@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
51675217
INIT_LIST_HEAD(&pwq->pwqs_node);
51685218
INIT_LIST_HEAD(&pwq->mayday_node);
51695219
kthread_init_work(&pwq->release_work, pwq_release_workfn);
5220+
5221+
/*
5222+
* Set the dummy cursor work with valid function and get_work_pwq().
5223+
*
5224+
* The cursor work should only be in the pwq->pool->worklist, and
5225+
* should not be treated as a processable work item.
5226+
*
5227+
* WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
5228+
* surprise for kernel debugging tools and reviewers.
5229+
*/
5230+
INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
5231+
atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
5232+
WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
51705233
}
51715234

51725235
/* sync @pwq with the current state of its associated wq and link it */
@@ -7508,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
75087571
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
75097572
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
75107573

7511-
static unsigned int wq_panic_on_stall;
7574+
static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
75127575
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
75137576

7577+
static unsigned int wq_panic_on_stall_time;
7578+
module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
7579+
MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
7580+
75147581
/*
75157582
* Show workers that might prevent the processing of pending work items.
75167583
* The only candidates are CPU-bound workers in the running state.
@@ -7562,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
75627629
rcu_read_unlock();
75637630
}
75647631

7565-
static void panic_on_wq_watchdog(void)
7632+
/*
7633+
* It triggers a panic in two scenarios: when the total number of stalls
7634+
* exceeds a threshold, and when a stall lasts longer than
7635+
* wq_panic_on_stall_time
7636+
*/
7637+
static void panic_on_wq_watchdog(unsigned int stall_time_sec)
75667638
{
75677639
static unsigned int wq_stall;
75687640

75697641
if (wq_panic_on_stall) {
75707642
wq_stall++;
7571-
BUG_ON(wq_stall >= wq_panic_on_stall);
7643+
if (wq_stall >= wq_panic_on_stall)
7644+
panic("workqueue: %u stall(s) exceeded threshold %u\n",
7645+
wq_stall, wq_panic_on_stall);
75727646
}
7647+
7648+
if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
7649+
panic("workqueue: stall lasted %us, exceeding threshold %us\n",
7650+
stall_time_sec, wq_panic_on_stall_time);
75737651
}
75747652

75757653
static void wq_watchdog_reset_touched(void)
@@ -7584,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
75847662
static void wq_watchdog_timer_fn(struct timer_list *unused)
75857663
{
75867664
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
7665+
unsigned int max_stall_time = 0;
75877666
bool lockup_detected = false;
75887667
bool cpu_pool_stall = false;
75897668
unsigned long now = jiffies;
75907669
struct worker_pool *pool;
7670+
unsigned int stall_time;
75917671
int pi;
75927672

75937673
if (!thresh)
@@ -7621,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
76217701
/* did we stall? */
76227702
if (time_after(now, ts + thresh)) {
76237703
lockup_detected = true;
7704+
stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
7705+
max_stall_time = max(max_stall_time, stall_time);
76247706
if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
76257707
pool->cpu_stall = true;
76267708
cpu_pool_stall = true;
76277709
}
76287710
pr_emerg("BUG: workqueue lockup - pool");
76297711
pr_cont_pool_info(pool);
7630-
pr_cont(" stuck for %us!\n",
7631-
jiffies_to_msecs(now - pool_ts) / 1000);
7712+
pr_cont(" stuck for %us!\n", stall_time);
76327713
}
76337714

76347715

@@ -7641,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
76417722
show_cpu_pools_hogs();
76427723

76437724
if (lockup_detected)
7644-
panic_on_wq_watchdog();
7725+
panic_on_wq_watchdog(max_stall_time);
76457726

76467727
wq_watchdog_reset_touched();
76477728
mod_timer(&wq_watchdog_timer, jiffies + thresh);

lib/Kconfig.debug

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1322,6 +1322,29 @@ config WQ_WATCHDOG
13221322
state. This can be configured through kernel parameter
13231323
"workqueue.watchdog_thresh" and its sysfs counterpart.
13241324

1325+
config BOOTPARAM_WQ_STALL_PANIC
1326+
int "Panic on Nth workqueue stall"
1327+
default 0
1328+
range 0 100
1329+
depends on WQ_WATCHDOG
1330+
help
1331+
Set the number of workqueue stalls to trigger a kernel panic.
1332+
A workqueue stall occurs when a worker pool doesn't make forward
1333+
progress on a pending work item for over 30 seconds (configurable
1334+
using the workqueue.watchdog_thresh parameter).
1335+
1336+
If n = 0, the kernel will not panic on stall. If n > 0, the kernel
1337+
will panic after n stall warnings.
1338+
1339+
The panic can be used in combination with panic_timeout,
1340+
to cause the system to reboot automatically after a
1341+
stall has been detected. This feature is useful for
1342+
high-availability systems that have uptime guarantees and
1343+
where a stall must be resolved ASAP.
1344+
1345+
This setting can be overridden at runtime via the
1346+
workqueue.panic_on_stall kernel parameter.
1347+
13251348
config WQ_CPU_INTENSIVE_REPORT
13261349
bool "Report per-cpu work items which hog CPU for too long"
13271350
depends on DEBUG_KERNEL

0 commit comments

Comments
 (0)