@@ -117,6 +117,8 @@ enum wq_internal_consts {
117117 MAYDAY_INTERVAL = HZ / 10 , /* and then every 100ms */
118118 CREATE_COOLDOWN = HZ , /* time to breath after fail */
119119
120+ RESCUER_BATCH = 16 , /* process items per turn */
121+
120122 /*
121123 * Rescue workers are used only on emergencies and shared by
122124 * all cpus. Give MIN_NICE.
@@ -286,6 +288,7 @@ struct pool_workqueue {
286288 struct list_head pending_node ; /* LN: node on wq_node_nr_active->pending_pwqs */
287289 struct list_head pwqs_node ; /* WR: node on wq->pwqs */
288290 struct list_head mayday_node ; /* MD: node on wq->maydays */
291+ struct work_struct mayday_cursor ; /* L: cursor on pool->worklist */
289292
290293 u64 stats [PWQ_NR_STATS ];
291294
@@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
11201123 return NULL ;
11211124}
11221125
1126+ static void mayday_cursor_func (struct work_struct * work )
1127+ {
1128+ /* should not be processed, only for marking position */
1129+ BUG ();
1130+ }
1131+
11231132/**
11241133 * move_linked_works - move linked works to a list
11251134 * @work: start of series of works to be scheduled
@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
11821191
11831192 lockdep_assert_held (& pool -> lock );
11841193
1194+ /* The cursor work should not be processed */
1195+ if (unlikely (work -> func == mayday_cursor_func )) {
1196+ /* only worker_thread() can possibly take this branch */
1197+ WARN_ON_ONCE (worker -> rescue_wq );
1198+ if (nextp )
1199+ * nextp = list_next_entry (work , entry );
1200+ list_del_init (& work -> entry );
1201+ return false;
1202+ }
1203+
11851204 /*
11861205 * A single work shouldn't be executed concurrently by multiple workers.
11871206 * __queue_work() ensures that @work doesn't jump to a different pool
@@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
29762995 reap_dying_workers (& cull_list );
29772996}
29782997
2979- static void send_mayday (struct work_struct * work )
2998+ static void send_mayday (struct pool_workqueue * pwq )
29802999{
2981- struct pool_workqueue * pwq = get_work_pwq (work );
29823000 struct workqueue_struct * wq = pwq -> wq ;
29833001
29843002 lockdep_assert_held (& wq_mayday_lock );
@@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
30163034 * rescuers.
30173035 */
30183036 list_for_each_entry (work , & pool -> worklist , entry )
3019- send_mayday (work );
3037+ send_mayday (get_work_pwq ( work ) );
30203038 }
30213039
30223040 raw_spin_unlock (& wq_mayday_lock );
@@ -3440,22 +3458,57 @@ static int worker_thread(void *__worker)
34403458static bool assign_rescuer_work (struct pool_workqueue * pwq , struct worker * rescuer )
34413459{
34423460 struct worker_pool * pool = pwq -> pool ;
3461+ struct work_struct * cursor = & pwq -> mayday_cursor ;
34433462 struct work_struct * work , * n ;
34443463
3445- /* need rescue? */
3446- if (!pwq -> nr_active || ! need_to_create_worker ( pool ) )
3464+ /* have work items to rescue? */
3465+ if (!pwq -> nr_active )
34473466 return false;
34483467
3449- /*
3450- * Slurp in all works issued via this workqueue and
3451- * process'em.
3452- */
3453- list_for_each_entry_safe (work , n , & pool -> worklist , entry ) {
3454- if (get_work_pwq (work ) == pwq && assign_work (work , rescuer , & n ))
3468+ /* need rescue? */
3469+ if (!need_to_create_worker (pool )) {
3470+ /*
3471+ * The pool has idle workers and doesn't need the rescuer, so it
3472+ * could simply return false here.
3473+ *
3474+ * However, the memory pressure might not be fully relieved.
3475+ * In PERCPU pool with concurrency enabled, having idle workers
3476+ * does not necessarily mean memory pressure is gone; it may
3477+ * simply mean regular workers have woken up, completed their
3478+ * work, and gone idle again due to concurrency limits.
3479+ *
3480+ * In this case, those working workers may later sleep again,
3481+ * the pool may run out of idle workers, and it will have to
3482+ * allocate new ones and wait for the timer to send mayday,
3483+ * causing unnecessary delay - especially if memory pressure
3484+ * was never resolved throughout.
3485+ *
3486+ * Do more work if memory pressure is still on to reduce
3487+ * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
3488+ * not precisely, unless there are other PWQs needing help.
3489+ */
3490+ if (!(pool -> flags & POOL_MANAGER_ACTIVE ) ||
3491+ !list_empty (& pwq -> wq -> maydays ))
3492+ return false;
3493+ }
3494+
3495+ /* search from the start or cursor if available */
3496+ if (list_empty (& cursor -> entry ))
3497+ work = list_first_entry (& pool -> worklist , struct work_struct , entry );
3498+ else
3499+ work = list_next_entry (cursor , entry );
3500+
3501+ /* find the next work item to rescue */
3502+ list_for_each_entry_safe_from (work , n , & pool -> worklist , entry ) {
3503+ if (get_work_pwq (work ) == pwq && assign_work (work , rescuer , & n )) {
34553504 pwq -> stats [PWQ_STAT_RESCUED ]++ ;
3505+ /* put the cursor for next search */
3506+ list_move_tail (& cursor -> entry , & n -> entry );
3507+ return true;
3508+ }
34563509 }
34573510
3458- return ! list_empty ( & rescuer -> scheduled ) ;
3511+ return false ;
34593512}
34603513
34613514/**
@@ -3512,6 +3565,7 @@ static int rescuer_thread(void *__rescuer)
35123565 struct pool_workqueue * pwq = list_first_entry (& wq -> maydays ,
35133566 struct pool_workqueue , mayday_node );
35143567 struct worker_pool * pool = pwq -> pool ;
3568+ unsigned int count = 0 ;
35153569
35163570 __set_current_state (TASK_RUNNING );
35173571 list_del_init (& pwq -> mayday_node );
@@ -3524,31 +3578,27 @@ static int rescuer_thread(void *__rescuer)
35243578
35253579 WARN_ON_ONCE (!list_empty (& rescuer -> scheduled ));
35263580
3527- if (assign_rescuer_work (pwq , rescuer )) {
3581+ while (assign_rescuer_work (pwq , rescuer )) {
35283582 process_scheduled_works (rescuer );
35293583
35303584 /*
3531- * The above execution of rescued work items could
3532- * have created more to rescue through
3533- * pwq_activate_first_inactive() or chained
3534- * queueing. Let's put @pwq back on mayday list so
3535- * that such back-to-back work items, which may be
3536- * being used to relieve memory pressure, don't
3537- * incur MAYDAY_INTERVAL delay inbetween.
3585+ * If the per-turn work item limit is reached and other
3586+ * PWQs are in mayday, requeue mayday for this PWQ and
3587+ * let the rescuer handle the other PWQs first.
35383588 */
3539- if (pwq -> nr_active && need_to_create_worker (pool )) {
3589+ if (++ count > RESCUER_BATCH && !list_empty (& pwq -> wq -> maydays ) &&
3590+ pwq -> nr_active && need_to_create_worker (pool )) {
35403591 raw_spin_lock (& wq_mayday_lock );
3541- /*
3542- * Queue iff somebody else hasn't queued it already.
3543- */
3544- if (list_empty (& pwq -> mayday_node )) {
3545- get_pwq (pwq );
3546- list_add_tail (& pwq -> mayday_node , & wq -> maydays );
3547- }
3592+ send_mayday (pwq );
35483593 raw_spin_unlock (& wq_mayday_lock );
3594+ break ;
35493595 }
35503596 }
35513597
3598+ /* The cursor can not be left behind without the rescuer watching it. */
3599+ if (!list_empty (& pwq -> mayday_cursor .entry ) && list_empty (& pwq -> mayday_node ))
3600+ list_del_init (& pwq -> mayday_cursor .entry );
3601+
35523602 /*
35533603 * Leave this pool. Notify regular workers; otherwise, we end up
35543604 * with 0 concurrency and stalling the execution.
@@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
51675217 INIT_LIST_HEAD (& pwq -> pwqs_node );
51685218 INIT_LIST_HEAD (& pwq -> mayday_node );
51695219 kthread_init_work (& pwq -> release_work , pwq_release_workfn );
5220+
5221+ /*
5222+ * Set the dummy cursor work with valid function and get_work_pwq().
5223+ *
5224+ * The cursor work should only be in the pwq->pool->worklist, and
5225+ * should not be treated as a processable work item.
5226+ *
5227+ * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
5228+ * surprise for kernel debugging tools and reviewers.
5229+ */
5230+ INIT_WORK (& pwq -> mayday_cursor , mayday_cursor_func );
5231+ atomic_long_set (& pwq -> mayday_cursor .data , (unsigned long )pwq |
5232+ WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE );
51705233}
51715234
51725235/* sync @pwq with the current state of its associated wq and link it */
@@ -7508,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
75087571static unsigned long wq_watchdog_touched = INITIAL_JIFFIES ;
75097572static DEFINE_PER_CPU (unsigned long, wq_watchdog_touched_cpu ) = INITIAL_JIFFIES ;
75107573
7511- static unsigned int wq_panic_on_stall ;
7574+ static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC ;
75127575module_param_named (panic_on_stall , wq_panic_on_stall , uint , 0644 );
75137576
7577+ static unsigned int wq_panic_on_stall_time ;
7578+ module_param_named (panic_on_stall_time , wq_panic_on_stall_time , uint , 0644 );
7579+ MODULE_PARM_DESC (panic_on_stall_time , "Panic if stall exceeds this many seconds (0=disabled)" );
7580+
75147581/*
75157582 * Show workers that might prevent the processing of pending work items.
75167583 * The only candidates are CPU-bound workers in the running state.
@@ -7562,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
75627629 rcu_read_unlock ();
75637630}
75647631
7565- static void panic_on_wq_watchdog (void )
7632+ /*
7633+ * It triggers a panic in two scenarios: when the total number of stalls
7634+ * exceeds a threshold, and when a stall lasts longer than
7635+ * wq_panic_on_stall_time
7636+ */
7637+ static void panic_on_wq_watchdog (unsigned int stall_time_sec )
75667638{
75677639 static unsigned int wq_stall ;
75687640
75697641 if (wq_panic_on_stall ) {
75707642 wq_stall ++ ;
7571- BUG_ON (wq_stall >= wq_panic_on_stall );
7643+ if (wq_stall >= wq_panic_on_stall )
7644+ panic ("workqueue: %u stall(s) exceeded threshold %u\n" ,
7645+ wq_stall , wq_panic_on_stall );
75727646 }
7647+
7648+ if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time )
7649+ panic ("workqueue: stall lasted %us, exceeding threshold %us\n" ,
7650+ stall_time_sec , wq_panic_on_stall_time );
75737651}
75747652
75757653static void wq_watchdog_reset_touched (void )
@@ -7584,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
75847662static void wq_watchdog_timer_fn (struct timer_list * unused )
75857663{
75867664 unsigned long thresh = READ_ONCE (wq_watchdog_thresh ) * HZ ;
7665+ unsigned int max_stall_time = 0 ;
75877666 bool lockup_detected = false;
75887667 bool cpu_pool_stall = false;
75897668 unsigned long now = jiffies ;
75907669 struct worker_pool * pool ;
7670+ unsigned int stall_time ;
75917671 int pi ;
75927672
75937673 if (!thresh )
@@ -7621,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
76217701 /* did we stall? */
76227702 if (time_after (now , ts + thresh )) {
76237703 lockup_detected = true;
7704+ stall_time = jiffies_to_msecs (now - pool_ts ) / 1000 ;
7705+ max_stall_time = max (max_stall_time , stall_time );
76247706 if (pool -> cpu >= 0 && !(pool -> flags & POOL_BH )) {
76257707 pool -> cpu_stall = true;
76267708 cpu_pool_stall = true;
76277709 }
76287710 pr_emerg ("BUG: workqueue lockup - pool" );
76297711 pr_cont_pool_info (pool );
7630- pr_cont (" stuck for %us!\n" ,
7631- jiffies_to_msecs (now - pool_ts ) / 1000 );
7712+ pr_cont (" stuck for %us!\n" , stall_time );
76327713 }
76337714
76347715
@@ -7641,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
76417722 show_cpu_pools_hogs ();
76427723
76437724 if (lockup_detected )
7644- panic_on_wq_watchdog ();
7725+ panic_on_wq_watchdog (max_stall_time );
76457726
76467727 wq_watchdog_reset_touched ();
76477728 mod_timer (& wq_watchdog_timer , jiffies + thresh );
0 commit comments