Skip to content

Commit ca174c7

Browse files
Waiman-Longhtejun
authored andcommitted
cgroup/cpuset: Call rebuild_sched_domains() directly in hotplug
Besides deferring the call to housekeeping_update(), commit 6df415a ("cgroup/cpuset: Defer housekeeping_update() calls from CPU hotplug to workqueue") also defers the rebuild_sched_domains() call to the workqueue. So a new offline CPU may still be in a sched domain or new online CPU not showing up in the sched domains for a short transition period. That could be a problem in some corner cases and can be the cause of a reported test failure[1]. Fix it by calling rebuild_sched_domains_cpuslocked() directly in hotplug as before. If isolated partition invalidation or recreation is being done, the housekeeping_update() call to update the housekeeping cpumasks will still be deferred to a workqueue. In commit 3bfe479 ("cgroup/cpuset: Move housekeeping_update()/rebuild_sched_domains() together"), housekeeping_update() is called before rebuild_sched_domains() because it needs to access the HK_TYPE_DOMAIN housekeeping cpumask. That is now changed to use the static HK_TYPE_DOMAIN_BOOT cpumask as HK_TYPE_DOMAIN cpumask is now changeable at run time. As a result, we can move the rebuild_sched_domains() call before housekeeping_update() with the slight advantage that it will be done in the same cpus_read_lock critical section without the possibility of interference by a concurrent cpu hot add/remove operation. As it doesn't make sense to acquire cpuset_mutex/cpuset_top_mutex after calling housekeeping_update() and immediately release them again, move the cpuset_full_unlock() operation inside update_hk_sched_domains() and rename it to cpuset_update_sd_hk_unlock() to signify that it will release the full set of locks. [1] https://lore.kernel.org/lkml/[email protected] Fixes: 6df415a ("cgroup/cpuset: Defer housekeeping_update() calls from CPU hotplug to workqueue") Tested-by: Jon Hunter <[email protected]> Reviewed-by: Chen Ridong <[email protected]> Signed-off-by: Waiman Long <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent 5ee8dbf commit ca174c7

1 file changed

Lines changed: 31 additions & 28 deletions

File tree

kernel/cgroup/cpuset.c

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
879879
/*
880880
* Cgroup v2 doesn't support domain attributes, just set all of them
881881
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
882-
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
882+
* subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs.
883883
*/
884884
for (i = 0; i < ndoms; i++) {
885885
/*
@@ -888,7 +888,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
888888
*/
889889
if (!csa || csa[i] == &top_cpuset)
890890
cpumask_and(doms[i], top_cpuset.effective_cpus,
891-
housekeeping_cpumask(HK_TYPE_DOMAIN));
891+
housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
892892
else
893893
cpumask_copy(doms[i], csa[i]->effective_cpus);
894894
if (dattr)
@@ -1329,17 +1329,22 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
13291329
}
13301330

13311331
/*
1332-
* update_hk_sched_domains - Update HK cpumasks & rebuild sched domains
1332+
* cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock
13331333
*
1334-
* Update housekeeping cpumasks and rebuild sched domains if necessary.
1335-
* This should be called at the end of cpuset or hotplug actions.
1334+
* Update housekeeping cpumasks and rebuild sched domains if necessary and
1335+
* then do a cpuset_full_unlock().
1336+
* This should be called at the end of cpuset operation.
13361337
*/
1337-
static void update_hk_sched_domains(void)
1338+
static void cpuset_update_sd_hk_unlock(void)
1339+
__releases(&cpuset_mutex)
1340+
__releases(&cpuset_top_mutex)
13381341
{
1342+
/* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
1343+
if (force_sd_rebuild)
1344+
rebuild_sched_domains_locked();
1345+
13391346
if (update_housekeeping) {
1340-
/* Updating HK cpumasks implies rebuild sched domains */
13411347
update_housekeeping = false;
1342-
force_sd_rebuild = true;
13431348
cpumask_copy(isolated_hk_cpus, isolated_cpus);
13441349

13451350
/*
@@ -1350,22 +1355,19 @@ static void update_hk_sched_domains(void)
13501355
mutex_unlock(&cpuset_mutex);
13511356
cpus_read_unlock();
13521357
WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
1353-
cpus_read_lock();
1354-
mutex_lock(&cpuset_mutex);
1358+
mutex_unlock(&cpuset_top_mutex);
1359+
} else {
1360+
cpuset_full_unlock();
13551361
}
1356-
/* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
1357-
if (force_sd_rebuild)
1358-
rebuild_sched_domains_locked();
13591362
}
13601363

13611364
/*
1362-
* Work function to invoke update_hk_sched_domains()
1365+
* Work function to invoke cpuset_update_sd_hk_unlock()
13631366
*/
13641367
static void hk_sd_workfn(struct work_struct *work)
13651368
{
13661369
cpuset_full_lock();
1367-
update_hk_sched_domains();
1368-
cpuset_full_unlock();
1370+
cpuset_update_sd_hk_unlock();
13691371
}
13701372

13711373
/**
@@ -3230,8 +3232,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
32303232

32313233
free_cpuset(trialcs);
32323234
out_unlock:
3233-
update_hk_sched_domains();
3234-
cpuset_full_unlock();
3235+
cpuset_update_sd_hk_unlock();
32353236
if (of_cft(of)->private == FILE_MEMLIST)
32363237
schedule_flush_migrate_mm();
32373238
return retval ?: nbytes;
@@ -3338,8 +3339,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
33383339
cpuset_full_lock();
33393340
if (is_cpuset_online(cs))
33403341
retval = update_prstate(cs, val);
3341-
update_hk_sched_domains();
3342-
cpuset_full_unlock();
3342+
cpuset_update_sd_hk_unlock();
33433343
return retval ?: nbytes;
33443344
}
33453345

@@ -3513,8 +3513,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
35133513
/* Reset valid partition back to member */
35143514
if (is_partition_valid(cs))
35153515
update_prstate(cs, PRS_MEMBER);
3516-
update_hk_sched_domains();
3517-
cpuset_full_unlock();
3516+
cpuset_update_sd_hk_unlock();
35183517
}
35193518

35203519
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3923,11 +3922,13 @@ static void cpuset_handle_hotplug(void)
39233922
rcu_read_unlock();
39243923
}
39253924

3926-
39273925
/*
3928-
* Queue a work to call housekeeping_update() & rebuild_sched_domains()
3929-
* There will be a slight delay before the HK_TYPE_DOMAIN housekeeping
3930-
* cpumask can correctly reflect what is in isolated_cpus.
3926+
* rebuild_sched_domains() will always be called directly if needed
3927+
* to make sure that newly added or removed CPU will be reflected in
3928+
* the sched domains. However, if isolated partition invalidation
3929+
* or recreation is being done (update_housekeeping set), a work item
3930+
* will be queued to call housekeeping_update() to update the
3931+
* corresponding housekeeping cpumasks after some slight delay.
39313932
*
39323933
* We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that
39333934
* is still pending. Before the pending bit is cleared, the work data
@@ -3936,8 +3937,10 @@ static void cpuset_handle_hotplug(void)
39363937
* previously queued work. Since hk_sd_workfn() doesn't use the work
39373938
* item at all, this is not a problem.
39383939
*/
3939-
if (update_housekeeping || force_sd_rebuild)
3940-
queue_work(system_unbound_wq, &hk_sd_work);
3940+
if (force_sd_rebuild)
3941+
rebuild_sched_domains_cpuslocked();
3942+
if (update_housekeeping)
3943+
queue_work(system_dfl_wq, &hk_sd_work);
39413944

39423945
free_tmpmasks(ptmp);
39433946
}

0 commit comments

Comments
 (0)