Skip to content

Commit 98a2689

Browse files
Peter Zijlstragregkh
authored andcommitted
sched/fair: Proportional newidle balance
commit 33cf66d upstream. Add a randomized algorithm that runs newidle balancing proportional to its success rate. This improves schbench significantly: 6.18-rc4: 2.22 Mrps/s 6.18-rc4+revert: 2.04 Mrps/s 6.18-rc4+revert+random: 2.18 Mrps/S Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: 6.17: -6% 6.17+revert: 0% 6.17+revert+random: -1% Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Dietmar Eggemann <[email protected]> Tested-by: Dietmar Eggemann <[email protected]> Tested-by: Chris Mason <[email protected]> Link: https://lkml.kernel.org/r/[email protected] Link: https://patch.msgid.link/[email protected] Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent d4ffb9c commit 98a2689

6 files changed

Lines changed: 64 additions & 4 deletions

File tree

include/linux/sched/topology.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ struct sched_domain {
9292
unsigned int nr_balance_failed; /* initialise to 0 */
9393

9494
/* idle_balance() stats */
95+
unsigned int newidle_call;
96+
unsigned int newidle_success;
97+
unsigned int newidle_ratio;
9598
u64 max_newidle_lb_cost;
9699
unsigned long last_decay_max_lb_cost;
97100

kernel/sched/core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
121121
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
122122

123123
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
124+
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
124125

125126
#ifdef CONFIG_SCHED_PROXY_EXEC
126127
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8591,6 +8592,8 @@ void __init sched_init_smp(void)
85918592
{
85928593
sched_init_numa(NUMA_NO_NODE);
85938594

8595+
prandom_init_once(&sched_rnd_state);
8596+
85948597
/*
85958598
* There's no userspace yet to cause hotplug operations; hence all the
85968599
* CPU masks are stable and all blatant races in the below code cannot

kernel/sched/fair.c

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12122,11 +12122,27 @@ void update_max_interval(void)
1212212122
max_load_balance_interval = HZ*num_online_cpus()/10;
1212312123
}
1212412124

12125-
static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
12125+
static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
12126+
{
12127+
sd->newidle_call++;
12128+
sd->newidle_success += success;
12129+
12130+
if (sd->newidle_call >= 1024) {
12131+
sd->newidle_ratio = sd->newidle_success;
12132+
sd->newidle_call /= 2;
12133+
sd->newidle_success /= 2;
12134+
}
12135+
}
12136+
12137+
static inline bool
12138+
update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
1212612139
{
1212712140
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
1212812141
unsigned long now = jiffies;
1212912142

12143+
if (cost)
12144+
update_newidle_stats(sd, success);
12145+
1213012146
if (cost > sd->max_newidle_lb_cost) {
1213112147
/*
1213212148
* Track max cost of a domain to make sure to not delay the
@@ -12174,7 +12190,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
1217412190
* Decay the newidle max times here because this is a regular
1217512191
* visit to all the domains.
1217612192
*/
12177-
need_decay = update_newidle_cost(sd, 0);
12193+
need_decay = update_newidle_cost(sd, 0, 0);
1217812194
max_cost += sd->max_newidle_lb_cost;
1217912195

1218012196
/*
@@ -12819,17 +12835,37 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
1281912835
break;
1282012836

1282112837
if (sd->flags & SD_BALANCE_NEWIDLE) {
12838+
unsigned int weight = 1;
12839+
12840+
if (sched_feat(NI_RANDOM)) {
12841+
/*
12842+
* Throw a 1k sided dice; and only run
12843+
* newidle_balance according to the success
12844+
* rate.
12845+
*/
12846+
u32 d1k = sched_rng() % 1024;
12847+
weight = 1 + sd->newidle_ratio;
12848+
if (d1k > weight) {
12849+
update_newidle_stats(sd, 0);
12850+
continue;
12851+
}
12852+
weight = (1024 + weight/2) / weight;
12853+
}
1282212854

1282312855
pulled_task = sched_balance_rq(this_cpu, this_rq,
1282412856
sd, CPU_NEWLY_IDLE,
1282512857
&continue_balancing);
1282612858

1282712859
t1 = sched_clock_cpu(this_cpu);
1282812860
domain_cost = t1 - t0;
12829-
update_newidle_cost(sd, domain_cost);
12830-
1283112861
curr_cost += domain_cost;
1283212862
t0 = t1;
12863+
12864+
/*
12865+
* Track max cost of a domain to make sure to not delay the
12866+
* next wakeup on the CPU.
12867+
*/
12868+
update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
1283312869
}
1283412870

1283512871
/*

kernel/sched/features.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
121121
SCHED_FEAT(UTIL_EST, true)
122122

123123
SCHED_FEAT(LATENCY_WARN, false)
124+
125+
/*
126+
* Do newidle balancing proportional to its success rate using randomization.
127+
*/
128+
SCHED_FEAT(NI_RANDOM, true)

kernel/sched/sched.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#ifndef _KERNEL_SCHED_SCHED_H
66
#define _KERNEL_SCHED_SCHED_H
77

8+
#include <linux/prandom.h>
89
#include <linux/sched/affinity.h>
910
#include <linux/sched/autogroup.h>
1011
#include <linux/sched/cpufreq.h>
@@ -1349,6 +1350,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
13491350
}
13501351

13511352
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1353+
DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
1354+
1355+
static inline u32 sched_rng(void)
1356+
{
1357+
return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
1358+
}
13521359

13531360
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
13541361
#define this_rq() this_cpu_ptr(&runqueues)

kernel/sched/topology.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_level *tl,
16621662

16631663
.last_balance = jiffies,
16641664
.balance_interval = sd_weight,
1665+
1666+
/* 50% success rate */
1667+
.newidle_call = 512,
1668+
.newidle_success = 256,
1669+
.newidle_ratio = 512,
1670+
16651671
.max_newidle_lb_cost = 0,
16661672
.last_decay_max_lb_cost = jiffies,
16671673
.child = child,

0 commit comments

Comments
 (0)