@@ -131,6 +131,14 @@ enum wq_internal_consts {
131131 WORKER_ID_LEN = 10 + WQ_NAME_LEN , /* "kworker/R-" + WQ_NAME_LEN */
132132};
133133
134+ /* Layout of shards within one LLC pod */
135+ struct llc_shard_layout {
136+ int nr_large_shards ; /* number of large shards (cores_per_shard + 1) */
137+ int cores_per_shard ; /* base number of cores per default shard */
138+ int nr_shards ; /* total number of shards */
139+ /* nr_default shards = (nr_shards - nr_large_shards) */
140+ };
141+
134142/*
135143 * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
136144 * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
@@ -410,6 +418,7 @@ static const char * const wq_affn_names[WQ_AFFN_NR_TYPES] = {
410418 [WQ_AFFN_CPU ] = "cpu" ,
411419 [WQ_AFFN_SMT ] = "smt" ,
412420 [WQ_AFFN_CACHE ] = "cache" ,
421+ [WQ_AFFN_CACHE_SHARD ] = "cache_shard" ,
413422 [WQ_AFFN_NUMA ] = "numa" ,
414423 [WQ_AFFN_SYSTEM ] = "system" ,
415424};
@@ -432,6 +441,9 @@ module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh
432441static bool wq_power_efficient = IS_ENABLED (CONFIG_WQ_POWER_EFFICIENT_DEFAULT );
433442module_param_named (power_efficient , wq_power_efficient , bool , 0444 );
434443
444+ static unsigned int wq_cache_shard_size = 8 ;
445+ module_param_named (cache_shard_size , wq_cache_shard_size , uint , 0444 );
446+
435447static bool wq_online ; /* can kworkers be created yet? */
436448static bool wq_topo_initialized __read_mostly = false;
437449
@@ -8155,6 +8167,175 @@ static bool __init cpus_share_numa(int cpu0, int cpu1)
81558167 return cpu_to_node (cpu0 ) == cpu_to_node (cpu1 );
81568168}
81578169
8170+ /* Maps each CPU to its shard index within the LLC pod it belongs to */
8171+ static int cpu_shard_id [NR_CPUS ] __initdata ;
8172+
8173+ /**
8174+ * llc_count_cores - count distinct cores (SMT groups) within an LLC pod
8175+ * @pod_cpus: the cpumask of CPUs in the LLC pod
8176+ * @smt_pods: the SMT pod type, used to identify sibling groups
8177+ *
8178+ * A core is represented by the lowest-numbered CPU in its SMT group. Returns
8179+ * the number of distinct cores found in @pod_cpus.
8180+ */
8181+ static int __init llc_count_cores (const struct cpumask * pod_cpus ,
8182+ struct wq_pod_type * smt_pods )
8183+ {
8184+ const struct cpumask * sibling_cpus ;
8185+ int nr_cores = 0 , c ;
8186+
8187+ /*
8188+ * Count distinct cores by only counting the first CPU in each
8189+ * SMT sibling group.
8190+ */
8191+ for_each_cpu (c , pod_cpus ) {
8192+ sibling_cpus = smt_pods -> pod_cpus [smt_pods -> cpu_pod [c ]];
8193+ if (cpumask_first (sibling_cpus ) == c )
8194+ nr_cores ++ ;
8195+ }
8196+
8197+ return nr_cores ;
8198+ }
8199+
8200+ /*
8201+ * llc_shard_size - number of cores in a given shard
8202+ *
8203+ * Cores are spread as evenly as possible. The first @nr_large_shards shards are
8204+ * "large shards" with (cores_per_shard + 1) cores; the rest are "default
8205+ * shards" with cores_per_shard cores.
8206+ */
8207+ static int __init llc_shard_size (int shard_id , int cores_per_shard , int nr_large_shards )
8208+ {
8209+ /* The first @nr_large_shards shards are large shards */
8210+ if (shard_id < nr_large_shards )
8211+ return cores_per_shard + 1 ;
8212+
8213+ /* The remaining shards are default shards */
8214+ return cores_per_shard ;
8215+ }
8216+
8217+ /*
8218+ * llc_calc_shard_layout - compute the shard layout for an LLC pod
8219+ * @nr_cores: number of distinct cores in the LLC pod
8220+ *
8221+ * Chooses the number of shards that keeps average shard size closest to
8222+ * wq_cache_shard_size. Returns a struct describing the total number of shards,
8223+ * the base size of each, and how many are large shards.
8224+ */
8225+ static struct llc_shard_layout __init llc_calc_shard_layout (int nr_cores )
8226+ {
8227+ struct llc_shard_layout layout ;
8228+
8229+ /* Ensure at least one shard; pick the count closest to the target size */
8230+ layout .nr_shards = max (1 , DIV_ROUND_CLOSEST (nr_cores , wq_cache_shard_size ));
8231+ layout .cores_per_shard = nr_cores / layout .nr_shards ;
8232+ layout .nr_large_shards = nr_cores % layout .nr_shards ;
8233+
8234+ return layout ;
8235+ }
8236+
8237+ /*
8238+ * llc_shard_is_full - check whether a shard has reached its core capacity
8239+ * @cores_in_shard: number of cores already assigned to this shard
8240+ * @shard_id: index of the shard being checked
8241+ * @layout: the shard layout computed by llc_calc_shard_layout()
8242+ *
8243+ * Returns true if @cores_in_shard equals the expected size for @shard_id.
8244+ */
8245+ static bool __init llc_shard_is_full (int cores_in_shard , int shard_id ,
8246+ const struct llc_shard_layout * layout )
8247+ {
8248+ return cores_in_shard == llc_shard_size (shard_id , layout -> cores_per_shard ,
8249+ layout -> nr_large_shards );
8250+ }
8251+
8252+ /**
8253+ * llc_populate_cpu_shard_id - populate cpu_shard_id[] for each CPU in an LLC pod
8254+ * @pod_cpus: the cpumask of CPUs in the LLC pod
8255+ * @smt_pods: the SMT pod type, used to identify sibling groups
8256+ * @nr_cores: number of distinct cores in @pod_cpus (from llc_count_cores())
8257+ *
8258+ * Walks @pod_cpus in order. At each SMT group leader, advances to the next
8259+ * shard once the current shard is full. Results are written to cpu_shard_id[].
8260+ */
8261+ static void __init llc_populate_cpu_shard_id (const struct cpumask * pod_cpus ,
8262+ struct wq_pod_type * smt_pods ,
8263+ int nr_cores )
8264+ {
8265+ struct llc_shard_layout layout = llc_calc_shard_layout (nr_cores );
8266+ const struct cpumask * sibling_cpus ;
8267+ /* Count the number of cores in the current shard_id */
8268+ int cores_in_shard = 0 ;
8269+ /* This is a cursor for the shards. Go from zero to nr_shards - 1*/
8270+ int shard_id = 0 ;
8271+ int c ;
8272+
8273+ /* Iterate at every CPU for a given LLC pod, and assign it a shard */
8274+ for_each_cpu (c , pod_cpus ) {
8275+ sibling_cpus = smt_pods -> pod_cpus [smt_pods -> cpu_pod [c ]];
8276+ if (cpumask_first (sibling_cpus ) == c ) {
8277+ /* This is the CPU leader for the siblings */
8278+ if (llc_shard_is_full (cores_in_shard , shard_id , & layout )) {
8279+ shard_id ++ ;
8280+ cores_in_shard = 0 ;
8281+ }
8282+ cores_in_shard ++ ;
8283+ cpu_shard_id [c ] = shard_id ;
8284+ } else {
8285+ /*
8286+ * The siblings' shard MUST be the same as the leader.
8287+ * never split threads in the same core.
8288+ */
8289+ cpu_shard_id [c ] = cpu_shard_id [cpumask_first (sibling_cpus )];
8290+ }
8291+ }
8292+
8293+ WARN_ON_ONCE (shard_id != (layout .nr_shards - 1 ));
8294+ }
8295+
8296+ /**
8297+ * precompute_cache_shard_ids - assign each CPU its shard index within its LLC
8298+ *
8299+ * Iterates over all LLC pods. For each pod, counts distinct cores then assigns
8300+ * shard indices to all CPUs in the pod. Must be called after WQ_AFFN_CACHE and
8301+ * WQ_AFFN_SMT have been initialized.
8302+ */
8303+ static void __init precompute_cache_shard_ids (void )
8304+ {
8305+ struct wq_pod_type * llc_pods = & wq_pod_types [WQ_AFFN_CACHE ];
8306+ struct wq_pod_type * smt_pods = & wq_pod_types [WQ_AFFN_SMT ];
8307+ const struct cpumask * cpus_sharing_llc ;
8308+ int nr_cores ;
8309+ int pod ;
8310+
8311+ if (!wq_cache_shard_size ) {
8312+ pr_warn ("workqueue: cache_shard_size must be > 0, setting to 1\n" );
8313+ wq_cache_shard_size = 1 ;
8314+ }
8315+
8316+ for (pod = 0 ; pod < llc_pods -> nr_pods ; pod ++ ) {
8317+ cpus_sharing_llc = llc_pods -> pod_cpus [pod ];
8318+
8319+ /* Number of cores in this given LLC */
8320+ nr_cores = llc_count_cores (cpus_sharing_llc , smt_pods );
8321+ llc_populate_cpu_shard_id (cpus_sharing_llc , smt_pods , nr_cores );
8322+ }
8323+ }
8324+
8325+ /*
8326+ * cpus_share_cache_shard - test whether two CPUs belong to the same cache shard
8327+ *
8328+ * Two CPUs share a cache shard if they are in the same LLC and have the same
8329+ * shard index. Used as the pod affinity callback for WQ_AFFN_CACHE_SHARD.
8330+ */
8331+ static bool __init cpus_share_cache_shard (int cpu0 , int cpu1 )
8332+ {
8333+ if (!cpus_share_cache (cpu0 , cpu1 ))
8334+ return false;
8335+
8336+ return cpu_shard_id [cpu0 ] == cpu_shard_id [cpu1 ];
8337+ }
8338+
81588339/**
81598340 * workqueue_init_topology - initialize CPU pods for unbound workqueues
81608341 *
@@ -8170,6 +8351,8 @@ void __init workqueue_init_topology(void)
81708351 init_pod_type (& wq_pod_types [WQ_AFFN_CPU ], cpus_dont_share );
81718352 init_pod_type (& wq_pod_types [WQ_AFFN_SMT ], cpus_share_smt );
81728353 init_pod_type (& wq_pod_types [WQ_AFFN_CACHE ], cpus_share_cache );
8354+ precompute_cache_shard_ids ();
8355+ init_pod_type (& wq_pod_types [WQ_AFFN_CACHE_SHARD ], cpus_share_cache_shard );
81738356 init_pod_type (& wq_pod_types [WQ_AFFN_NUMA ], cpus_share_numa );
81748357
81758358 wq_topo_initialized = true;
0 commit comments