@@ -114,48 +114,15 @@ static int ncpus_cmp_func(const void *l, const void *r)
114114 return ln -> ncpus - rn -> ncpus ;
115115}
116116
117- /*
118- * Allocate group number for each node, so that for each node:
119- *
120- * 1) the allocated number is >= 1
121- *
122- * 2) the allocated number is <= active CPU number of this node
123- *
124- * The actual allocated total groups may be less than @numgrps when
125- * active total CPU number is less than @numgrps.
126- *
127- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
128- * for each node.
129- */
130- static void alloc_nodes_groups (unsigned int numgrps ,
131- cpumask_var_t * node_to_cpumask ,
132- const struct cpumask * cpu_mask ,
133- const nodemask_t nodemsk ,
134- struct cpumask * nmsk ,
135- struct node_groups * node_groups )
117+ static void alloc_groups_to_nodes (unsigned int numgrps ,
118+ unsigned int numcpus ,
119+ struct node_groups * node_groups ,
120+ unsigned int num_nodes )
136121{
137- unsigned n , remaining_ncpus = 0 ;
138-
139- for (n = 0 ; n < nr_node_ids ; n ++ ) {
140- node_groups [n ].id = n ;
141- node_groups [n ].ncpus = UINT_MAX ;
142- }
143-
144- for_each_node_mask (n , nodemsk ) {
145- unsigned ncpus ;
146-
147- cpumask_and (nmsk , cpu_mask , node_to_cpumask [n ]);
148- ncpus = cpumask_weight (nmsk );
149-
150- if (!ncpus )
151- continue ;
152- remaining_ncpus += ncpus ;
153- node_groups [n ].ncpus = ncpus ;
154- }
122+ unsigned int n , remaining_ncpus = numcpus ;
123+ unsigned int ngroups , ncpus ;
155124
156- numgrps = min_t (unsigned , remaining_ncpus , numgrps );
157-
158- sort (node_groups , nr_node_ids , sizeof (node_groups [0 ]),
125+ sort (node_groups , num_nodes , sizeof (node_groups [0 ]),
159126 ncpus_cmp_func , NULL );
160127
161128 /*
@@ -226,9 +193,8 @@ static void alloc_nodes_groups(unsigned int numgrps,
226193 * finally for each node X: grps(X) <= ncpu(X).
227194 *
228195 */
229- for (n = 0 ; n < nr_node_ids ; n ++ ) {
230- unsigned ngroups , ncpus ;
231196
197+ for (n = 0 ; n < num_nodes ; n ++ ) {
232198 if (node_groups [n ].ncpus == UINT_MAX )
233199 continue ;
234200
@@ -246,12 +212,201 @@ static void alloc_nodes_groups(unsigned int numgrps,
246212 }
247213}
248214
215+ /*
216+ * Allocate group number for each node, so that for each node:
217+ *
218+ * 1) the allocated number is >= 1
219+ *
220+ * 2) the allocated number is <= active CPU number of this node
221+ *
222+ * The actual allocated total groups may be less than @numgrps when
223+ * active total CPU number is less than @numgrps.
224+ *
225+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
226+ * for each node.
227+ */
228+ static void alloc_nodes_groups (unsigned int numgrps ,
229+ cpumask_var_t * node_to_cpumask ,
230+ const struct cpumask * cpu_mask ,
231+ const nodemask_t nodemsk ,
232+ struct cpumask * nmsk ,
233+ struct node_groups * node_groups )
234+ {
235+ unsigned int n , numcpus = 0 ;
236+
237+ for (n = 0 ; n < nr_node_ids ; n ++ ) {
238+ node_groups [n ].id = n ;
239+ node_groups [n ].ncpus = UINT_MAX ;
240+ }
241+
242+ for_each_node_mask (n , nodemsk ) {
243+ unsigned int ncpus ;
244+
245+ cpumask_and (nmsk , cpu_mask , node_to_cpumask [n ]);
246+ ncpus = cpumask_weight (nmsk );
247+
248+ if (!ncpus )
249+ continue ;
250+ numcpus += ncpus ;
251+ node_groups [n ].ncpus = ncpus ;
252+ }
253+
254+ numgrps = min_t (unsigned int , numcpus , numgrps );
255+ alloc_groups_to_nodes (numgrps , numcpus , node_groups , nr_node_ids );
256+ }
257+
258+ static void assign_cpus_to_groups (unsigned int ncpus ,
259+ struct cpumask * nmsk ,
260+ struct node_groups * nv ,
261+ struct cpumask * masks ,
262+ unsigned int * curgrp ,
263+ unsigned int last_grp )
264+ {
265+ unsigned int v , cpus_per_grp , extra_grps ;
266+ /* Account for rounding errors */
267+ extra_grps = ncpus - nv -> ngroups * (ncpus / nv -> ngroups );
268+
269+ /* Spread allocated groups on CPUs of the current node */
270+ for (v = 0 ; v < nv -> ngroups ; v ++ , * curgrp += 1 ) {
271+ cpus_per_grp = ncpus / nv -> ngroups ;
272+
273+ /* Account for extra groups to compensate rounding errors */
274+ if (extra_grps ) {
275+ cpus_per_grp ++ ;
276+ -- extra_grps ;
277+ }
278+
279+ /*
280+ * wrapping has to be considered given 'startgrp'
281+ * may start anywhere
282+ */
283+ if (* curgrp >= last_grp )
284+ * curgrp = 0 ;
285+ grp_spread_init_one (& masks [* curgrp ], nmsk , cpus_per_grp );
286+ }
287+ }
288+
289+ static int alloc_cluster_groups (unsigned int ncpus ,
290+ unsigned int ngroups ,
291+ struct cpumask * node_cpumask ,
292+ cpumask_var_t msk ,
293+ const struct cpumask * * * clusters_ptr ,
294+ struct node_groups * * cluster_groups_ptr )
295+ {
296+ unsigned int ncluster = 0 ;
297+ unsigned int cpu , nc , n ;
298+ const struct cpumask * cluster_mask ;
299+ const struct cpumask * * clusters ;
300+ struct node_groups * cluster_groups ;
301+
302+ cpumask_copy (msk , node_cpumask );
303+
304+ /* Probe how many clusters in this node. */
305+ while (1 ) {
306+ cpu = cpumask_first (msk );
307+ if (cpu >= nr_cpu_ids )
308+ break ;
309+
310+ cluster_mask = topology_cluster_cpumask (cpu );
311+ if (!cpumask_weight (cluster_mask ))
312+ goto no_cluster ;
313+ /* Clean out CPUs on the same cluster. */
314+ cpumask_andnot (msk , msk , cluster_mask );
315+ ncluster ++ ;
316+ }
317+
318+ /* If ngroups < ncluster, cross cluster is inevitable, skip. */
319+ if (ncluster == 0 || ncluster > ngroups )
320+ goto no_cluster ;
321+
322+ /* Allocate memory based on cluster number. */
323+ clusters = kcalloc (ncluster , sizeof (struct cpumask * ), GFP_KERNEL );
324+ if (!clusters )
325+ goto no_cluster ;
326+ cluster_groups = kcalloc (ncluster , sizeof (struct node_groups ), GFP_KERNEL );
327+ if (!cluster_groups )
328+ goto fail_cluster_groups ;
329+
330+ /* Filling cluster info for later process. */
331+ cpumask_copy (msk , node_cpumask );
332+ for (n = 0 ; n < ncluster ; n ++ ) {
333+ cpu = cpumask_first (msk );
334+ cluster_mask = topology_cluster_cpumask (cpu );
335+ nc = cpumask_weight_and (cluster_mask , node_cpumask );
336+ clusters [n ] = cluster_mask ;
337+ cluster_groups [n ].id = n ;
338+ cluster_groups [n ].ncpus = nc ;
339+ cpumask_andnot (msk , msk , cluster_mask );
340+ }
341+
342+ alloc_groups_to_nodes (ngroups , ncpus , cluster_groups , ncluster );
343+
344+ * clusters_ptr = clusters ;
345+ * cluster_groups_ptr = cluster_groups ;
346+ return ncluster ;
347+
348+ fail_cluster_groups :
349+ kfree (clusters );
350+ no_cluster :
351+ return 0 ;
352+ }
353+
354+ /*
355+ * Try group CPUs evenly for cluster locality within a NUMA node.
356+ *
357+ * Return: true if success, false otherwise.
358+ */
359+ static bool __try_group_cluster_cpus (unsigned int ncpus ,
360+ unsigned int ngroups ,
361+ struct cpumask * node_cpumask ,
362+ struct cpumask * masks ,
363+ unsigned int * curgrp ,
364+ unsigned int last_grp )
365+ {
366+ struct node_groups * cluster_groups ;
367+ const struct cpumask * * clusters ;
368+ unsigned int ncluster ;
369+ bool ret = false;
370+ cpumask_var_t nmsk ;
371+ unsigned int i , nc ;
372+
373+ if (!zalloc_cpumask_var (& nmsk , GFP_KERNEL ))
374+ goto fail_nmsk_alloc ;
375+
376+ ncluster = alloc_cluster_groups (ncpus , ngroups , node_cpumask , nmsk ,
377+ & clusters , & cluster_groups );
378+
379+ if (ncluster == 0 )
380+ goto fail_no_clusters ;
381+
382+ for (i = 0 ; i < ncluster ; i ++ ) {
383+ struct node_groups * nv = & cluster_groups [i ];
384+
385+ /* Get the cpus on this cluster. */
386+ cpumask_and (nmsk , node_cpumask , clusters [nv -> id ]);
387+ nc = cpumask_weight (nmsk );
388+ if (!nc )
389+ continue ;
390+ WARN_ON_ONCE (nv -> ngroups > nc );
391+
392+ assign_cpus_to_groups (nc , nmsk , nv , masks , curgrp , last_grp );
393+ }
394+
395+ ret = true;
396+ kfree (cluster_groups );
397+ kfree (clusters );
398+ fail_no_clusters :
399+ free_cpumask_var (nmsk );
400+ fail_nmsk_alloc :
401+ return ret ;
402+ }
403+
249404static int __group_cpus_evenly (unsigned int startgrp , unsigned int numgrps ,
250405 cpumask_var_t * node_to_cpumask ,
251406 const struct cpumask * cpu_mask ,
252407 struct cpumask * nmsk , struct cpumask * masks )
253408{
254- unsigned int i , n , nodes , cpus_per_grp , extra_grps , done = 0 ;
409+ unsigned int i , n , nodes , done = 0 ;
255410 unsigned int last_grp = numgrps ;
256411 unsigned int curgrp = startgrp ;
257412 nodemask_t nodemsk = NODE_MASK_NONE ;
@@ -287,7 +442,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
287442 alloc_nodes_groups (numgrps , node_to_cpumask , cpu_mask ,
288443 nodemsk , nmsk , node_groups );
289444 for (i = 0 ; i < nr_node_ids ; i ++ ) {
290- unsigned int ncpus , v ;
445+ unsigned int ncpus ;
291446 struct node_groups * nv = & node_groups [i ];
292447
293448 if (nv -> ngroups == UINT_MAX )
@@ -301,28 +456,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
301456
302457 WARN_ON_ONCE (nv -> ngroups > ncpus );
303458
304- /* Account for rounding errors */
305- extra_grps = ncpus - nv -> ngroups * (ncpus / nv -> ngroups );
306-
307- /* Spread allocated groups on CPUs of the current node */
308- for (v = 0 ; v < nv -> ngroups ; v ++ , curgrp ++ ) {
309- cpus_per_grp = ncpus / nv -> ngroups ;
310-
311- /* Account for extra groups to compensate rounding errors */
312- if (extra_grps ) {
313- cpus_per_grp ++ ;
314- -- extra_grps ;
315- }
316-
317- /*
318- * wrapping has to be considered given 'startgrp'
319- * may start anywhere
320- */
321- if (curgrp >= last_grp )
322- curgrp = 0 ;
323- grp_spread_init_one (& masks [curgrp ], nmsk ,
324- cpus_per_grp );
459+ if (__try_group_cluster_cpus (ncpus , nv -> ngroups , nmsk ,
460+ masks , & curgrp , last_grp )) {
461+ done += nv -> ngroups ;
462+ continue ;
325463 }
464+
465+ assign_cpus_to_groups (ncpus , nmsk , nv , masks , & curgrp ,
466+ last_grp );
326467 done += nv -> ngroups ;
327468 }
328469 kfree (node_groups );
0 commit comments