From d06beed024fb06ca5171e1d18726b99a7ed3a1b3 Mon Sep 17 00:00:00 2001 From: mloubout Date: Tue, 5 May 2026 11:31:40 -0400 Subject: [PATCH 1/2] compiler: restrict nested parallelism to supported compilers (intel) --- conftest.py | 7 +++- devito/passes/iet/languages/openmp.py | 14 +++++++- devito/passes/iet/parpragma.py | 7 +++- tests/test_caching.py | 2 ++ tests/test_dle.py | 46 +++++++++++++++++++++++++++ 5 files changed, 73 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index b2d49697fb..74854eb061 100644 --- a/conftest.py +++ b/conftest.py @@ -38,7 +38,7 @@ def skipif(items, whole_module=False): accepted.update({'device', 'device-C', 'device-openmp', 'device-openacc', 'device-aomp', 'cpu64-icc', 'cpu64-icx', 'cpu64-nvc', 'noadvisor', 'cpu64-arm', 'cpu64-icpx', 'chkpnt'}) - accepted.update({'nodevice', 'noomp'}) + accepted.update({'nodevice', 'noomp', 'nointel'}) unknown = sorted(set(items) - accepted) if unknown: raise ValueError(f"Illegal skipif argument(s) `{unknown}`") @@ -93,6 +93,11 @@ def skipif(items, whole_module=False): if i == 'noomp' and 'openmp' not in configuration['language']: skipit = "Must use openmp" break + # Skip if not using an Intel compiler + if i == 'nointel' and \ + not isinstance(configuration['compiler'], (IntelCompiler, OneapiCompiler)): + skipit = "Must use an Intel compiler" + break # Skip if it won't run on Arm if i == 'cpu64-arm' and isinstance(configuration['platform'], Arm): skipit = "Arm doesn't support x86-specific instructions" diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index 32ace9d473..ce5decc518 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -5,7 +5,9 @@ from sympy import And, Ne, Not from devito.arch import AMDGPUX, INTELGPUX, NVIDIAX, PVC -from devito.arch.compiler import CustomCompiler, GNUCompiler, NvidiaCompiler +from devito.arch.compiler import ( + CustomCompiler, GNUCompiler, IntelCompiler, NvidiaCompiler, OneapiCompiler +) from devito.ir import ( Call, Conditional, DeviceCall, FindSymbols, List, ParallelBlock, PointerCast, Pragma, Prodder, While @@ -276,6 +278,16 @@ def _support_complex_reduction(cls, compiler): # Gcc doesn't supports complex reduction return not isinstance(compiler, GNUCompiler) + @classmethod + def _support_nested_parallelism(cls, compiler): + # In case we have a CustomCompiler + if isinstance(compiler, CustomCompiler): + compiler = compiler._base() + if isinstance(compiler, (IntelCompiler, OneapiCompiler)): # noqa: SIM103 + return True + else: + return False + class Ompizer(AbstractOmpizer): langbb = OmpBB diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 3cb072104c..56eb6fb779 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -54,6 +54,10 @@ def _support_array_reduction(cls, compiler): def _support_complex_reduction(cls, compiler): return False + @classmethod + def _support_nested_parallelism(cls, compiler): + return False + @property def simd_reg_nbytes(self): return self.platform.simd_reg_nbytes @@ -344,7 +348,8 @@ def _make_guard(self, parregion): def _make_nested_partree(self, partree): # Apply heuristic - if self.nhyperthreads <= self.nested: + if self.nhyperthreads <= self.nested or \ + not self._support_nested_parallelism(self.compiler): return partree # Note: there might be multiple sub-trees amenable to nested parallelism, diff --git a/tests/test_caching.py b/tests/test_caching.py index 8bfaaae837..c56ec95bf2 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -5,6 +5,7 @@ import pytest from sympy import Expr +from conftest import skipif from devito import ( ConditionalDimension, Constant, DefaultDimension, Dimension, Eq, Function, Grid, Operator, SparseFunction, SparseTimeFunction, SubDimension, TensorFunction, @@ -467,6 +468,7 @@ def test_grid_dtypes(self): assert hash(grid0) != hash(grid1) + @skipif('nointel') def test_special_symbols(self): """ This test checks the singletonization, through the caching infrastructure, diff --git a/tests/test_dle.py b/tests/test_dle.py index d71eb8ae1e..843ec7233f 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -11,6 +11,7 @@ PrecomputedSparseTimeFunction, ReduceMax, ReduceMin, ReduceMinMax, SpaceDimension, SparseTimeFunction, SubDimension, TimeFunction, configuration, cos, dimensions, info ) +from devito.arch.compiler import IntelCompiler, OneapiCompiler from devito.exceptions import InvalidArgument from devito.ir.iet import ( Expression, FindNodes, IsPerfectIteration, Iteration, retrieve_iteration_tree @@ -1237,6 +1238,7 @@ def test_parallel_prec_inject(self): class TestNestedParallelism: + @skipif('nointel') def test_basic(self): grid = Grid(shape=(3, 3, 3)) @@ -1249,6 +1251,7 @@ def test_basic(self): 'par-dynamic-work': 0})) # Does it compile? Honoring the OpenMP specification isn't trivial + print(op) assert op.cfunction # Does it produce the right result @@ -1268,6 +1271,7 @@ def test_basic(self): assert iterations[2].pragmas[0].ccode.value ==\ 'omp parallel for schedule(dynamic,1) num_threads(nthreads_nested)' + @skipif('nointel') def test_collapsing(self): grid = Grid(shape=(3, 3, 3)) @@ -1276,6 +1280,7 @@ def test_collapsing(self): op = Operator(Eq(u.forward, u + f + 1), opt=('blocking', 'openmp', {'par-nested': 0, + 'cire-rotate': True, 'par-collapse-ncores': 1, 'par-collapse-work': 0, 'par-dynamic-work': 0})) @@ -1297,6 +1302,7 @@ def test_collapsing(self): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') + @skipif('nointel') def test_multiple_subnests_v0(self): grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions @@ -1329,6 +1335,7 @@ def test_multiple_subnests_v0(self): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') + @skipif('nointel') def test_multiple_subnests_v1(self): """ Unlike ``test_multiple_subnestes_v0``, now we use the ``cire-rotate=True`` @@ -1367,6 +1374,7 @@ def test_multiple_subnests_v1(self): assert trees[-1][3].pragmas[0].ccode.value ==\ 'omp parallel for schedule(dynamic,1) num_threads(nthreads_nested)' + @skipif('nointel') @pytest.mark.parametrize('blocklevels', [1, 2]) def test_nested_cache_blocking_structure_subdims(self, blocklevels): """ @@ -1430,6 +1438,7 @@ def test_nested_cache_blocking_structure_subdims(self, blocklevels): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') + @skipif('nointel') @pytest.mark.parametrize('exprs,collapsed,scheduling', [ (['Eq(u.forward, u.dx)'], '2', 'static'), (['Eq(u.forward, u.dy)'], '2', 'static'), @@ -1461,3 +1470,40 @@ def test_collapsing_w_wo_halo(self, exprs, collapsed, scheduling): assert iterations[1].pragmas[0].ccode.value ==\ "".join([ompfor_string, scheduling_string]) + + @skipif('device') + def test_nested_parallelism_support(self): + grid = Grid(shape=(10, 10, 10)) + + f = Function(name='f', grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + v1 = TimeFunction(name="v1", grid=grid, space_order=4) + + f.data_with_halo[:] = 0.5 + v.data_with_halo[:] = 1. + v1.data_with_halo[:] = 1. + + eqn = Eq(v.forward, (v.dx * (1 + 2*f) * f).dx) + op = Operator(eqn, opt=('advanced', {'openmp': True, 'par-nested': 0})) + + bns, _ = assert_blocking(op, {'x0_blk0'}) + trees = retrieve_iteration_tree(bns['x0_blk0']) + assert len(trees) == 2 + + # Check omp pargams + assert trees[0][0].pragmas[0].ccode.value == \ + 'omp for collapse(2) schedule(dynamic,1)' + if isinstance(configuration['compiler'], (IntelCompiler, OneapiCompiler)): + # Supports nested parallelism + assert trees[0][2].pragmas[0].ccode.value == \ + '#pragma omp parallel for collapse(2) schedule(dynamic,1)'\ + ' num_threads(nthreads_nested)' + assert trees[1][2].pragmas[0].ccode.value == \ + trees[0][2].pragmas[0].ccode.value + else: + # Most compiler don't support nested parallelism + assert not trees[0][2].pragmas + assert not trees[1][2].pragmas + + # Should compile properly + op.cfunction # noqa: B018 From 192a7b589a02a8844b2836920c686f4d56a8f1d8 Mon Sep 17 00:00:00 2001 From: mloubout Date: Tue, 5 May 2026 13:03:44 -0400 Subject: [PATCH 2/2] compiler: refine nested parallel support with decl check --- conftest.py | 7 +------ devito/arch/compiler.py | 2 +- devito/passes/iet/languages/openmp.py | 8 ++++---- devito/passes/iet/parpragma.py | 15 ++++++++++++--- tests/test_caching.py | 5 ++--- tests/test_dle.py | 26 ++++++++++++-------------- 6 files changed, 32 insertions(+), 31 deletions(-) diff --git a/conftest.py b/conftest.py index 74854eb061..b2d49697fb 100644 --- a/conftest.py +++ b/conftest.py @@ -38,7 +38,7 @@ def skipif(items, whole_module=False): accepted.update({'device', 'device-C', 'device-openmp', 'device-openacc', 'device-aomp', 'cpu64-icc', 'cpu64-icx', 'cpu64-nvc', 'noadvisor', 'cpu64-arm', 'cpu64-icpx', 'chkpnt'}) - accepted.update({'nodevice', 'noomp', 'nointel'}) + accepted.update({'nodevice', 'noomp'}) unknown = sorted(set(items) - accepted) if unknown: raise ValueError(f"Illegal skipif argument(s) `{unknown}`") @@ -93,11 +93,6 @@ def skipif(items, whole_module=False): if i == 'noomp' and 'openmp' not in configuration['language']: skipit = "Must use openmp" break - # Skip if not using an Intel compiler - if i == 'nointel' and \ - not isinstance(configuration['compiler'], (IntelCompiler, OneapiCompiler)): - skipit = "Must use an Intel compiler" - break # Skip if it won't run on Arm if i == 'cpu64-arm' and isinstance(configuration['platform'], Arm): skipit = "Arm doesn't support x86-specific instructions" diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py index 58b1e30204..aaa584aaa2 100644 --- a/devito/arch/compiler.py +++ b/devito/arch/compiler.py @@ -894,7 +894,7 @@ def __lookup_cmds__(self): check_output(["mpiicc", f"-cc={self.CC}", "--version"]).decode("utf-8") self.MPICC = 'mpiicc' self.MPICXX = 'mpicxx' - except FileNotFoundError: + except (FileNotFoundError, CalledProcessError): self.MPICC = 'mpicc' self.MPICXX = 'mpicxx' diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index ce5decc518..242887fa8b 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -283,10 +283,10 @@ def _support_nested_parallelism(cls, compiler): # In case we have a CustomCompiler if isinstance(compiler, CustomCompiler): compiler = compiler._base() - if isinstance(compiler, (IntelCompiler, OneapiCompiler)): # noqa: SIM103 - return True - else: - return False + # Only supported by icc (IntelCompiler) but not by + # OneAPI's DPC++ compiler (OneapiCompiler) that inherits from IntelCompiler + return isinstance(compiler, IntelCompiler) and not \ + isinstance(compiler, OneapiCompiler) class Ompizer(AbstractOmpizer): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 56eb6fb779..d5752fec52 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -346,10 +346,18 @@ def _make_parregion(self, partree, parrays): def _make_guard(self, parregion): return parregion + def _support_uindices(self, uindices): + if not uindices: + # No secondary indices, so we can apply nested parallelism + return True + else: + # Compiler supports nested parallelism with multiple indices + # such as for(int i = 0, j=1; ...) + return self._support_nested_parallelism(self.compiler) + def _make_nested_partree(self, partree): # Apply heuristic - if self.nhyperthreads <= self.nested or \ - not self._support_nested_parallelism(self.compiler): + if self.nhyperthreads <= self.nested: return partree # Note: there might be multiple sub-trees amenable to nested parallelism, @@ -371,7 +379,8 @@ def _make_nested_partree(self, partree): # within a block) candidates = [] for i in inner: - if self.key(i) and any((j.dim.root is i.dim.root) for j in outer): + if self.key(i) and any((j.dim.root is i.dim.root) for j in outer) and \ + self._support_uindices(i.uindices): candidates.append(i) elif candidates: # If there's at least one candidate but `i` doesn't honor the diff --git a/tests/test_caching.py b/tests/test_caching.py index c56ec95bf2..2df7dc516d 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -5,12 +5,11 @@ import pytest from sympy import Expr -from conftest import skipif from devito import ( ConditionalDimension, Constant, DefaultDimension, Dimension, Eq, Function, Grid, Operator, SparseFunction, SparseTimeFunction, SubDimension, TensorFunction, TensorTimeFunction, TimeFunction, VectorFunction, VectorTimeFunction, _SymbolCache, - clear_cache, solve + clear_cache, solve, switchconfig ) from devito.types import ( DeviceID, LocalObject, NPThreads, NThreadsBase, Object, Scalar, Symbol, ThreadID @@ -468,7 +467,7 @@ def test_grid_dtypes(self): assert hash(grid0) != hash(grid1) - @skipif('nointel') + @switchconfig(compiler='icc') def test_special_symbols(self): """ This test checks the singletonization, through the caching infrastructure, diff --git a/tests/test_dle.py b/tests/test_dle.py index 843ec7233f..b672465dba 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -9,7 +9,8 @@ from devito import ( CustomDimension, DefaultDimension, Dimension, Eq, Function, Grid, Inc, Operator, PrecomputedSparseTimeFunction, ReduceMax, ReduceMin, ReduceMinMax, SpaceDimension, - SparseTimeFunction, SubDimension, TimeFunction, configuration, cos, dimensions, info + SparseTimeFunction, SubDimension, TimeFunction, configuration, cos, dimensions, info, + switchconfig ) from devito.arch.compiler import IntelCompiler, OneapiCompiler from devito.exceptions import InvalidArgument @@ -1238,7 +1239,6 @@ def test_parallel_prec_inject(self): class TestNestedParallelism: - @skipif('nointel') def test_basic(self): grid = Grid(shape=(3, 3, 3)) @@ -1251,7 +1251,6 @@ def test_basic(self): 'par-dynamic-work': 0})) # Does it compile? Honoring the OpenMP specification isn't trivial - print(op) assert op.cfunction # Does it produce the right result @@ -1271,7 +1270,6 @@ def test_basic(self): assert iterations[2].pragmas[0].ccode.value ==\ 'omp parallel for schedule(dynamic,1) num_threads(nthreads_nested)' - @skipif('nointel') def test_collapsing(self): grid = Grid(shape=(3, 3, 3)) @@ -1280,7 +1278,6 @@ def test_collapsing(self): op = Operator(Eq(u.forward, u + f + 1), opt=('blocking', 'openmp', {'par-nested': 0, - 'cire-rotate': True, 'par-collapse-ncores': 1, 'par-collapse-work': 0, 'par-dynamic-work': 0})) @@ -1302,7 +1299,7 @@ def test_collapsing(self): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') - @skipif('nointel') + @switchconfig(compiler='icc') def test_multiple_subnests_v0(self): grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions @@ -1335,7 +1332,7 @@ def test_multiple_subnests_v0(self): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') - @skipif('nointel') + @switchconfig(compiler='icc') def test_multiple_subnests_v1(self): """ Unlike ``test_multiple_subnestes_v0``, now we use the ``cire-rotate=True`` @@ -1374,7 +1371,6 @@ def test_multiple_subnests_v1(self): assert trees[-1][3].pragmas[0].ccode.value ==\ 'omp parallel for schedule(dynamic,1) num_threads(nthreads_nested)' - @skipif('nointel') @pytest.mark.parametrize('blocklevels', [1, 2]) def test_nested_cache_blocking_structure_subdims(self, blocklevels): """ @@ -1438,7 +1434,6 @@ def test_nested_cache_blocking_structure_subdims(self, blocklevels): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') - @skipif('nointel') @pytest.mark.parametrize('exprs,collapsed,scheduling', [ (['Eq(u.forward, u.dx)'], '2', 'static'), (['Eq(u.forward, u.dy)'], '2', 'static'), @@ -1471,7 +1466,6 @@ def test_collapsing_w_wo_halo(self, exprs, collapsed, scheduling): assert iterations[1].pragmas[0].ccode.value ==\ "".join([ompfor_string, scheduling_string]) - @skipif('device') def test_nested_parallelism_support(self): grid = Grid(shape=(10, 10, 10)) @@ -1484,7 +1478,9 @@ def test_nested_parallelism_support(self): v1.data_with_halo[:] = 1. eqn = Eq(v.forward, (v.dx * (1 + 2*f) * f).dx) - op = Operator(eqn, opt=('advanced', {'openmp': True, 'par-nested': 0})) + op = Operator(eqn, opt=('advanced', {'openmp': True, + 'par-collapse-ncores': 1, + 'par-nested': 0})) bns, _ = assert_blocking(op, {'x0_blk0'}) trees = retrieve_iteration_tree(bns['x0_blk0']) @@ -1493,13 +1489,15 @@ def test_nested_parallelism_support(self): # Check omp pargams assert trees[0][0].pragmas[0].ccode.value == \ 'omp for collapse(2) schedule(dynamic,1)' - if isinstance(configuration['compiler'], (IntelCompiler, OneapiCompiler)): + if isinstance(configuration['compiler'], IntelCompiler) and \ + not isinstance(configuration['compiler'], OneapiCompiler): # Supports nested parallelism assert trees[0][2].pragmas[0].ccode.value == \ - '#pragma omp parallel for collapse(2) schedule(dynamic,1)'\ + 'omp parallel for collapse(2) schedule(dynamic,1)'\ ' num_threads(nthreads_nested)' assert trees[1][2].pragmas[0].ccode.value == \ - trees[0][2].pragmas[0].ccode.value + 'omp parallel for collapse(2) schedule(static,1)'\ + ' num_threads(nthreads_nested)' else: # Most compiler don't support nested parallelism assert not trees[0][2].pragmas