diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py index 58b1e30204..aaa584aaa2 100644 --- a/devito/arch/compiler.py +++ b/devito/arch/compiler.py @@ -894,7 +894,7 @@ def __lookup_cmds__(self): check_output(["mpiicc", f"-cc={self.CC}", "--version"]).decode("utf-8") self.MPICC = 'mpiicc' self.MPICXX = 'mpicxx' - except FileNotFoundError: + except (FileNotFoundError, CalledProcessError): self.MPICC = 'mpicc' self.MPICXX = 'mpicxx' diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index 32ace9d473..242887fa8b 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -5,7 +5,9 @@ from sympy import And, Ne, Not from devito.arch import AMDGPUX, INTELGPUX, NVIDIAX, PVC -from devito.arch.compiler import CustomCompiler, GNUCompiler, NvidiaCompiler +from devito.arch.compiler import ( + CustomCompiler, GNUCompiler, IntelCompiler, NvidiaCompiler, OneapiCompiler +) from devito.ir import ( Call, Conditional, DeviceCall, FindSymbols, List, ParallelBlock, PointerCast, Pragma, Prodder, While @@ -276,6 +278,16 @@ def _support_complex_reduction(cls, compiler): # Gcc doesn't supports complex reduction return not isinstance(compiler, GNUCompiler) + @classmethod + def _support_nested_parallelism(cls, compiler): + # In case we have a CustomCompiler + if isinstance(compiler, CustomCompiler): + compiler = compiler._base() + # Only supported by icc (IntelCompiler) but not by + # OneAPI's DPC++ compiler (OneapiCompiler) that inherits from IntelCompiler + return isinstance(compiler, IntelCompiler) and not \ + isinstance(compiler, OneapiCompiler) + class Ompizer(AbstractOmpizer): langbb = OmpBB diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 3cb072104c..d5752fec52 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -54,6 +54,10 @@ def _support_array_reduction(cls, compiler): def _support_complex_reduction(cls, compiler): return False + @classmethod + def _support_nested_parallelism(cls, compiler): + return False + @property def simd_reg_nbytes(self): return self.platform.simd_reg_nbytes @@ -342,6 +346,15 @@ def _make_parregion(self, partree, parrays): def _make_guard(self, parregion): return parregion + def _support_uindices(self, uindices): + if not uindices: + # No secondary indices, so we can apply nested parallelism + return True + else: + # Compiler supports nested parallelism with multiple indices + # such as for(int i = 0, j=1; ...) + return self._support_nested_parallelism(self.compiler) + def _make_nested_partree(self, partree): # Apply heuristic if self.nhyperthreads <= self.nested: @@ -366,7 +379,8 @@ def _make_nested_partree(self, partree): # within a block) candidates = [] for i in inner: - if self.key(i) and any((j.dim.root is i.dim.root) for j in outer): + if self.key(i) and any((j.dim.root is i.dim.root) for j in outer) and \ + self._support_uindices(i.uindices): candidates.append(i) elif candidates: # If there's at least one candidate but `i` doesn't honor the diff --git a/tests/test_caching.py b/tests/test_caching.py index 8bfaaae837..2df7dc516d 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -9,7 +9,7 @@ ConditionalDimension, Constant, DefaultDimension, Dimension, Eq, Function, Grid, Operator, SparseFunction, SparseTimeFunction, SubDimension, TensorFunction, TensorTimeFunction, TimeFunction, VectorFunction, VectorTimeFunction, _SymbolCache, - clear_cache, solve + clear_cache, solve, switchconfig ) from devito.types import ( DeviceID, LocalObject, NPThreads, NThreadsBase, Object, Scalar, Symbol, ThreadID @@ -467,6 +467,7 @@ def test_grid_dtypes(self): assert hash(grid0) != hash(grid1) + @switchconfig(compiler='icc') def test_special_symbols(self): """ This test checks the singletonization, through the caching infrastructure, diff --git a/tests/test_dle.py b/tests/test_dle.py index d71eb8ae1e..b672465dba 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -9,8 +9,10 @@ from devito import ( CustomDimension, DefaultDimension, Dimension, Eq, Function, Grid, Inc, Operator, PrecomputedSparseTimeFunction, ReduceMax, ReduceMin, ReduceMinMax, SpaceDimension, - SparseTimeFunction, SubDimension, TimeFunction, configuration, cos, dimensions, info + SparseTimeFunction, SubDimension, TimeFunction, configuration, cos, dimensions, info, + switchconfig ) +from devito.arch.compiler import IntelCompiler, OneapiCompiler from devito.exceptions import InvalidArgument from devito.ir.iet import ( Expression, FindNodes, IsPerfectIteration, Iteration, retrieve_iteration_tree @@ -1297,6 +1299,7 @@ def test_collapsing(self): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') + @switchconfig(compiler='icc') def test_multiple_subnests_v0(self): grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions @@ -1329,6 +1332,7 @@ def test_multiple_subnests_v0(self): ('omp parallel for collapse(2) schedule(dynamic,1) ' 'num_threads(nthreads_nested)') + @switchconfig(compiler='icc') def test_multiple_subnests_v1(self): """ Unlike ``test_multiple_subnestes_v0``, now we use the ``cire-rotate=True`` @@ -1461,3 +1465,43 @@ def test_collapsing_w_wo_halo(self, exprs, collapsed, scheduling): assert iterations[1].pragmas[0].ccode.value ==\ "".join([ompfor_string, scheduling_string]) + + def test_nested_parallelism_support(self): + grid = Grid(shape=(10, 10, 10)) + + f = Function(name='f', grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + v1 = TimeFunction(name="v1", grid=grid, space_order=4) + + f.data_with_halo[:] = 0.5 + v.data_with_halo[:] = 1. + v1.data_with_halo[:] = 1. + + eqn = Eq(v.forward, (v.dx * (1 + 2*f) * f).dx) + op = Operator(eqn, opt=('advanced', {'openmp': True, + 'par-collapse-ncores': 1, + 'par-nested': 0})) + + bns, _ = assert_blocking(op, {'x0_blk0'}) + trees = retrieve_iteration_tree(bns['x0_blk0']) + assert len(trees) == 2 + + # Check omp pargams + assert trees[0][0].pragmas[0].ccode.value == \ + 'omp for collapse(2) schedule(dynamic,1)' + if isinstance(configuration['compiler'], IntelCompiler) and \ + not isinstance(configuration['compiler'], OneapiCompiler): + # Supports nested parallelism + assert trees[0][2].pragmas[0].ccode.value == \ + 'omp parallel for collapse(2) schedule(dynamic,1)'\ + ' num_threads(nthreads_nested)' + assert trees[1][2].pragmas[0].ccode.value == \ + 'omp parallel for collapse(2) schedule(static,1)'\ + ' num_threads(nthreads_nested)' + else: + # Most compiler don't support nested parallelism + assert not trees[0][2].pragmas + assert not trees[1][2].pragmas + + # Should compile properly + op.cfunction # noqa: B018