From 1608a5b40ccbb1810cea0d992b3fb45f218c0d2d Mon Sep 17 00:00:00 2001 From: xsjk Date: Fri, 8 May 2026 02:37:09 +0800 Subject: [PATCH 1/6] Fix CUDA minmax reduction for volatile collectors --- compyle/array.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/compyle/array.py b/compyle/array.py index 5b5a034..e737ac4 100644 --- a/compyle/array.py +++ b/compyle/array.py @@ -97,6 +97,22 @@ def get_backend(backend=None): minmax_operator_tpl = """ + __device__ ${dtype}() + { + } + + __device__ ${dtype}(${dtype} const volatile &src) + { + % for prop in prop_names: + % if not only_max: + this->cur_min_${prop} = src.cur_min_${prop}; + % endif + % if not only_min: + this->cur_max_${prop} = src.cur_max_${prop}; + % endif + % endfor + } + __device__ ${dtype} volatile &operator=( ${dtype} const &src) volatile { From 4e1470e1a828f710210f9b13f7c463a842543f68 Mon Sep 17 00:00:00 2001 From: xsjk Date: Fri, 3 Jul 2026 15:51:00 +0800 Subject: [PATCH 2/6] Support Python 3.14 AST constants --- compyle/ast_utils.py | 12 ++++++++++++ compyle/cython_generator.py | 9 +++++---- compyle/jit.py | 15 +++++++++++---- compyle/template.py | 11 +++++++++-- compyle/translator.py | 20 ++++++++++++++++---- 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/compyle/ast_utils.py b/compyle/ast_utils.py index e622142..eaa190d 100644 --- a/compyle/ast_utils.py +++ b/compyle/ast_utils.py @@ -9,6 +9,18 @@ basestring = str if PY_VER > 2 else basestring +def get_string_value(node): + """Return a string literal's value or None if *node* is not a string.""" + ast_constant = getattr(ast, 'Constant', None) + if ast_constant is not None and isinstance(node, ast_constant) and \ + isinstance(node.value, str): + return node.value + ast_str = getattr(ast, 'Str', None) + if ast_str is not None and isinstance(node, ast_str): + return node.s + return None + + class NameLister(ast.NodeVisitor): """Utility class to collect the Names in an AST. """ diff --git a/compyle/cython_generator.py b/compyle/cython_generator.py index a0e7e51..7ab9ba9 100644 --- a/compyle/cython_generator.py +++ b/compyle/cython_generator.py @@ -21,7 +21,7 @@ from .types import KnownType, Undefined, get_declare_info from .config import get_config -from .ast_utils import get_assigned, has_return +from .ast_utils import get_assigned, get_string_value, has_return from .utils import getsourcelines logger = logging.getLogger(__name__) @@ -247,11 +247,12 @@ def parse_declare(code): if call.func.id != 'declare': raise CodeGenerationError('Unknown declare statement: %s' % code) arg0 = call.args[0] - if not isinstance(arg0, ast.Str): - err = 'Type should be a string, given :%r' % arg0.s + type_str = get_string_value(arg0) + if type_str is None: + err = 'Type should be a string, given :%r' % getattr(arg0, 'value', arg0) raise CodeGenerationError(err) - return get_declare_info(arg0.s) + return get_declare_info(type_str) class CythonGenerator(object): diff --git a/compyle/jit.py b/compyle/jit.py index 080fd42..a2585df 100644 --- a/compyle/jit.py +++ b/compyle/jit.py @@ -8,6 +8,7 @@ import time from pytools import memoize from .config import get_config +from .ast_utils import get_string_value from .cython_generator import CythonGenerator from .transpiler import Transpiler, BUILTINS from .types import (dtype_to_ctype, get_declare_info, @@ -198,15 +199,16 @@ def warn(self, message, node): warnings.warn(msg) def visit_declare(self, node): - if not isinstance(node.args[0], ast.Str): + type_str = get_string_value(node.args[0]) + if type_str is None: self.error("Argument to declare should be a string.", node) - type_str = node.args[0].s return self.get_declare_type(type_str) def visit_cast(self, node): - if not isinstance(node.args[1], ast.Str): + type_str = get_string_value(node.args[1]) + if type_str is None: self.error("Cast type should be a string.", node) - return node.args[1].s + return type_str def visit_address(self, node): base_type = self.visit(node.args[0]) @@ -294,6 +296,11 @@ def visit_BinOp(self, node): def visit_Num(self, node): return get_ctype_from_arg(node.n) + def visit_Constant(self, node): + if isinstance(node.value, str): + return None + return get_ctype_from_arg(node.value) + def visit_UnaryOp(self, node): return self.visit(node.operand) diff --git a/compyle/template.py b/compyle/template.py index 55e6a78..c1c887c 100644 --- a/compyle/template.py +++ b/compyle/template.py @@ -9,6 +9,13 @@ getfullargspec = inspect.getfullargspec +def _string_value(node): + value = node.value + if isinstance(value, ast.Constant): + return value.value + return value.s + + class Template(object): def __init__(self, name): self.name = name @@ -45,8 +52,8 @@ def _get_code(self): args += extra_args arg_string = ', '.join(args) body = m.body[0].body - template = body[-1].value.s - docstring = body[0].value.s if len(body) == 2 else '' + template = _string_value(body[-1]) + docstring = _string_value(body[0]) if len(body) == 2 else '' name = self.name sig = 'def {name}({args}):\n """{docs}\n """'.format( name=name, args=arg_string, docs=docstring diff --git a/compyle/translator.py b/compyle/translator.py index 7a10a92..c786222 100644 --- a/compyle/translator.py +++ b/compyle/translator.py @@ -25,6 +25,7 @@ from .cython_generator import ( CodeGenerationError, KnownType, Undefined, all_numeric ) +from .ast_utils import get_string_value from .utils import getsource PY_VER = sys.version_info.major @@ -235,7 +236,7 @@ def _indent_block(self, code): def _remove_docstring(self, body): if body and isinstance(body[0], ast.Expr) and \ - isinstance(body[0].value, ast.Str): + get_string_value(body[0].value) is not None: return body[1:] else: return body @@ -351,9 +352,9 @@ def visit_Assign(self, node): left, right = node.targets[0], node.value if isinstance(right, ast.Call) and \ isinstance(right.func, ast.Name) and right.func.id == 'declare': - if not isinstance(right.args[0], ast.Str): + type = get_string_value(right.args[0]) + if type is None: self.error("Argument to declare should be a string.", node) - type = right.args[0].s if isinstance(left, ast.Name): self._known.add(left.id) return self._get_variable_declaration(type, [self.visit(left)]) @@ -395,7 +396,10 @@ def visit_Call(self, node): elif 'atomic' in node.func.id: return self.render_atomic(node.func.id, node.args[0]) elif node.func.id == 'cast': - return '(%s) (%s)' % (node.args[1].s, self.visit(node.args[0])) + type_str = get_string_value(node.args[1]) + if type_str is None: + self.error("Cast type should be a string.", node) + return '(%s) (%s)' % (type_str, self.visit(node.args[0])) else: return '{func}({args})'.format( func=node.func.id, @@ -682,6 +686,14 @@ def visit_NameConstant(self, node): else: return value + def visit_Constant(self, node): + value = node.value + if value is True or value is False or value is None: + return self._replacements[value] + if isinstance(value, str): + return r'"%s"' % value + return literal_to_float(value, self._use_double) + def visit_Not(self, node): return '!' From 03728ccf4b02a1de91797a1f063694c4edc85490 Mon Sep 17 00:00:00 2001 From: xsjk Date: Tue, 23 Jun 2026 21:35:34 +0800 Subject: [PATCH 3/6] Suppress pytools cache sync warning --- compyle/cuda.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compyle/cuda.py b/compyle/cuda.py index f888de6..fe9002c 100644 --- a/compyle/cuda.py +++ b/compyle/cuda.py @@ -1396,7 +1396,8 @@ def __init__(self, dtype, generic_scan_kernel_cache = WriteOncePersistentDict( "pycuda-generated-scan-kernel-cache-v1", - key_builder=_NumpyTypesKeyBuilder()) + key_builder=_NumpyTypesKeyBuilder(), + safe_sync=False) class GenericScanKernel(_GenericScanKernelBase): From 5545d59a3351559e6105f6c48bdee603e480097a Mon Sep 17 00:00:00 2001 From: xsjk Date: Wed, 24 Jun 2026 15:23:33 +0800 Subject: [PATCH 4/6] Fix CUDA kernel profiling return semantics --- compyle/profile.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/compyle/profile.py b/compyle/profile.py index 09bd946..b207dc5 100644 --- a/compyle/profile.py +++ b/compyle/profile.py @@ -179,9 +179,16 @@ def _profile_knl(*args, **kwargs): _record_profile(name, end - start) return event elif backend == 'cuda': - exec_time = kernel(*args, **kwargs, time_kernel=True) - _record_profile(name, exec_time) - return exec_time + from pycuda import driver as cuda + stream = kwargs.get('stream') + start = cuda.Event() + end = cuda.Event() + start.record(stream) + result = kernel(*args, **kwargs) + end.record(stream) + end.synchronize() + _record_profile(name, end.time_since(start) * 1e-3) + return result else: start = time.time() kernel(*args, **kwargs) From 7a9e3e7de927025c23f600a7affaf7acc3b712d7 Mon Sep 17 00:00:00 2001 From: xsjk Date: Tue, 30 Jun 2026 16:11:50 +0800 Subject: [PATCH 5/6] Use pooled CUDA allocations --- compyle/array.py | 9 ++++++--- compyle/cuda.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/compyle/array.py b/compyle/array.py index e737ac4..e452913 100644 --- a/compyle/array.py +++ b/compyle/array.py @@ -303,7 +303,8 @@ def to_device(array, backend='cython'): out = gpuarray.to_device(get_queue(), array) elif backend == 'cuda': import pycuda.gpuarray as gpuarray - out = gpuarray.to_gpu(array) + from .cuda import get_cuda_allocator + out = gpuarray.to_gpu(array, allocator=get_cuda_allocator()) return wrap_array(out, backend) @@ -341,7 +342,8 @@ def empty(n, dtype, backend='cython'): out = gpuarray.empty(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray - out = gpuarray.empty(n, dtype) + from .cuda import get_cuda_allocator + out = gpuarray.empty(n, dtype, allocator=get_cuda_allocator()) else: out = np.empty(n, dtype=dtype) return wrap_array(out, backend) @@ -358,7 +360,8 @@ def zeros(n, dtype, backend='cython'): out = gpuarray.zeros(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray - out = gpuarray.zeros(n, dtype) + from .cuda import get_cuda_allocator + out = gpuarray.zeros(n, dtype, allocator=get_cuda_allocator()) else: out = np.zeros(n, dtype=dtype) return wrap_array(out, backend) diff --git a/compyle/cuda.py b/compyle/cuda.py index fe9002c..745f04f 100644 --- a/compyle/cuda.py +++ b/compyle/cuda.py @@ -18,6 +18,7 @@ import numpy as np import six _cuda_ctx = False +_cuda_memory_pool = None def set_context(): @@ -27,6 +28,15 @@ def set_context(): _cuda_ctx = True +def get_cuda_allocator(): + global _cuda_memory_pool + set_context() + if _cuda_memory_pool is None: + from pycuda.tools import DeviceMemoryPool + _cuda_memory_pool = DeviceMemoryPool() + return _cuda_memory_pool.allocate + + # The following code is taken from pyopencl for struct mapping. # it should be ported over to pycuda eventually. import pycuda.gpuarray as gpuarray # noqa From b14869ade0b13624b1dae98988b34dab5a37f9d5 Mon Sep 17 00:00:00 2001 From: xsjk Date: Fri, 3 Jul 2026 15:51:32 +0800 Subject: [PATCH 6/6] perf: reduce CUDA host synchronization overhead --- compyle/jit.py | 27 +++++----- compyle/low_level.py | 9 ++-- compyle/parallel.py | 27 +++++----- compyle/tests/test_jit.py | 91 ++++++++++++++++++++++++++++++++- compyle/tests/test_low_level.py | 47 +++++++++++++++++ compyle/tests/test_parallel.py | 83 +++++++++++++++++++++++++++++- 6 files changed, 254 insertions(+), 30 deletions(-) diff --git a/compyle/jit.py b/compyle/jit.py index a2585df..43ed0ba 100644 --- a/compyle/jit.py +++ b/compyle/jit.py @@ -370,11 +370,12 @@ def __call__(self, *args, **kw): c_func(*c_args, **kw) self.queue.finish() elif self.backend == 'cuda': - import pycuda.driver as drv - event = drv.Event() c_func(*c_args, **kw) - event.record() - event.synchronize() + if get_config().profile: + import pycuda.driver as drv + event = drv.Event() + event.record() + event.synchronize() class ReductionJIT(parallel.ReductionBase): @@ -449,11 +450,12 @@ def __call__(self, *args, **kw): self.queue.finish() return result.get() elif self.backend == 'cuda': - import pycuda.driver as drv - event = drv.Event() result = c_func(*c_args, **kw) - event.record() - event.synchronize() + if get_config().profile: + import pycuda.driver as drv + event = drv.Event() + event.record() + event.synchronize() return result.get() @@ -569,8 +571,9 @@ def __call__(self, **kwargs): c_func(*[c_args_dict[k] for k in output_arg_keys]) self.queue.finish() elif self.backend == 'cuda': - import pycuda.driver as drv - event = drv.Event() c_func(*[c_args_dict[k] for k in output_arg_keys]) - event.record() - event.synchronize() + if get_config().profile: + import pycuda.driver as drv + event = drv.Event() + event.record() + event.synchronize() diff --git a/compyle/low_level.py b/compyle/low_level.py index 280e801..3ea567e 100644 --- a/compyle/low_level.py +++ b/compyle/low_level.py @@ -260,15 +260,16 @@ def __call__(self, *args, **kw): self.knl(*c_args) self.queue.finish() elif self.backend == 'cuda': - import pycuda.driver as drv shared_mem_size = int(self._get_local_size(args, ls[0])) num_blocks = int((n + ls[0] - 1) / ls[0]) num_tpb = int(ls[0]) - event = drv.Event() self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1), shared=shared_mem_size) - event.record() - event.synchronize() + if get_config().profile: + import pycuda.driver as drv + event = drv.Event() + event.record() + event.synchronize() class _prange(Extern): diff --git a/compyle/parallel.py b/compyle/parallel.py index b1c53ec..0976148 100644 --- a/compyle/parallel.py +++ b/compyle/parallel.py @@ -547,11 +547,12 @@ def __call__(self, *args, **kw): self.c_func(*c_args, **kw) self.queue.finish() elif self.backend == 'cuda': - import pycuda.driver as drv - event = drv.Event() self.c_func(*c_args, **kw) - event.record() - event.synchronize() + if get_config().profile: + import pycuda.driver as drv + event = drv.Event() + event.record() + event.synchronize() class Elementwise(object): @@ -809,11 +810,12 @@ def __call__(self, *args): self.queue.finish() return result.get() elif self.backend == 'cuda': - import pycuda.driver as drv - event = drv.Event() result = self.c_func(*c_args) - event.record() - event.synchronize() + if get_config().profile: + import pycuda.driver as drv + event = drv.Event() + event.record() + event.synchronize() return result.get() @@ -1229,11 +1231,12 @@ def __call__(self, **kwargs): self.c_func(*[c_args_dict[k] for k in output_arg_keys]) self.queue.finish() elif self.backend == 'cuda': - import pycuda.driver as drv - event = drv.Event() self.c_func(*[c_args_dict[k] for k in output_arg_keys]) - event.record() - event.synchronize() + if get_config().profile: + import pycuda.driver as drv + event = drv.Event() + event.record() + event.synchronize() class Scan(object): diff --git a/compyle/tests/test_jit.py b/compyle/tests/test_jit.py index 3f09a75..110dd72 100644 --- a/compyle/tests/test_jit.py +++ b/compyle/tests/test_jit.py @@ -1,12 +1,16 @@ from math import sin import unittest import numpy as np +from unittest.mock import patch from pytest import importorskip from ..config import get_config, use_config from ..array import wrap -from ..jit import get_binop_return_type, AnnotationHelper +from ..jit import ( + AnnotationHelper, ElementwiseJIT, ReductionJIT, ScanJIT, + get_binop_return_type +) from ..types import annotate from ..parallel import Elementwise, Reduction, Scan @@ -32,6 +36,91 @@ def undeclared_f(a, b): return g(h_ab) +class TestCUDAJITSynchronization(unittest.TestCase): + def _patch_cuda_event(self): + sync_calls = [] + + class FakeEvent: + def record(self): + pass + + def synchronize(self): + sync_calls.append("sync") + + return sync_calls, patch("pycuda.driver.Event", FakeEvent) + + def test_cuda_elementwise_jit_does_not_synchronize_without_profile(self): + importorskip("pycuda") + + @annotate + def axpb(i, x): + x[i] = x[i] + 1.0 + + kernel = ElementwiseJIT(axpb, backend="cuda") + kernel._generate_kernel = lambda *args: lambda *c_args, **kw: None + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=False), event_patch: + kernel(np.zeros(8)) + + assert sync_calls == [] + + def test_cuda_scan_jit_does_not_synchronize_without_profile(self): + importorskip("pycuda") + + @annotate(input="doublep", return_="double") + def input_expr(i, input): + return input[i] + + @annotate(output="doublep", item="double") + def output_expr(i, item, output): + output[i] = item + + scan = ScanJIT(input=input_expr, output=output_expr, backend="cuda") + output_expr.arg_keys = {scan._get_backend_key(): ["input", "output"]} + scan._generate_kernel = lambda **kwargs: lambda *c_args: None + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=False), event_patch: + scan(input=np.zeros(8), output=np.zeros(8)) + + assert sync_calls == [] + + def test_cuda_reduction_jit_does_not_event_synchronize_without_profile(self): + importorskip("pycuda") + + class FakeResult: + def get(self): + return 1.0 + + reduction = ReductionJIT("a+b", backend="cuda") + reduction._generate_kernel = ( + lambda *args: lambda *c_args, **kw: FakeResult() + ) + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=False), event_patch: + assert reduction(np.zeros(8)) == 1.0 + + assert sync_calls == [] + + def test_cuda_elementwise_jit_synchronizes_with_profile(self): + importorskip("pycuda") + + @annotate + def axpb(i, x): + x[i] = x[i] + 1.0 + + kernel = ElementwiseJIT(axpb, backend="cuda") + kernel._generate_kernel = lambda *args: lambda *c_args, **kw: None + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=True), event_patch: + kernel(np.zeros(8)) + + assert sync_calls == ["sync"] + + class TestAnnotationHelper(unittest.TestCase): def test_const_as_call_arg(self): # Given diff --git a/compyle/tests/test_low_level.py b/compyle/tests/test_low_level.py index 61b66cb..6e0049d 100644 --- a/compyle/tests/test_low_level.py +++ b/compyle/tests/test_low_level.py @@ -1,5 +1,6 @@ import unittest import numpy as np +from unittest.mock import patch from pytest import importorskip @@ -13,6 +14,52 @@ class TestKernel(unittest.TestCase): + def _patch_cuda_event(self): + sync_calls = [] + + class FakeEvent: + def record(self): + pass + + def synchronize(self): + sync_calls.append("sync") + + return sync_calls, patch("pycuda.driver.Event", FakeEvent) + + def _make_cuda_kernel(self): + class FakeArray: + data = np.zeros(8) + + kernel = object.__new__(Kernel) + kernel.backend = "cuda" + kernel.knl = lambda *c_args, **kw: None + kernel._get_workgroup_size = lambda n: ((8,), (128,)) + kernel._get_args = lambda args, workgroup_size: [] + kernel._get_local_size = lambda args, workgroup_size: 0 + return kernel, FakeArray() + + def test_cuda_kernel_does_not_synchronize_without_profile(self): + importorskip("pycuda") + + kernel, fake_array = self._make_cuda_kernel() + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=False), event_patch: + kernel(fake_array) + + assert sync_calls == [] + + def test_cuda_kernel_synchronizes_with_profile(self): + importorskip("pycuda") + + kernel, fake_array = self._make_cuda_kernel() + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=True), event_patch: + kernel(fake_array) + + assert sync_calls == ["sync"] + def test_simple_kernel_opencl(self): importorskip('pyopencl') diff --git a/compyle/tests/test_parallel.py b/compyle/tests/test_parallel.py index 8fed025..df84a69 100644 --- a/compyle/tests/test_parallel.py +++ b/compyle/tests/test_parallel.py @@ -1,19 +1,100 @@ from math import sin import unittest import numpy as np +from unittest.mock import patch from pytest import importorskip from ..config import get_config, use_config from ..array import wrap, zeros from ..types import annotate, declare -from ..parallel import Elementwise, Reduction, Scan +from ..parallel import ( + Elementwise, ElementwiseBase, Reduction, ReductionBase, Scan, ScanBase +) from ..low_level import atomic_inc, atomic_dec from .test_jit import g MY_CONST = 42 +class TestCUDAParallelSynchronization(unittest.TestCase): + def _patch_cuda_event(self): + sync_calls = [] + + class FakeEvent: + def record(self): + pass + + def synchronize(self): + sync_calls.append("sync") + + return sync_calls, patch("pycuda.driver.Event", FakeEvent) + + def test_cuda_elementwise_base_does_not_synchronize_without_profile(self): + importorskip("pycuda") + + kernel = object.__new__(ElementwiseBase) + kernel.backend = "cuda" + kernel.c_func = lambda *c_args, **kw: None + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=False), event_patch: + kernel(np.zeros(8)) + + assert sync_calls == [] + + def test_cuda_reduction_base_does_not_event_synchronize_without_profile(self): + importorskip("pycuda") + + class FakeResult: + def get(self): + return 1.0 + + reduction = object.__new__(ReductionBase) + reduction.backend = "cuda" + reduction.c_func = lambda *c_args: FakeResult() + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=False), event_patch: + assert reduction(np.zeros(8)) == 1.0 + + assert sync_calls == [] + + def test_cuda_scan_base_does_not_synchronize_without_profile(self): + importorskip("pycuda") + + class OutputFunc: + pass + + scan = object.__new__(ScanBase) + scan.backend = "cuda" + scan._config = get_config() + scan.c_func = lambda *c_args: None + scan.output_func = OutputFunc() + scan.output_func.arg_keys = { + scan._get_backend_key(): ["input", "output"] + } + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=False), event_patch: + scan(input=np.zeros(8), output=np.zeros(8)) + + assert sync_calls == [] + + def test_cuda_elementwise_base_synchronizes_with_profile(self): + importorskip("pycuda") + + kernel = object.__new__(ElementwiseBase) + kernel.backend = "cuda" + kernel.c_func = lambda *c_args, **kw: None + sync_calls, event_patch = self._patch_cuda_event() + + with use_config(profile=True), event_patch: + kernel(np.zeros(8)) + + assert sync_calls == ["sync"] + + @annotate(x='int', return_='int') def external(x): return x