KernelTuner · Imke7 · Nov 20, 2025 · Dec 20, 2025 · Feb 6, 2026 · Mar 1, 2026
diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml
@@ -21,7 +21,7 @@ jobs:
 
         strategy:
             matrix:
-                os: [ubuntu-latest, macos-latest]
+                os: [ubuntu-latest, macos-13]
 
         steps:
             - uses: actions/checkout@v4

diff --git a/examples/__init__.py b/examples/__init__.py
diff --git a/examples/generic_python/call_functions.py b/examples/generic_python/call_functions.py
@@ -0,0 +1,114 @@
+import torch
+
+def call_tilus(kernel_function, args, kwargs):
+    kernel_function(*args, **kwargs) 
+
+
+def call_triton(kernel_function, args, kwargs, grid, threads, params):
+    if "num_warps" in params.keys():
+        kwargs["num_warps"] = params["num_warps"]
+    if "num_stages" in params.keys():
+        kwargs["num_stages"] = params["num_stages"]
+
+    kernel_function[grid](*args, **kwargs)
+
+
+def call_tilelang(kernel_function, args, kwargs):
+    compiled_kernel = kernel_function(**kwargs)
+    compiled_kernel(*args)
+
+
+def call_numba(kernel_function, args, kwargs, grid, threads):
+    from numba import cuda
+
+    numba_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            numba_args.append(cuda.as_cuda_array(arg))
+        else:
+            numba_args.append(arg)
+
+    kernel_function[grid, threads](*args, **kwargs)
+
+
+def call_cupyx(kernel_function, args, kwargs, grid, threads):
+    import cupy as cp
+
+    cupy_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            cupy_args.append(cp.from_dlpack(arg))
+        else:
+            cupy_args.append(arg)
+    kernel_function(grid, threads, tuple(cupy_args))
+
+
+def call_cute(kernel_function, args, kwargs, grid, threads, params):
+    import cutlass.cute as cute
+    from cutlass.cute.runtime import from_dlpack
+
+    # Initialize cache if it does not exist
+    if not hasattr(call_cute, "custom_cache"):
+        call_cute.custom_cache = {}  
+
+    # Convert Torch tensors to CuTe tensors with correct layout
+    cute_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            arg_ = from_dlpack(arg)
+            cute_args.append(arg_)
+        else:
+            cute_args.append(arg)
+
+    # Form cache key from tuning parameters
+    param_keys = sorted(params.keys())
+    cache_str = type(kernel_function).__name__
+    for k in param_keys:
+        cache_str += "_" + str(params[k]) 
+
+    # Check if kernel exists in cache. Otherwise, compile and save
+    if cache_str in call_cute.custom_cache:
+        compiled_kernel = call_cute.custom_cache[cache_str]
+    else: 
+        compiled_kernel = cute.compile(kernel_function, *cute_args)
+        call_cute.custom_cache[cache_str] = compiled_kernel
+
+    compiled_kernel(*cute_args, **kwargs)
+
+
+def call_taichi(kernel_function, args, kwargs):
+    kernel_function(*args, **kwargs)
+
+
+def call_warp(kernel_function, args, kwargs, grid, threads, params):
+    import warp as wp
+
+    # Convert Torch tensors to Warp args
+    warp_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            warp_args.append(wp.from_torch(arg))
+        else:
+            warp_args.append(arg)
+
+    # Check if block_dim is in the tuning parameters. Otherwise, use
+    # the computed thread block dimensions.
+    if 'block_dim' in params.keys():
+        threads_per_block = params['block_dim']
+    else:
+        threads_per_block = threads[0] * threads[1] * threads[2]
+
+    # Check if dim is in the tuning parameters. Otherwise, compute from
+    # grid and threads.
+    if 'dim' in params.keys():
+        dimensions = params['dim']
+    else:
+        dimensions = [grid[i] * threads[i] for i in range(len(grid))]
+
+    # launch kernel
+    wp.launch(
+        kernel_function,
+        dim=dimensions,
+        inputs=warp_args,
+        block_dim=threads_per_block
+    )
diff --git a/examples/generic_python/cute_vec_add.py b/examples/generic_python/cute_vec_add.py
@@ -0,0 +1,57 @@
+import torch 
+
+import cutlass
+import cutlass.cute as cute
+
+from kernel_tuner import tune_kernel
+from call_functions import call_cute
+
+@cute.kernel
+def vec_add_kernel(
+    gA: cute.Tensor,
+    gB: cute.Tensor,
+    gC: cute.Tensor,
+    size: cute.Int32,
+):
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+    bdim, _, _ = cute.arch.block_dim()
+    thread_id = bdim * bidx + tidx
+
+    if thread_id < size:
+        gC[thread_id] = gA[thread_id] + gB[thread_id]
+
+
+@cute.jit
+def vec_add(
+    mA: cute.Tensor,
+    mB: cute.Tensor,
+    mC: cute.Tensor,
+    size: cute.Int32,
+):
+    num_threads_per_block = 256
+
+    kernel = vec_add_kernel(mA, mB, mC, size)
+
+    kernel.launch(
+        grid=(cute.ceil_div(size, num_threads_per_block), 1, 1),
+        block = (num_threads_per_block, 1, 1),
+    )
+
+
+def main():
+    size = 16384
+    a = torch.randn(size, device="cuda", dtype=torch.float16)
+    b = torch.randn(size, device="cuda", dtype=torch.float16)
+    c = torch.zeros(size, device="cuda", dtype=torch.float16)
+
+    args = [a, b, c, size]
+    tune_params = {"num_threads_per_block": [1, 2, 4, 8, 16, 32, 64, 128, 265, 512, 1024]}
+    answer = [None, None, (a+b).cpu(), None]
+
+    tune_kernel("vec_add", __file__, size, args, tune_params, answer=answer,
+                lang="generic_python", call_function=call_cute, verbose=True)
+
+
+main()
+
diff --git a/examples/generic_python/matmul/cupy_matmul.py b/examples/generic_python/matmul/cupy_matmul.py
@@ -0,0 +1,71 @@
+import cupy as cp
+from cupyx import jit
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_cupyx
+
+
+@jit.rawkernel()
+def gemm(a, b, c, M, N, K):
+    row = jit.blockIdx.y * jit.blockDim.y + jit.threadIdx.y
+    col = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
+
+    if row < M and col < N:
+        acc = 0.0
+        for kk in range(K):
+            acc += a[row, kk] * b[kk, col]   
+        c[row, col] = acc                   
+
+
+def run(M, N, K):
+    # float16 matrices on GPU
+    a = cp.random.random((M, K)).astype(cp.float16)
+    b = cp.random.random((K, N)).astype(cp.float16)
+    c = cp.zeros((M, N), dtype=cp.float16)
+
+    # block / grid configuration
+    block = (16, 16)
+    grid = ((N + block[0] - 1) // block[0], (M + block[1] - 1) // block[1])
+
+    # launch kernel
+    gemm[grid, block](a, b, c, M, N, K)
+    cp.cuda.Device().synchronize()
+
+    # Correctness verification
+    c_ref = cp.matmul(a, b)
+    assert cp.allclose(c, c_ref, rtol=1e-2, atol=1e-1)
+
+    print("Succes")
+
+
+def tune(M, N, K):
+    # random test data. Here we had to use numpy arrays instead of cupy.
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+
+    args = [A, B, C, M, N, K]
+    size = (N, M)
+    tune_params = {"block_size_x": [2**i for i in range(11)], "block_size_y": [2**i for i in range(11)]}
+    restrictions = ["block_size_x * block_size_y <= 1024"]
+
+    results, env = tune_kernel(
+        kernel_name="gemm",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        answer=[None, None, A.dot(B), None, None, None],
+        atol=1e-1,
+        call_function=call_cupyx, 
+        lang="generic_python",
+        restrictions=restrictions,
+        verbose=True,   
+    )
+
+
+if __name__ == "__main__":
+    M, N, K = 1024, 1024, 1024
+    tune(M, N, K)
+