diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml
index f86240c7f..dec26fdba 100644
--- a/.github/workflows/test-python-package.yml
+++ b/.github/workflows/test-python-package.yml
@@ -21,7 +21,7 @@ jobs:
 
         strategy:
             matrix:
-                os: [ubuntu-latest, macos-latest]
+                os: [ubuntu-latest, macos-13]
 
         steps:
             - uses: actions/checkout@v4
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/generic_python/call_functions.py b/examples/generic_python/call_functions.py
new file mode 100644
index 000000000..8c74790e1
--- /dev/null
+++ b/examples/generic_python/call_functions.py
@@ -0,0 +1,114 @@
+import torch
+
+def call_tilus(kernel_function, args, kwargs):
+    kernel_function(*args, **kwargs) 
+
+
+def call_triton(kernel_function, args, kwargs, grid, threads, params):
+    if "num_warps" in params.keys():
+        kwargs["num_warps"] = params["num_warps"]
+    if "num_stages" in params.keys():
+        kwargs["num_stages"] = params["num_stages"]
+    
+    kernel_function[grid](*args, **kwargs)
+
+
+def call_tilelang(kernel_function, args, kwargs):
+    compiled_kernel = kernel_function(**kwargs)
+    compiled_kernel(*args)
+
+
+def call_numba(kernel_function, args, kwargs, grid, threads):
+    from numba import cuda
+    
+    numba_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            numba_args.append(cuda.as_cuda_array(arg))
+        else:
+            numba_args.append(arg)
+
+    kernel_function[grid, threads](*args, **kwargs)
+
+
+def call_cupyx(kernel_function, args, kwargs, grid, threads):
+    import cupy as cp
+
+    cupy_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            cupy_args.append(cp.from_dlpack(arg))
+        else:
+            cupy_args.append(arg)
+    kernel_function(grid, threads, tuple(cupy_args))
+
+
+def call_cute(kernel_function, args, kwargs, grid, threads, params):
+    import cutlass.cute as cute
+    from cutlass.cute.runtime import from_dlpack
+
+    # Initialize cache if it does not exist
+    if not hasattr(call_cute, "custom_cache"):
+        call_cute.custom_cache = {}  
+
+    # Convert Torch tensors to CuTe tensors with correct layout
+    cute_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            arg_ = from_dlpack(arg)
+            cute_args.append(arg_)
+        else:
+            cute_args.append(arg)
+
+    # Form cache key from tuning parameters
+    param_keys = sorted(params.keys())
+    cache_str = type(kernel_function).__name__
+    for k in param_keys:
+        cache_str += "_" + str(params[k]) 
+    
+    # Check if kernel exists in cache. Otherwise, compile and save
+    if cache_str in call_cute.custom_cache:
+        compiled_kernel = call_cute.custom_cache[cache_str]
+    else: 
+        compiled_kernel = cute.compile(kernel_function, *cute_args)
+        call_cute.custom_cache[cache_str] = compiled_kernel
+
+    compiled_kernel(*cute_args, **kwargs)
+
+
+def call_taichi(kernel_function, args, kwargs):
+    kernel_function(*args, **kwargs)
+
+
+def call_warp(kernel_function, args, kwargs, grid, threads, params):
+    import warp as wp
+    
+    # Convert Torch tensors to Warp args
+    warp_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            warp_args.append(wp.from_torch(arg))
+        else:
+            warp_args.append(arg)
+
+    # Check if block_dim is in the tuning parameters. Otherwise, use
+    # the computed thread block dimensions.
+    if 'block_dim' in params.keys():
+        threads_per_block = params['block_dim']
+    else:
+        threads_per_block = threads[0] * threads[1] * threads[2]
+
+    # Check if dim is in the tuning parameters. Otherwise, compute from
+    # grid and threads.
+    if 'dim' in params.keys():
+        dimensions = params['dim']
+    else:
+        dimensions = [grid[i] * threads[i] for i in range(len(grid))]
+    
+    # launch kernel
+    wp.launch(
+        kernel_function,
+        dim=dimensions,
+        inputs=warp_args,
+        block_dim=threads_per_block
+    )
\ No newline at end of file
diff --git a/examples/generic_python/cute_vec_add.py b/examples/generic_python/cute_vec_add.py
new file mode 100644
index 000000000..2744abb77
--- /dev/null
+++ b/examples/generic_python/cute_vec_add.py
@@ -0,0 +1,57 @@
+import torch 
+
+import cutlass
+import cutlass.cute as cute
+
+from kernel_tuner import tune_kernel
+from call_functions import call_cute
+
+@cute.kernel
+def vec_add_kernel(
+    gA: cute.Tensor,
+    gB: cute.Tensor,
+    gC: cute.Tensor,
+    size: cute.Int32,
+):
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+    bdim, _, _ = cute.arch.block_dim()
+    thread_id = bdim * bidx + tidx
+
+    if thread_id < size:
+        gC[thread_id] = gA[thread_id] + gB[thread_id]
+
+
+@cute.jit
+def vec_add(
+    mA: cute.Tensor,
+    mB: cute.Tensor,
+    mC: cute.Tensor,
+    size: cute.Int32,
+):
+    num_threads_per_block = 256
+
+    kernel = vec_add_kernel(mA, mB, mC, size)
+
+    kernel.launch(
+        grid=(cute.ceil_div(size, num_threads_per_block), 1, 1),
+        block = (num_threads_per_block, 1, 1),
+    )
+
+
+def main():
+    size = 16384
+    a = torch.randn(size, device="cuda", dtype=torch.float16)
+    b = torch.randn(size, device="cuda", dtype=torch.float16)
+    c = torch.zeros(size, device="cuda", dtype=torch.float16)
+
+    args = [a, b, c, size]
+    tune_params = {"num_threads_per_block": [1, 2, 4, 8, 16, 32, 64, 128, 265, 512, 1024]}
+    answer = [None, None, (a+b).cpu(), None]
+
+    tune_kernel("vec_add", __file__, size, args, tune_params, answer=answer,
+                lang="generic_python", call_function=call_cute, verbose=True)
+
+
+main()
+
diff --git a/examples/generic_python/matmul/cupy_matmul.py b/examples/generic_python/matmul/cupy_matmul.py
new file mode 100644
index 000000000..1101b5fae
--- /dev/null
+++ b/examples/generic_python/matmul/cupy_matmul.py
@@ -0,0 +1,71 @@
+import cupy as cp
+from cupyx import jit
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_cupyx
+
+
+@jit.rawkernel()
+def gemm(a, b, c, M, N, K):
+    row = jit.blockIdx.y * jit.blockDim.y + jit.threadIdx.y
+    col = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
+    
+    if row < M and col < N:
+        acc = 0.0
+        for kk in range(K):
+            acc += a[row, kk] * b[kk, col]   
+        c[row, col] = acc                   
+
+
+def run(M, N, K):
+    # float16 matrices on GPU
+    a = cp.random.random((M, K)).astype(cp.float16)
+    b = cp.random.random((K, N)).astype(cp.float16)
+    c = cp.zeros((M, N), dtype=cp.float16)
+
+    # block / grid configuration
+    block = (16, 16)
+    grid = ((N + block[0] - 1) // block[0], (M + block[1] - 1) // block[1])
+
+    # launch kernel
+    gemm[grid, block](a, b, c, M, N, K)
+    cp.cuda.Device().synchronize()
+
+    # Correctness verification
+    c_ref = cp.matmul(a, b)
+    assert cp.allclose(c, c_ref, rtol=1e-2, atol=1e-1)
+
+    print("Succes")
+
+
+def tune(M, N, K):
+    # random test data. Here we had to use numpy arrays instead of cupy.
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+
+    args = [A, B, C, M, N, K]
+    size = (N, M)
+    tune_params = {"block_size_x": [2**i for i in range(11)], "block_size_y": [2**i for i in range(11)]}
+    restrictions = ["block_size_x * block_size_y <= 1024"]
+
+    results, env = tune_kernel(
+        kernel_name="gemm",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        answer=[None, None, A.dot(B), None, None, None],
+        atol=1e-1,
+        call_function=call_cupyx, 
+        lang="generic_python",
+        restrictions=restrictions,
+        verbose=True,   
+    )
+
+
+if __name__ == "__main__":
+    M, N, K = 1024, 1024, 1024
+    tune(M, N, K)
+    
\ No newline at end of file
diff --git a/examples/generic_python/matmul/cute_matmul.py b/examples/generic_python/matmul/cute_matmul.py
new file mode 100644
index 000000000..d584a6aa8
--- /dev/null
+++ b/examples/generic_python/matmul/cute_matmul.py
@@ -0,0 +1,1065 @@
+import math
+import torch
+from typing import Tuple
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+from cutlass.cute.runtime import from_dlpack
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_cute
+
+# might need export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH to work
+
+## Basic Matmul ================================================================
+
+@cute.kernel
+def naive_matmul_kernel(gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor, M: int, K: int, N: int):
+    tx, ty, _ = cute.arch.thread_idx()
+    bx, by, _ = cute.arch.block_idx()
+    bdx, bdy, _ = cute.arch.block_dim()
+
+    # Global indices
+    n = bx * bdx + tx
+    m = by * bdy + ty
+
+    if m < M and n < N:
+        acc = cutlass.Float32(0.0)
+
+        for k in range(K):
+            a = gA[m, k].to(cutlass.Float32)
+            b = gB[k, n].to(cutlass.Float32)
+            acc += a * b
+
+        gC[m, n] = acc.to(gC.element_type)
+
+
+@cute.jit
+def matmul(
+    mA: cute.Tensor,
+    mB: cute.Tensor,
+    mC: cute.Tensor,
+):
+    block_size_x = 16
+    block_size_y = 16
+    block = (block_size_x, block_size_y, 1)
+
+    M, K = mA.shape
+    _, N = mB.shape
+
+    grid = (
+        (N + block[0] - 1) // block[0],
+        (M + block[1] - 1) // block[1],
+        1,
+    )
+
+    kernel = naive_matmul_kernel(mA, mB, mC, M, K, N)
+
+    kernel.launch(grid=grid, block=block)
+
+
+def run_naive_matmul(M, N, K):
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    c = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+    c_ref = a @ b
+
+    # Convert to CuTe tensors
+    a_ = from_dlpack(a, assumed_align=16)  
+    b_ = from_dlpack(b, assumed_align=16)  
+    c_ = from_dlpack(c, assumed_align=16)  
+
+    compiled_kernel = cute.compile(matmul, a_, b_, c_)
+    compiled_kernel(a_, b_, c_)
+
+    assert torch.allclose(c, c_ref, atol=M * 2 **(-11), rtol=1e-2)
+    print("Succes")
+
+
+def tune_naive_matmul(M, N, K):
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    c = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+
+    args = [a, b, c]
+    size = M * N 
+    answer = [None, None, (a @ b).cpu()]
+    tune_params = dict()
+    tune_params["block_size_x"] = [2**i for i in range(1, 10)]
+    tune_params["block_size_y"] = [2**i for i in range(1, 10)]
+    restrictions = ["block_size_x * block_size_y >= 32, block_size_x * block_size_y <= 1024"]
+
+    results, env = tune_kernel("matmul", __file__, size, args, tune_params, lang="generic_python", 
+        call_function=call_cute, answer=answer,  atol=M * 2 **(-11), restrictions=restrictions, verbose=False)
+
+
+
+# This kernel is copied from the CuTe DSL project:
+# https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/ampere/tensorop_gemm.py
+#
+# Original example: tensorop_gemm.py
+# Copyright (c) 2025 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Modifications in file:
+# - Tuning paramters made more explicit
+
+## Optimized matmul ==========================================================
+# Copyright (c) 2025 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+"""
+A dense GEMM (C = A * B) example for the NVIDIA Ampere architecture using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+
+This GEMM kernel supports the following features:
+    - Utilizes Ampere's tensor cores for matrix multiply-accumulate (MMA) operations
+    - Threadblock rasterization to improve data re-use
+    - Supports multi-stage pipeline to overlap computation and memory access
+    - Implements shared memory buffering for epilogue to increase coalesced global memory access
+
+This GEMM works as follows:
+1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using asynchronous copies.
+2. Perform matrix multiply-accumulate (MMA) operations.
+3. Store results from registers (RMEM) to shared memory (SMEM), then to global memory (GMEM).
+
+The Ampere tensor core instruction used operates as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Perform MMA operation and store the result in Accumulator(register)
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/ampere/tensorop_gemm.py                                  \
+      --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1                        \
+      --ab_dtype Float16                                                     \
+      --c_dtype Float16 --acc_dtype Float32                                  \
+      --a_major m --b_major n --c_major n
+
+The above example command computes with M=8192, N=8192, K=8192,
+batch_count=1. The atom layout's shape is 2x2x1 and the input, mma
+accumulator, and output data type are set as fp16, fp32 and fp16,
+respectively.
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/ampere/tensorop_gemm.py                              \
+      --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1                        \
+      --ab_dtype Float16                                                     \
+      --c_dtype Float16 --acc_dtype Float32                                  \
+      --a_major m --b_major n --c_major n                                    \
+      --skip_ref_check --iterations 2
+
+Constraints:
+* Supported input and output data types: fp16
+* Support accumulator data types: f32
+* Default tile shape is set to be 128x128x32
+* Atom layout's MNK shape is set so that tile shape can be divided by MMA
+  instruction shape
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 8
+"""
+
+
+class TensorOpGemm:
+    def __init__(self):
+        self.ab_dtype = cutlass.Float16
+        self.c_dtype = cutlass.Float16
+        self.acc_dtype = cutlass.Float32
+        self.bM = 128 # extracted from cta_tiler for KT support
+        self.bN = 128 # extracted from cta_tiler for KT support
+        self.bK = 32  # extracted from cta_tiler for KT support
+        self.cta_tiler = (self.bM, self.bN, self.bK)
+        self.num_stages = 3
+        atom_lay_M = 2 # extracted from atom_layout_mnk for KT support
+        atom_lay_N = 2 # extracted from atom_layout_mnk for KT support
+        atom_lay_K = 1 # extracted from atom_layout_mnk for KT support
+        self.atom_layout_mnk = (atom_lay_M, atom_lay_N, atom_lay_K)  # moved from init parameter to here for KT support
+        self.num_threads = atom_lay_M * atom_lay_N * atom_lay_K * 32
+
+        self.mma_inst_shape = (16, 8, 16)
+        mmaM, mmaN, mmaK = self.mma_inst_shape
+
+        assert self.bM % (atom_lay_M * mmaM) == 0, (
+            "bM must be divisible by MMA instruction"
+        )
+        assert self.bN % (atom_lay_N * mmaN) == 0, (
+            "bN must be divisible by MMA instruction"
+        )
+        assert atom_lay_K == 1, "this example does not support atom layout K > 1"
+        assert self.bK % mmaK == 0, "bK must be divisible by MMA instruction"
+        assert self.num_stages >= 3, "num_stages must be greater than or equal to 3"
+
+    @cute.jit
+    def __call__(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        # The grid divides the problems's M, N, and L dimensions by the
+        # respective modes of the tile shape (bM, bN, 1). The K dimension is
+        # handled within a block via a multistage process.
+
+        self.a_major_mode = utils.LayoutEnum.from_tensor(mA)
+        self.b_major_mode = utils.LayoutEnum.from_tensor(mB)
+        self.c_major_mode = utils.LayoutEnum.from_tensor(mC)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory layout:
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # Creates a layout with the size required for the provided tile
+        # size and num stages (stages are used for K dimension) that is also
+        # sectioned into 64x8 or 8x32 layout atoms. The swizzle is set so that
+        # the atom for the shared memory -> register copy does not encounter
+        # bank conflicts
+
+        # assume the input is 16B align
+        ab_copy_bits = 128
+        sA_layout = self._make_smem_layout_AB(
+            mA.element_type,
+            self.a_major_mode,
+            ab_copy_bits,
+            (self.cta_tiler[0], self.cta_tiler[2], self.num_stages),
+        )
+        sB_layout = self._make_smem_layout_AB(
+            mB.element_type,
+            self.b_major_mode,
+            ab_copy_bits,
+            (self.cta_tiler[1], self.cta_tiler[2], self.num_stages),
+        )
+
+        # Creates a similar layout but without num_stages or layout atoms
+        sC_layout = self._make_smem_layout_C(
+            mC.element_type,
+            self.c_major_mode,
+            ab_copy_bits,
+            (self.cta_tiler[0], self.cta_tiler[1]),
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Tiled copy:
+        # The majorness of tA/tB/tC follows the majorness of gA/gB/gC,
+        # enabling merged accesses to global memory for faster data
+        # transfer between global and shared memory.
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # Create a copy atom for a global to shared memory asynchronous copy
+        atom_async_copy = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(
+                cache_mode=cute.nvgpu.cpasync.LoadCacheMode.GLOBAL
+            ),
+            mA.element_type,
+            num_bits_per_copy=ab_copy_bits,
+        )
+
+        # Create thread layouts for tiled copy from the copy atom where the
+        # thread layout simply follows the leading dimension of the tensor
+        tiled_copy_A = self._make_gmem_tiled_copy_AB(
+            atom_async_copy, mA.element_type, self.a_major_mode, ab_copy_bits
+        )
+        tiled_copy_B = self._make_gmem_tiled_copy_AB(
+            atom_async_copy, mB.element_type, self.b_major_mode, ab_copy_bits
+        )
+
+        # Creates a synchronous copy atom and thread layouts for the epilogue
+        c_copy_bits = 128
+        atom_sync_copy = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            mC.element_type,
+            num_bits_per_copy=c_copy_bits,
+        )
+        tiled_copy_C = self._make_gmem_tiled_copy_C(
+            atom_sync_copy, mC.element_type, self.c_major_mode, c_copy_bits
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Tiled MMA
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # Creates a mma atom with 16x8x16 shape for MNK
+        op = cute.nvgpu.warp.MmaF16BF16Op(
+            self.ab_dtype, self.acc_dtype, self.mma_inst_shape
+        )
+
+        permutation_mnk = (
+            self.atom_layout_mnk[0] * self.mma_inst_shape[0],
+            # if atom layout's N-mode is 1, to leverage the largest coalesced
+            # shared memory -> register copy, set the tiled mma's N mode to 16
+            self.atom_layout_mnk[1] * self.mma_inst_shape[1] * 2,
+            self.atom_layout_mnk[2] * self.mma_inst_shape[2],
+        )
+
+        # Created a tiled mma that tiles the atom according to specified layout.
+        # For a 2x2x1 atom layout, the mma atom is duplicated 4 times, twice
+        # across M and twice across N
+        tC = cute.make_layout(self.atom_layout_mnk)
+        tiled_mma = cute.make_tiled_mma(
+            op,
+            tC,
+            permutation_mnk=permutation_mnk,
+        )
+
+        # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, l)
+        grid_dim = cute.ceil_div(mC.shape, (self.bM, self.bN, 1))
+
+        # Add threadblock rasterization to improve re-use of data
+        raster_factor = 1
+        grid_dim_n = cute.size(grid_dim[1])
+        # Thresholds picked so that it doesn't cause too many no-op CTAs
+        if grid_dim_n > 5:
+            raster_factor = 8
+        elif grid_dim_n > 2:
+            raster_factor = 4
+        elif grid_dim_n > 1:
+            raster_factor = 2
+        rasterization_remap_grid_dim = (
+            cute.size(grid_dim[0]) * raster_factor,
+            (cute.size(grid_dim[1]) + raster_factor - 1) // raster_factor,
+            cute.size(grid_dim[2]),
+        )
+
+        self.kernel(
+            mA,
+            mB,
+            mC,
+            sA_layout,
+            sB_layout,
+            sC_layout,
+            tiled_copy_A,
+            tiled_copy_B,
+            tiled_copy_C,
+            tiled_mma,
+            raster_factor,
+            epilogue_op,
+        ).launch(
+            grid=rasterization_remap_grid_dim,
+            block=[self.num_threads, 1, 1],
+        )
+
+    @cute.kernel
+    def kernel(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        sA_layout: cute.ComposedLayout,
+        sB_layout: cute.ComposedLayout,
+        sC_layout: cute.ComposedLayout,
+        tiled_copy_A: cute.TiledCopy,
+        tiled_copy_B: cute.TiledCopy,
+        tiled_copy_C: cute.TiledCopy,
+        tiled_mma: cute.TiledMma,
+        rasterization_factor: cutlass.Int32,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        # Thread index, block index
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, bidy, bidz = cute.arch.block_idx()
+        grid_dim = cute.ceil_div(mC.shape, (self.bM, self.bN, 1))
+        offset_tile_x, offset_tile_y = self.raster_tile(
+            bidx, bidy, rasterization_factor
+        )
+        # Early exit if CTA is out of range
+        if grid_dim[0] <= offset_tile_x or grid_dim[1] <= offset_tile_y:
+            pass
+        else:
+            tiler_coord = (offset_tile_x, offset_tile_y, None)
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Get the appropriate tiles for this thread block.
+            # gA: (BLK_M, BLK_N, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N)
+            # ///////////////////////////////////////////////////////////////////////////////
+            gA = cute.local_tile(
+                mA[None, None, bidz],
+                tiler=self.cta_tiler,
+                coord=tiler_coord,
+                proj=(1, None, 1),
+            )
+            gB = cute.local_tile(
+                mB[None, None, bidz],
+                tiler=self.cta_tiler,
+                coord=tiler_coord,
+                proj=(None, 1, 1),
+            )
+            gC = cute.local_tile(
+                mC[None, None, bidz],
+                tiler=self.cta_tiler,
+                coord=tiler_coord,
+                proj=(1, 1, None),
+            )
+
+            # By default, if the tensor k mode does not divide into the tile k
+            # size, then last tiles in the k dimension are irregular.
+            # Instead, make the first tiles irregular when k is irregular.
+            # This allows us to handle the irregular tile first to avoid
+            # checking for this condition within the mainloop.
+
+            # residual_k is a negative number indicating the amount needed to
+            # shift the pointer by in dimension k
+            residual_k = cute.size(mA, mode=[1]) - cutlass.Int32(self.bK) * cute.size(
+                gA, mode=[2]
+            )
+
+            # move the pointer of gA/gB in the `-k` direction
+            gA = cute.domain_offset((0, residual_k, 0), gA)
+            gB = cute.domain_offset((0, residual_k, 0), gB)
+            # input is 16B aligned
+            gA = cute.make_tensor(gA.iterator.align(16), gA.layout)
+            gB = cute.make_tensor(gB.iterator.align(16), gB.layout)
+
+            # Construct identity layout for sA and sB (mirrors global tensors,
+            # used for predication only)
+            mcA = cute.make_identity_tensor(mA.layout.shape)
+            mcB = cute.make_identity_tensor(mB.layout.shape)
+            cA = cute.local_tile(
+                mcA[None, None, bidz],
+                tiler=self.cta_tiler,
+                coord=tiler_coord,
+                proj=(1, None, 1),
+            )
+            cB = cute.local_tile(
+                mcB[None, None, bidz],
+                tiler=self.cta_tiler,
+                coord=tiler_coord,
+                proj=(None, 1, 1),
+            )
+
+            cA = cute.domain_offset((0, residual_k, 0), cA)
+            cB = cute.domain_offset((0, residual_k, 0), cB)
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Create shared memory buffers and get the appropriate fragments for this thread.
+            # sA:   (BLK_M, BLK_K, PIPE)       , sB:   (BLK_N, BLK_K, PIPE)
+            # tAgA: (CPY, CPY_M, CPY_K, k)     , tBgB: (CPY, CPY_N, CPY_K, k)
+            # tAsA: (CPY, CPY_M, CPY_K, PIPE)  , tBsB: (CPY, CPY_N, CPY_K, PIPE)
+            # ///////////////////////////////////////////////////////////////////////////////
+            @cute.struct
+            class SharedStorageAB:
+                a: cute.struct.Align[
+                    cute.struct.MemRange[mA.element_type, cute.cosize(sA_layout)],
+                    16,
+                ]
+                b: cute.struct.Align[
+                    cute.struct.MemRange[mB.element_type, cute.cosize(sB_layout)],
+                    16,
+                ]
+
+            @cute.struct
+            class SharedStorageC:
+                c: cute.struct.Align[
+                    cute.struct.MemRange[mC.element_type, cute.cosize(sC_layout)],
+                    16,
+                ]
+
+            # Shared memory buffer
+            smem = cutlass.utils.SmemAllocator()
+            # Shared memory allocated for operations with A, B will be
+            # overwritten for operations on C. This is to improve performance
+            # by reducing the size of shared memory requested by each block
+            storage = smem.allocate(
+                max(SharedStorageAB.size_in_bytes(), SharedStorageC.size_in_bytes()),
+                byte_alignment=16,
+            )
+            sA = SharedStorageAB(storage).a.get_tensor(sA_layout)
+            sB = SharedStorageAB(storage).b.get_tensor(sB_layout)
+            sC = SharedStorageC(storage).c.get_tensor(sC_layout)
+
+            thr_copy_A = tiled_copy_A.get_slice(tidx)
+            thr_copy_B = tiled_copy_B.get_slice(tidx)
+            thr_copy_C = tiled_copy_C.get_slice(tidx)
+            tAgA = thr_copy_A.partition_S(gA)
+            tAsA = thr_copy_A.partition_D(sA)
+            tBgB = thr_copy_B.partition_S(gB)
+            tBsB = thr_copy_B.partition_D(sB)
+            tCsC_epilogue = thr_copy_C.partition_S(sC)
+            tCgC_epilogue = thr_copy_C.partition_D(gC)
+
+            # Repeat the partitioning with identity layouts
+            tAcA = thr_copy_A.partition_S(cA)
+            tBcB = thr_copy_B.partition_S(cB)
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Predicate: Mark indices that need to copy when problem_shape isn't a multiple
+            # of tile_shape
+            # ///////////////////////////////////////////////////////////////////////////////
+
+            # For predication over the tensors A (M/K), B (N/K), and (in the
+            # epilogue) C (M/N), we will compute it in a fashion similar to an
+            # outer product. The predication along one of the dimensions is
+            # evaluated and stored in a predication tensor. Then, the
+            # predication for the remaining dimension is handled later via an
+            # if/else branch at the copy.
+            # For A and B, predication booleans along M/N are stored in a
+            # predication tensor and along K is handled via a if/else branch.
+
+            # Allocate predicate tensors for M and N. Predication is checked
+            # at the granularity of a copy atom, so the predicate tensor does not
+            # need separate booleans for individual elements within a copy
+            # atom (for example, the elements of tAgA.shape[0][0].)
+            tApA = cute.make_rmem_tensor(
+                cute.make_layout(
+                    (
+                        tAgA.shape[0][1],
+                        cute.size(tAgA, mode=[1]),
+                        cute.size(tAgA, mode=[2]),
+                    ),
+                    stride=(cute.size(tAgA, mode=[1]), 1, 0),
+                ),
+                cutlass.Boolean,
+            )
+            tBpB = cute.make_rmem_tensor(
+                cute.make_layout(
+                    (
+                        tBsB.shape[0][1],
+                        cute.size(tBsB, mode=[1]),
+                        cute.size(tBsB, mode=[2]),
+                    ),
+                    stride=(cute.size(tBsB, mode=[1]), 1, 0),
+                ),
+                cutlass.Boolean,
+            )
+            # Set predicates for M/N bounds
+            for rest_v in range(tApA.shape[0]):
+                for m in range(tApA.shape[1]):
+                    tApA[rest_v, m, 0] = cute.elem_less(
+                        tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0]
+                    )
+            for rest_v in range(tBpB.shape[0]):
+                for n in range(tBpB.shape[1]):
+                    tBpB[rest_v, n, 0] = cute.elem_less(
+                        tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0]
+                    )
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Prefetch Prologue
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Clear the smem tiles to account for predicated off loads
+            tAsA.fill(0)
+            tBsB.fill(0)
+            cute.arch.sync_threads()
+            # Start async loads for the first k-tile. Here we take care of the k residue
+            # via if/else check along the k dimension. Because we shifted the identity tensor
+            # by the residue_k and because the identity tensor is a coord tensor, the
+            # values of any identity tensor element that is poison is less than -1
+            num_smem_stages = cute.size(tAsA, mode=[3])
+            k_tile_count = cute.size(tAgA, mode=[3])
+            k_tile_index = cutlass.Int32(0)
+
+            for k in range(tApA.shape[2]):
+                if cute.elem_less(cutlass.Int32(-1), tAcA[0, 0, k, 0][1]):
+                    cute.copy(
+                        tiled_copy_A,
+                        tAgA[None, None, k, k_tile_index],
+                        tAsA[None, None, k, 0],
+                        pred=tApA[None, None, k],
+                    )
+            for k in range(tBpB.shape[2]):
+                if cute.elem_less(cutlass.Int32(-1), tBcB[0, 0, k, 0][1]):
+                    cute.copy(
+                        tiled_copy_B,
+                        tBgB[None, None, k, k_tile_index],
+                        tBsB[None, None, k, 0],
+                        pred=tBpB[None, None, k],
+                    )
+            k_tile_index = k_tile_index + 1
+            cute.arch.cp_async_commit_group()
+
+            # Start async loads for rest of the k-tiles
+            for k_tile in range(1, num_smem_stages - 1):
+                if k_tile == k_tile_count:
+                    tApA.fill(0)
+                    tBpB.fill(0)
+                cute.copy(
+                    tiled_copy_A,
+                    tAgA[None, None, None, k_tile_index],
+                    tAsA[None, None, None, k_tile],
+                    pred=tApA,
+                )
+                cute.copy(
+                    tiled_copy_B,
+                    tBgB[None, None, None, k_tile_index],
+                    tBsB[None, None, None, k_tile],
+                    pred=tBpB,
+                )
+                k_tile_index = k_tile_index + 1
+                cute.arch.cp_async_commit_group()
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Tile MMA compute thread partitions and allocate accumulators
+            # ///////////////////////////////////////////////////////////////////////////////
+            thr_mma = tiled_mma.get_slice(tidx)
+            tCsA = thr_mma.partition_A(sA)
+            tCsB = thr_mma.partition_B(sB)
+            tCsC = thr_mma.partition_C(sC)
+            tCgC = thr_mma.partition_C(gC)
+            tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0])
+            tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0])
+            tCrC = tiled_mma.make_fragment_C(tCgC)
+            # Clear the accumulator
+            tCrC.fill(0.0)
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Copy Atom A/B retiling
+            # ///////////////////////////////////////////////////////////////////////////////
+
+            # Create the copy atoms for the copy from shared memory to register
+            atom_copy_s2r_A = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(
+                    self.a_major_mode != utils.LayoutEnum.ROW_MAJOR, 4
+                ),
+                mA.element_type,
+            )
+            atom_copy_s2r_B = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(
+                    self.b_major_mode != utils.LayoutEnum.ROW_MAJOR, 4
+                ),
+                mB.element_type,
+            )
+
+            # Creates the tiled copy so that it matches the thread-value layout
+            # expected by the tiled mma
+            tiled_copy_s2r_A = cute.make_tiled_copy_A(atom_copy_s2r_A, tiled_mma)
+            tiled_copy_s2r_B = cute.make_tiled_copy_B(atom_copy_s2r_B, tiled_mma)
+
+            thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx)
+            thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx)
+            tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+            tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA)
+            tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+            tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB)
+
+            # Current pipe index in smem to read from / write to
+            smem_pipe_read = 0
+            smem_pipe_write = num_smem_stages - 1
+
+            tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read]
+            tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read]
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # PREFETCH register pipeline
+            # ///////////////////////////////////////////////////////////////////////////////
+            num_k_block = cute.size(tCrA, mode=[2])
+            if num_k_block > 1:
+                # Wait until our first prefetched tile is loaded in
+                cute.arch.cp_async_wait_group(num_smem_stages - 2)
+                cute.arch.sync_threads()
+                # Prefetch the first k-block rmem from the first k-tile
+                cute.copy(
+                    tiled_copy_s2r_A,
+                    tCsA_p[None, None, 0],
+                    tCrA_copy_view[None, None, 0],
+                )
+                cute.copy(
+                    tiled_copy_s2r_B,
+                    tCsB_p[None, None, 0],
+                    tCrB_copy_view[None, None, 0],
+                )
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Mainloop
+            # 1. Shared memory pipeline (gmem -> smem):
+            #    The default smem pipeline depth is 3, meaning that for shared
+            # memory buffers, we allocate three times the size described by the
+            # CTA tiler. We prefetch 2 of these buffers before entering the main
+            # loop. Considering only the transfer from global memory to shared
+            # memory, the general structure of the mainloop is:
+            #   (1) copy k-tile from gmem to smem;
+            #   (2) perform gemm computation on k-tile;
+            #   (3) wait for the next copy to finish.
+            #    The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command
+            # waits for the number of unfinished 'copy' to be <= 1. The advantage
+            # of this approach is that it allows for simultaneous production
+            # (i.e., step (1)) and consumption (i.e., step (2)) of smem.
+            #    A common misconception is to prefetch N buffers and rewrite
+            # the pipeline logic to wait on N-1 pending copies. The disadvantage
+            # of this approach is that it requires fully consuming a buffer in
+            # order to open an empty buffer for the next copy.
+            # 2. Register pipeline (smem -> register):
+            #    Similarly, the register pipeline produces i+1, consumes i, and
+            # produces i+2... Notably, i and i+1 do not use the same register,
+            # eliminating dependencies on the same register for better parallelism.
+            # 3. Combining the smem and register pipelines results in the mainloop.
+            # ///////////////////////////////////////////////////////////////////////////////
+            for k_tile in range(k_tile_count):
+                for k_block in cutlass.range(num_k_block, unroll_full=True):
+                    if k_block == num_k_block - 1:
+                        tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read]
+                        tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read]
+                        cute.arch.cp_async_wait_group(num_smem_stages - 2)
+                        cute.arch.sync_threads()
+
+                    # Load A, B from shared memory to registers for k_block + 1
+                    k_block_next = (k_block + 1) % num_k_block  # static
+                    cute.copy(
+                        tiled_copy_s2r_A,
+                        tCsA_p[None, None, k_block_next],
+                        tCrA_copy_view[None, None, k_block_next],
+                    )
+                    cute.copy(
+                        tiled_copy_s2r_B,
+                        tCsB_p[None, None, k_block_next],
+                        tCrB_copy_view[None, None, k_block_next],
+                    )
+
+                    # Fetch next A: To better interleave global memory access and compute
+                    # instructions, we intentionally use the sequence: copy A, perform GEMM,
+                    # then copy B.
+                    if k_block == 0:
+                        if k_tile + num_smem_stages - 1 < k_tile_count:
+                            cute.copy(
+                                tiled_copy_A,
+                                tAgA[None, None, None, k_tile_index],
+                                tAsA[None, None, None, smem_pipe_write],
+                                pred=tApA,
+                            )
+
+                    # Thread-level register gemm for k_block
+                    cute.gemm(
+                        tiled_mma,
+                        tCrC,
+                        tCrA[None, None, k_block],
+                        tCrB[None, None, k_block],
+                        tCrC,
+                    )
+
+                    # Fetch next B and update smem pipeline read/write
+                    if k_block == 0:
+                        if k_tile + num_smem_stages - 1 < k_tile_count:
+                            cute.copy(
+                                tiled_copy_B,
+                                tBgB[None, None, None, k_tile_index],
+                                tBsB[None, None, None, smem_pipe_write],
+                                pred=tBpB,
+                            )
+                        k_tile_index = k_tile_index + 1
+                        cute.arch.cp_async_commit_group()
+                        smem_pipe_write = smem_pipe_read
+                        smem_pipe_read = smem_pipe_read + 1
+                        if smem_pipe_read == num_smem_stages:
+                            smem_pipe_read = 0
+
+            # Sync before epilogue
+            cute.arch.cp_async_wait_group(0)
+            cute.arch.sync_threads()
+
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Epilogue with fusion
+            # ///////////////////////////////////////////////////////////////////////////////
+            tCrD = cute.make_fragment_like(tCrC, self.c_dtype)
+            tCrD[None] = epilogue_op(tCrC.load()).to(self.c_dtype)
+
+            # Copy results of D back to shared memory
+            cute.autovec_copy(tCrD, tCsC) 
+
+            # Create coord tensor for C
+            ceilM, ceilN, _ = cute.ceil_div(mC.shape, (self.bM, self.bN, 1))
+            mcC = cute.make_identity_tensor(
+                (
+                    cute.size(ceilM) * self.cta_tiler[0],
+                    cute.size(ceilN) * self.cta_tiler[1],
+                    1,
+                )
+            )
+            cC = cute.local_tile(
+                mcC[None, None, bidz],
+                tiler=self.cta_tiler,
+                coord=tiler_coord,
+                proj=(1, 1, None),
+            )
+            tCcC = thr_copy_C.partition_S(cC)
+
+            tCrC_epilogue = cute.make_fragment_like(tCsC_epilogue)
+            # Wait for all writes to shared memory to finish before starting copies
+            # using the new layouts
+            cute.arch.sync_threads()
+            cute.autovec_copy(tCsC_epilogue, tCrC_epilogue) 
+
+            # Create predication tensor for m
+            tCpC = cute.make_rmem_tensor(
+                cute.make_layout(
+                    (
+                        tCgC_epilogue.shape[0][1],
+                        cute.size(tCgC_epilogue, mode=[1]),
+                        cute.size(tCgC_epilogue, mode=[2]),
+                    ),
+                    stride=(cute.size(tCgC_epilogue, mode=[1]), 1, 0),
+                ),
+                cutlass.Boolean,
+            )
+            for rest_v in range(tCpC.shape[0]):
+                for m in range(tCpC.shape[1]):
+                    tCpC[rest_v, m, 0] = cute.elem_less(
+                        tCcC[(0, rest_v), m, 0][0], mC.shape[0]
+                    )
+
+            # Copy to global memory using better vectorization
+            for rest_v in range(tCpC.shape[0]):
+                for n in range(tCpC.shape[2]):
+                    if cute.elem_less(tCcC[(0, rest_v), 0, n][1], mC.shape[1]):
+                        cute.copy(
+                            tiled_copy_C,
+                            tCrC_epilogue[None, None, n],
+                            tCgC_epilogue[None, None, n],
+                            pred=tCpC[None, None, n],
+                        )
+        return
+
+    def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler):
+        major_mode_size = (
+            smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0]
+        )
+        major_mode_size = 64 if major_mode_size >= 64 else major_mode_size
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            layout_atom_outer,
+        )
+        layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1, 2))
+        return layout
+
+    def _make_smem_layout_C(self, dtype, major_mode, copy_bits, smem_tiler):
+        major_mode_size = (
+            smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0]
+        )
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 4),
+            0,
+            layout_atom_outer,
+        )
+
+        # Due to the thread layout of the mma, remove swizzle in C to
+        # prevent shared memory fragments owned by an single thread from
+        # holding swizzles
+        if major_mode == utils.LayoutEnum.COL_MAJOR:
+            layout_atom = cute.make_composed_layout(
+                cute.make_swizzle(0, 3, 4), 0, layout_atom_outer
+            )
+        layout = cute.tile_to_shape(
+            layout_atom,
+            smem_tiler,
+            (0, 1),
+        )
+        return layout
+
+    def _make_gmem_tiled_copy_AB(self, atom_copy, dtype, major_mode, copy_bits):
+        copy_elems = copy_bits // dtype.width
+        shape_dim_1 = cute.size(self.bK) // copy_elems
+        # thread layout for copy
+        thread_layout = cute.make_layout(
+            (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1)
+        )
+        if major_mode != utils.LayoutEnum.ROW_MAJOR:
+            shape_dim_0 = cute.size(self.bM) // copy_elems
+            thread_layout = cute.make_layout(
+                (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0)
+            )
+        # Value layout for copy
+        value_layout = (
+            cute.make_layout((1, copy_elems))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((copy_elems, 1))
+        )
+        return cute.make_tiled_copy_tv(atom_copy, thread_layout, value_layout)
+
+    def _make_gmem_tiled_copy_C(self, atom_copy, dtype, major_mode, copy_bits):
+        copy_elems = copy_bits // dtype.width
+        shape_dim_1 = cute.size(self.bN) // copy_elems
+        # thread layout for copy
+        thread_layout = cute.make_layout(
+            (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1)
+        )
+        if major_mode != utils.LayoutEnum.ROW_MAJOR:
+            shape_dim_0 = cute.size(self.bM) // copy_elems
+            thread_layout = cute.make_layout(
+                (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0)
+            )
+        value_layout = (
+            cute.make_layout((1, copy_elems))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((copy_elems, 1))
+        )
+        return cute.make_tiled_copy_tv(atom_copy, thread_layout, value_layout)
+
+    def raster_tile(self, i, j, f):
+        new_i = i // f
+        new_j = (i % f) + (j * f)
+        return (new_i, new_j)
+
+
+# Need a custum call function for the optimized kernel because of the tensor layout
+def call_cute_custom(kernel_function, args, kwargs, grid, threads, params):
+    # Initialize cache if it does not exist
+    if not hasattr(call_cute_custom, "custom_cache"):
+        call_cute_custom.custom_cache = {}  
+
+    # Convert Torch tensors to CuTe tensors with correct layout
+    cute_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            cute_tensor = (
+                from_dlpack(arg, assumed_align=16)
+                .mark_layout_dynamic(leading_dim=1) 
+                .mark_compact_shape_dynamic(
+                    mode=1,
+                    stride_order=(2, 0, 1),   
+                    divisibility=8,        
+                )
+            )
+            cute_args.append(cute_tensor)
+        else:
+            cute_args.append(arg)
+
+    # Form cache key from tuning parameters
+    param_keys = sorted(params.keys())
+    cache_str = type(kernel_function).__name__
+    for k in param_keys:
+        cache_str += "_" + str(params[k]) 
+    
+    # Check if kernel exists in cache. Otherwise, compile and save
+    if cache_str in call_cute_custom.custom_cache:
+        compiled_kernel = call_cute_custom.custom_cache[cache_str]
+    else: 
+        # Wrap in try-except because CuTe gives TypeError for certain block sizes. This 
+        # stops the tuning process and we do not want this
+        try: 
+            compiled_kernel = cute.compile(kernel_function, *cute_args)
+            call_cute_custom.custom_cache[cache_str] = compiled_kernel
+        except:
+            raise RuntimeError("Invalid configuration")
+            
+        
+
+    compiled_kernel(*cute_args)
+
+
+
+def run_optimized(M, N, K, L=1):
+    a = torch.randn((L, M, K), device="cuda", dtype=torch.float16).permute(1, 2, 0)
+    b = torch.randn((L, N, K), device="cuda", dtype=torch.float16).permute(1, 2, 0) # b is transposed
+    c = torch.empty((L, M, N), device="cuda", dtype=torch.float16).permute(1, 2, 0)
+
+    c_ref = torch.matmul(
+        a.permute(2, 0, 1),   
+        b.permute(2, 1, 0)    
+    ).permute(1, 2, 0)  
+
+    def create_cute_tensor(torch_tensor): 
+        cute_tensor = (
+            from_dlpack(torch_tensor, assumed_align=16)
+            .mark_layout_dynamic(leading_dim=1) 
+            .mark_compact_shape_dynamic(
+                mode=1,
+                stride_order=(2, 0, 1),   
+                divisibility=8,        
+            )
+        )
+        return cute_tensor
+
+    a_, b_, c_ = create_cute_tensor(a), create_cute_tensor(b), create_cute_tensor(c)
+
+    gemm = TensorOpGemm()
+    compiled_kernel = cute.compile(gemm, a_, b_, c_)
+    compiled_kernel(a_, b_, c_)
+
+    assert torch.allclose(c_ref, c, atol= M * 2**(-11), rtol=1e-2)
+    print("Sucess")
+
+
+def tune_optimized(M, N, K, L=1):   
+    a = torch.randn((L, M, K), device="cuda", dtype=torch.float16).permute(1, 2, 0)
+    b = torch.randn((L, N, K), device="cuda", dtype=torch.float16).permute(1, 2, 0) # b is transposed
+    c = torch.empty((L, M, N), device="cuda", dtype=torch.float16).permute(1, 2, 0)
+
+    c_ref = torch.matmul(
+        a.permute(2, 0, 1),   
+        b.permute(2, 1, 0)    
+    ).permute(1, 2, 0)        
+    
+
+    args = [a, b, c]
+    tune_params = {
+        "bM":[16, 32, 64, 128, 256], 
+        "bN":[32, 64, 128, 256],
+        "bK": [16, 32, 64, 128], # must be divisable by 16
+        "num_stages": [3, 4, 5], # restricted to >= 3 by the kernel
+        "atom_lay_M": [1, 2, 4],
+        "atom_lay_N": [1, 2], 
+    }
+
+    restrictions = [
+        "bM % (atom_lay_M * 16) == 0", "bN % (atom_lay_N * 8) == 0", # layout
+        "atom_lay_M * atom_lay_N * 32 <= 1024", # number of threads
+        "num_stages * (bK * (bM + bN)) * 2 <= 49152", # SMEM
+        "bM >= atom_lay_M * 32", # ensure each atom sees at least 2 MMA tiles per dimension
+        "bN >= atom_lay_N * 16", # ensure each atom sees at least 2 MMA tiles per dimension
+    ]
+
+
+
+    results, env = tune_kernel("TensorOpGemm", __file__, M * N, args, tune_params, verbose=True, restrictions=restrictions, #strategy="bayes_opt",
+                               lang="generic_python", call_function=call_cute_custom, answer=[None, None, c_ref.cpu()], atol=M * 2**(-10))
+
+
+
+if __name__ == "__main__":
+    m, n, k = 4096, 4096, 4096
+    run_naive_matmul(m, n, k)
+    tune_naive_matmul(m, n, k)
+
+    mnkl = (4096, 4096, 4096, 1)
+    tune_optimized(m, n, k)
+    run_optimized(m, n, k)
diff --git a/examples/generic_python/matmul/numba_matmul.py b/examples/generic_python/matmul/numba_matmul.py
new file mode 100644
index 000000000..71a6b1b3a
--- /dev/null
+++ b/examples/generic_python/matmul/numba_matmul.py
@@ -0,0 +1,260 @@
+import numpy as np
+from numba import cuda, float32
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_numba
+
+
+# Source: https://nvidia.github.io/numba-cuda/user/examples.html#matrix-multiplication
+@cuda.jit(cache=True)
+def matmul(A, B, C):
+    i, j = cuda.grid(2)
+    if i < C.shape[0] and j < C.shape[1]:
+        tmp = 0.
+        for k in range(A.shape[1]):
+            tmp += A[i, k] * B[k, j]
+        C[i, j] = tmp
+
+
+# Translated to Numba-CUDA from https://github.com/cupy/cupy/blob/main/examples/gemm/sgemm.cu
+@cuda.jit(cache=True)
+def optimized_matmul(M, N, K, A, B, C):
+    DIM_X = 16
+    DIM_Y = 16
+    BLK_M = 64
+    BLK_N = 64
+    BLK_K = 16
+    THR_M = 4 # Should be equal to BLK_M / DIM_X
+    THR_N = 4 # Should be equal to BLK_N / DIM_Y 
+
+    # thread indices
+    idx = cuda.threadIdx.x
+    idy = cuda.threadIdx.y
+
+    blx = cuda.blockIdx.x
+    bly = cuda.blockIdx.y
+
+    # shared memory
+    sA = cuda.shared.array((BLK_K, BLK_M), np.float16)
+    sB = cuda.shared.array((BLK_N, BLK_K), np.float16)
+
+    # registers
+    rC = cuda.local.array((THR_N, THR_M), float32)
+    rA = cuda.local.array(THR_M, float32)
+    rB = cuda.local.array(THR_N, float32)
+
+    # init accumulator
+    for n in range(THR_N):
+        for m in range(THR_M):
+            rC[n][m] = 0.0
+
+    # global indices
+    base_row = blx * BLK_M
+    base_col = bly * BLK_N
+
+    # loop over K tiles
+    for kk in range(0, K, BLK_K):
+
+        # load A tile into shared memory
+        for i in range(idy, BLK_K, DIM_Y):
+            for j in range(idx, BLK_M, DIM_X):
+                row = base_row + j
+                col = kk + i
+                if row < M and col < K:
+                    sA[i, j] = A[row, col]
+                else:
+                    sA[i, j] = 0.0
+
+        # load B tile
+        for i in range(idy, BLK_N, DIM_Y):
+            for j in range(idx, BLK_K, DIM_X):
+                row = kk + j
+                col = base_col + i
+                if row < K and col < N:
+                    sB[i, j] = B[row, col]
+                else:
+                    sB[i, j] = 0.0
+
+        cuda.syncthreads()
+
+        # compute
+        for k in range(BLK_K):
+            for m in range(THR_M):
+                rA[m] = float32(sA[k, m * DIM_X + idx])
+
+            for n in range(THR_N):
+                rB[n] = float32(sB[n * DIM_Y + idy, k])
+
+            for n in range(THR_N):
+                for m in range(THR_M):
+                    rC[n][m] += rA[m] * rB[n]
+
+        cuda.syncthreads()
+
+    # write back
+    for n in range(THR_N):
+        col = base_col + n * DIM_Y + idy   
+        for m in range(THR_M):
+            row = base_row + m * DIM_X + idx   
+            if row < M and col < N:
+                C[row, col] = np.float16(rC[n][m])
+
+
+def run_basic(M, N, K):
+    # create numpy arrays
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+    C_ref = A @ B
+
+    # copy to GPU
+    A_d = cuda.to_device(A)
+    B_d = cuda.to_device(B)
+    C_d = cuda.to_device(C)
+
+    # threads per block
+    threads = (16, 16)
+
+    # compute grid size (ceil division)
+    blocks = (
+        (M + threads[0] - 1) // threads[0],
+        (N + threads[1] - 1) // threads[1],
+    )
+
+    # launch kernel
+    matmul[blocks, threads](A_d, B_d, C_d)
+    cuda.synchronize()
+
+    # copy result back
+    C_result = C_d.copy_to_host()
+
+    # check
+    np.testing.assert_allclose(C_result, C_ref, rtol=1e-2, atol=M * 2**(-11))
+    print("Succes")
+
+
+def run_optimized(M, N, K):
+    # inputs 
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+    C_ref = A @ B
+
+    # move to GPU
+    dA = cuda.to_device(A)
+    dB = cuda.to_device(B)
+    dC = cuda.to_device(C)
+
+    threads_per_block = (16, 16)
+    blocks_per_grid = (
+        (M + 63) // 64, # BLK_M = 64
+        (N + 63) // 64, # BLK_N = 64
+    )
+
+    # launch
+    optimized_matmul[blocks_per_grid, threads_per_block](M, N, K, dA, dB, dC)
+    cuda.synchronize()
+
+    # copy result back
+    C_result = dC.copy_to_host()
+    
+    # check
+    np.testing.assert_allclose(C_result, C_ref, rtol=1e-2, atol=M * 2**(-11))
+    print("Succes")
+
+
+
+
+def tune_basic(M, N, K):
+    # create inputs as normal, but do not copy to device
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+
+    size = (M, N)
+    args = [A, B, C]
+    tune_params = dict()
+    tune_params["block_size_x"] = [2**i for i in range(1, 10)]
+    tune_params["block_size_y"] = [2**i for i in range(1, 10)]
+
+    restrictions = ["block_size_x * block_size_y <= 1024"]
+    
+    answer = [None, None, A @ B]
+    atol = M * 2**(-11)
+
+    results, env = tune_kernel(
+        kernel_name="matmul",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=answer,
+        atol=atol,
+        restrictions=restrictions,
+        call_function=call_numba,
+    )
+
+
+def tune_optimized(M, N, K):
+    # create inputs as normal, but do not copy to device
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+
+    size = (M, N)
+    args = [M, N, K, A, B, C]
+    tune_params = {
+        "DIM_X": [2**i for i in range(1, 10)],
+        "DIM_Y": [2**i for i in range(1, 10)],
+        "BLK_M": [2**i for i in range(1, 10)],
+        "BLK_N": [2**i for i in range(1, 10)],
+        "BLK_K": [2**i for i in range(1, 10)],
+        "THR_M": [2**i for i in range(1, 9)], # Restricted to BLK_M / DIM_X
+        "THR_N": [2**i for i in range(1, 9)], # Restricted to BLK_N / DIM_Y
+    }
+    
+    answer = [None, None, None, None, None, A @ B]
+    atol = M * 2**(-11)
+
+    restrictions = [
+        "BLK_M % DIM_X == 0",
+        "BLK_N % DIM_Y == 0",
+        "THR_M == BLK_M / DIM_X",
+        "THR_N == BLK_N / DIM_Y",
+        "DIM_X * DIM_Y <= 1024",
+        "DIM_x * DIM_Y >= 32",
+    ]
+
+    results, env = tune_kernel(
+        kernel_name="optimized_matmul",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=answer,
+        atol=atol,
+        restrictions = restrictions,
+        block_size_names = ["DIM_X", "DIM_Y"],
+        grid_div_x = ["BLK_M"],
+        grid_div_y = ["BLK_N"],
+        strategy = "bayes_opt",
+        strategy_options = {"max_fevals": 100},
+        call_function=call_numba,
+    )
+
+
+
+if __name__ == "__main__":
+    M, N, K = 1024, 1024, 1024
+
+    run_basic(M, N, K)
+    tune_basic(M, N, K)
+
+    run_optimized(M, N, K)
+    tune_optimized(M, N, K)
+
+
+
+  
\ No newline at end of file
diff --git a/examples/generic_python/matmul/taichi_matmul.py b/examples/generic_python/matmul/taichi_matmul.py
new file mode 100644
index 000000000..bb3c53a81
--- /dev/null
+++ b/examples/generic_python/matmul/taichi_matmul.py
@@ -0,0 +1,63 @@
+import numpy as np
+import taichi as ti
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_taichi
+
+ti.init(arch=ti.gpu)
+
+BLOCK_DIM = 512
+@ti.kernel
+def matmul(A: ti.types.ndarray(dtype=ti.f16, ndim=2), B: ti.types.ndarray(dtype=ti.f16, ndim=2), C: ti.types.ndarray(dtype=ti.f16, ndim=2)):
+    K_dim = A.shape[1]
+
+    ti.loop_config(block_dim=BLOCK_DIM)  
+    for i, j in C:
+        sum = 0.0
+        for k in range(K_dim):
+            sum += A[i, k] * B[k, j]
+        C[i, j] = ti.cast(sum, ti.f16)
+
+
+def run(M, N, K):
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+    C_ref = A @ B
+
+    matmul(A, B, C)
+
+    np.testing.assert_allclose(C, C_ref, rtol=1e-2, atol=M * 2**(-11))
+    print("Succes")
+
+
+def tune(M, N, K):
+    A = np.random.rand(M, K).astype(np.float16)
+    B = np.random.rand(K, N).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+
+    size = M * N
+    args = [A, B, C]
+    tune_params = {"BLOCK_DIM": [2**i for i in range(5, 11)]}
+
+    answer = [None, None, A @ B]
+
+    results, env = tune_kernel(
+        kernel_name="matmul",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        answer=answer,
+        lang="generic_python",
+        call_function=call_taichi,
+        atol=M * 2**(-11),
+        block_size_names = ["BLOCK_DIM"],
+    )
+
+if __name__ == "__main__":
+    run(1024, 1024, 1024)
+    tune(1024, 1024, 1024)
+
+
+    
diff --git a/examples/generic_python/matmul/tilelang_matmul.py b/examples/generic_python/matmul/tilelang_matmul.py
new file mode 100644
index 000000000..751c0add4
--- /dev/null
+++ b/examples/generic_python/matmul/tilelang_matmul.py
@@ -0,0 +1,324 @@
+import torch
+import tilelang
+import tilelang.language as T
+
+import itertools
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_tilelang
+
+# https://github.com/tile-ai/tilelang/tree/main/examples/gemm
+# num_threads and num_stages added as variables to enable tuning.
+@tilelang.jit
+def matmul_basic(M:int, N:int, K:int, block_M:int, block_N:int, block_K:int, dtype:str="float16", accum_dtype:str="float32"):
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        num_threads = 128
+        num_stages = 3
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+            # We do use shared memory, even though this is a basic kernel. However, you don't 
+            # really get around this because T.gemm can not handle global memory directly.
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            
+            # We do use a pipelining optimization here, because this is 'the basic way' 
+            # of writing for loops in TileLang. 
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+            
+    return gemm
+
+
+
+# This kernel is copied from the TileLang project:
+# https://github.com/tile-ai/tilelang/tree/main/examples/gemm/example_gemm.py
+#
+# Original example: example_gemm.py
+# Copyright (c) the TileLang authors
+#
+# Modifications in file:
+# - num_threads and num_stages added as metaparameters
+# - Removed annotated memory layout for tiles A and B
+# - dummy parameter to trigger fresh compilation when timing repeated tuning
+@tilelang.jit
+def matmul_opt(M, N, K, block_M, block_N, block_K, num_threads=128, num_stages=3, dummy=0, dtype=T.float16, accum_dtype=T.float):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+            # Allocate shared and local fragments
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local  = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Enable swizzle-based rasterization for better L2 locality
+            panel_size = 10
+            T.use_swizzle(panel_size=panel_size, enable=True)
+
+            # Clear the local accumulation buffer
+            T.clear(C_local)
+
+            # Pipelined iteration over K dimension
+            for idx in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                # Copy tile of A
+                T.copy(A[by * block_M, idx * block_K], A_shared)
+
+                # Parallel copy tile of B
+                for ko, j in T.Parallel(block_K, block_N):
+                    B_shared[ko, j] = B[idx * block_K + ko, bx * block_N + j]
+
+                # Perform local GEMM on the shared-memory tiles
+                T.gemm(A_shared, B_shared, C_local)
+
+            # Copy the result tile back
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+
+
+def get_configs(kt=False):
+    """
+    Generate a list of kernel tuning configuration dictionaries for a tiled matrix-multiply.
+    This function is used for tuning experiments with the built-in tuner
+
+    Returns:
+        List[dict]: A list of configuration dictionaries 
+    """
+    
+    block_M = [32, 64, 128, 256]
+    block_N = [32, 64, 128, 256]
+    block_K = [16, 32, 64, 128]
+    num_threads = [256]#[64, 128, 256, 512]
+    num_stages = [3]#[0, 1, 2, 3, 4]
+    panel_size = [10]#[4, 6, 8, 10]
+
+
+    _configs = list(
+        itertools.product(
+            block_M,
+            block_N,
+            block_K,
+            num_threads,
+            num_stages,
+            panel_size,
+        )
+    )
+
+    if kt:
+        configs = {
+            "block_M": block_M,
+            "block_N": block_N,
+            "block_K": block_K,
+            "num_stages": num_stages,
+            "num_threads": num_threads,
+            "panel_size": panel_size,
+        }
+    else:
+        configs = [
+            {
+                "block_M": c[0],
+                "block_N": c[1],
+                "block_K": c[2],
+                "num_threads": c[3],
+                "num_stages": c[4],
+                "panel_size": c[5],
+            }
+            for c in _configs
+        ]
+    return configs
+
+
+
+# For autotuning experiment
+# https://github.com/tile-ai/tilelang/blob/main/examples/gemm/example_gemm_autotune.py
+@tilelang.autotune(configs=get_configs(), warmup=1, rep=32)
+@tilelang.jit
+def matmul_opt_autotune(M, N, K, block_M, block_N, block_K, num_threads=128, num_stages=3, panel_size=10, dummy=0, dtype=T.float16, accum_dtype=T.float):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+            # Allocate shared and local fragments
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local  = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Enable swizzle-based rasterization for better L2 locality
+            T.use_swizzle(panel_size=panel_size, enable=True)
+
+            # Clear the local accumulation buffer
+            T.clear(C_local)
+
+            # Pipelined iteration over K dimension
+            for idx in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                # Copy tile of A
+                T.copy(A[by * block_M, idx * block_K], A_shared)
+
+                # Parallel copy tile of B
+                for ko, j in T.Parallel(block_K, block_N):
+                    B_shared[ko, j] = B[idx * block_K + ko, bx * block_N + j]
+
+                # Perform local GEMM on the shared-memory tiles
+                T.gemm(A_shared, B_shared, C_local)
+
+            # Copy the result tile back
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+
+def run_basic(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+    C_ref = A @ B
+
+    kernel = matmul_basic(
+        M, N, K,
+        block_M=64,
+        block_N=64,
+        block_K=32,
+    )
+
+    kernel(A, B, C)
+
+    assert torch.allclose(C, C_ref, atol=M * 2 **(-11), rtol=1e-2)
+    print("Succes")
+
+
+def run_opt(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+    C_ref = A @ B
+
+    kernel = matmul_opt(
+        M, N, K,
+        dummy=0,
+        block_M=64,
+        block_N=64,
+        block_K=32,
+        num_stages=3,
+        num_threads=128,
+    )
+
+    kernel(A, B, C)
+
+    assert torch.allclose(C, C_ref, atol=M * 2 **(-11), rtol=1e-2)
+    print("Succes")
+
+
+def tune_basic(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    size = (M, N)
+    
+
+    args = [A, B, C]
+
+    tune_params = dict()
+    tune_params["block_M"] = [2**i for i in range(5, 9)]
+    tune_params["block_N"] = [2**i for i in range(5, 9)]
+    tune_params["block_K"] = [2**i for i in range(4, 8)] 
+    tune_params["num_threads"] = [64, 128, 256, 512]
+    tune_params["num_stages"] = [0, 1, 2, 3, 4]
+    tune_params["M"] = [M]
+    tune_params["N"] = [N]
+    tune_params["K"] = [K]
+
+    restrictions = [
+        "2 * num_stages * block_K * (block_M + block_N) <= 49152"
+    ]
+
+    results, env = tune_kernel(
+        kernel_name="matmul_basic",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, C_ref.cpu()],
+        atol=M * 2**(-11),
+        strategy = "bayes_opt",
+        strategy_options = {"max_fevals": 100},
+        call_function=call_tilelang,
+        verbose=True,
+        restrictions=restrictions,
+    )
+
+
+def tune_opt(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    size = (M, N)
+    
+    args = [A, B, C]
+
+    tune_params = dict()
+    tune_params["block_M"] = [2**i for i in range(5, 9)]
+    tune_params["block_N"] = [2**i for i in range(5, 9)]
+    tune_params["block_K"] = [2**i for i in range(4, 8)] 
+    tune_params["num_threads"] = [64, 128, 256, 512]
+    tune_params["num_stages"] = [0, 1, 2, 3, 4]
+    tune_params["panel_size"] = [4, 6, 8, 10]
+    tune_params["M"] = [M]
+    tune_params["N"] = [N]
+    tune_params["K"] = [K]
+
+    restrictions = [
+        "2 * num_stages * block_K * (block_M + block_N) <= 49152"
+    ]
+
+    results, env = tune_kernel(
+        kernel_name="matmul_opt",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, C_ref.cpu()],
+        atol=M * 2**(-11),
+        strategy = "bayes_opt",
+        strategy_options = {"max_fevals": 100},
+        call_function=call_tilelang,
+        verbose=True,
+        restrictions=restrictions,
+    )
+
+
+
+
+if __name__ == "__main__":
+    M, N, K = 4096, 4096, 4096
+    run_basic(M, N, K)
+    run_opt(M, N, K)
+
+    tune_basic(M, N, K)
+    tune_opt(M, N, K)
+
+
diff --git a/examples/generic_python/matmul/tilus_matmul.py b/examples/generic_python/matmul/tilus_matmul.py
new file mode 100644
index 000000000..c65cdeaa7
--- /dev/null
+++ b/examples/generic_python/matmul/tilus_matmul.py
@@ -0,0 +1,263 @@
+import torch
+
+import tilus
+from tilus import float16, float32, int32
+from tilus.utils import cdiv
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_tilus
+
+
+# This kernel is copied from the Tilus project:
+# https://github.com/NVIDIA/tilus/blob/main/examples/matmul/matmul_v0.py
+#
+# Original example: matmul_v0.py
+# Copyright (c) the Tilus authors
+class MatmulBasic(tilus.Script):
+    def __init__(self):
+        super().__init__()
+        # we define three hyperparameters: ``block_m``, ``block_n``, and ``block_k`` to determine the tile size on
+        # m, n, and k dimensions for each `thread block` of the kernel.
+        self.block_m = 64
+        self.block_n = 64
+        self.block_k = 16
+
+    def __call__(
+        self,
+        m_size: int32, n_size: int, k_size: int, # Matrix dimensions 
+        a_ptr: ~float16, b_ptr: ~float16, c_ptr: ~float16, # Matrix pointers 
+    ):
+        self.attrs.blocks = [
+            cdiv(m_size, self.block_m),  # the x dimension size of the grid
+            cdiv(n_size, self.block_n),  # the y dimension size of the grid
+        ]
+        num_warps = 1 # added for tuning
+        self.attrs.warps = num_warps  # the number of warps per thread block, must be a compile-time known integer
+
+        # define two int32 variables to store the offsets of the m and n dimensions for the current thread block.
+        offset_m: int32 = self.block_m * self.blockIdx.x
+        offset_n: int32 = self.block_n * self.blockIdx.y
+
+        # create two global tensors `ga` and `gb` to represent the input matrices A and B, respectively.
+        ga = self.global_view(a_ptr, dtype=float16, shape=[m_size, k_size])
+        gb = self.global_view(b_ptr, dtype=float16, shape=[k_size, n_size])
+
+        # create a register tensor `acc` to accumulate the results of the matrix multiplication.
+        acc = self.register_tensor(
+            dtype=float32, shape=[self.block_m, self.block_n], init=0.0
+        )
+
+        # iterate over the k dimension in blocks of size `block_k`.
+        for k in range(cdiv(k_size, self.block_k)):
+            # calculate the offset for the current block in the k dimension
+            offset_k = k * self.block_k
+
+            # load a block of matrix A and B into register tensors `a` and `b`.
+            a = self.load_global(
+                ga, offsets=[offset_m, offset_k], shape=[self.block_m, self.block_k]
+            )
+            b = self.load_global(
+                gb, offsets=[offset_k, offset_n], shape=[self.block_k, self.block_n]
+            )
+
+            # perform the dot product: acc = a @ b + acc
+            self.dot(a, b, acc, out=acc)
+
+        # after the loop, we cast the accumulated result `acc` to float16 type and store it back to the output matrix C.
+        acc_f16 = self.cast(acc, dtype=float16)
+        gc = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size])
+        self.store_global(gc, acc_f16, offsets=[offset_m, offset_n])
+
+
+
+
+# This kernel is copied from the Tilus project:
+# https://nvidia.github.io/tilus/stable/tutorials/matmul-ampere/matmul/matmul_v4.html
+#
+# Original example: matmul_v4.py
+# Copyright (c) the Tilus authors
+#
+# Modifications in file:
+# - Removed auto-tuning decorators
+# - Added default values (None) for the __init__ parameters
+class MatmulOpt(tilus.Script):
+    def __init__(self, num_warps=None, block_m=None, block_n=None, block_k=None, num_stages=None):
+        super().__init__()
+        self.block_m = block_m
+        self.block_n = block_n
+        self.block_k = block_k
+        self.num_warps = num_warps
+        self.num_stages = num_stages
+
+    def __call__(
+        self,
+        m_size: int32, n_size: int, k_size: int,
+        a_ptr: ~float16, b_ptr: ~float16, c_ptr: ~float16,
+    ):
+        self.attrs.blocks = [cdiv(m_size, self.block_m), cdiv(n_size, self.block_n)]
+        self.attrs.warps = self.num_warps
+
+        block_m, block_n, block_k = self.block_m, self.block_n, self.block_k
+        offset_m: int32 = block_m * self.blockIdx.x
+        offset_n: int32 = block_n * self.blockIdx.y
+
+        ga = self.global_view(a_ptr, dtype=float16, shape=[m_size, k_size])
+        gb = self.global_view(b_ptr, dtype=float16, shape=[k_size, n_size])
+        sa = self.shared_tensor(dtype=float16, shape=[self.num_stages, block_m, block_k])
+        sb = self.shared_tensor(dtype=float16, shape=[self.num_stages, block_k, block_n])
+        acc = self.register_tensor(dtype=float32, shape=[block_m, block_n], init=0.0)
+
+        for stage in range(self.num_stages - 1):
+            offset_k = stage * self.block_k
+            self.copy_async(src=ga, dst=sa[stage], offsets=[offset_m, offset_k])
+            self.copy_async(src=gb, dst=sb[stage], offsets=[offset_k, offset_n])
+            self.copy_async_commit_group()
+
+        self.copy_async_wait_group(n=self.num_stages - 2)
+        self.sync()
+
+        current_stage: int32 = 0
+        preload_stage: int32 = self.num_stages - 1
+        for offset_k in self.range(0, k_size, block_k, unroll=self.num_stages):
+            # computation for current tile
+            a = self.load_shared(sa[current_stage])
+            b = self.load_shared(sb[current_stage])
+            self.dot(a, b, acc, out=acc)
+
+            # preload the next tile of A and B into shared memory
+            preload_offset_k = offset_k + (self.num_stages - 1) * block_k
+            self.copy_async(
+                src=ga,
+                dst=sa[preload_stage],
+                offsets=[offset_m, preload_offset_k],
+            )
+            self.copy_async(
+                src=gb,
+                dst=sb[preload_stage],
+                offsets=[preload_offset_k, offset_n],
+            )
+            self.copy_async_commit_group()
+
+            # update the stage
+            current_stage = (current_stage + 1) % self.num_stages
+            preload_stage = (preload_stage + 1) % self.num_stages
+            self.copy_async_wait_group(n=self.num_stages - 2)
+            self.sync()
+
+        self.free_shared(sa)
+        self.free_shared(sb)
+
+        casted_acc = self.cast(acc, dtype=float16)
+        gc = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size])
+        self.store_global(gc, casted_acc, offsets=[offset_m, offset_n])
+
+
+
+def run_basic(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    matmul = MatmulBasic()
+    torch.cuda.synchronize()
+    matmul(M, N, K, A, B, C)
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(C_ref, C, atol=M * 2**(-11), rtol=1e-2)
+    print("Succes")
+
+
+def run_optimized(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    matmul = MatmulOpt(num_warps=4, block_m=64, block_n=64, block_k=16, num_stages=3)
+    torch.cuda.synchronize()
+
+    matmul(M, N, K, A, B, C)
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(C_ref, C, atol=M * 2**(-11), rtol=1e-2)
+    print("Succes")
+
+
+def tune_basic(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    size = (M, N)
+
+    args = [M, N, K, A, B, C]
+
+    tune_params = dict()
+    tune_params["block_m"] = [2**i for i in range(5, 9)]
+    tune_params["block_n"] = [2**i for i in range(5, 9)]
+    tune_params["block_k"] = [2**i for i in range(4, 8)]
+    tune_params["num_warps"] = [2, 4, 8, 16]
+
+
+    results, env = tune_kernel(
+        kernel_name="MatmulBasic",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, None, None, None, C_ref.cpu()],
+        atol=M * 2**(-11),
+        strategy = "bayes_opt",
+        strategy_options = {"max_fevals": 200},
+        call_function=call_tilus,
+    )
+
+
+def tune_opt(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    size = (M, N)
+
+    args = [M, N, K, A, B, C]
+
+    tune_params = dict()
+    tune_params["block_m"] = [2**i for i in range(5, 9)]
+    tune_params["block_n"] = [2**i for i in range(5, 9)]
+    tune_params["block_k"] = [2**i for i in range(4, 8)]
+    tune_params["num_warps"] = [2, 4, 8, 16]
+    tune_params["num_stages"] = [2, 3, 4, 5]
+
+    # Shared memory restriction
+    restrictions = ["2 * num_stages * block_k * (block_m + block_n) <= 49152"]
+
+    results, env = tune_kernel(
+        kernel_name="MatmulOpt",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, None, None, None, C_ref.cpu()],
+        atol=M * 2**(-11),
+        restrictions=restrictions,
+        strategy = "bayes_opt",
+        strategy_options = {"max_fevals": 200},
+        call_function=call_tilus,
+    )
+
+
+
+
+
+if __name__ == "__main__":
+    M, N, K = 4096, 4096, 4096
+    run_basic(M, N, K)
+    run_optimized(M, N, K)
+    tune_basic(M, N, K)
+    tune_opt(M, N, K)
\ No newline at end of file
diff --git a/examples/generic_python/matmul/triton_matmul.py b/examples/generic_python/matmul/triton_matmul.py
new file mode 100644
index 000000000..d5522c627
--- /dev/null
+++ b/examples/generic_python/matmul/triton_matmul.py
@@ -0,0 +1,312 @@
+import torch
+import triton
+import triton.language as tl
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_triton
+
+@triton.jit
+def matmul_basic(
+        a_ptr, b_ptr, c_ptr, # Pointers
+        M, N, K, # Matrix sizes
+        stride_am, stride_ak,  # Strides
+        stride_bk, stride_bn,
+        stride_cm, stride_cn,
+        BLOCK_SIZE_M: tl.constexpr, # Tile sizes
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+):
+    # Each program computes one BLOCK_SIZE_M x BLOCK_SIZE_N tile of C
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+
+    # Compute row/column indices for the tile
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    # Create pointers to A and B tiles
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    # Accumulator
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # Loop over K dimension
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(
+            a_ptrs,
+            mask=(offs_am[:, None] < M) & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        )
+        
+        b = tl.load(
+            b_ptrs,
+            mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K) & (offs_bn[None, :] < N),
+            other=0.0,
+        )
+
+        accumulator = tl.dot(a, b, accumulator)
+
+        # advance K
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    # Store result
+    c = accumulator.to(tl.float16)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+
+
+
+
+# This kernel is copied from the Triton project:
+# https://github.com/triton-lang/triton/blob/main/python/tutorials/03-matrix-multiplication.py
+#
+# Original example: 03-matrix-multiplication.py
+# Copyright (c) the Triton authors
+#
+# Modifications in file:
+# - Removed auto-tuning decorators
+# - Removed activation function
+@triton.jit
+def matmul_opt(
+        # Pointers to matrices
+        a_ptr, b_ptr, c_ptr,
+        # Matrix dimensions
+        M, N, K,
+        # The stride variables represent how much to increase the ptr by when moving by 1
+        # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
+        # by to get the element one row down (A has M rows).
+        stride_am, stride_ak,  #
+        stride_bk, stride_bn,  #
+        stride_cm, stride_cn,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,  #
+        GROUP_SIZE_M: tl.constexpr,  #
+):
+    """Kernel for computing the matmul C = A x B.
+    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    # See above `L2 Cache Optimizations` section for details.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # -----------------------------------------------------------
+    # Add some integer bound assumptions.
+    # This helps to guide integer analysis in the backend to optimize
+    # load/store offset address calculation
+    tl.assume(pid_m >= 0)
+    tl.assume(pid_n >= 0)
+    tl.assume(stride_am > 0)
+    tl.assume(stride_ak > 0)
+    tl.assume(stride_bn > 0)
+    tl.assume(stride_bk > 0)
+    tl.assume(stride_cm > 0)
+    tl.assume(stride_cn > 0)
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    # See above `Pointer Arithmetic` section for details
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the K dimension.
+        # If it is out of bounds, set it to 0.
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        # We accumulate along the K dimension.
+        accumulator = tl.dot(a, b, accumulator)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    c = accumulator.to(tl.float16)
+
+    # -----------------------------------------------------------
+    # Write back the block of the output matrix C with masks.
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+
+def run_basic(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    BLOCK_SIZE_M = 64
+    BLOCK_SIZE_N = 64
+    BLOCK_SIZE_K = 32
+
+    grid = (
+        triton.cdiv(M, BLOCK_SIZE_M),
+        triton.cdiv(N, BLOCK_SIZE_N),
+    )
+
+    matmul_basic[grid](
+        A, B, C,
+        M, N, K,
+        A.stride(0), A.stride(1),
+        B.stride(0), B.stride(1),
+        C.stride(0), C.stride(1),
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+    )
+
+    assert torch.allclose(C, C_ref, rtol=1e-2, atol= M * 2**(-11))
+
+    print("Passed")
+
+
+def run_opt(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    BLOCK_SIZE_M = 64
+    BLOCK_SIZE_N = 64
+    BLOCK_SIZE_K = 32
+    GROUP_SIZE_M = 32
+
+    grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N),)
+
+    matmul_opt[grid](
+        A, B, C,
+        M, N, K,
+        A.stride(0), A.stride(1),
+        B.stride(0), B.stride(1),
+        C.stride(0), C.stride(1),
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+        GROUP_SIZE_M=GROUP_SIZE_M,
+    )
+
+    assert torch.allclose(C, C_ref, rtol=1e-2, atol= M * 2**(-11))
+
+    print("Passed")
+
+
+def tune_basic(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    size = (M, N)
+
+    args = [A, B, C,
+        M, N, K,
+        A.stride(0), A.stride(1),
+        B.stride(0), B.stride(1),
+        C.stride(0), C.stride(1),
+    ]
+
+    tune_params = dict()
+    tune_params["BLOCK_SIZE_M"] = [2**i for i in range(5, 9)]
+    tune_params["BLOCK_SIZE_N"] = [2**i for i in range(5, 9)]
+    tune_params["BLOCK_SIZE_K"] = [2**i for i in range(4, 8)] 
+    tune_params["num_warps"] = [2, 4, 8, 16]
+    tune_params["num_stages"] = [1, 2, 3, 4, 5]
+
+    results, env = tune_kernel(
+        kernel_name="matmul_basic",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, C_ref.cpu(), None, None, None, None, None, None, None, None, None],
+        atol=M * 2**(-11),
+        block_size_names = ["BLOCK_SIZE_M", "BLOCK_SIZE_M"],
+        strategy = "bayes_opt",
+        strategy_options = {"max_fevals": 100},
+        call_function=call_triton,
+    )
+
+
+def tune_opt(M, N, K):
+    A = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    B = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+    C_ref = A @ B
+
+    size = M * N
+
+    args = [A, B, C,
+        M, N, K,
+        A.stride(0), A.stride(1),
+        B.stride(0), B.stride(1),
+        C.stride(0), C.stride(1),
+    ]
+
+    tune_params = dict()
+    tune_params["BLOCK_SIZE_M"] = [2**i for i in range(5, 9)]
+    tune_params["BLOCK_SIZE_N"] = [2**i for i in range(5, 9)]
+    tune_params["BLOCK_SIZE_K"] = [2**i for i in range(4, 8)] 
+    tune_params["GROUP_SIZE_M"] = [4, 6, 8, 10]
+    tune_params["num_warps"] = [2, 4, 8, 16]
+    tune_params["num_stages"] = [1, 2, 3, 4, 5]
+    tune_params["M_dim"] = [M]
+
+    restrictions = ["GROUP_SIZE_M * BLOCK_SIZE_M < M_dim"] # Otherwise grouped ordering has no effect
+
+    results, env = tune_kernel(
+        kernel_name="matmul_opt",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, C_ref.cpu(), None, None, None, None, None, None, None, None, None],
+        atol=M * 2**(-11),
+        block_size_names = ["BLOCK_SIZE_M", "BLOCK_SIZE_M"],
+        grid_div_x = ["BLOCK_SIZE_M", "BLOCK_SIZE_N"],
+        restrictions = restrictions,
+        strategy = "bayes_opt",
+        strategy_options = {"max_fevals": 100},
+        call_function=call_triton,
+    )
+    
+
+
+if __name__ == "__main__":
+    M, N, K = 4096, 4096, 4096
+    run_basic(M, N, K)
+    run_opt(M, N, K)
+
+    tune_basic(M, N, K)
+    tune_opt(M, N, K)
diff --git a/examples/generic_python/matmul/warp_matmul.py b/examples/generic_python/matmul/warp_matmul.py
new file mode 100644
index 000000000..c4a218b21
--- /dev/null
+++ b/examples/generic_python/matmul/warp_matmul.py
@@ -0,0 +1,90 @@
+import numpy as np
+import warp as wp
+
+from kernel_tuner import tune_kernel
+from examples.generic_python.call_functions import call_warp
+
+wp.init()
+wp.config.enable_backward = False
+
+
+@wp.kernel()
+def gemm(
+    A: wp.array2d(dtype=wp.float16), B: wp.array2d(dtype=wp.float16), C: wp.array2d(dtype=wp.float16)
+):
+    i, j = wp.tid()
+    M = A.shape[0]
+    N = B.shape[1]
+    K = A.shape[1]
+
+    if i >= M or j >= N:
+        return
+
+    # compute dot product
+    sum = wp.float32(0.0)
+    for k in range(K):
+        sum += wp.float32(A[i, k]) * wp.float32(B[k, j])
+
+    # write result
+    C[i, j] = wp.float16(sum)
+
+
+def run_gemm(M, N, K):
+    rng = np.random.default_rng(42)
+    A = rng.random((M, K)).astype(np.float16)
+    B = rng.random((K, N)).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+    C_wp = wp.array(C)
+
+    wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+
+    np.testing.assert_allclose(C_wp.numpy(), A @ B, rtol=1e-2, atol=M * 2**(-11))
+
+    print("Succes")
+
+
+def tune(M, K, N):
+    rng = np.random.default_rng(42)
+    A = rng.random((M, K)).astype(np.float16)
+    B = rng.random((K, N)).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float16)
+
+    size = (M, N)
+
+    tune_params = dict()
+    tune_params["block_dim"] = [2**i for i in range(5, 11)]
+    tune_params["dim"] = [size]
+
+    args = [A, B, C]
+    answer = [None, None, A @ B]
+
+    results, env = tune_kernel(
+        kernel_name="gemm",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=answer,
+        call_function=call_warp,
+    )
+
+
+
+if __name__ == "__main__":
+    M, N, K = 1024, 1024, 1024
+    
+    run_gemm(M, N, K)    
+    tune(M, N, K)
+
+    
+
+
+    
+
+    
+
+    
diff --git a/examples/generic_python/numba_vec_add.py b/examples/generic_python/numba_vec_add.py
new file mode 100644
index 000000000..3bc90de86
--- /dev/null
+++ b/examples/generic_python/numba_vec_add.py
@@ -0,0 +1,42 @@
+from numba import cuda
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from call_functions import call_numba
+
+
+@cuda.jit
+def f(a, b, c):
+    tid = cuda.grid(1)
+    size = len(c)
+
+    if tid < size:
+        c[tid] = a[tid] + b[tid]
+
+
+def tune():
+
+    N = 100000
+
+    a = np.random.random(N)
+    b = np.random.random(N)
+    c = np.zeros(N)
+    c_expect = a + b
+
+    args = [a, b, c]
+    tune_params = {"block_size_x": [2**i for i in range(10)]}
+
+
+    results, env = tune_kernel(
+        kernel_name="f",
+        kernel_source=__file__,
+        problem_size=N,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, c_expect],
+        call_function=call_numba,
+    )
+
+if __name__ == "__main__":
+    tune()
\ No newline at end of file
diff --git a/examples/generic_python/tilelang_vec_add.py b/examples/generic_python/tilelang_vec_add.py
new file mode 100644
index 000000000..38b4ab309
--- /dev/null
+++ b/examples/generic_python/tilelang_vec_add.py
@@ -0,0 +1,44 @@
+import tilelang
+import tilelang.language as T
+import torch
+
+from kernel_tuner import tune_kernel
+from call_functions import call_tilelang
+
+
+@tilelang.jit  # infers target from tensors at first call
+def add(N: int, dtype: str = 'float32', block: int = 256,):
+
+    @T.prim_func
+    def add_kernel(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block), threads=block) as bx:
+            for i in T.Parallel(block):
+                gi = bx * block + i
+                # Optional — LegalizeSafeMemoryAccess inserts a guard when an access may be OOB
+                C[gi] = A[gi] + B[gi]
+
+    return add_kernel
+
+
+def tune():
+    N = 1 << 20
+    A = torch.randn(N, device='cuda', dtype=torch.float32)
+    B = torch.randn(N, device='cuda', dtype=torch.float32)
+    C = torch.empty(N, device='cuda', dtype=torch.float32)
+
+    args = [A, B, C]
+    tune_params = dict()
+    tune_params["block"] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+    tune_params["N"] = [N] # Not a tune param, but enables an easier call function
+
+    answer = [None, None, (A + B).cpu()]
+    
+    res, env = tune_kernel("add", __file__, N, args, tune_params, lang="generic_python", 
+            call_function=call_tilelang, answer=answer)
+
+if __name__ == "__main__":
+    tune()
\ No newline at end of file
diff --git a/examples/generic_python/tilus_vec_add.py b/examples/generic_python/tilus_vec_add.py
new file mode 100644
index 000000000..8ee96aa07
--- /dev/null
+++ b/examples/generic_python/tilus_vec_add.py
@@ -0,0 +1,101 @@
+import tilus
+from tilus import float32, int32
+from tilus.utils import cdiv
+import torch 
+
+from kernel_tuner import tune_kernel, run_kernel
+from call_functions import call_tilus
+
+
+
+
+class VecAddV(tilus.Script):
+    def __init__(self, block_size_x=None, num_warps=None):
+        super().__init__()
+        self.block_size_x = block_size_x  # number of threads per block
+
+    def __call__(
+        self,
+        n_size: int32,        # size of the vectors
+        a_ptr: ~float32,      # input vector A
+        b_ptr: ~float32,      # input vector B
+        c_ptr: ~float32       # output vector C
+    ):  
+
+        # compute the number of blocks needed
+        self.attrs.blocks = [cdiv(n_size, self.block_size_x)]
+        self.attrs.warps = 4  # number of warps per block
+
+        # calculate the offset for this block
+        offset: int32 = self.block_size_x * self.blockIdx.x
+
+        # create global views for input/output vectors
+        ga = self.global_view(a_ptr, dtype=float32, shape=[n_size])
+        gb = self.global_view(b_ptr, dtype=float32, shape=[n_size])
+        gc = self.global_view(c_ptr, dtype=float32, shape=[n_size])
+
+        a = self.load_global(ga, offsets=[offset], shape=[self.block_size_x])
+        b = self.load_global(gb, offsets=[offset], shape=[self.block_size_x]) 
+        c = a + b
+        self.store_global(gc, c, offsets=[offset])
+
+
+
+
+
+def tune(size):
+    a = torch.randn(size, dtype=torch.float32).cuda()
+    b = torch.randn(size, dtype=torch.float32).cuda()
+    c = torch.empty(size, dtype=torch.float32).cuda()
+    c_expect = a + b
+    
+
+    args = [size, a, b, c]
+    tune_params = dict()
+    tune_params["block_size_x"] = [32, 64, 128, 256, 512, 1024]
+    tune_params["num_warps"] = [4, 8]
+
+
+    results, env = tune_kernel(
+        kernel_name="VecAddV", 
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, None, c_expect.cpu()],
+        call_function=call_tilus,
+        verbose=True,
+    )
+
+
+def run(size):
+    a = torch.randn(size, dtype=torch.float32).cuda()
+    b = torch.randn(size, dtype=torch.float32).cuda()
+    c = torch.empty(size, dtype=torch.float32).cuda()
+    c_expect = a + b
+    
+
+    args = [size, a, b, c]
+
+    results = run_kernel(
+        kernel_name="VecAddV", 
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        params={"block_size_x": 32},
+        lang="generic_python",
+        call_function=call_tilus,
+    )
+
+    c_expect = c_expect.cpu()
+
+    assert torch.allclose(results[-1], c_expect)
+
+
+
+if __name__ == "__main__":
+    size = 10000000
+    tune(size)
+    run(size)
+
diff --git a/examples/generic_python/triton_vec_add.py b/examples/generic_python/triton_vec_add.py
new file mode 100644
index 000000000..cafba79a9
--- /dev/null
+++ b/examples/generic_python/triton_vec_add.py
@@ -0,0 +1,66 @@
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from kernel_tuner import tune_kernel, run_kernel
+from call_functions import call_triton
+
+@triton.jit
+def add_op(x, y):
+    return x + y
+
+@triton.jit
+def add_kernel(x_ptr,  # *Pointer* to first input vector.
+               y_ptr,  # *Pointer* to second input vector.
+               output_ptr,  # *Pointer* to output vector.
+               n_elements,  # Size of the vector.
+               block_size_x: tl.constexpr,  # Number of elements each program should process.
+               # note: `constexpr` so it can be used as a shape value.
+               ):
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    block_start = pid * block_size_x
+    offsets = block_start + tl.arange(0, block_size_x)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = add_op(x, y)
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+
+def tune():
+    size = 10000000
+
+    a = torch.randn(size, device='cuda', dtype=torch.float32)
+    b = torch.randn(size, device='cuda', dtype=torch.float32)
+    c = torch.empty_like(b)
+    n = torch.tensor(size, dtype=torch.int32)
+    c_expect = a + b
+
+    args = [a, b, c, size] 
+    tune_params = dict()
+    tune_params["block_size_x"] = [2**i for i in range(11)]
+
+    
+    result = run_kernel("add_kernel", __file__, size, args, {"block_size_x": 256}, 
+               lang="generic_python", call_function=call_triton)   
+    assert np.allclose(c_expect.cpu(), result[2])
+    
+    
+ 
+    results, env = tune_kernel(
+        kernel_name="add_kernel",
+        kernel_source=__file__,
+        problem_size=size,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, c_expect.cpu(), None],
+        call_function=call_triton,
+    )
+
+    
+    
+if __name__ == "__main__":
+    tune()
\ No newline at end of file
diff --git a/examples/generic_python/warp_vec_add.py b/examples/generic_python/warp_vec_add.py
new file mode 100644
index 000000000..81d7fb19e
--- /dev/null
+++ b/examples/generic_python/warp_vec_add.py
@@ -0,0 +1,71 @@
+import warp as wp
+import numpy as np
+import torch
+
+from kernel_tuner import tune_kernel
+from call_functions import call_warp
+
+
+wp.init()
+
+
+@wp.func
+def add_op(x: float, y: float):
+    return x + y
+
+@wp.kernel
+def vec_add(a: wp.array(dtype=float),
+            b: wp.array(dtype=float),
+            c: wp.array(dtype=float),
+            n: int):
+
+    work_per_thread = 8
+    tid = wp.tid()
+    base = tid * work_per_thread
+
+    for i in range(work_per_thread):
+        idx = base + i
+        if idx < n:
+            c[idx] = add_op(a[idx], b[idx])
+
+
+def call_warp(kernel_function, args, kwargs, grid, threads, params):
+    warp_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            warp_args.append(wp.from_torch(arg))
+        else:
+            warp_args.append(arg)
+    dim = params['size']
+    wp.launch(kernel=kernel_function, dim=dim, inputs=warp_args)
+    
+
+def tune():
+    n = 1024
+
+    # Create host arrays
+    a = np.arange(n, dtype=np.float32)
+    b = np.arange(n, 0, -1, dtype=np.float32)
+    c = np.zeros(n, dtype=np.float32)
+    c_expect = a + b
+
+    
+    tune_params = dict()
+    tune_params["work_per_thread"] = [2**i for i in range(10)]
+    tune_params["size"] = [n]
+    args = [a, b, c, n]
+
+    results, env = tune_kernel(
+        kernel_name="vec_add",
+        kernel_source=__file__,
+        problem_size=n,
+        arguments=args,
+        tune_params=tune_params,
+        lang="generic_python",
+        answer=[None, None, c_expect, None],
+        call_function=call_warp,
+    )
+
+
+if __name__ == "__main__":
+    tune()
\ No newline at end of file
diff --git a/kernel_tuner/backends/generic_python.py b/kernel_tuner/backends/generic_python.py
new file mode 100644
index 000000000..2247d5e87
--- /dev/null
+++ b/kernel_tuner/backends/generic_python.py
@@ -0,0 +1,391 @@
+import logging
+import inspect
+import copy
+import traceback # for compile error handling
+import re
+import warnings
+import builtins
+import numpy as np
+
+from kernel_tuner.backends.backend import GPUBackend
+from kernel_tuner.observers.generic_python import GenericPythonRuntimeObserver
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+class GenericPythonFunctions(GPUBackend):
+    """Class that groups the Python DSL functions on maintains state about the device."""
+
+    def __init__(self, device=0, iterations=7, compiler_options=None, observers=None):
+        """Instantiate GenericPythonFunctions object used for interacting with the device.
+        Currently, only CUDA devices are supported.
+
+        Instantiating this object will inspect and store certain device properties at
+        runtime, which are used during compilation and/or execution of kernels by the
+        kernel tuner. Compiler options are not supported for GenericPython.
+
+        :param device: Number of CUDA device to use for this context
+        :type device: int
+
+        :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
+        :type iterations: int
+        """
+
+        if not torch: 
+            logging.error("Unable to import Torch")
+            raise ImportError("Unable to import Torch")
+
+        self.device_id = torch.cuda.current_device()
+        self.device_properties = torch.cuda.get_device_properties(self.device_id)
+        self.name = torch.cuda.get_device_name(self.device_id)
+        self.max_threads = 10**18 # 'inf' to support tile based programming models, which can use less threads then the size of a tile.
+
+        env = dict()
+        env["device_name"] = self.name
+        env["max_threads"] = self.max_threads 
+        env["iterations"] = iterations
+        env["compiler_options"] = compiler_options
+        self.env = env
+
+        self.stream = torch.cuda.default_stream()
+        self.start = torch.cuda.Event(enable_timing=True)
+        self.end = torch.cuda.Event(enable_timing=True)
+
+        # setup observers
+        self.observers = observers or []
+        self.observers.append(GenericPythonRuntimeObserver(self))
+        for obs in self.observers:
+            obs.register_device(self)
+
+        self.units = {"time": "ms", "power": "s,mW", "energy": "J"}
+
+        # Variables to be filled in at compile time, needed for running the kernel after compilation.
+        self.call_function = None
+        self.signature = None 
+        self.gpu_kwargs = None 
+
+        super().__init__(device=device, iterations=iterations, compiler_options=compiler_options, observers=observers)
+
+    def ready_argument_list(self, arguments):
+        """Ready argument list to be passed to the kernel. Converts arguments to Torch GPU Tensors or Python 
+        Scalars. Arguments of built-in Python types are left untouched.
+
+        :param arguments: List of arguments to be passed to the kernel.
+            The order should match the argument list on the kernel.
+            Allowed values are:
+            - numpy.ndarray, and/or numpy.int32, numpy.float32, and so on.
+            - Torch Tensors
+            - Built-in Python types.
+        :type arguments: list
+
+        :returns: A list of arguments that can be passed to the kernel.
+        :rtype: list( Torch.Tensor on GPU, int, float, bool, etc. )
+        """
+        torch_args = []
+
+        for arg in arguments:
+            if isinstance(arg, torch.Tensor):
+                if arg.dim() == 0: # Scalar tensor, convert to Python scalar
+                    torch_args.append(arg.item())
+                else:
+                    if arg.is_cuda: # already on GPU, need deep copy to not overwrite
+                        torch_args.append(arg.clone())
+                    else: # Copy from CPU to GPU
+                        torch_args.append(arg.contiguous().to("cuda"))
+            elif isinstance(arg, np.ndarray): # Convert Numpy CPU array to Torch GPU Tensor
+                torch_args.append(torch.from_numpy(arg).to("cuda"))
+            elif isinstance(arg, np.generic): # Numpy scalar, convert to Python scalar
+                torch_args.append(arg.item())
+            elif isinstance(arg, (int, float, bool, str)): # Is already Python
+                torch_args.append(arg)
+            elif type(arg) in vars(builtins).values():
+                torch_args.append(arg)
+            else:
+                raise TypeError("Unknown argument type: ", type(arg), ". Accepted types are Torch tenors, NumPy arrays and scalars and built-in Python types.")
+
+        return torch_args
+        
+
+    def compile(self, kernel_instance, gpu_args=None):
+        """Compile the kernel by executing it once. This enforces that the kernel is cached. 
+        
+        :param kernel_instance: The kernel instance containing information such as the kernel_source,
+        grid, threads, params, etc.
+        :type kernel_instance: KernelInstance
+
+        :param gpu_args: arguments to be passed to the kernel.
+        :type gpu_args: list(any)
+
+        :returns: A kernel that can be called with the user-defined call function.
+        :rtype: callable
+        """
+        if kernel_instance.kernel_fn is None:
+            raise ValueError("kernel_fn is None, currently Generic Python only supports callable kernel_source")
+
+        if gpu_args is None:
+            raise ValueError("gpu_args is None, Generic Python needs gpu args to compile the kernel")
+        
+        # The first time we compile, we also set the call function and the signature
+        # We need this later to run the kernel.
+        if self.call_function is None or self.signature is None:
+            self.call_function = kernel_instance.kernel_source.call_function
+            self.signature = kernel_instance.kernel_source.signature
+        
+        grid = kernel_instance.grid
+        threads = kernel_instance.threads
+        params = kernel_instance.params
+
+        # If the kernel source is a class, we use the __call__ function as the kernel_function.
+        if inspect.isclass(kernel_instance.kernel_fn):  
+            kernel_function = kernel_instance.kernel_fn()
+        elif callable(kernel_instance.kernel_fn):
+            kernel_function = kernel_instance.kernel_fn
+        else:
+            raise TypeError("kernel function is not a class or function")
+        
+        # Tuning params can contain kernel arguments. In such cases, create keyword arguments with
+        # the values of the tuning params.
+        self.gpu_kwargs = {}
+        if params is not None:
+            for arg_name in self.signature:
+                if arg_name in params:
+                    self.gpu_kwargs[arg_name] = params[arg_name]
+
+        # Call the user-defined call function in order to compile the kernel.
+        self.synchronize()
+        self.call_function(kernel_function, gpu_args, self.gpu_kwargs, grid, threads, params) 
+        self.synchronize()
+
+        return kernel_function
+
+    def start_event(self):
+        """Records the event that marks the start of a measurement."""
+        self.start.record()
+
+    def stop_event(self):
+        """Records the event that marks the end of a measurement."""
+        self.end.record()
+
+    def kernel_finished(self):
+        """Returns True if the kernel has finished, False otherwise."""
+        return self.end.query()
+
+    def run_kernel(self, func, gpu_args, threads, grid, params=None):
+        """Runs the Python kernel passed as 'func'.
+
+        :param func: A cached Python kernel for this specific kernel configuration
+        :type func: callable
+
+        :param gpu_args: A list of arguments to the kernel, order should match the
+            order in the code. 
+        :type gpu_args: list(any)
+
+        :param threads: A tuple listing the number of threads in each dimension of
+            the thread block
+        :type threads: tuple(int, int, int)
+
+        :param grid: A tuple listing the number of thread blocks in each dimension
+            of the grid
+        :type grid: tuple(int, int, int)
+
+        :param params: A dictionary with the tuning params for this specific kernel 
+            configuration
+        :type params: dict
+        """
+        self.call_function(func, gpu_args, self.gpu_kwargs, grid, threads, params) 
+    
+    
+    def synchronize(self):
+        """Halts execution until device has finished its tasks."""
+        torch.cuda.synchronize()
+
+
+    def memset(self, allocation, value, size):
+        """This method must implement setting the memory to a value on the device.
+        Not implemented: Python DSLs usually do not perform explicit memory handling."""
+        pass
+
+
+    def memcpy_dtoh(self, dest, src):
+        """This method must implement a device to host copy.
+        Not implemented: Python DSLs usually do not perform explicit memory handling."""
+        pass
+
+
+    def memcpy_htod(self, dest, src):
+        """This method must implement a host to device copy.
+        Not implemented: Python DSLs usually do not perform explicit memory handling."""
+        pass
+
+
+    def copy_constant_memory_args(self, cmem_args):
+        raise NotImplementedError("Generic Python does not support constant memory")
+
+
+    def copy_shared_memory_args(self, smem_args):
+        raise NotImplementedError("Generic Python does not support shared memory")
+
+
+    def copy_texture_memory_args(self, texmem_args):
+        raise NotImplementedError("Generic Python does not support texture memory")
+    
+    
+    def refresh_memory(self, gpu_memory, host_arguments, should_sync):
+        """Refresh the GPU memory with the untouched host arguments. We overwrite the standard function
+        because Python DSLs do usually do not manage memory explicitely"""
+
+        for i, host_arg in enumerate(host_arguments):
+            if should_sync[i]:
+                gpu_arg = gpu_memory[i]
+
+                # Scalar Python type
+                if isinstance(gpu_arg, (int, float, bool)):
+                    gpu_memory[i] = host_arg
+                elif type(gpu_arg) in vars(builtins).values():
+                    gpu_memory[i] = host_arg
+
+                # GPU tensor
+                elif isinstance(gpu_arg, torch.Tensor):
+                    if isinstance(host_arg, np.ndarray):
+                        gpu_arg.copy_(torch.as_tensor(host_arg))  
+                    elif isinstance(host_arg, torch.Tensor):
+                        if host_arg.is_cuda and host_arg.device != gpu_arg.device:
+                            gpu_arg.copy_(host_arg.to(gpu_arg.device))  # different gpu's, no direct copy allowed
+                        else:
+                            gpu_arg.copy_(host_arg)  
+                    else:
+                        # host_arg is scalar, fill into tensor
+                        gpu_arg.fill_(host_arg)
+               
+
+    def classify_compile_exception(self, e):
+        """Best effort to differentiate between a user error and a resource error. 
+        
+        :param e: the caught exception
+        :type e: exception
+
+        :returns: "resource_error" , "user_error" or "unknown"
+        :rtype: string
+        """
+
+        RESOURCE_KEYWORDS = (
+            # Shared memory
+            "shared memory",
+            "smem",
+            "uses too much shared",
+            "exceeds shared memory",
+            "shared memory limit",
+
+            # Registers / occupancy
+            "too many registers",
+            "register spill",
+            "uses too many registers",
+            "out of registers",
+
+            # Launch configuration
+            "invalid launch configuration",
+            "invalid configuration argument",
+            "threads per block",
+            "block size",
+            "grid size",
+            "num_warps",
+            "num_ctas",
+
+            # Generic resource exhaustion
+            "too many resources",
+            "out of resources",
+            "exceeds maximum",
+            "exceeds limit",
+
+            # Compiler-level indicators
+            "ptxas error",
+            "ptxas fatal",
+            "nvcc error",
+            "cuda error",
+            "llvm error",
+            "mlir error",
+            "lowering failed",
+        )
+
+        USER_ERROR_KEYWORDS = (
+            # Undefined / missing symbols
+            "not defined",
+            "undefined variable",
+            "without definition",
+            "unknown variable",
+            "unbound",
+
+            # Type / shape errors
+            "type mismatch",
+            "invalid type",
+            "cannot convert",
+            "expected .* but got",
+            "incompatible types",
+
+            # Indexing / bounds
+            "index out of bounds",
+            "out of bounds access",
+            "invalid index",
+
+            # IR / AST construction
+            "failed to build",
+            "invalid expression",
+            "malformed",
+            "illegal operation",
+        )
+
+        RESOURCE_ORIGINS = (
+            "ptxas",
+            "nvcc",
+            "cuda",
+            "llvm",
+            "mlir",
+            "cubin",
+            "fatbin",
+        )
+
+        USER_ORIGINS = (
+            "transpiler",
+            "scheduler",
+            "hidet",
+            "frontend",
+            "ast",
+        )
+
+
+        USER_ERROR_TYPES = (
+            NameError,
+            UnboundLocalError,
+            AttributeError,
+            TypeError,
+            SyntaxError,
+            IndentationError,
+        )
+
+        if isinstance(e, USER_ERROR_TYPES):
+            return "user_error"
+        
+        
+        def match_any(patterns, text):
+            return any(re.search(p, text) for p in patterns)
+
+        msg = str(e).lower()
+        tb = "".join(traceback.format_tb(e.__traceback__)).lower()
+
+        
+        if match_any(RESOURCE_KEYWORDS, msg):
+            return "resource_error"
+
+        if match_any(RESOURCE_ORIGINS, msg + tb):
+            return "resource_error"
+
+        if match_any(USER_ERROR_KEYWORDS, msg):
+            return "user_error"
+        
+        if match_any(USER_ORIGINS, msg + tb):
+            return "user_error"
+
+        return "unknown"
diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py
index 8f9326c2d..f4816528c 100644
--- a/kernel_tuner/backends/pycuda.py
+++ b/kernel_tuner/backends/pycuda.py
@@ -376,6 +376,7 @@ def memset(self, allocation, value, size):
         drv.memset_d8(allocation, value, size)
 
     def memcpy_dtoh(self, dest, src):
+        print("dtoh called")
         """Perform a device to host memory copy.
 
         :param dest: A numpy array in host memory to store the data
@@ -390,6 +391,7 @@ def memcpy_dtoh(self, dest, src):
             dest[:] = src
 
     def memcpy_htod(self, dest, src):
+        print("htod called")
         """Perform a host to device memory copy.
 
         :param dest: A GPU memory allocation unit
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 69d952541..1fe5ffbe6 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -16,8 +16,12 @@ def _get_cupy():
 
 import kernel_tuner.util as util
 from kernel_tuner.accuracy import Tunable
+from kernel_tuner.backends.generic_python import GenericPythonFunctions
+from kernel_tuner.kernel_sources.kernel_source import KernelSource
+from kernel_tuner.observers.nvml import NVMLObserver
 from kernel_tuner.observers.observer import BenchmarkObserver, ContinuousObserver, OutputObserver, PrologueObserver
 from kernel_tuner.observers.tegra import TegraObserver
+from kernel_tuner.language import Language
 
 try:
     import torch
@@ -36,6 +40,7 @@ def _get_cupy():
         "name",
         "kernel_source",
         "kernel_string",
+        "kernel_fn",
         "temp_files",
         "threads",
         "grid",
@@ -48,13 +53,27 @@ def _get_cupy():
 class KernelInstance(_KernelInstance):
     """Class that represents the specific parameterized instance of a kernel."""
 
+    def __new__(cls, *args, **kwargs):
+        # Detect old-style calls (without kernel_fn and gpu_kwargs) for old tests
+        if len(args) == 8:  # old version
+            name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments = args
+            kernel_fn = None
+            args = (name, kernel_source, kernel_string, kernel_fn, temp_files, threads, grid, params, arguments)
+        elif "kernel_fn" not in kwargs and len(args) < 9:
+            kwargs["kernel_fn"] = None
+        return super().__new__(cls, *args, **kwargs)
+    
     def delete_temp_files(self):
         """Delete any generated temp files."""
-        for v in self.temp_files.values():
-            util.delete_temp_file(v)
+        tmp_files_list = self.temp_files.values() if isinstance(self.temp_files, dict) else self.temp_files
+        for tmp_file in tmp_files_list:
+            util.delete_temp_file(tmp_file)
 
     def prepare_temp_files_for_error_msg(self):
         """Prepare temp file with source code, and return list of temp file names."""
+        if type(self.kernel_source).__name__ == "KernelSourceFn":
+            return [] # already done during compilation
+
         temp_filename = util.get_temp_filename(suffix=self.kernel_source.get_suffix())
         util.write_file(temp_filename, self.kernel_string)
         ret = [temp_filename]
@@ -62,160 +81,7 @@ def prepare_temp_files_for_error_msg(self):
         return ret
 
 
-class KernelSource(object):
-    """Class that holds the kernel sources.
-
-    There is a primary kernel source, which can be either a source string,
-    a filename (indicating a file containing the kernel source code),
-    or a callable (generating the kernel source code).
-    There can additionally be (one or multiple) secondary kernel sources, which
-    must be filenames.
-    """
-
-    def __init__(self, kernel_name, kernel_sources, lang, defines=None):
-        if not isinstance(kernel_sources, list):
-            kernel_sources = [kernel_sources]
-        self.kernel_sources = kernel_sources
-        self.kernel_name = kernel_name
-        self.defines = defines
-        if lang is None:
-            if callable(self.kernel_sources[0]):
-                raise TypeError("Please specify language when using a code generator function")
-            kernel_string = self.get_kernel_string(0)
-            lang = util.detect_language(kernel_string)
-
-        # The validity of lang is checked later, when creating the DeviceInterface
-        self.lang = lang.upper()
-
-    def get_kernel_string(self, index=0, params=None):
-        """Retrieve the kernel source with the given index and return as a string.
-
-        See util.get_kernel_string() for details.
-
-        :param index: Index of the kernel source in the list of sources.
-        :type index: int
-
-        :param params: Dictionary containing the tunable parameters for this specific
-            kernel instance, only needed when kernel_source is a generator.
-        :type param: dict
-
-        :returns: A string containing the kernel code.
-        :rtype: string
-        """
-        logging.debug("get_kernel_string called")
-
-        if hasattr(self, 'lang') and self.lang.upper() == "HYPERTUNER":
-            return ""
-
-        kernel_source = self.kernel_sources[index]
-        return util.get_kernel_string(kernel_source, params)
-
-    def prepare_list_of_files(
-        self, kernel_name, params, grid, threads, block_size_names
-    ):
-        """Prepare the kernel string along with any additional files.
-
-        The first file in the list is allowed to include or read in the others
-        The files beyond the first are considered additional files that may also contain tunable parameters
-
-        For each file beyond the first this function creates a temporary file with
-        preprocessors statements inserted. Occurrences of the original filenames in the
-        first file are replaced with their temporary counterparts.
-
-        :param kernel_name: A string specifying the kernel name.
-        :type kernel_name: string
-
-        :param params: A dictionary with the tunable parameters for this particular
-            instance.
-        :type params: dict()
-
-        :param grid: The grid dimensions for this instance. The grid dimensions are
-            also inserted into the code as if they are tunable parameters for
-            convenience.
-        :type grid: tuple()
-
-        :param threads: The thread block dimensions for this instance. The thread block are
-            also inserted into the code as if they are tunable parameters for
-            convenience.
-        :type threads: tuple()
-
-        :param block_size_names: A list of strings that denote the names
-            for the thread block dimensions.
-        :type block_size_names: list(string)
-
-        """
-        temp_files = dict()
-
-        if self.lang.upper() == "HYPERTUNER":
-            return tuple(["", "", temp_files])
-
-        for i, f in enumerate(self.kernel_sources):
-            if i > 0 and not util.looks_like_a_filename(f):
-                raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames")
-
-            ks = self.get_kernel_string(i, params)
-            # add preprocessor statements
-            n, ks = util.prepare_kernel_string(
-                kernel_name,
-                ks,
-                params,
-                grid,
-                threads,
-                block_size_names,
-                self.lang,
-                self.defines,
-            )
-
-            if i == 0:
-                # primary kernel source
-                name = n
-                kernel_string = ks
-                continue
-
-            # save secondary kernel sources to temporary files
-
-            # generate temp filename with the same extension
-            temp_file = util.get_temp_filename(suffix="." + f.split(".")[-1])
-            temp_files[f] = temp_file
-            util.write_file(temp_file, ks)
-            # replace occurrences of the additional file's name in the first kernel_string with the name of the temp file
-            kernel_string = kernel_string.replace(f, temp_file)
-
-        return name, kernel_string, temp_files
-
-    def get_user_suffix(self, index=0):
-        """Get the suffix of the kernel filename, if the user specified one. Return None otherwise."""
-        if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]):
-            return "." + self.kernel_sources[index].split(".")[-1]
-        return None
-
-    def get_suffix(self, index=0):
-        """Return a suitable suffix for a kernel filename.
-
-        This uses the user-specified suffix if available, or one based on the
-        lang/backend otherwise.
-        """
-        # TODO: Consider delegating this to the backend
-        suffix = self.get_user_suffix(index)
-        if suffix is not None:
-            return suffix
-
-        _suffixes = {"CUDA": ".cu", "OpenCL": ".cl", "C": ".c"}
-        try:
-            return _suffixes[self.lang]
-        except KeyError:
-            return ".c"
-
-    def check_argument_lists(self, kernel_name, arguments):
-        """Check if the kernel arguments have the correct types.
-
-        This is done by calling util.check_argument_list on each kernel string.
-        """
-        for i, f in enumerate(self.kernel_sources):
-            if not callable(f):
-                util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments)
-            else:
-                logging.debug("Checking of arguments list not supported yet for code generators.")
+# KernelSource has been moved to kernel_sources/kernel_source_str.py to support Generic Python
 
 
 def instantiate_observer(observer, args):
@@ -278,6 +144,7 @@ def __init__(
 
         logging.debug("DeviceInterface instantiated, lang=%s", lang)
 
+        
         # Ensure observers is a list
         observers = observers or []
 
@@ -286,7 +153,7 @@ def __init__(
         observer_args = dict(device=device, platform=platform, compiler=compiler, lang=lang)
         observers = [instantiate_observer(ob, observer_args) for ob in observers]
 
-        if lang.upper() == "CUDA":
+        if lang == Language.CUDA:
             from kernel_tuner.backends.pycuda import PyCudaFunctions
             dev = PyCudaFunctions(
                 device,
@@ -294,7 +161,7 @@ def __init__(
                 iterations=iterations,
                 observers=observers,
             )
-        elif lang.upper() == "CUPY":
+        elif lang == Language.CUPY:
             from kernel_tuner.backends.cupy import CupyFunctions
             dev = CupyFunctions(
                 device,
@@ -302,7 +169,7 @@ def __init__(
                 iterations=iterations,
                 observers=observers,
             )
-        elif lang.upper() == "NVCUDA":
+        elif lang == Language.NVCUDA:
             from kernel_tuner.backends.nvcuda import CudaFunctions
             dev = CudaFunctions(
                 device,
@@ -310,7 +177,7 @@ def __init__(
                 iterations=iterations,
                 observers=observers,
             )
-        elif lang.upper() == "OPENCL":
+        elif lang == Language.OPENCL:
             from kernel_tuner.backends.opencl import OpenCLFunctions
             dev = OpenCLFunctions(
                 device,
@@ -319,7 +186,7 @@ def __init__(
                 iterations=iterations,
                 observers=observers,
             )
-        elif lang.upper() in ["C", "FORTRAN"]:
+        elif lang == Language.C or lang == Language.FORTRAN:
             from kernel_tuner.backends.compiler import CompilerFunctions
             dev = CompilerFunctions(
                 compiler=compiler,
@@ -327,7 +194,7 @@ def __init__(
                 iterations=iterations,
                 observers=observers,
             )
-        elif lang.upper() == "HIP":
+        elif lang == Language.HIP:
             from kernel_tuner.backends.hip import HipFunctions
             dev = HipFunctions(
                 device,
@@ -335,16 +202,23 @@ def __init__(
                 iterations=iterations,
                 observers=observers,
             )
-        elif lang.upper() == "HYPERTUNER":
+        elif lang == Language.HYPERTUNER:
             from kernel_tuner.backends.hypertuner import HypertunerFunctions
             dev = HypertunerFunctions(
                 iterations=iterations,
                 compiler_options=compiler_options
             )
             self.requires_warmup = False
+        elif lang == Language.GENERIC_PYTHON:
+            dev = GenericPythonFunctions( 
+                device,
+                compiler_options=compiler_options,
+                iterations=iterations,
+                observers=observers
+            )
         else:
             raise NotImplementedError(
-                "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet"
+                "Sorry, support for languages other than CUDA, OpenCL, HIP, C, Fortran and Generic Python is not implemented yet"
             )
         self.dev = dev
 
@@ -387,17 +261,23 @@ def __init__(
         if not quiet:
             print("Using: " + self.dev.name)
 
-    def benchmark_prologue(self, func, gpu_args, threads, grid, result):
+    def run_kernel_bench(self, func, gpu_args, threads, grid, stream=None, params=None):
+        if isinstance(self.dev, GenericPythonFunctions): # Generic Python needs params to compile.
+            self.dev.run_kernel(func, gpu_args, threads, grid, params=params)
+        else:
+            self.dev.run_kernel(func, gpu_args, threads, grid)
+
+    def benchmark_prologue(self, func, gpu_args, threads, grid, result, params=None):
         """Benchmark prologue one kernel execution per PrologueObserver."""
         for obs in self.prologue_observers:
             self.dev.synchronize()
             obs.before_start()
-            self.dev.run_kernel(func, gpu_args, threads, grid)
+            self.run_kernel_bench(func, gpu_args, threads, grid, stream=None, params=params)
             self.dev.synchronize()
             obs.after_finish()
             result.update(obs.get_results())
 
-    def benchmark_default(self, func, gpu_args, threads, grid, result):
+    def benchmark_default(self, func, gpu_args, threads, grid, result, params=None):
         """Benchmark one kernel execution for 'iterations' at a time."""
         self.dev.synchronize()
         for _ in range(self.iterations):
@@ -405,7 +285,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
                 obs.before_start()
             self.dev.synchronize()
             self.dev.start_event()
-            self.dev.run_kernel(func, gpu_args, threads, grid)
+            self.run_kernel_bench(func, gpu_args, threads, grid, stream=None, params=params)
             self.dev.stop_event()
             for obs in self.benchmark_observers:
                 obs.after_start()
@@ -417,11 +297,13 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
             for obs in self.benchmark_observers:
                 obs.after_finish()
 
+            #time.sleep(0.1) # prevent termal throttling
+
         for obs in self.benchmark_observers:
             result.update(obs.get_results())
 
 
-    def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
+    def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration, params=None):
         """Benchmark continuously for at least 'duration' seconds."""
         iterations = int(np.ceil(duration / (result["time"] / 1000)))
         self.dev.synchronize()
@@ -429,7 +311,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
             obs.before_start()
         self.dev.start_event()
         for _ in range(iterations):
-            self.dev.run_kernel(func, gpu_args, threads, grid)
+            self.run_kernel_bench(func, gpu_args, threads, grid, stream=None, params=params)
         self.dev.stop_event()
         for obs in self.continuous_observers:
             obs.after_start()
@@ -479,8 +361,8 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
 
         result = {}
         try:
-            self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result)
-            self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result)
+            self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result, instance.params)
+            self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result, instance.params)
 
             if self.continuous_observers:
                 duration = 1
@@ -488,7 +370,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                     obs.results = result
                     duration = max(duration, obs.continuous_duration)
 
-                self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration)
+                self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration, instance.params)
 
         except Exception as e:
             # some launches may fail because too many registers are required
@@ -518,6 +400,7 @@ def check_kernel_output(
     ):
         """Runs the kernel once and checks the result against answer."""
         logging.debug("check_kernel_output")
+        cp = _get_cupy()
 
         # if not using custom verify function, check if the length is the same
         if answer:
@@ -526,7 +409,6 @@ def check_kernel_output(
 
             should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)]
         else:
-            cp = _get_cupy()
             cupy_ndarray = (cp.ndarray,) if cp is not None else ()
             should_sync = [
                 isinstance(arg, (np.ndarray, torch.Tensor, DeviceArray) + cupy_ndarray)
@@ -538,33 +420,44 @@ def check_kernel_output(
         self.dev.refresh_memory(gpu_args, instance.arguments, should_sync)
 
         # run the kernel
-        check = self.run_kernel(func, gpu_args, instance)
+        self.dev.synchronize()
+        check = self.run_kernel_check(func, gpu_args, instance)
+        self.dev.synchronize()
         if not check:
             # runtime failure occurred that should be ignored, skip correctness check
             return
 
         # retrieve gpu results to host memory
         result_host = []
-        for i, arg in enumerate(instance.arguments):
-            if should_sync[i]:
-                cp = _get_cupy()
-                cupy_ndarray = (cp.ndarray,) if cp is not None else ()
-                if isinstance(arg, (np.ndarray,) + cupy_ndarray):
-                    result_host.append(np.zeros_like(arg))
-                    self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
-                elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
-                    if not answer[i].is_cuda:
-                        # if the answer is on the host, copy gpu output to host as well
-                        result_host.append(torch.zeros_like(answer[i]))
-                        self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
+        if instance.kernel_source is not None and instance.kernel_source.lang == Language.GENERIC_PYTHON:
+            # arguments are either Torch tensors or built-in Python types
+            for i, arg in enumerate(gpu_args):
+                if should_sync[i]:
+                    if isinstance(arg, torch.Tensor):
+                        result_host.append(arg.cpu())
                     else:
-                        result_host.append(gpu_args[i].tensor)
+                        result_host.append(arg)
+                else: 
+                    result_host.append(None)
+        else:
+            for i, arg in enumerate(instance.arguments):
+                if should_sync[i]:
+                    if isinstance(arg, (np.ndarray, cp.ndarray)):
+                        result_host.append(np.zeros_like(arg))
+                        self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
+                    elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
+                        if not answer[i].is_cuda:
+                            # if the answer is on the host, copy gpu output to host as well
+                            result_host.append(torch.zeros_like(answer[i]))
+                            self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
+                        else:
+                            result_host.append(gpu_args[i].tensor)
+                    else:
+                        # We should sync this argument, but we do not know how to transfer this type of argument
+                        # What do we do? Should we throw an error?
+                        result_host.append(None)
                 else:
-                    # We should sync this argument, but we do not know how to transfer this type of argument
-                    # What do we do? Should we throw an error?
                     result_host.append(None)
-            else:
-                result_host.append(None)
 
         # Call the output observers
         for obs in self.output_observers:
@@ -582,6 +475,7 @@ def check_kernel_output(
             correct = True
 
         if not correct:
+            print("expected: ", answer, "\ngot: ", result_host)
             raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
 
     def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
@@ -597,7 +491,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
         instance_string = util.get_instance_string(params)
 
         logging.debug("compile_and_benchmark " + instance_string)
-
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
             result[to.objective] = util.InvalidConfig()
@@ -608,7 +501,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
             try:
                 # compile the kernel
                 start_compilation = time.perf_counter()
-                func = self.compile_kernel(instance, verbose)
+                func = self.compile_kernel(instance, verbose, gpu_args)
                 if not func:
                     result[to.objective] = util.CompilationFailedConfig()
                 else:
@@ -651,42 +544,67 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
             # clean up any temporary files, if no error occurred
             instance.delete_temp_files()
 
+        # For Python DSLs, the compilation time also includes one kernel run, so we subtract the runtime
+        if self.lang == Language.GENERIC_PYTHON:
+            last_compilation_time -= result["time"] or 0
+
         result["compile_time"] = last_compilation_time or 0
         result["verification_time"] = last_verification_time or 0
         result["benchmark_time"] = last_benchmark_time or 0
 
         return result
 
-    def compile_kernel(self, instance, verbose):
+    def compile_kernel(self, instance, verbose, gpu_args=None):
         """Compile the kernel for this specific instance."""
         logging.debug("compile_kernel " + instance.name)
 
         # compile kernel_string into device func
         func = None
-        try:
-            func = self.dev.compile(instance)
-        except Exception as e:
-            # compiles may fail because certain kernel configurations use too
-            # much shared memory for example, the desired behavior is to simply
-            # skip over this configuration and try the next one
-            shared_mem_error_messages = [
-                "uses too much shared data",
-                "local memory limit exceeded",
-                r"local memory \(\d+\) exceeds limit \(\d+\)",
-            ]
-            error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
-            if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
-                logging.debug(
-                    "compile_kernel failed due to kernel using too much shared memory"
-                )
-                if verbose:
-                    print(
-                        f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used"
+        
+        # Python DSLs have different error handling, so we make a case distinction.
+        if isinstance(self.dev, GenericPythonFunctions):
+            try:
+                func = self.dev.compile(instance, gpu_args)
+            except Exception as e:
+                exception_type = self.dev.classify_compile_exception(e)
+                if exception_type == "user_error":
+                    error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
+                    print("compile_kernel failed due to error: " + error_message)
+                    print("Error while compiling:", instance.name)
+                    raise e
+                else:
+                    logging.debug(
+                        "compile_kernel failed due to kernel using too many resources"
                     )
-            else:
-                print("compile_kernel failed due to error: " + error_message)
-                print("Error while compiling:", instance.name)
-                raise e
+                    if verbose:
+                        print(
+                            f"skipping config {util.get_instance_string(instance.params)} reason: \n{e}"
+                        )
+        else: # All other languages
+            try:
+                func = self.dev.compile(instance)
+            except Exception as e:  
+                # compiles may fail because certain kernel configurations use too
+                # much shared memory for example, the desired behavior is to simply
+                # skip over this configuration and try the next one
+                shared_mem_error_messages = [
+                    "uses too much shared data",
+                    "local memory limit exceeded",
+                    r"local memory \(\d+\) exceeds limit \(\d+\)",
+                ]
+                error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
+                if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
+                    logging.debug(
+                        "compile_kernel failed due to kernel using too much shared memory"
+                    )
+                    if verbose:
+                        print(
+                            f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used"
+                        )
+                else:
+                    print("compile_kernel failed due to error: " + error_message)
+                    print("Error while compiling:", instance.name)
+                    raise e
         return func
 
     @staticmethod
@@ -725,20 +643,22 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
             params,
             kernel_options.block_size_names,
         )
-        if np.prod(threads) > self.dev.max_threads:
+
+        if kernel_source.lang is not Language.GENERIC_PYTHON and np.prod(threads) > self.dev.max_threads:
             if verbose:
                 print(f"skipping config {util.get_instance_string(params)} reason: too many threads per block")
             return util.InvalidConfig()
 
         # obtain the kernel_string and prepare additional files, if any
-        name, kernel_string, temp_files = kernel_source.prepare_list_of_files(
-            kernel_options.kernel_name,
+        instance_data = kernel_source.prepare_kernel_instance(
+            kernel_options,
             params,
             grid,
             threads,
-            kernel_options.block_size_names,
         )
 
+        name = instance_data.kernel_name
+        kernel_string=instance_data.kernel_str
         # check for templated kernel
         if kernel_source.lang in ["CUDA"] and "<" in name and ">" in name:
             kernel_string, name = wrap_templated_kernel(kernel_string, name)
@@ -747,7 +667,17 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         arguments = _preprocess_gpu_arguments(kernel_options.arguments, params)
 
         # collect everything we know about this instance and return it
-        return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments)
+        return KernelInstance(
+            name=name,
+            kernel_source=kernel_source,
+            kernel_string=kernel_string,
+            kernel_fn=instance_data.kernel_fn,
+            temp_files=instance_data.temp_files,
+            threads=threads,
+            grid=grid,
+            params=params,
+            arguments=arguments,
+        )
 
     def get_environment(self):
         """Return dictionary with information about the environment."""
@@ -784,14 +714,15 @@ def ready_argument_list(self, arguments):
 
         return gpu_args
 
-    def run_kernel(self, func, gpu_args, instance):
+    
+    def run_kernel_check(self, func, gpu_args, instance):
         """Run a compiled kernel instance on a device."""
         logging.debug("run_kernel %s", instance.name)
         logging.debug("thread block dims (%d, %d, %d)", *instance.threads)
         logging.debug("grid dims (%d, %d, %d)", *instance.grid)
 
         try:
-            self.dev.run_kernel(func, gpu_args, instance.threads, instance.grid)
+            self.run_kernel_bench(func, gpu_args, instance.threads, instance.grid, stream=None, params=instance.params)
         except Exception as e:
             if "too many resources requested for launch" in str(e) or "OUT_OF_RESOURCES" in str(e):
                 logging.debug("ignoring runtime failure due to too many resources required")
@@ -800,6 +731,7 @@ def run_kernel(self, func, gpu_args, instance):
                 logging.debug("encountered unexpected runtime failure: " + str(e))
                 raise e
         return True
+    
 
 
 def _preprocess_gpu_arguments(old_arguments, params):
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index ad9a250db..3c5f3fd85 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -40,10 +40,12 @@
 import kernel_tuner.core as core
 import kernel_tuner.util as util
 from kernel_tuner.file_utils import get_input_file, get_t4_metadata, get_t4_results, import_class_from_file
+from kernel_tuner.kernel_sources.kernel_source import KernelSource
 from kernel_tuner.util import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.language import Language
 
 try:
     import torch
@@ -128,9 +130,10 @@ def __deepcopy__(self, _):
         (
             "kernel_source",
             (
-                """The CUDA, OpenCL, HIP, or C kernel code.
+                """The CUDA, OpenCL, HIP, C or Python DSL kernel code.
             It is allowed for the code to be passed as a string, a filename, a function
             that returns a string of code, or a list when the code needs auxilliary files.
+            In the case of a kernel in a Python DSL such as Triton, only a filename is accepted.
 
             To support combined host and device code tuning, a list of
             filenames can be passed. The first file in the list should be the
@@ -154,7 +157,7 @@ def __deepcopy__(self, _):
                 """Specifies the language used for GPU kernels. The kernel_tuner
         automatically detects the language, but if it fails, you may specify
         the language using this argument, currently supported: "CUDA", "CuPy",
-        "nvcuda", "OpenCL", "HIP", or "C".""",
+        "nvcuda", "OpenCL", "HIP", "C", or "Generic_Python.""",
                 "string",
             ),
         ),
@@ -201,7 +204,9 @@ def __deepcopy__(self, _):
             "arguments",
             (
                 """A list of kernel arguments, use numpy arrays for
-            arrays, use numpy.int32 or numpy.float32 for scalars.""",
+            arrays, use numpy.int32 or numpy.float32 for scalars. When the language
+            Generic Python is used, Torch Tensors and built-in Python data types are also 
+            excepted""",
                 "list",
             ),
         ),
@@ -305,6 +310,22 @@ def __deepcopy__(self, _):
                 "dict",
             ),
         ),
+        (
+            "call_function",
+            (
+                """When the language Generic Python is used, a call function that calls the kernel in the Python
+                 DSL must be specified. The function must take the following positional arguments:
+                :kernel_function: the callable function with the tuning parameters inserted.
+                :args: list of kernel arguments, as provided by the user in the <args> argument.
+                :kwargs: dictionary of kernel keyword arguments. If a tuning parameter is in the kernel signature, 
+                    the tuning parameter will be added as a keyword argument.
+                Optionally, the following arguments can be used. The order and name of the arguments must match.
+                :grid: the launch grid (tuple with 3 values), as computed by KernelTuner
+                :threads: the thread block size (tuple with 3 values), as computed by KernelTuner
+                :params: dictionary with the values of the tuning params for the specific configuration.""",
+                "function",
+            ),
+        ),
     ]
 )
 
@@ -621,12 +642,14 @@ def tune_kernel(
     observers=None,
     objective=None,
     objective_higher_is_better=None,
+    call_function=None,
 ):
+  
     start_overhead_time = perf_counter()
     if log:
         logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log)
 
-    kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines)
+    kernelsource = KernelSource(kernel_name, kernel_source, lang, defines, util.normalize_call_function(call_function))
 
     _check_user_input(kernel_name, kernelsource, arguments, block_size_names)
 
@@ -670,6 +693,10 @@ def tune_kernel(
     logging.debug("tuning_options: %s", util.get_config_string(tuning_options))
     logging.debug("device_options: %s", util.get_config_string(device_options))
 
+    # the user-specific call function may of may not have optional grid, threads and params
+    # arguments. We normalize it so that it always accepts all arguments.
+    kernel_options.call_function = util.normalize_call_function(kernel_options.call_function)
+
     # check whether the selected strategy and options are valid
     strategy_string = strategy
     if strategy:
@@ -851,11 +878,12 @@ def run_kernel(
     block_size_names=None,
     quiet=False,
     log=None,
+    call_function=None,
 ):
     if log:
         logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log)
 
-    kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines)
+    kernelsource = KernelSource(kernel_name, kernel_source, lang, defines, util.normalize_call_function(call_function))
 
     _check_user_input(kernel_name, kernelsource, arguments, block_size_names)
 
@@ -864,6 +892,10 @@ def run_kernel(
     kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
     device_options = Options([(k, opts[k]) for k in _device_options.keys()])
 
+    # the user-specific call function may of may not have optional grid, threads and params
+    # arguments. We normalize it so that it always accepts all arguments.
+    kernel_options.call_function = util.normalize_call_function(kernel_options.call_function)
+
     # detect language and create the right device function interface
     dev = core.DeviceInterface(kernelsource, iterations=1, **device_options)
 
@@ -883,8 +915,11 @@ def run_kernel(
         # see if the kernel arguments have correct type
         util.check_argument_list(instance.name, instance.kernel_string, arguments)
 
-        # compile the kernel
-        func = dev.compile_kernel(instance, False)
+        # compile the kernel, Extra logic needed to check if we have a python function
+        if lang.upper() == "TRITON" or lang.upper() == "GENERIC_PYTHON":
+            func = dev.compile_kernel(instance, False, gpu_args)
+        else: 
+            func = dev.compile_kernel(instance, False)
         if func is None:
             raise RuntimeError("cannot compile kernel, too much shared memory used")
 
@@ -903,19 +938,29 @@ def run_kernel(
             instance.delete_temp_files()
 
     # run the kernel
-    if not dev.run_kernel(func, gpu_args, instance):
+    if not dev.run_kernel_check(func, gpu_args, instance):
         raise RuntimeError("runtime error occured, too many resources requested")
 
+
+
     # copy data in GPU memory back to the host
     results = []
-    for i, arg in enumerate(arguments):
-        if numpy.isscalar(arg):
-            results.append(arg)
-        elif isinstance(arg, torch.Tensor):
-            results.append(arg.cpu())
-        else:
-            results.append(numpy.zeros_like(arg))
-            dev.memcpy_dtoh(results[-1], gpu_args[i])
+    if instance.kernel_source.lang == Language.GENERIC_PYTHON:
+        for arg in gpu_args:
+            if isinstance(arg, torch.Tensor):
+                results.append(arg.cpu())
+            else:
+                results.append(arg)
+    else:
+        for i, arg in enumerate(arguments):
+            if numpy.isscalar(arg):
+                results.append(arg)
+            elif isinstance(arg, torch.Tensor):
+                results.append(arg.cpu())
+            else:
+                results.append(numpy.zeros_like(arg))
+                dev.memcpy_dtoh(results[-1], gpu_args[i])
+
 
     return results
 
diff --git a/kernel_tuner/kernel_sources/__init__.py b/kernel_tuner/kernel_sources/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/kernel_tuner/kernel_sources/kernel_source.py b/kernel_tuner/kernel_sources/kernel_source.py
new file mode 100644
index 000000000..51fb75d9b
--- /dev/null
+++ b/kernel_tuner/kernel_sources/kernel_source.py
@@ -0,0 +1,82 @@
+import kernel_tuner.util as util
+
+from abc import abstractmethod
+
+from kernel_tuner.language import Language
+from kernel_tuner.kernel_sources.model.prepared_kernel_source_data import PreparedKernelSourceData
+
+
+class KernelSourceFactory(type):
+    '''
+    Factory to dynamically determine if a KernelSource should be a KernelSourceStr or a KernelSourceFn.
+    if lang=generic_python, KernelSourceFn will be used. In all other cases, an instance of KernelSourceStr 
+    is created.
+    
+    We create a kernelsouce by calling KernelSource(...). The call method for KernelSource is replaced by 
+    the call method in KernelSourceFactory, because KernelSource uses that class as metaclass. Inside the 
+    call method, a call to create either KernelSourceStr or KernelSourceFn is performed. This triggers the 
+    __init__ call of the corresponding subclass. In both subclasses, we call super().__init__, which triggers 
+    the __init__ call of the KernelSource class to initalize some common variables.
+    '''
+    def __call__(cls, kernel_name, kernel_sources, lang, defines=None, call_function=None):
+        if lang == None:
+            language = None 
+        else:
+            try:
+                language = Language(lang.upper())
+            except ValueError:
+                raise TypeError(f"Supported languages are {[l.value for l in Language]}")
+            
+        # Determine if we need to create a KernelSourceStr or a KernelSourceFn
+        if (language and language == Language.GENERIC_PYTHON): 
+            ks_str = False 
+        else:
+            ks_str = True
+        
+        from kernel_tuner.kernel_sources.kernel_source_str import KernelSourceStr
+        from kernel_tuner.kernel_sources.kernel_source_fn import KernelSourceFn
+        if cls is KernelSource:  
+            if ks_str:
+                return KernelSourceStr(kernel_name, kernel_sources, lang, defines)
+            else:
+                return KernelSourceFn(kernel_name, kernel_sources, lang, defines, call_function)
+
+        # Else, normal behaviour for subclasses
+        if ks_str:
+            return super().__call__(kernel_name, kernel_sources, lang, defines)
+        else:
+            return super().__call__(kernel_name, kernel_sources, lang, defines, call_function)
+    
+
+
+class KernelSource(metaclass=KernelSourceFactory):
+
+    def __init__(self, kernel_name, kernel_sources, lang, defines=None):
+        if not isinstance(kernel_sources, list):
+            kernel_sources = [kernel_sources]
+        self.kernel_sources = kernel_sources
+        self.kernel_name = kernel_name
+        self.defines = defines
+
+        if lang is None:
+            if callable(self.kernel_sources[0]):
+                raise TypeError("Please specify language when using a code generator function")
+            kernel_string = self.get_kernel_string(0)
+            language = util.detect_language(kernel_string)
+            self.lang = Language(language.upper())
+        else:
+            try:
+                self.lang = Language(lang.upper())
+            except ValueError:
+                raise TypeError(f"Supported languages are {[l.value for l in Language]}")
+            
+
+    @abstractmethod
+    def prepare_kernel_instance(self, kernel_options, params, grid, threads) -> PreparedKernelSourceData:
+        raise NotImplementedError("create_kernel_instance not implemented")
+
+    @abstractmethod
+    def check_argument_lists(self, kernel_name, arguments):
+        raise NotImplementedError("check_argument_lists not implemented")
+
+
diff --git a/kernel_tuner/kernel_sources/kernel_source_fn.py b/kernel_tuner/kernel_sources/kernel_source_fn.py
new file mode 100644
index 000000000..f806d5b87
--- /dev/null
+++ b/kernel_tuner/kernel_sources/kernel_source_fn.py
@@ -0,0 +1,304 @@
+import inspect
+import ast
+import copy
+import uuid
+import sys
+import logging
+
+import astor
+import tempfile
+import importlib.util
+
+from typing import Any
+
+from kernel_tuner.language import Language
+from kernel_tuner.kernel_sources.kernel_source import KernelSource
+from kernel_tuner.kernel_sources.model.prepared_kernel_source_data import PreparedKernelSourceData
+from kernel_tuner.util import get_kernel_ast, get_arg_names
+
+
+class KernelSourceFn(KernelSource):
+    """
+    Class that holds the Python-function-based kernel sources.
+
+    There is a primary kernel source for function-based kernels in Python. The kernel_source
+    must be a path to the file where the kernel with kernel_name lives. The kernel can be 
+    decorated by a JIT decorator. 
+    
+    A call function to specify how the kernel should be launched must be supplied. The call 
+    function must take the following arguments:
+    - kernel_function: the callable function with the tuning parameters inserted. 
+    - args: list of kernel arguments, as provided by the user in the <args> argument.
+    - kwargs: dictionary of kernel keyword arguments. If a tuning parameter is in the kernel signature, 
+      the tuning parameter will be added as a keyword argument.
+    - grid: the launch grid (tuple with 3 values), as computed by KernelTuner
+    - threads: the thread block size (tuple with 3 values), as computed by KernelTuner
+    - params: dictionary with the values of the tuning params for this configuration.
+    """
+
+    def __init__(self, kernel_name, kernel_source, lang, defines=None, call_function=None):
+        super().__init__(kernel_name, kernel_source, lang, defines)
+        if isinstance(kernel_source, list):
+            raise ValueError("KernelSourceFn only supports a single kernel source")
+       
+        if self.lang == Language.GENERIC_PYTHON: 
+            if call_function is None:
+                raise ValueError("call_function must be supplied for language Generic Python")
+            if not callable(call_function):
+                raise TypeError(f"call_function of type {type(call_function)} is not a callable object.")
+        self.call_function = call_function
+
+        if not isinstance(kernel_name, str):
+            raise TypeError("kernel_name should be a string, got ", type(kernel_name))
+        
+        source_ast = get_kernel_ast(kernel_name, kernel_source)
+        if isinstance(source_ast, tuple): # Class based kernel
+            self.source_tree = source_ast[0]
+            self.signature = get_arg_names(source_ast[1])
+        else:
+            self.source_tree = source_ast 
+            self.signature = get_arg_names(source_ast)
+
+        self.kernel_fn = self.source_tree # This is where we will store the transformed source.
+        self.import_nodes = self._find_import_nodes(kernel_source)
+        self.dependencies = self._find_dependencies(kernel_source)
+
+
+    def prepare_kernel_instance(self, kernel_options, params, grid, threads):
+        '''
+        Given the dictionary of tuning parameter values for this configuration,
+        generate a kernel instance with these tuning parameters inserted. kernel_options, 
+        grid and threads are not needed for Python kernels.
+        '''
+        # Test: see if removing signature params helps in Triton overhead
+        filtered_params = {}
+        for k, v in params.items():
+            if k not in self.signature:
+                filtered_params[k] = v
+
+        new_kernel_fn, temp_file_path = self.apply_params_to_source_fn(filtered_params)
+        self.kernel_fn = new_kernel_fn
+
+        return PreparedKernelSourceData(
+            temp_files=[temp_file_path],
+            kernel_name=self.kernel_name,
+            kernel_fn=new_kernel_fn,
+            kernel_str=None
+        )
+
+
+    def check_argument_lists(self, kernel_name, arguments):
+        '''
+        Check if the kernel arguments have the correct types.
+        Not implemented for Python, because type hinting is not always supplied.
+        '''
+        logging.debug("Checking of arguments list not supported yet for Python kernels")
+        return True
+
+
+    def apply_params_to_source_fn(self, params):
+        '''
+        Create a module with the kernel imports and local dependencies from the kernel source file.
+        Find instances of the tuning parameters in the kernel and local dependencies and replace these
+        instances by the values of this configuration.
+        Return the new kernel and the path to the new module, where the new kernel lives.
+        '''
+        transformer = ReplaceVars(params)
+        
+        # Create a new module to store the transformed AST in.
+        new_module = ast.Module(body=[], type_ignores=[])
+        
+        # Add imports
+        new_module.body.extend(self.import_nodes)
+        
+        # Add dependencies (functions)
+        for dep_node in self.dependencies.values():
+            dep_node_copy = copy.deepcopy(dep_node)
+            transformed_dep_node = transformer.visit(dep_node_copy) # apply tuning params to dependencies
+            new_module.body.append(transformed_dep_node)
+        
+        # Transform main kernel
+        source_tree_copy = copy.deepcopy(self.source_tree)
+        transformed_tree = transformer.visit(source_tree_copy)
+        
+        # Add transformed main kernel to new module
+        new_module.body.append(transformed_tree)
+        
+        # Fix locations and generate source
+        ast.fix_missing_locations(new_module)
+        new_source = astor.to_source(new_module)
+
+        #print(new_source)
+        
+        # Create a unique module name and write new source to it.
+        module_name = f'temp_kernel_module_{uuid.uuid4().hex}'
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:
+            temp_file.write(new_source)
+            temp_file_path = temp_file.name
+
+        # Register the module in sys.modules before executing it
+        spec = importlib.util.spec_from_file_location(module_name, temp_file_path)
+        temp_module = importlib.util.module_from_spec(spec)       
+        sys.modules[module_name] = temp_module
+        spec.loader.exec_module(temp_module)
+        new_fn = getattr(temp_module, self.kernel_name)
+        
+        return new_fn, temp_file_path
+
+
+    def _find_import_nodes(self, source_file):
+        '''
+        Parse kernel source file to find import statements. Return those
+        statements as AST import nodes. 
+        '''
+        with open(source_file, "r") as f:
+            tree = ast.parse(f.read(), filename=source_file)
+
+        import_nodes = []
+        for node in tree.body:
+            if isinstance(node, (ast.Import, ast.ImportFrom)):
+                import_nodes.append(node)
+
+        return import_nodes
+
+
+    def _find_function_dependecies(self, tree, local_funcs):
+        '''
+        Given an AST and a list of local function names, return the funtions that are invoked
+        somewhere in the file, and are in the local_funcs list.
+        '''
+        class FunctionCallVisitor(ast.NodeVisitor):
+            def __init__(self):
+                self.called = set()
+
+            def visit_Call(self, node):
+                if isinstance(node.func, ast.Name):
+                    name = node.func.id
+                    if name in local_funcs:
+                        self.called.add(name)
+                self.generic_visit(node)
+        
+        visitor = FunctionCallVisitor()
+        visitor.visit(tree)
+        return visitor.called
+
+
+    def _find_dependencies(self, filepath):
+        '''
+        Find all local function dependencies in the file where the kernel source is
+        defined. Return a dicionary indexed by the function node name with the function
+        body as value.
+        '''
+        with open(filepath, "r") as f: 
+            source_code = f.read() 
+        tree = ast.parse(source_code, filename=filepath) 
+
+        # Find the locally defined functions in the source file
+        local_funcs = set()
+        for node in tree.body:
+            if isinstance(node, ast.FunctionDef):
+                local_funcs.add(node.name)
+        
+        # Given source file AST and the set of locally defined functions, find the names of 
+        # the local functions that are invoked somewhere in the AST.
+        called_functions = self._find_function_dependecies(self.source_tree, local_funcs)
+
+        # Traverse the tree one more time to save the dependency functions in a dictionary. 
+        # Skip the main kernel.
+        dependencies = {}
+        for node in tree.body:
+            if (isinstance(node, ast.FunctionDef) and node.name in called_functions and 
+                node.name != self.kernel_name):  # Skip the main kernel itself
+                dependencies[node.name] = node
+
+        return dependencies
+
+
+    def __del__(self):
+        '''
+        Clean up temporary modules when the instance is destroyed
+        '''
+        for key in list(sys.modules.keys()):
+            if key.startswith('temp_kernel_module_'):
+                del sys.modules[key]
+
+
+class ReplaceVars(ast.NodeTransformer):
+    '''
+    AST transformer that replaces occurrences of tuning parameters with constant values.
+    The tuning parameters (params) should be provided as a dictionary.
+    The transformation supports the following cases:
+
+    - Variable reads:
+        occurrence of a variable name that matches a key in `params` are replaced if the 
+        value is being read.
+
+    - Object attribute reads:
+        expressions of the form `self.<param>` are replaced with constants if
+        `<param>` exists in `params` and is being read (not assigned to).
+
+    - Assignments to parameters:
+        assignments like `param = ...` or `self.param = ...` are overridden,
+        replacing the right-hand side with the constant value from `params`.
+
+    Examples:
+        Given `params = {"block_size": 32}`:
+        x = 2 * block_size          ->      x = 2 * 32
+        x = 2 * self.block_size     ->      x = 2 * 32
+        self.block_size = 64        ->      self.block_size = 32
+        block_size = 64             ->      block_size = 32
+
+    '''
+
+    def __init__(self, params: dict):
+        self.params = params
+
+    def visit_Name(self, node: ast.Name) -> Any:
+        if isinstance(node.ctx, ast.Load) and node.id in self.params.keys():
+            return ast.copy_location(
+                ast.Constant(value=self.params[node.id]),
+                node
+            )
+        return node
+    
+
+    def visit_Attribute(self, node: ast.Attribute):
+        self.generic_visit(node)
+    
+        # Replace self.<param> with constant, but only if it's being read
+        if (
+            isinstance(node.value, ast.Name)
+            and node.value.id == "self"
+            and node.attr in self.params
+            and isinstance(node.ctx, ast.Load)  
+        ):
+            return ast.copy_location(ast.Constant(self.params[node.attr]), node)
+        
+        return node
+
+
+    def visit_Assign(self, node: ast.Assign):
+        # Replace assignments like: mock_param = ...
+        if (
+            len(node.targets) == 1
+            and isinstance(node.targets[0], ast.Name)
+            and node.targets[0].id in self.params
+        ):
+            node.value = ast.Constant(value=self.params[node.targets[0].id])
+            return node  
+
+        # Replace assignment like self.<attr> = ...
+        if (
+            len(node.targets) == 1
+            and isinstance(node.targets[0], ast.Attribute)
+            and isinstance(node.targets[0].value, ast.Name)
+            and node.targets[0].value.id == "self"
+            and node.targets[0].attr in self.params
+        ):
+            node.value = ast.Constant(value=self.params[node.targets[0].attr])
+            return node
+
+        return self.generic_visit(node)
+
+    
+
diff --git a/kernel_tuner/kernel_sources/kernel_source_str.py b/kernel_tuner/kernel_sources/kernel_source_str.py
new file mode 100644
index 000000000..095417392
--- /dev/null
+++ b/kernel_tuner/kernel_sources/kernel_source_str.py
@@ -0,0 +1,174 @@
+import kernel_tuner.util as util
+import logging
+
+from kernel_tuner.kernel_sources.kernel_source import KernelSource
+from kernel_tuner.core import wrap_templated_kernel
+from kernel_tuner.kernel_sources.model.prepared_kernel_source_data import PreparedKernelSourceData
+from kernel_tuner.language import Language
+
+
+class KernelSourceStr(KernelSource):
+    """Class that holds the string-based kernel sources.
+
+    There is a primary kernel source for string-based kernels., which can be either a source string,
+    a filename (indicating a file containing the kernel source code),
+    or a callable (generating the kernel source code).
+    There can additionally be (one or multiple) secondary kernel sources, which
+    must be filenames.
+    """
+
+    def __init__(self, kernel_name, kernel_sources, lang, defines=None):
+        super().__init__(kernel_name, kernel_sources, lang, defines)
+    
+    def prepare_kernel_instance(self, kernel_options, params, grid, threads):
+        name, kernel_string, temp_files = self.prepare_list_of_files(
+            kernel_name=kernel_options.kernel_name,
+            params=params,
+            grid=grid,
+            threads=threads,
+            block_size_names=kernel_options.block_size_names,
+        )
+
+        lang_is_cuda = self.lang in [Language.CUDA, Language.NVCUDA]
+
+        if lang_is_cuda and "<" in name and ">" in name:
+            kernel_string, name = wrap_templated_kernel(kernel_string, name)
+
+        return PreparedKernelSourceData(
+            temp_files=temp_files,
+            kernel_name=name,
+            kernel_str=kernel_string,
+            kernel_fn=None,
+        )
+
+
+
+    def get_kernel_string(self, index=0, params=None):
+        """Retrieve the kernel source with the given index and return as a string.
+
+        See util.get_kernel_string() for details.
+
+        :param index: Index of the kernel source in the list of sources.
+        :type index: int
+
+        :param params: Dictionary containing the tunable parameters for this specific
+            kernel instance, only needed when kernel_source is a generator.
+        :type param: dict
+
+        :returns: A string containing the kernel code.
+        :rtype: string
+        """
+        logging.debug("get_kernel_string called")
+
+        if hasattr(self, 'lang') and self.lang == Language.HYPERTUNER:
+            return ""
+
+        kernel_source = self.kernel_sources[index]
+        return util.get_kernel_string(kernel_source, params)
+
+    def prepare_list_of_files(
+        self, kernel_name, params, grid, threads, block_size_names
+    ):
+        """Prepare the kernel string along with any additional files.
+
+        The first file in the list is allowed to include or read in the others
+        The files beyond the first are considered additional files that may also contain tunable parameters
+
+        For each file beyond the first this function creates a temporary file with
+        preprocessors statements inserted. Occurrences of the original filenames in the
+        first file are replaced with their temporary counterparts.
+
+        :param kernel_name: A string specifying the kernel name.
+        :type kernel_name: string
+
+        :param params: A dictionary with the tunable parameters for this particular
+            instance.
+        :type params: dict()
+
+        :param grid: The grid dimensions for this instance. The grid dimensions are
+            also inserted into the code as if they are tunable parameters for
+            convenience.
+        :type grid: tuple()
+
+        :param threads: The thread block dimensions for this instance. The thread block are
+            also inserted into the code as if they are tunable parameters for
+            convenience.
+        :type threads: tuple()
+
+        :param block_size_names: A list of strings that denote the names
+            for the thread block dimensions.
+        :type block_size_names: list(string)
+
+        """
+        temp_files = dict()
+
+        if self.lang == Language.HYPERTUNER:
+            return tuple(["", "", temp_files])
+
+        for i, f in enumerate(self.kernel_sources):
+            if i > 0 and not util.looks_like_a_filename(f):
+                raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames")
+
+            ks = self.get_kernel_string(i, params)
+            # add preprocessor statements
+            n, ks = util.prepare_kernel_string(
+                kernel_name,
+                ks,
+                params,
+                grid,
+                threads,
+                block_size_names,
+                self.lang,
+                self.defines,
+            )
+
+            if i == 0:
+                # primary kernel source
+                name = n
+                kernel_string = ks
+                continue
+
+            # save secondary kernel sources to temporary files
+
+            # generate temp filename with the same extension
+            temp_file = util.get_temp_filename(suffix="." + f.split(".")[-1])
+            temp_files[f] = temp_file
+            util.write_file(temp_file, ks)
+            # replace occurrences of the additional file's name in the first kernel_string with the name of the temp file
+            kernel_string = kernel_string.replace(f, temp_file)
+
+        return name, kernel_string, temp_files
+
+    def get_user_suffix(self, index=0):
+        """Get the suffix of the kernel filename, if the user specified one. Return None otherwise."""
+        if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]):
+            return "." + self.kernel_sources[index].split(".")[-1]
+        return None
+
+    def get_suffix(self, index=0):
+        """Return a suitable suffix for a kernel filename.
+
+        This uses the user-specified suffix if available, or one based on the
+        lang/backend otherwise.
+        """
+        # TODO: Consider delegating this to the backend
+        suffix = self.get_user_suffix(index)
+        if suffix is not None:
+            return suffix
+
+        _suffixes = {Language.CUDA: ".cu", Language.OPENCL: ".cl", Language.C: ".c"}
+        try:
+            return _suffixes[self.lang]
+        except KeyError:
+            return ".c"
+
+    def check_argument_lists(self, kernel_name, arguments):
+        """Check if the kernel arguments have the correct types.
+
+        This is done by calling util.check_argument_list on each kernel string.
+        """
+        for i, f in enumerate(self.kernel_sources):
+            if not callable(f):
+                util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments)
+            else:
+                logging.debug("Checking of arguments list not supported yet for code generators.")
\ No newline at end of file
diff --git a/kernel_tuner/kernel_sources/model/__init__.py b/kernel_tuner/kernel_sources/model/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/kernel_tuner/kernel_sources/model/prepared_kernel_source_data.py b/kernel_tuner/kernel_sources/model/prepared_kernel_source_data.py
new file mode 100644
index 000000000..b9c904b38
--- /dev/null
+++ b/kernel_tuner/kernel_sources/model/prepared_kernel_source_data.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class PreparedKernelSourceData:
+    temp_files: Any
+    kernel_name: str
+    kernel_str: Any
+    kernel_fn: Any
\ No newline at end of file
diff --git a/kernel_tuner/kernelbuilder.py b/kernel_tuner/kernelbuilder.py
index 0f3f6154f..e2039e66d 100644
--- a/kernel_tuner/kernelbuilder.py
+++ b/kernel_tuner/kernelbuilder.py
@@ -4,6 +4,7 @@
 from kernel_tuner.interface import Options, _kernel_options
 
 from kernel_tuner.integration import TuneResults
+from kernel_tuner.kernel_sources.kernel_source import KernelSource
 
 class PythonKernel(object):
 
@@ -30,7 +31,7 @@ def __init__(self, kernel_name, kernel_string, problem_size, arguments, params=N
 
         """
         #construct device interface
-        kernel_source = core.KernelSource(kernel_name, kernel_string, lang)
+        kernel_source = KernelSource(kernel_name, kernel_string, lang)
         self.dev = core.DeviceInterface(kernel_source, device=device, quiet=True)
         if not params:
             params = {}
@@ -92,7 +93,7 @@ def run_kernel(self, args):
         :type args: list(np.ndarray or np.generic)
         """
         self.update_gpu_args(args)
-        self.dev.run_kernel(self.func, self.gpu_args, self.kernel_instance)
+        self.dev.run_kernel_check(self.func, self.gpu_args, self.kernel_instance)
         return self.get_gpu_result(args)
 
     def __call__(self, *args):
diff --git a/kernel_tuner/language.py b/kernel_tuner/language.py
new file mode 100644
index 000000000..02948c12b
--- /dev/null
+++ b/kernel_tuner/language.py
@@ -0,0 +1,19 @@
+from enum import Enum
+
+class Language(Enum):
+    CUDA = "CUDA"
+    OPENCL = "OPENCL"
+    C = "C"
+    HIP = "HIP"
+    FORTRAN = "FORTRAN"
+    NVCUDA = "NVCUDA"
+    GENERIC_PYTHON = "GENERIC_PYTHON"
+    CUPY = "CUPY"
+    HYPERTUNER = "HYPERTUNER"
+
+    @classmethod
+    def is_valid(cls, value: str) -> bool:
+        """
+        Test if a language is valid in Kernel Tuner framework.
+        """
+        return value in cls._value2member_map_
\ No newline at end of file
diff --git a/kernel_tuner/observers/generic_python.py b/kernel_tuner/observers/generic_python.py
new file mode 100644
index 000000000..62dfa20c7
--- /dev/null
+++ b/kernel_tuner/observers/generic_python.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+from kernel_tuner.observers.observer import BenchmarkObserver
+
+try:
+    import torch
+except (ImportError, RuntimeError):
+    torch = None
+
+
+class GenericPythonRuntimeObserver(BenchmarkObserver):
+    """Observer that measures time using CUDA events during benchmarking."""
+
+    def __init__(self, dev):
+        if torch is None:
+            raise ImportError("Unable to load torch")
+
+        self.dev = dev
+        self.stream = dev.stream
+        self.start = dev.start
+        self.end = dev.end
+        self.times = []
+
+    def after_finish(self):
+        # Time is measured in milliseconds
+        event_elapsed_time = self.start.elapsed_time(self.end)
+        self.times.append(event_elapsed_time)
+
+    def get_results(self):
+        results = {"time": np.average(self.times), "times": self.times.copy()}
+        self.times = []
+        return results
\ No newline at end of file
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 0163083a1..b0d284b2a 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -133,6 +133,9 @@ def check_argument_type(dtype, kernel_argument):
 
 def check_argument_list(kernel_name, kernel_string, args):
     """Raise an exception if kernel arguments do not match host arguments."""
+    if kernel_string is None:
+        return 
+
     cp = _get_cupy()
     cupy_ndarray = (cp.ndarray,) if cp is not None else ()
     kernel_arguments = list()
@@ -666,6 +669,82 @@ def get_kernel_string(kernel_source, params=None):
     return kernel_string
 
 
+def get_kernel_ast(kernel_name, filepath):
+    '''
+    Util function for Generic Python Backend that returns the kernel function as AST. 
+
+    :param kernel_name: name of the kernel (as passed by the user)
+    :type kernel_name: string
+
+    :param filepath: the path to the file where the kernel lives. (passed by the user as kenrel_source)
+    :type filepath: string or Path containing a filename that points to the kernel source
+
+    :returns: ast.FunctionDef node in case the kernel is a function or a tuple 
+        (ast.ClassDef node, ast.FunctionDef node) in case the kernel is represented as the __call__ function
+        of a class (for Tilus support).
+    '''
+    if isinstance(filepath, Path):
+        source = read_file(filepath)
+    elif isinstance(filepath, str):
+        with open(filepath, "r") as f:
+            source = f.read()
+    else:
+        raise TypeError("Error kernel_source does not specify a path to a file")
+
+    tree = ast.parse(source)
+
+    # Function based kernels
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == kernel_name:
+            return node 
+    
+    # Class based kernels
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef) and node.name == kernel_name:
+            class_node = node
+            break
+
+    if not class_node:
+         raise ValueError(f"Kernel {kernel_name} not found in {filepath}")
+    
+    # Search for __call__ function within class
+    for node in class_node.body:
+        if isinstance(node, ast.FunctionDef) and node.name == "__call__":
+            call_node = node
+            return (class_node, call_node)
+
+    if not call_node:
+        raise ValueError(f"No __call__ function found inside Class {kernel_name}")
+   
+
+
+def get_arg_names(func_node: ast.FunctionDef):
+    """
+    Util to get the argument names from a Python AST function definition. This is needed to check
+    if there are any tunable parameters in the function signature.
+    """
+    args = func_node.args
+    names = []
+
+    names += [arg.arg for arg in args.args]
+    names += [arg.arg for arg in args.posonlyargs]
+    names += [arg.arg for arg in args.kwonlyargs]
+
+    # *args
+    if args.vararg:
+        names.append(args.vararg.arg)
+
+    # **kwargs
+    if args.kwarg:
+        names.append(args.kwarg.arg)
+
+    # Remove self for classes.
+    if 'self' in names:
+        names.remove('self')
+
+    return names
+
+
 def get_problem_size(problem_size, params):
     """Compute current problem size."""
     if callable(problem_size):
@@ -1014,6 +1093,33 @@ def has_kw_argument(func, name):
     return lambda answer, result_host, atol: v(answer, result_host)
 
 
+def normalize_call_function(v):
+    """Normalize a user-specified call function for language Generic Python.
+
+    The user-specified function has three required positional arguments (kernel_function, 
+    args, kwargs), and three optional keyword arguments: grid, threads and params. The
+    optional keyword arguments should appear in that order. We normalize the function
+    so that it always accepts grid, threads and params.
+
+    Undefined behaviour if the passed function does not match the required signatures.
+    """
+    def has_kw_argument(func, name):
+        sig = signature(func)
+        return name in sig.parameters
+    
+    if v is None:
+        return None
+
+    if has_kw_argument(v, "grid"):
+        if has_kw_argument(v, "threads"):
+            if has_kw_argument(v, "params"):
+                return v
+            return lambda kernel_function, args, kwargs, grid, threads, params: v(kernel_function, args, kwargs, grid, threads)
+        return lambda kernel_function, args, kwargs, grid, threads, params: v(kernel_function, args, kwargs, grid)
+    return lambda kernel_function, args, kwargs, grid, threads, params: v(kernel_function, args, kwargs)
+
+
+
 def parse_restrictions(
     restrictions: list[str], tune_params: dict, monolithic=False, format=None
 ) -> list[tuple[Union[Constraint, str], list[str]]]:
diff --git a/test/context.py b/test/context.py
index 7d94ac99b..ffa283c72 100644
--- a/test/context.py
+++ b/test/context.py
@@ -74,6 +74,12 @@
 except ImportError:
     bayes_opt_gpytorch_present = False
 
+try:
+    import torch
+    gen_python_torch_present = True
+except ImportError:
+    gen_python_torch_present = False
+
 try:
     import pyatf
     pyatf_present = True
@@ -110,6 +116,7 @@
 skip_if_no_hip = pytest.mark.skipif(not hip_present, reason="No HIP Python found")
 skip_if_no_pyatf = pytest.mark.skipif(not pyatf_present, reason="PyATF not installed")
 skip_if_no_methodology = pytest.mark.skipif(not methodology_present, reason="Autotuning Methodology not found")
+skip_if_no_torch = pytest.mark.skipif(not gen_python_torch_present, reason="Torch not installed")
 
 
 def skip_backend(backend: str):
@@ -129,3 +136,5 @@ def skip_backend(backend: str):
         pytest.skip("No nvc++ on PATH")
     elif backend.upper() == "HIP" and not hip_present:
         pytest.skip("HIP Python not installed")
+    elif backend.upper() == "GENERIC_PYTHON" and not gen_python_torch_present:
+        pytest.skip("Torch not installed")
diff --git a/test/test_backend.py b/test/test_backend.py
index e694649c1..af9971a48 100644
--- a/test/test_backend.py
+++ b/test/test_backend.py
@@ -4,8 +4,9 @@
     skip_if_no_cuda,
     skip_if_no_opencl,
     skip_if_no_pycuda,
+    skip_if_no_torch,
 )
-from kernel_tuner.backends import backend, compiler, cupy, nvcuda, opencl, pycuda
+from kernel_tuner.backends import backend, compiler, cupy, nvcuda, opencl, pycuda, generic_python
 
 
 class WrongBackend(backend.Backend):
@@ -45,3 +46,8 @@ def test_opencl_backend():
 @skip_if_no_pycuda
 def test_pycuda_backend():
     dev = pycuda.PyCudaFunctions()
+
+
+@skip_if_no_torch
+def test_generic_python_backend():
+    dev = generic_python.GenericPythonFunctions()
\ No newline at end of file
diff --git a/test/test_generic_python_functions.py b/test/test_generic_python_functions.py
new file mode 100644
index 000000000..b8cddea88
--- /dev/null
+++ b/test/test_generic_python_functions.py
@@ -0,0 +1,129 @@
+from .context import skip_if_no_torch
+from .test_kernel_source_fn import call_mock
+from kernel_tuner.core import DeviceInterface, KernelInstance
+from kernel_tuner.kernel_sources.kernel_source import KernelSource
+import numpy as np
+from pathlib import Path
+import os
+
+try:
+    import torch
+    torch_present = True
+except ImportError:
+    pass
+
+KS_FILE = os.path.join(Path(__file__).resolve().parent, "test_kernel_source_fn.py")
+
+# Helper functions ------------------------------
+
+def value_equal(a, b):
+    # Torch tensors
+    if isinstance(a, torch.Tensor):
+        return torch.equal(a, b)
+
+    # NumPy arrays
+    if isinstance(a, np.ndarray):
+        return np.array_equal(a, b)
+
+    # Fallback (ints, floats, strings, lists, tuples, dicts)
+    return a == b
+
+def get_context():
+    params = {'mock_param': 64}
+    a = 42
+    b = torch.randn(12, device='cuda', dtype=torch.float32)
+    args = [a, b]
+    ks = KernelSource("mock_kernel", KS_FILE, "generic_python", call_function=call_mock)
+    return ks, args, params
+
+
+# Tests ----------------------------------------------
+
+@skip_if_no_torch
+def test_ready_argument_list():
+    ks, args, params = get_context()
+    dev = DeviceInterface(ks)
+
+    gpu_args = dev.ready_argument_list(args)
+
+    assert len(args) == len(gpu_args)
+
+    # arg 0: python scalar
+    assert isinstance(gpu_args[0], int)
+    assert gpu_args[0] == args[0]
+
+    # arg 1: torch cuda tensor
+    assert isinstance(gpu_args[1], torch.Tensor)
+    assert gpu_args[1].is_cuda
+
+    # values equal
+    assert torch.allclose(gpu_args[1], args[1])
+
+    # ensure deep copy
+    assert gpu_args[1] is not args[1]
+
+
+@skip_if_no_torch
+def test_compile():
+    ks, args, params = get_context()
+
+    instance_data = ks.prepare_kernel_instance(
+        kernel_options=None,
+        params=params,
+        grid=None,
+        threads=None,
+    )
+    dev = DeviceInterface(ks)
+    instance = KernelInstance(
+        name="mock_kernel",
+        kernel_source=ks,
+        kernel_string=None,
+        kernel_fn=instance_data.kernel_fn,
+        temp_files=instance_data.temp_files,
+        threads=None,
+        grid=None,
+        params=params,
+        arguments=None,
+    )
+    gpu_args = dev.ready_argument_list(args)
+
+    callable_fn = dev.compile_kernel(instance, verbose=False, gpu_args=gpu_args)
+
+    # The mock function return the mock_param, which should be set to 64.
+    assert callable_fn(*args) == 64
+
+
+@skip_if_no_torch
+def test_gpu_kwargs():
+    params = {'mock_param': 64}
+    a = torch.randn(12, device='cuda', dtype=torch.float32)
+    args = [a] # we do not have to specify the kwarg here
+    ks = KernelSource("kernel_with_kwarg", KS_FILE, "generic_python", call_function=call_mock)
+    dev = DeviceInterface(ks)
+
+    instance_data = ks.prepare_kernel_instance(
+        kernel_options=None,
+        params=params,
+        grid=None,
+        threads=None,
+    )
+    dev = DeviceInterface(ks)
+    instance = KernelInstance(
+        name="kernel_with_kwarg",
+        kernel_source=ks,
+        kernel_string=None,
+        kernel_fn=instance_data.kernel_fn,
+        temp_files=instance_data.temp_files,
+        threads=None,
+        grid=None,
+        params=params,
+        arguments=None,
+    )
+    gpu_args = dev.ready_argument_list(args)
+    callable_fn = dev.compile_kernel(instance, verbose=False, gpu_args=gpu_args)
+
+    kwargs = dev.dev.gpu_kwargs
+    
+    assert kwargs["mock_param"] == 64
+    assert callable_fn(*args, **kwargs) == 64
+
diff --git a/test/test_kernel_source_fn.py b/test/test_kernel_source_fn.py
new file mode 100644
index 000000000..d02be9e4d
--- /dev/null
+++ b/test/test_kernel_source_fn.py
@@ -0,0 +1,172 @@
+import os
+from pathlib import Path
+import pytest
+import inspect
+import ast
+import functools # for an example decorator
+from kernel_tuner.kernel_sources.kernel_source import KernelSource
+# Note: need subclass imports to test for types and instantiate subclasses direclty.
+# Normally, just importing KernelSource is enough.
+from kernel_tuner.kernel_sources.kernel_source_fn import KernelSourceFn
+from kernel_tuner.kernel_sources.kernel_source_str import KernelSourceStr
+
+KS_FILE = Path(__file__).resolve()
+# Helper functions --------------------------------------------
+
+def normalize_ast(src: str):
+    return ast.dump(ast.parse(src), include_attributes=False)
+
+def mock_kernel(a, b):
+    mock_param = 256
+    if a < mock_param:
+        b = 42
+    return mock_param
+
+def kernel_with_kwarg(a, mock_param):
+    return mock_param
+
+# NOTE should kernel_with_kwarg now return the tuning param value for mock param or 5?
+def kernel_with_dependency():
+    mock_param = 42
+    foo = kernel_with_kwarg(mock_param, 5)
+    return mock_kernel(42, 42)
+
+def call_mock(kernel_function, args, kwargs, grid, threads, params):
+        kernel_function(*args, **kwargs)
+
+class TilusLike():
+    def __init__(self):
+        self.mock_param = 32  
+    
+    def __call__(self):
+        return self.mock_param
+
+    def another_function(self):
+        return self.mock_param
+
+@functools.lru_cache()   
+def kernel_with_decorator():
+    mock_param = 42
+    return mock_param
+    
+
+# Tests ------------------------------------------------------
+
+def test_factory_behaviour():
+    # KernelSourceFn should only be created when language generic_python is supplied
+    ks_fn = KernelSource("mock_kernel", KS_FILE, "generic_python", call_function=call_mock)
+    ks_str = KernelSource("vector_add", 'extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {', lang=None)
+    
+    assert isinstance(ks_fn, KernelSourceFn)
+    assert isinstance(ks_str, KernelSourceStr)
+
+
+def test_initiation():
+    '''
+    Test invalid KernelSourceFn initations    
+    '''
+    with pytest.raises(ValueError, match=r"call_function must be supplied for language .*"):
+        KernelSource("mock_kernel", KS_FILE, "generic_python")
+    
+    with pytest.raises(FileNotFoundError, match=r".* No such file or directory: .*"):
+        KernelSource("mock_kernel", "This is a string Kernel", "generic_python", call_function=call_mock)
+
+    with pytest.raises(TypeError, match="Error kernel_source does not specify a path to a file"):
+        KernelSource("mock_kernel", mock_kernel, "generic_python", call_function=call_mock)
+
+    with pytest.raises(ValueError, match=r"KernelSourceFn only supports a single kernel source"):
+        KernelSource("mock_kernel", [KS_FILE, "another file"], "generic_python", call_function=call_mock)
+
+    with pytest.raises(TypeError, match=r".* is not a callable object"):
+        KernelSource("mock_kernel", KS_FILE, "generic_python", call_function="not a function")
+
+
+
+def test_param_subsitution():
+    params = {"mock_param": 128}
+    ks = KernelSourceFn("mock_kernel", KS_FILE, "generic_python", call_function=call_mock)
+    new_kernel_fn, _ = ks.apply_params_to_source_fn(params)
+
+    actual_src = inspect.getsource(new_kernel_fn)
+    expected_src = """
+def mock_kernel(a, b):
+    mock_param = 128
+    if a < 128:
+        b = 42
+    return 128
+"""
+
+    assert normalize_ast(actual_src) == normalize_ast(expected_src)
+
+
+def test_imports():
+    '''
+    Import statements that are present in the file where the function lives
+    should also be present in the file where the new function lives. 
+    '''
+
+    params = {"mock_param": 128}
+    ks = KernelSourceFn("mock_kernel", KS_FILE, "generic_python", call_function=call_mock)
+    _, temp_path = ks.apply_params_to_source_fn(params)
+
+    # Check if imports are present
+    with open(temp_path) as f:
+        full_src = f.read()
+
+    assert "import pytest" in full_src
+    assert "import inspect" in full_src
+
+
+def test_param_substitution_class():
+    '''
+    Tilus uses the __call__ function of a class to define its kernel. Therefore,
+    param substitution should also work classes. Param substiution in other class
+    functions is also supported.
+    '''
+
+    params = {"mock_param": 128}
+    ks = KernelSourceFn("TilusLike", KS_FILE, "generic_python", call_function=call_mock)
+
+    new_kernel_fn, _ = ks.apply_params_to_source_fn(params)
+
+    actual_src = inspect.getsource(new_kernel_fn)
+    expected_src = """
+class TilusLike():
+    def __init__(self):
+        self.mock_param = 128  
+
+    def __call__(self):
+        return 128
+    
+    def another_function(self):
+        return 128
+"""
+    assert normalize_ast(actual_src) == normalize_ast(expected_src)
+
+
+
+def test_decorator():
+    params = {"mock_param": 128}
+    ks = KernelSourceFn("kernel_with_decorator", KS_FILE, "generic_python", call_function=call_mock)
+    new_kernel_fn, _ = ks.apply_params_to_source_fn(params)
+    actual_src = inspect.getsource(new_kernel_fn)
+    expected_src = """
+@functools.lru_cache()   
+def kernel_with_decorator():
+    mock_param = 128
+    return 128
+"""
+    assert hasattr(new_kernel_fn, "__wrapped__")
+    assert normalize_ast(actual_src) == normalize_ast(expected_src)
+
+
+def test_dependencies():
+    params = {"mock_param": 128}
+    ks = KernelSourceFn("kernel_with_dependency", KS_FILE, "generic_python", call_function=call_mock)
+    new_kernel_fn, _ = ks.apply_params_to_source_fn(params)
+    res = new_kernel_fn() # This should not throw an error if the dependency exists in the module.
+
+    assert res == 128
+
+
+
diff --git a/test/test_observers.py b/test/test_observers.py
index 69a94a9bb..d1a27179c 100644
--- a/test/test_observers.py
+++ b/test/test_observers.py
@@ -4,6 +4,7 @@
 from kernel_tuner.observers.nvml import NVMLObserver
 from kernel_tuner.observers.observer import BenchmarkObserver
 from kernel_tuner.observers.register import RegisterObserver
+from kernel_tuner.language import Language
 
 from .context import (
     skip_if_no_cuda,
@@ -59,7 +60,7 @@ def get_results(self):
     result, _ = kernel_tuner.tune_kernel(*env_compiler, observers=[lambda args: MyObserver(args)], compiler_options=["-fopenmp"])
 
     # Check if the observer has correctly received the lang option
-    assert result[0]["observer_args"]["lang"] == "C"
+    assert result[0]["observer_args"]["lang"] == Language.C
 
 @skip_if_no_pycuda
 def test_register_observer_pycuda(env):
diff --git a/test/test_util_functions.py b/test/test_util_functions.py
index ba8727765..66710a9ba 100644
--- a/test/test_util_functions.py
+++ b/test/test_util_functions.py
@@ -672,6 +672,27 @@ def verify2(answer, result_host, atol):
     assert v(1, 2, atol=3)
 
 
+
+def test_normalize_call_function_none():
+    assert normalize_call_function(None) is None
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda f, a, k: f(*a, **k),
+        lambda f, a, k, grid: f(*a, **k),
+        lambda f, a, k, grid, threads: f(*a, **k),
+        lambda f, a, k, grid, threads, params: f(*a, **k),
+    ],
+)
+def test_normalize_call_function(func):
+    v = normalize_call_function(func)
+
+    def kernel(x):
+        return x + 1
+
+    assert v(kernel, (1,), {}, grid=1, threads=2, params=3) == 2
+
 class MockRunner:
     simulation_mode = False