diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml index f86240c7f..dec26fdba 100644 --- a/.github/workflows/test-python-package.yml +++ b/.github/workflows/test-python-package.yml @@ -21,7 +21,7 @@ jobs: strategy: matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-13] steps: - uses: actions/checkout@v4 diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/generic_python/call_functions.py b/examples/generic_python/call_functions.py new file mode 100644 index 000000000..8c74790e1 --- /dev/null +++ b/examples/generic_python/call_functions.py @@ -0,0 +1,114 @@ +import torch + +def call_tilus(kernel_function, args, kwargs): + kernel_function(*args, **kwargs) + + +def call_triton(kernel_function, args, kwargs, grid, threads, params): + if "num_warps" in params.keys(): + kwargs["num_warps"] = params["num_warps"] + if "num_stages" in params.keys(): + kwargs["num_stages"] = params["num_stages"] + + kernel_function[grid](*args, **kwargs) + + +def call_tilelang(kernel_function, args, kwargs): + compiled_kernel = kernel_function(**kwargs) + compiled_kernel(*args) + + +def call_numba(kernel_function, args, kwargs, grid, threads): + from numba import cuda + + numba_args = [] + for arg in args: + if isinstance(arg, torch.Tensor): + numba_args.append(cuda.as_cuda_array(arg)) + else: + numba_args.append(arg) + + kernel_function[grid, threads](*args, **kwargs) + + +def call_cupyx(kernel_function, args, kwargs, grid, threads): + import cupy as cp + + cupy_args = [] + for arg in args: + if isinstance(arg, torch.Tensor): + cupy_args.append(cp.from_dlpack(arg)) + else: + cupy_args.append(arg) + kernel_function(grid, threads, tuple(cupy_args)) + + +def call_cute(kernel_function, args, kwargs, grid, threads, params): + import cutlass.cute as cute + from cutlass.cute.runtime import from_dlpack + + # Initialize cache if it does not exist + if not hasattr(call_cute, "custom_cache"): + call_cute.custom_cache = {} + + # Convert Torch tensors to CuTe tensors with correct layout + cute_args = [] + for arg in args: + if isinstance(arg, torch.Tensor): + arg_ = from_dlpack(arg) + cute_args.append(arg_) + else: + cute_args.append(arg) + + # Form cache key from tuning parameters + param_keys = sorted(params.keys()) + cache_str = type(kernel_function).__name__ + for k in param_keys: + cache_str += "_" + str(params[k]) + + # Check if kernel exists in cache. Otherwise, compile and save + if cache_str in call_cute.custom_cache: + compiled_kernel = call_cute.custom_cache[cache_str] + else: + compiled_kernel = cute.compile(kernel_function, *cute_args) + call_cute.custom_cache[cache_str] = compiled_kernel + + compiled_kernel(*cute_args, **kwargs) + + +def call_taichi(kernel_function, args, kwargs): + kernel_function(*args, **kwargs) + + +def call_warp(kernel_function, args, kwargs, grid, threads, params): + import warp as wp + + # Convert Torch tensors to Warp args + warp_args = [] + for arg in args: + if isinstance(arg, torch.Tensor): + warp_args.append(wp.from_torch(arg)) + else: + warp_args.append(arg) + + # Check if block_dim is in the tuning parameters. Otherwise, use + # the computed thread block dimensions. + if 'block_dim' in params.keys(): + threads_per_block = params['block_dim'] + else: + threads_per_block = threads[0] * threads[1] * threads[2] + + # Check if dim is in the tuning parameters. Otherwise, compute from + # grid and threads. + if 'dim' in params.keys(): + dimensions = params['dim'] + else: + dimensions = [grid[i] * threads[i] for i in range(len(grid))] + + # launch kernel + wp.launch( + kernel_function, + dim=dimensions, + inputs=warp_args, + block_dim=threads_per_block + ) \ No newline at end of file diff --git a/examples/generic_python/cute_vec_add.py b/examples/generic_python/cute_vec_add.py new file mode 100644 index 000000000..2744abb77 --- /dev/null +++ b/examples/generic_python/cute_vec_add.py @@ -0,0 +1,57 @@ +import torch + +import cutlass +import cutlass.cute as cute + +from kernel_tuner import tune_kernel +from call_functions import call_cute + +@cute.kernel +def vec_add_kernel( + gA: cute.Tensor, + gB: cute.Tensor, + gC: cute.Tensor, + size: cute.Int32, +): + tidx, _, _ = cute.arch.thread_idx() + bidx, _, _ = cute.arch.block_idx() + bdim, _, _ = cute.arch.block_dim() + thread_id = bdim * bidx + tidx + + if thread_id < size: + gC[thread_id] = gA[thread_id] + gB[thread_id] + + +@cute.jit +def vec_add( + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + size: cute.Int32, +): + num_threads_per_block = 256 + + kernel = vec_add_kernel(mA, mB, mC, size) + + kernel.launch( + grid=(cute.ceil_div(size, num_threads_per_block), 1, 1), + block = (num_threads_per_block, 1, 1), + ) + + +def main(): + size = 16384 + a = torch.randn(size, device="cuda", dtype=torch.float16) + b = torch.randn(size, device="cuda", dtype=torch.float16) + c = torch.zeros(size, device="cuda", dtype=torch.float16) + + args = [a, b, c, size] + tune_params = {"num_threads_per_block": [1, 2, 4, 8, 16, 32, 64, 128, 265, 512, 1024]} + answer = [None, None, (a+b).cpu(), None] + + tune_kernel("vec_add", __file__, size, args, tune_params, answer=answer, + lang="generic_python", call_function=call_cute, verbose=True) + + +main() + diff --git a/examples/generic_python/matmul/cupy_matmul.py b/examples/generic_python/matmul/cupy_matmul.py new file mode 100644 index 000000000..1101b5fae --- /dev/null +++ b/examples/generic_python/matmul/cupy_matmul.py @@ -0,0 +1,71 @@ +import cupy as cp +from cupyx import jit +import numpy as np + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_cupyx + + +@jit.rawkernel() +def gemm(a, b, c, M, N, K): + row = jit.blockIdx.y * jit.blockDim.y + jit.threadIdx.y + col = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x + + if row < M and col < N: + acc = 0.0 + for kk in range(K): + acc += a[row, kk] * b[kk, col] + c[row, col] = acc + + +def run(M, N, K): + # float16 matrices on GPU + a = cp.random.random((M, K)).astype(cp.float16) + b = cp.random.random((K, N)).astype(cp.float16) + c = cp.zeros((M, N), dtype=cp.float16) + + # block / grid configuration + block = (16, 16) + grid = ((N + block[0] - 1) // block[0], (M + block[1] - 1) // block[1]) + + # launch kernel + gemm[grid, block](a, b, c, M, N, K) + cp.cuda.Device().synchronize() + + # Correctness verification + c_ref = cp.matmul(a, b) + assert cp.allclose(c, c_ref, rtol=1e-2, atol=1e-1) + + print("Succes") + + +def tune(M, N, K): + # random test data. Here we had to use numpy arrays instead of cupy. + A = np.random.rand(M, K).astype(np.float16) + B = np.random.rand(K, N).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + + args = [A, B, C, M, N, K] + size = (N, M) + tune_params = {"block_size_x": [2**i for i in range(11)], "block_size_y": [2**i for i in range(11)]} + restrictions = ["block_size_x * block_size_y <= 1024"] + + results, env = tune_kernel( + kernel_name="gemm", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + answer=[None, None, A.dot(B), None, None, None], + atol=1e-1, + call_function=call_cupyx, + lang="generic_python", + restrictions=restrictions, + verbose=True, + ) + + +if __name__ == "__main__": + M, N, K = 1024, 1024, 1024 + tune(M, N, K) + \ No newline at end of file diff --git a/examples/generic_python/matmul/cute_matmul.py b/examples/generic_python/matmul/cute_matmul.py new file mode 100644 index 000000000..d584a6aa8 --- /dev/null +++ b/examples/generic_python/matmul/cute_matmul.py @@ -0,0 +1,1065 @@ +import math +import torch +from typing import Tuple + +import cutlass +import cutlass.cute as cute +import cutlass.utils as utils +from cutlass.cute.runtime import from_dlpack + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_cute + +# might need export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH to work + +## Basic Matmul ================================================================ + +@cute.kernel +def naive_matmul_kernel(gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor, M: int, K: int, N: int): + tx, ty, _ = cute.arch.thread_idx() + bx, by, _ = cute.arch.block_idx() + bdx, bdy, _ = cute.arch.block_dim() + + # Global indices + n = bx * bdx + tx + m = by * bdy + ty + + if m < M and n < N: + acc = cutlass.Float32(0.0) + + for k in range(K): + a = gA[m, k].to(cutlass.Float32) + b = gB[k, n].to(cutlass.Float32) + acc += a * b + + gC[m, n] = acc.to(gC.element_type) + + +@cute.jit +def matmul( + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, +): + block_size_x = 16 + block_size_y = 16 + block = (block_size_x, block_size_y, 1) + + M, K = mA.shape + _, N = mB.shape + + grid = ( + (N + block[0] - 1) // block[0], + (M + block[1] - 1) // block[1], + 1, + ) + + kernel = naive_matmul_kernel(mA, mB, mC, M, K, N) + + kernel.launch(grid=grid, block=block) + + +def run_naive_matmul(M, N, K): + a = torch.randn(M, K, device="cuda", dtype=torch.float16) + b = torch.randn(K, N, device="cuda", dtype=torch.float16) + c = torch.zeros(M, N, device="cuda", dtype=torch.float16) + c_ref = a @ b + + # Convert to CuTe tensors + a_ = from_dlpack(a, assumed_align=16) + b_ = from_dlpack(b, assumed_align=16) + c_ = from_dlpack(c, assumed_align=16) + + compiled_kernel = cute.compile(matmul, a_, b_, c_) + compiled_kernel(a_, b_, c_) + + assert torch.allclose(c, c_ref, atol=M * 2 **(-11), rtol=1e-2) + print("Succes") + + +def tune_naive_matmul(M, N, K): + a = torch.randn(M, K, device="cuda", dtype=torch.float16) + b = torch.randn(K, N, device="cuda", dtype=torch.float16) + c = torch.zeros(M, N, device="cuda", dtype=torch.float16) + + args = [a, b, c] + size = M * N + answer = [None, None, (a @ b).cpu()] + tune_params = dict() + tune_params["block_size_x"] = [2**i for i in range(1, 10)] + tune_params["block_size_y"] = [2**i for i in range(1, 10)] + restrictions = ["block_size_x * block_size_y >= 32, block_size_x * block_size_y <= 1024"] + + results, env = tune_kernel("matmul", __file__, size, args, tune_params, lang="generic_python", + call_function=call_cute, answer=answer, atol=M * 2 **(-11), restrictions=restrictions, verbose=False) + + + +# This kernel is copied from the CuTe DSL project: +# https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/ampere/tensorop_gemm.py +# +# Original example: tensorop_gemm.py +# Copyright (c) 2025 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Modifications in file: +# - Tuning paramters made more explicit + +## Optimized matmul ========================================================== +# Copyright (c) 2025 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +""" +A dense GEMM (C = A * B) example for the NVIDIA Ampere architecture using CUTE DSL. +- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M") +- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K") +- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M") + +This GEMM kernel supports the following features: + - Utilizes Ampere's tensor cores for matrix multiply-accumulate (MMA) operations + - Threadblock rasterization to improve data re-use + - Supports multi-stage pipeline to overlap computation and memory access + - Implements shared memory buffering for epilogue to increase coalesced global memory access + +This GEMM works as follows: +1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using asynchronous copies. +2. Perform matrix multiply-accumulate (MMA) operations. +3. Store results from registers (RMEM) to shared memory (SMEM), then to global memory (GMEM). + +The Ampere tensor core instruction used operates as follows: +- Read matrix A from SMEM +- Read matrix B from SMEM +- Perform MMA operation and store the result in Accumulator(register) + +To run this example: + +.. code-block:: bash + + python examples/ampere/tensorop_gemm.py \ + --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1 \ + --ab_dtype Float16 \ + --c_dtype Float16 --acc_dtype Float32 \ + --a_major m --b_major n --c_major n + +The above example command computes with M=8192, N=8192, K=8192, +batch_count=1. The atom layout's shape is 2x2x1 and the input, mma +accumulator, and output data type are set as fp16, fp32 and fp16, +respectively. + +To collect performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/ampere/tensorop_gemm.py \ + --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1 \ + --ab_dtype Float16 \ + --c_dtype Float16 --acc_dtype Float32 \ + --a_major m --b_major n --c_major n \ + --skip_ref_check --iterations 2 + +Constraints: +* Supported input and output data types: fp16 +* Support accumulator data types: f32 +* Default tile shape is set to be 128x128x32 +* Atom layout's MNK shape is set so that tile shape can be divided by MMA + instruction shape +* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned, + i.e, number of elements is a multiple of 8 +""" + + +class TensorOpGemm: + def __init__(self): + self.ab_dtype = cutlass.Float16 + self.c_dtype = cutlass.Float16 + self.acc_dtype = cutlass.Float32 + self.bM = 128 # extracted from cta_tiler for KT support + self.bN = 128 # extracted from cta_tiler for KT support + self.bK = 32 # extracted from cta_tiler for KT support + self.cta_tiler = (self.bM, self.bN, self.bK) + self.num_stages = 3 + atom_lay_M = 2 # extracted from atom_layout_mnk for KT support + atom_lay_N = 2 # extracted from atom_layout_mnk for KT support + atom_lay_K = 1 # extracted from atom_layout_mnk for KT support + self.atom_layout_mnk = (atom_lay_M, atom_lay_N, atom_lay_K) # moved from init parameter to here for KT support + self.num_threads = atom_lay_M * atom_lay_N * atom_lay_K * 32 + + self.mma_inst_shape = (16, 8, 16) + mmaM, mmaN, mmaK = self.mma_inst_shape + + assert self.bM % (atom_lay_M * mmaM) == 0, ( + "bM must be divisible by MMA instruction" + ) + assert self.bN % (atom_lay_N * mmaN) == 0, ( + "bN must be divisible by MMA instruction" + ) + assert atom_lay_K == 1, "this example does not support atom layout K > 1" + assert self.bK % mmaK == 0, "bK must be divisible by MMA instruction" + assert self.num_stages >= 3, "num_stages must be greater than or equal to 3" + + @cute.jit + def __call__( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + # The grid divides the problems's M, N, and L dimensions by the + # respective modes of the tile shape (bM, bN, 1). The K dimension is + # handled within a block via a multistage process. + + self.a_major_mode = utils.LayoutEnum.from_tensor(mA) + self.b_major_mode = utils.LayoutEnum.from_tensor(mB) + self.c_major_mode = utils.LayoutEnum.from_tensor(mC) + + # /////////////////////////////////////////////////////////////////////////////// + # Shared memory layout: + # /////////////////////////////////////////////////////////////////////////////// + + # Creates a layout with the size required for the provided tile + # size and num stages (stages are used for K dimension) that is also + # sectioned into 64x8 or 8x32 layout atoms. The swizzle is set so that + # the atom for the shared memory -> register copy does not encounter + # bank conflicts + + # assume the input is 16B align + ab_copy_bits = 128 + sA_layout = self._make_smem_layout_AB( + mA.element_type, + self.a_major_mode, + ab_copy_bits, + (self.cta_tiler[0], self.cta_tiler[2], self.num_stages), + ) + sB_layout = self._make_smem_layout_AB( + mB.element_type, + self.b_major_mode, + ab_copy_bits, + (self.cta_tiler[1], self.cta_tiler[2], self.num_stages), + ) + + # Creates a similar layout but without num_stages or layout atoms + sC_layout = self._make_smem_layout_C( + mC.element_type, + self.c_major_mode, + ab_copy_bits, + (self.cta_tiler[0], self.cta_tiler[1]), + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Tiled copy: + # The majorness of tA/tB/tC follows the majorness of gA/gB/gC, + # enabling merged accesses to global memory for faster data + # transfer between global and shared memory. + # /////////////////////////////////////////////////////////////////////////////// + + # Create a copy atom for a global to shared memory asynchronous copy + atom_async_copy = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp( + cache_mode=cute.nvgpu.cpasync.LoadCacheMode.GLOBAL + ), + mA.element_type, + num_bits_per_copy=ab_copy_bits, + ) + + # Create thread layouts for tiled copy from the copy atom where the + # thread layout simply follows the leading dimension of the tensor + tiled_copy_A = self._make_gmem_tiled_copy_AB( + atom_async_copy, mA.element_type, self.a_major_mode, ab_copy_bits + ) + tiled_copy_B = self._make_gmem_tiled_copy_AB( + atom_async_copy, mB.element_type, self.b_major_mode, ab_copy_bits + ) + + # Creates a synchronous copy atom and thread layouts for the epilogue + c_copy_bits = 128 + atom_sync_copy = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), + mC.element_type, + num_bits_per_copy=c_copy_bits, + ) + tiled_copy_C = self._make_gmem_tiled_copy_C( + atom_sync_copy, mC.element_type, self.c_major_mode, c_copy_bits + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Tiled MMA + # /////////////////////////////////////////////////////////////////////////////// + + # Creates a mma atom with 16x8x16 shape for MNK + op = cute.nvgpu.warp.MmaF16BF16Op( + self.ab_dtype, self.acc_dtype, self.mma_inst_shape + ) + + permutation_mnk = ( + self.atom_layout_mnk[0] * self.mma_inst_shape[0], + # if atom layout's N-mode is 1, to leverage the largest coalesced + # shared memory -> register copy, set the tiled mma's N mode to 16 + self.atom_layout_mnk[1] * self.mma_inst_shape[1] * 2, + self.atom_layout_mnk[2] * self.mma_inst_shape[2], + ) + + # Created a tiled mma that tiles the atom according to specified layout. + # For a 2x2x1 atom layout, the mma atom is duplicated 4 times, twice + # across M and twice across N + tC = cute.make_layout(self.atom_layout_mnk) + tiled_mma = cute.make_tiled_mma( + op, + tC, + permutation_mnk=permutation_mnk, + ) + + # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, l) + grid_dim = cute.ceil_div(mC.shape, (self.bM, self.bN, 1)) + + # Add threadblock rasterization to improve re-use of data + raster_factor = 1 + grid_dim_n = cute.size(grid_dim[1]) + # Thresholds picked so that it doesn't cause too many no-op CTAs + if grid_dim_n > 5: + raster_factor = 8 + elif grid_dim_n > 2: + raster_factor = 4 + elif grid_dim_n > 1: + raster_factor = 2 + rasterization_remap_grid_dim = ( + cute.size(grid_dim[0]) * raster_factor, + (cute.size(grid_dim[1]) + raster_factor - 1) // raster_factor, + cute.size(grid_dim[2]), + ) + + self.kernel( + mA, + mB, + mC, + sA_layout, + sB_layout, + sC_layout, + tiled_copy_A, + tiled_copy_B, + tiled_copy_C, + tiled_mma, + raster_factor, + epilogue_op, + ).launch( + grid=rasterization_remap_grid_dim, + block=[self.num_threads, 1, 1], + ) + + @cute.kernel + def kernel( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + sA_layout: cute.ComposedLayout, + sB_layout: cute.ComposedLayout, + sC_layout: cute.ComposedLayout, + tiled_copy_A: cute.TiledCopy, + tiled_copy_B: cute.TiledCopy, + tiled_copy_C: cute.TiledCopy, + tiled_mma: cute.TiledMma, + rasterization_factor: cutlass.Int32, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + # Thread index, block index + tidx, _, _ = cute.arch.thread_idx() + bidx, bidy, bidz = cute.arch.block_idx() + grid_dim = cute.ceil_div(mC.shape, (self.bM, self.bN, 1)) + offset_tile_x, offset_tile_y = self.raster_tile( + bidx, bidy, rasterization_factor + ) + # Early exit if CTA is out of range + if grid_dim[0] <= offset_tile_x or grid_dim[1] <= offset_tile_y: + pass + else: + tiler_coord = (offset_tile_x, offset_tile_y, None) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread block. + # gA: (BLK_M, BLK_N, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N) + # /////////////////////////////////////////////////////////////////////////////// + gA = cute.local_tile( + mA[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, None, 1), + ) + gB = cute.local_tile( + mB[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(None, 1, 1), + ) + gC = cute.local_tile( + mC[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, 1, None), + ) + + # By default, if the tensor k mode does not divide into the tile k + # size, then last tiles in the k dimension are irregular. + # Instead, make the first tiles irregular when k is irregular. + # This allows us to handle the irregular tile first to avoid + # checking for this condition within the mainloop. + + # residual_k is a negative number indicating the amount needed to + # shift the pointer by in dimension k + residual_k = cute.size(mA, mode=[1]) - cutlass.Int32(self.bK) * cute.size( + gA, mode=[2] + ) + + # move the pointer of gA/gB in the `-k` direction + gA = cute.domain_offset((0, residual_k, 0), gA) + gB = cute.domain_offset((0, residual_k, 0), gB) + # input is 16B aligned + gA = cute.make_tensor(gA.iterator.align(16), gA.layout) + gB = cute.make_tensor(gB.iterator.align(16), gB.layout) + + # Construct identity layout for sA and sB (mirrors global tensors, + # used for predication only) + mcA = cute.make_identity_tensor(mA.layout.shape) + mcB = cute.make_identity_tensor(mB.layout.shape) + cA = cute.local_tile( + mcA[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, None, 1), + ) + cB = cute.local_tile( + mcB[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(None, 1, 1), + ) + + cA = cute.domain_offset((0, residual_k, 0), cA) + cB = cute.domain_offset((0, residual_k, 0), cB) + + # /////////////////////////////////////////////////////////////////////////////// + # Create shared memory buffers and get the appropriate fragments for this thread. + # sA: (BLK_M, BLK_K, PIPE) , sB: (BLK_N, BLK_K, PIPE) + # tAgA: (CPY, CPY_M, CPY_K, k) , tBgB: (CPY, CPY_N, CPY_K, k) + # tAsA: (CPY, CPY_M, CPY_K, PIPE) , tBsB: (CPY, CPY_N, CPY_K, PIPE) + # /////////////////////////////////////////////////////////////////////////////// + @cute.struct + class SharedStorageAB: + a: cute.struct.Align[ + cute.struct.MemRange[mA.element_type, cute.cosize(sA_layout)], + 16, + ] + b: cute.struct.Align[ + cute.struct.MemRange[mB.element_type, cute.cosize(sB_layout)], + 16, + ] + + @cute.struct + class SharedStorageC: + c: cute.struct.Align[ + cute.struct.MemRange[mC.element_type, cute.cosize(sC_layout)], + 16, + ] + + # Shared memory buffer + smem = cutlass.utils.SmemAllocator() + # Shared memory allocated for operations with A, B will be + # overwritten for operations on C. This is to improve performance + # by reducing the size of shared memory requested by each block + storage = smem.allocate( + max(SharedStorageAB.size_in_bytes(), SharedStorageC.size_in_bytes()), + byte_alignment=16, + ) + sA = SharedStorageAB(storage).a.get_tensor(sA_layout) + sB = SharedStorageAB(storage).b.get_tensor(sB_layout) + sC = SharedStorageC(storage).c.get_tensor(sC_layout) + + thr_copy_A = tiled_copy_A.get_slice(tidx) + thr_copy_B = tiled_copy_B.get_slice(tidx) + thr_copy_C = tiled_copy_C.get_slice(tidx) + tAgA = thr_copy_A.partition_S(gA) + tAsA = thr_copy_A.partition_D(sA) + tBgB = thr_copy_B.partition_S(gB) + tBsB = thr_copy_B.partition_D(sB) + tCsC_epilogue = thr_copy_C.partition_S(sC) + tCgC_epilogue = thr_copy_C.partition_D(gC) + + # Repeat the partitioning with identity layouts + tAcA = thr_copy_A.partition_S(cA) + tBcB = thr_copy_B.partition_S(cB) + + # /////////////////////////////////////////////////////////////////////////////// + # Predicate: Mark indices that need to copy when problem_shape isn't a multiple + # of tile_shape + # /////////////////////////////////////////////////////////////////////////////// + + # For predication over the tensors A (M/K), B (N/K), and (in the + # epilogue) C (M/N), we will compute it in a fashion similar to an + # outer product. The predication along one of the dimensions is + # evaluated and stored in a predication tensor. Then, the + # predication for the remaining dimension is handled later via an + # if/else branch at the copy. + # For A and B, predication booleans along M/N are stored in a + # predication tensor and along K is handled via a if/else branch. + + # Allocate predicate tensors for M and N. Predication is checked + # at the granularity of a copy atom, so the predicate tensor does not + # need separate booleans for individual elements within a copy + # atom (for example, the elements of tAgA.shape[0][0].) + tApA = cute.make_rmem_tensor( + cute.make_layout( + ( + tAgA.shape[0][1], + cute.size(tAgA, mode=[1]), + cute.size(tAgA, mode=[2]), + ), + stride=(cute.size(tAgA, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + tBpB = cute.make_rmem_tensor( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=(cute.size(tBsB, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + # Set predicates for M/N bounds + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cute.elem_less( + tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0] + ) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cute.elem_less( + tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0] + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Prefetch Prologue + # /////////////////////////////////////////////////////////////////////////////// + # Clear the smem tiles to account for predicated off loads + tAsA.fill(0) + tBsB.fill(0) + cute.arch.sync_threads() + # Start async loads for the first k-tile. Here we take care of the k residue + # via if/else check along the k dimension. Because we shifted the identity tensor + # by the residue_k and because the identity tensor is a coord tensor, the + # values of any identity tensor element that is poison is less than -1 + num_smem_stages = cute.size(tAsA, mode=[3]) + k_tile_count = cute.size(tAgA, mode=[3]) + k_tile_index = cutlass.Int32(0) + + for k in range(tApA.shape[2]): + if cute.elem_less(cutlass.Int32(-1), tAcA[0, 0, k, 0][1]): + cute.copy( + tiled_copy_A, + tAgA[None, None, k, k_tile_index], + tAsA[None, None, k, 0], + pred=tApA[None, None, k], + ) + for k in range(tBpB.shape[2]): + if cute.elem_less(cutlass.Int32(-1), tBcB[0, 0, k, 0][1]): + cute.copy( + tiled_copy_B, + tBgB[None, None, k, k_tile_index], + tBsB[None, None, k, 0], + pred=tBpB[None, None, k], + ) + k_tile_index = k_tile_index + 1 + cute.arch.cp_async_commit_group() + + # Start async loads for rest of the k-tiles + for k_tile in range(1, num_smem_stages - 1): + if k_tile == k_tile_count: + tApA.fill(0) + tBpB.fill(0) + cute.copy( + tiled_copy_A, + tAgA[None, None, None, k_tile_index], + tAsA[None, None, None, k_tile], + pred=tApA, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, k_tile_index], + tBsB[None, None, None, k_tile], + pred=tBpB, + ) + k_tile_index = k_tile_index + 1 + cute.arch.cp_async_commit_group() + + # /////////////////////////////////////////////////////////////////////////////// + # Tile MMA compute thread partitions and allocate accumulators + # /////////////////////////////////////////////////////////////////////////////// + thr_mma = tiled_mma.get_slice(tidx) + tCsA = thr_mma.partition_A(sA) + tCsB = thr_mma.partition_B(sB) + tCsC = thr_mma.partition_C(sC) + tCgC = thr_mma.partition_C(gC) + tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0]) + tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0]) + tCrC = tiled_mma.make_fragment_C(tCgC) + # Clear the accumulator + tCrC.fill(0.0) + + # /////////////////////////////////////////////////////////////////////////////// + # Copy Atom A/B retiling + # /////////////////////////////////////////////////////////////////////////////// + + # Create the copy atoms for the copy from shared memory to register + atom_copy_s2r_A = cute.make_copy_atom( + cute.nvgpu.warp.LdMatrix8x8x16bOp( + self.a_major_mode != utils.LayoutEnum.ROW_MAJOR, 4 + ), + mA.element_type, + ) + atom_copy_s2r_B = cute.make_copy_atom( + cute.nvgpu.warp.LdMatrix8x8x16bOp( + self.b_major_mode != utils.LayoutEnum.ROW_MAJOR, 4 + ), + mB.element_type, + ) + + # Creates the tiled copy so that it matches the thread-value layout + # expected by the tiled mma + tiled_copy_s2r_A = cute.make_tiled_copy_A(atom_copy_s2r_A, tiled_mma) + tiled_copy_s2r_B = cute.make_tiled_copy_B(atom_copy_s2r_B, tiled_mma) + + thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx) + thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx) + tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA) + tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA) + tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB) + tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB) + + # Current pipe index in smem to read from / write to + smem_pipe_read = 0 + smem_pipe_write = num_smem_stages - 1 + + tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read] + tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read] + + # /////////////////////////////////////////////////////////////////////////////// + # PREFETCH register pipeline + # /////////////////////////////////////////////////////////////////////////////// + num_k_block = cute.size(tCrA, mode=[2]) + if num_k_block > 1: + # Wait until our first prefetched tile is loaded in + cute.arch.cp_async_wait_group(num_smem_stages - 2) + cute.arch.sync_threads() + # Prefetch the first k-block rmem from the first k-tile + cute.copy( + tiled_copy_s2r_A, + tCsA_p[None, None, 0], + tCrA_copy_view[None, None, 0], + ) + cute.copy( + tiled_copy_s2r_B, + tCsB_p[None, None, 0], + tCrB_copy_view[None, None, 0], + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Mainloop + # 1. Shared memory pipeline (gmem -> smem): + # The default smem pipeline depth is 3, meaning that for shared + # memory buffers, we allocate three times the size described by the + # CTA tiler. We prefetch 2 of these buffers before entering the main + # loop. Considering only the transfer from global memory to shared + # memory, the general structure of the mainloop is: + # (1) copy k-tile from gmem to smem; + # (2) perform gemm computation on k-tile; + # (3) wait for the next copy to finish. + # The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command + # waits for the number of unfinished 'copy' to be <= 1. The advantage + # of this approach is that it allows for simultaneous production + # (i.e., step (1)) and consumption (i.e., step (2)) of smem. + # A common misconception is to prefetch N buffers and rewrite + # the pipeline logic to wait on N-1 pending copies. The disadvantage + # of this approach is that it requires fully consuming a buffer in + # order to open an empty buffer for the next copy. + # 2. Register pipeline (smem -> register): + # Similarly, the register pipeline produces i+1, consumes i, and + # produces i+2... Notably, i and i+1 do not use the same register, + # eliminating dependencies on the same register for better parallelism. + # 3. Combining the smem and register pipelines results in the mainloop. + # /////////////////////////////////////////////////////////////////////////////// + for k_tile in range(k_tile_count): + for k_block in cutlass.range(num_k_block, unroll_full=True): + if k_block == num_k_block - 1: + tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read] + tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read] + cute.arch.cp_async_wait_group(num_smem_stages - 2) + cute.arch.sync_threads() + + # Load A, B from shared memory to registers for k_block + 1 + k_block_next = (k_block + 1) % num_k_block # static + cute.copy( + tiled_copy_s2r_A, + tCsA_p[None, None, k_block_next], + tCrA_copy_view[None, None, k_block_next], + ) + cute.copy( + tiled_copy_s2r_B, + tCsB_p[None, None, k_block_next], + tCrB_copy_view[None, None, k_block_next], + ) + + # Fetch next A: To better interleave global memory access and compute + # instructions, we intentionally use the sequence: copy A, perform GEMM, + # then copy B. + if k_block == 0: + if k_tile + num_smem_stages - 1 < k_tile_count: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, k_tile_index], + tAsA[None, None, None, smem_pipe_write], + pred=tApA, + ) + + # Thread-level register gemm for k_block + cute.gemm( + tiled_mma, + tCrC, + tCrA[None, None, k_block], + tCrB[None, None, k_block], + tCrC, + ) + + # Fetch next B and update smem pipeline read/write + if k_block == 0: + if k_tile + num_smem_stages - 1 < k_tile_count: + cute.copy( + tiled_copy_B, + tBgB[None, None, None, k_tile_index], + tBsB[None, None, None, smem_pipe_write], + pred=tBpB, + ) + k_tile_index = k_tile_index + 1 + cute.arch.cp_async_commit_group() + smem_pipe_write = smem_pipe_read + smem_pipe_read = smem_pipe_read + 1 + if smem_pipe_read == num_smem_stages: + smem_pipe_read = 0 + + # Sync before epilogue + cute.arch.cp_async_wait_group(0) + cute.arch.sync_threads() + + # /////////////////////////////////////////////////////////////////////////////// + # Epilogue with fusion + # /////////////////////////////////////////////////////////////////////////////// + tCrD = cute.make_fragment_like(tCrC, self.c_dtype) + tCrD[None] = epilogue_op(tCrC.load()).to(self.c_dtype) + + # Copy results of D back to shared memory + cute.autovec_copy(tCrD, tCsC) + + # Create coord tensor for C + ceilM, ceilN, _ = cute.ceil_div(mC.shape, (self.bM, self.bN, 1)) + mcC = cute.make_identity_tensor( + ( + cute.size(ceilM) * self.cta_tiler[0], + cute.size(ceilN) * self.cta_tiler[1], + 1, + ) + ) + cC = cute.local_tile( + mcC[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, 1, None), + ) + tCcC = thr_copy_C.partition_S(cC) + + tCrC_epilogue = cute.make_fragment_like(tCsC_epilogue) + # Wait for all writes to shared memory to finish before starting copies + # using the new layouts + cute.arch.sync_threads() + cute.autovec_copy(tCsC_epilogue, tCrC_epilogue) + + # Create predication tensor for m + tCpC = cute.make_rmem_tensor( + cute.make_layout( + ( + tCgC_epilogue.shape[0][1], + cute.size(tCgC_epilogue, mode=[1]), + cute.size(tCgC_epilogue, mode=[2]), + ), + stride=(cute.size(tCgC_epilogue, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + for rest_v in range(tCpC.shape[0]): + for m in range(tCpC.shape[1]): + tCpC[rest_v, m, 0] = cute.elem_less( + tCcC[(0, rest_v), m, 0][0], mC.shape[0] + ) + + # Copy to global memory using better vectorization + for rest_v in range(tCpC.shape[0]): + for n in range(tCpC.shape[2]): + if cute.elem_less(tCcC[(0, rest_v), 0, n][1], mC.shape[1]): + cute.copy( + tiled_copy_C, + tCrC_epilogue[None, None, n], + tCgC_epilogue[None, None, n], + pred=tCpC[None, None, n], + ) + return + + def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler): + major_mode_size = ( + smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0] + ) + major_mode_size = 64 if major_mode_size >= 64 else major_mode_size + + swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits)) + swizzle_bits = min(swizzle_bits, 3) + + layout_atom_outer = ( + cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size)) + ) + layout_atom = cute.make_composed_layout( + cute.make_swizzle(swizzle_bits, 3, 3), + 0, + layout_atom_outer, + ) + layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1, 2)) + return layout + + def _make_smem_layout_C(self, dtype, major_mode, copy_bits, smem_tiler): + major_mode_size = ( + smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0] + ) + + swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits)) + swizzle_bits = min(swizzle_bits, 3) + + layout_atom_outer = ( + cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size)) + ) + layout_atom = cute.make_composed_layout( + cute.make_swizzle(swizzle_bits, 3, 4), + 0, + layout_atom_outer, + ) + + # Due to the thread layout of the mma, remove swizzle in C to + # prevent shared memory fragments owned by an single thread from + # holding swizzles + if major_mode == utils.LayoutEnum.COL_MAJOR: + layout_atom = cute.make_composed_layout( + cute.make_swizzle(0, 3, 4), 0, layout_atom_outer + ) + layout = cute.tile_to_shape( + layout_atom, + smem_tiler, + (0, 1), + ) + return layout + + def _make_gmem_tiled_copy_AB(self, atom_copy, dtype, major_mode, copy_bits): + copy_elems = copy_bits // dtype.width + shape_dim_1 = cute.size(self.bK) // copy_elems + # thread layout for copy + thread_layout = cute.make_layout( + (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1) + ) + if major_mode != utils.LayoutEnum.ROW_MAJOR: + shape_dim_0 = cute.size(self.bM) // copy_elems + thread_layout = cute.make_layout( + (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0) + ) + # Value layout for copy + value_layout = ( + cute.make_layout((1, copy_elems)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((copy_elems, 1)) + ) + return cute.make_tiled_copy_tv(atom_copy, thread_layout, value_layout) + + def _make_gmem_tiled_copy_C(self, atom_copy, dtype, major_mode, copy_bits): + copy_elems = copy_bits // dtype.width + shape_dim_1 = cute.size(self.bN) // copy_elems + # thread layout for copy + thread_layout = cute.make_layout( + (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1) + ) + if major_mode != utils.LayoutEnum.ROW_MAJOR: + shape_dim_0 = cute.size(self.bM) // copy_elems + thread_layout = cute.make_layout( + (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0) + ) + value_layout = ( + cute.make_layout((1, copy_elems)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((copy_elems, 1)) + ) + return cute.make_tiled_copy_tv(atom_copy, thread_layout, value_layout) + + def raster_tile(self, i, j, f): + new_i = i // f + new_j = (i % f) + (j * f) + return (new_i, new_j) + + +# Need a custum call function for the optimized kernel because of the tensor layout +def call_cute_custom(kernel_function, args, kwargs, grid, threads, params): + # Initialize cache if it does not exist + if not hasattr(call_cute_custom, "custom_cache"): + call_cute_custom.custom_cache = {} + + # Convert Torch tensors to CuTe tensors with correct layout + cute_args = [] + for arg in args: + if isinstance(arg, torch.Tensor): + cute_tensor = ( + from_dlpack(arg, assumed_align=16) + .mark_layout_dynamic(leading_dim=1) + .mark_compact_shape_dynamic( + mode=1, + stride_order=(2, 0, 1), + divisibility=8, + ) + ) + cute_args.append(cute_tensor) + else: + cute_args.append(arg) + + # Form cache key from tuning parameters + param_keys = sorted(params.keys()) + cache_str = type(kernel_function).__name__ + for k in param_keys: + cache_str += "_" + str(params[k]) + + # Check if kernel exists in cache. Otherwise, compile and save + if cache_str in call_cute_custom.custom_cache: + compiled_kernel = call_cute_custom.custom_cache[cache_str] + else: + # Wrap in try-except because CuTe gives TypeError for certain block sizes. This + # stops the tuning process and we do not want this + try: + compiled_kernel = cute.compile(kernel_function, *cute_args) + call_cute_custom.custom_cache[cache_str] = compiled_kernel + except: + raise RuntimeError("Invalid configuration") + + + + compiled_kernel(*cute_args) + + + +def run_optimized(M, N, K, L=1): + a = torch.randn((L, M, K), device="cuda", dtype=torch.float16).permute(1, 2, 0) + b = torch.randn((L, N, K), device="cuda", dtype=torch.float16).permute(1, 2, 0) # b is transposed + c = torch.empty((L, M, N), device="cuda", dtype=torch.float16).permute(1, 2, 0) + + c_ref = torch.matmul( + a.permute(2, 0, 1), + b.permute(2, 1, 0) + ).permute(1, 2, 0) + + def create_cute_tensor(torch_tensor): + cute_tensor = ( + from_dlpack(torch_tensor, assumed_align=16) + .mark_layout_dynamic(leading_dim=1) + .mark_compact_shape_dynamic( + mode=1, + stride_order=(2, 0, 1), + divisibility=8, + ) + ) + return cute_tensor + + a_, b_, c_ = create_cute_tensor(a), create_cute_tensor(b), create_cute_tensor(c) + + gemm = TensorOpGemm() + compiled_kernel = cute.compile(gemm, a_, b_, c_) + compiled_kernel(a_, b_, c_) + + assert torch.allclose(c_ref, c, atol= M * 2**(-11), rtol=1e-2) + print("Sucess") + + +def tune_optimized(M, N, K, L=1): + a = torch.randn((L, M, K), device="cuda", dtype=torch.float16).permute(1, 2, 0) + b = torch.randn((L, N, K), device="cuda", dtype=torch.float16).permute(1, 2, 0) # b is transposed + c = torch.empty((L, M, N), device="cuda", dtype=torch.float16).permute(1, 2, 0) + + c_ref = torch.matmul( + a.permute(2, 0, 1), + b.permute(2, 1, 0) + ).permute(1, 2, 0) + + + args = [a, b, c] + tune_params = { + "bM":[16, 32, 64, 128, 256], + "bN":[32, 64, 128, 256], + "bK": [16, 32, 64, 128], # must be divisable by 16 + "num_stages": [3, 4, 5], # restricted to >= 3 by the kernel + "atom_lay_M": [1, 2, 4], + "atom_lay_N": [1, 2], + } + + restrictions = [ + "bM % (atom_lay_M * 16) == 0", "bN % (atom_lay_N * 8) == 0", # layout + "atom_lay_M * atom_lay_N * 32 <= 1024", # number of threads + "num_stages * (bK * (bM + bN)) * 2 <= 49152", # SMEM + "bM >= atom_lay_M * 32", # ensure each atom sees at least 2 MMA tiles per dimension + "bN >= atom_lay_N * 16", # ensure each atom sees at least 2 MMA tiles per dimension + ] + + + + results, env = tune_kernel("TensorOpGemm", __file__, M * N, args, tune_params, verbose=True, restrictions=restrictions, #strategy="bayes_opt", + lang="generic_python", call_function=call_cute_custom, answer=[None, None, c_ref.cpu()], atol=M * 2**(-10)) + + + +if __name__ == "__main__": + m, n, k = 4096, 4096, 4096 + run_naive_matmul(m, n, k) + tune_naive_matmul(m, n, k) + + mnkl = (4096, 4096, 4096, 1) + tune_optimized(m, n, k) + run_optimized(m, n, k) diff --git a/examples/generic_python/matmul/numba_matmul.py b/examples/generic_python/matmul/numba_matmul.py new file mode 100644 index 000000000..71a6b1b3a --- /dev/null +++ b/examples/generic_python/matmul/numba_matmul.py @@ -0,0 +1,260 @@ +import numpy as np +from numba import cuda, float32 + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_numba + + +# Source: https://nvidia.github.io/numba-cuda/user/examples.html#matrix-multiplication +@cuda.jit(cache=True) +def matmul(A, B, C): + i, j = cuda.grid(2) + if i < C.shape[0] and j < C.shape[1]: + tmp = 0. + for k in range(A.shape[1]): + tmp += A[i, k] * B[k, j] + C[i, j] = tmp + + +# Translated to Numba-CUDA from https://github.com/cupy/cupy/blob/main/examples/gemm/sgemm.cu +@cuda.jit(cache=True) +def optimized_matmul(M, N, K, A, B, C): + DIM_X = 16 + DIM_Y = 16 + BLK_M = 64 + BLK_N = 64 + BLK_K = 16 + THR_M = 4 # Should be equal to BLK_M / DIM_X + THR_N = 4 # Should be equal to BLK_N / DIM_Y + + # thread indices + idx = cuda.threadIdx.x + idy = cuda.threadIdx.y + + blx = cuda.blockIdx.x + bly = cuda.blockIdx.y + + # shared memory + sA = cuda.shared.array((BLK_K, BLK_M), np.float16) + sB = cuda.shared.array((BLK_N, BLK_K), np.float16) + + # registers + rC = cuda.local.array((THR_N, THR_M), float32) + rA = cuda.local.array(THR_M, float32) + rB = cuda.local.array(THR_N, float32) + + # init accumulator + for n in range(THR_N): + for m in range(THR_M): + rC[n][m] = 0.0 + + # global indices + base_row = blx * BLK_M + base_col = bly * BLK_N + + # loop over K tiles + for kk in range(0, K, BLK_K): + + # load A tile into shared memory + for i in range(idy, BLK_K, DIM_Y): + for j in range(idx, BLK_M, DIM_X): + row = base_row + j + col = kk + i + if row < M and col < K: + sA[i, j] = A[row, col] + else: + sA[i, j] = 0.0 + + # load B tile + for i in range(idy, BLK_N, DIM_Y): + for j in range(idx, BLK_K, DIM_X): + row = kk + j + col = base_col + i + if row < K and col < N: + sB[i, j] = B[row, col] + else: + sB[i, j] = 0.0 + + cuda.syncthreads() + + # compute + for k in range(BLK_K): + for m in range(THR_M): + rA[m] = float32(sA[k, m * DIM_X + idx]) + + for n in range(THR_N): + rB[n] = float32(sB[n * DIM_Y + idy, k]) + + for n in range(THR_N): + for m in range(THR_M): + rC[n][m] += rA[m] * rB[n] + + cuda.syncthreads() + + # write back + for n in range(THR_N): + col = base_col + n * DIM_Y + idy + for m in range(THR_M): + row = base_row + m * DIM_X + idx + if row < M and col < N: + C[row, col] = np.float16(rC[n][m]) + + +def run_basic(M, N, K): + # create numpy arrays + A = np.random.rand(M, K).astype(np.float16) + B = np.random.rand(K, N).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + C_ref = A @ B + + # copy to GPU + A_d = cuda.to_device(A) + B_d = cuda.to_device(B) + C_d = cuda.to_device(C) + + # threads per block + threads = (16, 16) + + # compute grid size (ceil division) + blocks = ( + (M + threads[0] - 1) // threads[0], + (N + threads[1] - 1) // threads[1], + ) + + # launch kernel + matmul[blocks, threads](A_d, B_d, C_d) + cuda.synchronize() + + # copy result back + C_result = C_d.copy_to_host() + + # check + np.testing.assert_allclose(C_result, C_ref, rtol=1e-2, atol=M * 2**(-11)) + print("Succes") + + +def run_optimized(M, N, K): + # inputs + A = np.random.rand(M, K).astype(np.float16) + B = np.random.rand(K, N).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + C_ref = A @ B + + # move to GPU + dA = cuda.to_device(A) + dB = cuda.to_device(B) + dC = cuda.to_device(C) + + threads_per_block = (16, 16) + blocks_per_grid = ( + (M + 63) // 64, # BLK_M = 64 + (N + 63) // 64, # BLK_N = 64 + ) + + # launch + optimized_matmul[blocks_per_grid, threads_per_block](M, N, K, dA, dB, dC) + cuda.synchronize() + + # copy result back + C_result = dC.copy_to_host() + + # check + np.testing.assert_allclose(C_result, C_ref, rtol=1e-2, atol=M * 2**(-11)) + print("Succes") + + + + +def tune_basic(M, N, K): + # create inputs as normal, but do not copy to device + A = np.random.rand(M, K).astype(np.float16) + B = np.random.rand(K, N).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + + size = (M, N) + args = [A, B, C] + tune_params = dict() + tune_params["block_size_x"] = [2**i for i in range(1, 10)] + tune_params["block_size_y"] = [2**i for i in range(1, 10)] + + restrictions = ["block_size_x * block_size_y <= 1024"] + + answer = [None, None, A @ B] + atol = M * 2**(-11) + + results, env = tune_kernel( + kernel_name="matmul", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=answer, + atol=atol, + restrictions=restrictions, + call_function=call_numba, + ) + + +def tune_optimized(M, N, K): + # create inputs as normal, but do not copy to device + A = np.random.rand(M, K).astype(np.float16) + B = np.random.rand(K, N).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + + size = (M, N) + args = [M, N, K, A, B, C] + tune_params = { + "DIM_X": [2**i for i in range(1, 10)], + "DIM_Y": [2**i for i in range(1, 10)], + "BLK_M": [2**i for i in range(1, 10)], + "BLK_N": [2**i for i in range(1, 10)], + "BLK_K": [2**i for i in range(1, 10)], + "THR_M": [2**i for i in range(1, 9)], # Restricted to BLK_M / DIM_X + "THR_N": [2**i for i in range(1, 9)], # Restricted to BLK_N / DIM_Y + } + + answer = [None, None, None, None, None, A @ B] + atol = M * 2**(-11) + + restrictions = [ + "BLK_M % DIM_X == 0", + "BLK_N % DIM_Y == 0", + "THR_M == BLK_M / DIM_X", + "THR_N == BLK_N / DIM_Y", + "DIM_X * DIM_Y <= 1024", + "DIM_x * DIM_Y >= 32", + ] + + results, env = tune_kernel( + kernel_name="optimized_matmul", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=answer, + atol=atol, + restrictions = restrictions, + block_size_names = ["DIM_X", "DIM_Y"], + grid_div_x = ["BLK_M"], + grid_div_y = ["BLK_N"], + strategy = "bayes_opt", + strategy_options = {"max_fevals": 100}, + call_function=call_numba, + ) + + + +if __name__ == "__main__": + M, N, K = 1024, 1024, 1024 + + run_basic(M, N, K) + tune_basic(M, N, K) + + run_optimized(M, N, K) + tune_optimized(M, N, K) + + + + \ No newline at end of file diff --git a/examples/generic_python/matmul/taichi_matmul.py b/examples/generic_python/matmul/taichi_matmul.py new file mode 100644 index 000000000..bb3c53a81 --- /dev/null +++ b/examples/generic_python/matmul/taichi_matmul.py @@ -0,0 +1,63 @@ +import numpy as np +import taichi as ti + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_taichi + +ti.init(arch=ti.gpu) + +BLOCK_DIM = 512 +@ti.kernel +def matmul(A: ti.types.ndarray(dtype=ti.f16, ndim=2), B: ti.types.ndarray(dtype=ti.f16, ndim=2), C: ti.types.ndarray(dtype=ti.f16, ndim=2)): + K_dim = A.shape[1] + + ti.loop_config(block_dim=BLOCK_DIM) + for i, j in C: + sum = 0.0 + for k in range(K_dim): + sum += A[i, k] * B[k, j] + C[i, j] = ti.cast(sum, ti.f16) + + +def run(M, N, K): + A = np.random.rand(M, K).astype(np.float16) + B = np.random.rand(K, N).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + C_ref = A @ B + + matmul(A, B, C) + + np.testing.assert_allclose(C, C_ref, rtol=1e-2, atol=M * 2**(-11)) + print("Succes") + + +def tune(M, N, K): + A = np.random.rand(M, K).astype(np.float16) + B = np.random.rand(K, N).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + + size = M * N + args = [A, B, C] + tune_params = {"BLOCK_DIM": [2**i for i in range(5, 11)]} + + answer = [None, None, A @ B] + + results, env = tune_kernel( + kernel_name="matmul", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + answer=answer, + lang="generic_python", + call_function=call_taichi, + atol=M * 2**(-11), + block_size_names = ["BLOCK_DIM"], + ) + +if __name__ == "__main__": + run(1024, 1024, 1024) + tune(1024, 1024, 1024) + + + diff --git a/examples/generic_python/matmul/tilelang_matmul.py b/examples/generic_python/matmul/tilelang_matmul.py new file mode 100644 index 000000000..751c0add4 --- /dev/null +++ b/examples/generic_python/matmul/tilelang_matmul.py @@ -0,0 +1,324 @@ +import torch +import tilelang +import tilelang.language as T + +import itertools + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_tilelang + +# https://github.com/tile-ai/tilelang/tree/main/examples/gemm +# num_threads and num_stages added as variables to enable tuning. +@tilelang.jit +def matmul_basic(M:int, N:int, K:int, block_M:int, block_N:int, block_K:int, dtype:str="float16", accum_dtype:str="float32"): + @T.prim_func + def gemm( + A: T.Tensor((M, K), dtype), + B: T.Tensor((K, N), dtype), + C: T.Tensor((M, N), dtype), + ): + num_threads = 128 + num_stages = 3 + with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by): + # We do use shared memory, even though this is a basic kernel. However, you don't + # really get around this because T.gemm can not handle global memory directly. + A_shared = T.alloc_shared((block_M, block_K), dtype) + B_shared = T.alloc_shared((block_K, block_N), dtype) + C_local = T.alloc_fragment((block_M, block_N), accum_dtype) + T.clear(C_local) + + # We do use a pipelining optimization here, because this is 'the basic way' + # of writing for loops in TileLang. + for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages): + T.copy(A[by * block_M, k * block_K], A_shared) + T.copy(B[k * block_K, bx * block_N], B_shared) + T.gemm(A_shared, B_shared, C_local) + + T.copy(C_local, C[by * block_M, bx * block_N]) + + return gemm + + + +# This kernel is copied from the TileLang project: +# https://github.com/tile-ai/tilelang/tree/main/examples/gemm/example_gemm.py +# +# Original example: example_gemm.py +# Copyright (c) the TileLang authors +# +# Modifications in file: +# - num_threads and num_stages added as metaparameters +# - Removed annotated memory layout for tiles A and B +# - dummy parameter to trigger fresh compilation when timing repeated tuning +@tilelang.jit +def matmul_opt(M, N, K, block_M, block_N, block_K, num_threads=128, num_stages=3, dummy=0, dtype=T.float16, accum_dtype=T.float): + @T.prim_func + def main( + A: T.Tensor((M, K), dtype), + B: T.Tensor((K, N), dtype), + C: T.Tensor((M, N), dtype), + ): + with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by): + # Allocate shared and local fragments + A_shared = T.alloc_shared((block_M, block_K), dtype) + B_shared = T.alloc_shared((block_K, block_N), dtype) + C_local = T.alloc_fragment((block_M, block_N), accum_dtype) + + # Enable swizzle-based rasterization for better L2 locality + panel_size = 10 + T.use_swizzle(panel_size=panel_size, enable=True) + + # Clear the local accumulation buffer + T.clear(C_local) + + # Pipelined iteration over K dimension + for idx in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages): + # Copy tile of A + T.copy(A[by * block_M, idx * block_K], A_shared) + + # Parallel copy tile of B + for ko, j in T.Parallel(block_K, block_N): + B_shared[ko, j] = B[idx * block_K + ko, bx * block_N + j] + + # Perform local GEMM on the shared-memory tiles + T.gemm(A_shared, B_shared, C_local) + + # Copy the result tile back + T.copy(C_local, C[by * block_M, bx * block_N]) + + return main + + + + +def get_configs(kt=False): + """ + Generate a list of kernel tuning configuration dictionaries for a tiled matrix-multiply. + This function is used for tuning experiments with the built-in tuner + + Returns: + List[dict]: A list of configuration dictionaries + """ + + block_M = [32, 64, 128, 256] + block_N = [32, 64, 128, 256] + block_K = [16, 32, 64, 128] + num_threads = [256]#[64, 128, 256, 512] + num_stages = [3]#[0, 1, 2, 3, 4] + panel_size = [10]#[4, 6, 8, 10] + + + _configs = list( + itertools.product( + block_M, + block_N, + block_K, + num_threads, + num_stages, + panel_size, + ) + ) + + if kt: + configs = { + "block_M": block_M, + "block_N": block_N, + "block_K": block_K, + "num_stages": num_stages, + "num_threads": num_threads, + "panel_size": panel_size, + } + else: + configs = [ + { + "block_M": c[0], + "block_N": c[1], + "block_K": c[2], + "num_threads": c[3], + "num_stages": c[4], + "panel_size": c[5], + } + for c in _configs + ] + return configs + + + +# For autotuning experiment +# https://github.com/tile-ai/tilelang/blob/main/examples/gemm/example_gemm_autotune.py +@tilelang.autotune(configs=get_configs(), warmup=1, rep=32) +@tilelang.jit +def matmul_opt_autotune(M, N, K, block_M, block_N, block_K, num_threads=128, num_stages=3, panel_size=10, dummy=0, dtype=T.float16, accum_dtype=T.float): + @T.prim_func + def main( + A: T.Tensor((M, K), dtype), + B: T.Tensor((K, N), dtype), + C: T.Tensor((M, N), dtype), + ): + with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by): + # Allocate shared and local fragments + A_shared = T.alloc_shared((block_M, block_K), dtype) + B_shared = T.alloc_shared((block_K, block_N), dtype) + C_local = T.alloc_fragment((block_M, block_N), accum_dtype) + + # Enable swizzle-based rasterization for better L2 locality + T.use_swizzle(panel_size=panel_size, enable=True) + + # Clear the local accumulation buffer + T.clear(C_local) + + # Pipelined iteration over K dimension + for idx in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages): + # Copy tile of A + T.copy(A[by * block_M, idx * block_K], A_shared) + + # Parallel copy tile of B + for ko, j in T.Parallel(block_K, block_N): + B_shared[ko, j] = B[idx * block_K + ko, bx * block_N + j] + + # Perform local GEMM on the shared-memory tiles + T.gemm(A_shared, B_shared, C_local) + + # Copy the result tile back + T.copy(C_local, C[by * block_M, bx * block_N]) + + return main + + + +def run_basic(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.zeros(M, N, device="cuda", dtype=torch.float16) + C_ref = A @ B + + kernel = matmul_basic( + M, N, K, + block_M=64, + block_N=64, + block_K=32, + ) + + kernel(A, B, C) + + assert torch.allclose(C, C_ref, atol=M * 2 **(-11), rtol=1e-2) + print("Succes") + + +def run_opt(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.zeros(M, N, device="cuda", dtype=torch.float16) + C_ref = A @ B + + kernel = matmul_opt( + M, N, K, + dummy=0, + block_M=64, + block_N=64, + block_K=32, + num_stages=3, + num_threads=128, + ) + + kernel(A, B, C) + + assert torch.allclose(C, C_ref, atol=M * 2 **(-11), rtol=1e-2) + print("Succes") + + +def tune_basic(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + size = (M, N) + + + args = [A, B, C] + + tune_params = dict() + tune_params["block_M"] = [2**i for i in range(5, 9)] + tune_params["block_N"] = [2**i for i in range(5, 9)] + tune_params["block_K"] = [2**i for i in range(4, 8)] + tune_params["num_threads"] = [64, 128, 256, 512] + tune_params["num_stages"] = [0, 1, 2, 3, 4] + tune_params["M"] = [M] + tune_params["N"] = [N] + tune_params["K"] = [K] + + restrictions = [ + "2 * num_stages * block_K * (block_M + block_N) <= 49152" + ] + + results, env = tune_kernel( + kernel_name="matmul_basic", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, C_ref.cpu()], + atol=M * 2**(-11), + strategy = "bayes_opt", + strategy_options = {"max_fevals": 100}, + call_function=call_tilelang, + verbose=True, + restrictions=restrictions, + ) + + +def tune_opt(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + size = (M, N) + + args = [A, B, C] + + tune_params = dict() + tune_params["block_M"] = [2**i for i in range(5, 9)] + tune_params["block_N"] = [2**i for i in range(5, 9)] + tune_params["block_K"] = [2**i for i in range(4, 8)] + tune_params["num_threads"] = [64, 128, 256, 512] + tune_params["num_stages"] = [0, 1, 2, 3, 4] + tune_params["panel_size"] = [4, 6, 8, 10] + tune_params["M"] = [M] + tune_params["N"] = [N] + tune_params["K"] = [K] + + restrictions = [ + "2 * num_stages * block_K * (block_M + block_N) <= 49152" + ] + + results, env = tune_kernel( + kernel_name="matmul_opt", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, C_ref.cpu()], + atol=M * 2**(-11), + strategy = "bayes_opt", + strategy_options = {"max_fevals": 100}, + call_function=call_tilelang, + verbose=True, + restrictions=restrictions, + ) + + + + +if __name__ == "__main__": + M, N, K = 4096, 4096, 4096 + run_basic(M, N, K) + run_opt(M, N, K) + + tune_basic(M, N, K) + tune_opt(M, N, K) + + diff --git a/examples/generic_python/matmul/tilus_matmul.py b/examples/generic_python/matmul/tilus_matmul.py new file mode 100644 index 000000000..c65cdeaa7 --- /dev/null +++ b/examples/generic_python/matmul/tilus_matmul.py @@ -0,0 +1,263 @@ +import torch + +import tilus +from tilus import float16, float32, int32 +from tilus.utils import cdiv + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_tilus + + +# This kernel is copied from the Tilus project: +# https://github.com/NVIDIA/tilus/blob/main/examples/matmul/matmul_v0.py +# +# Original example: matmul_v0.py +# Copyright (c) the Tilus authors +class MatmulBasic(tilus.Script): + def __init__(self): + super().__init__() + # we define three hyperparameters: ``block_m``, ``block_n``, and ``block_k`` to determine the tile size on + # m, n, and k dimensions for each `thread block` of the kernel. + self.block_m = 64 + self.block_n = 64 + self.block_k = 16 + + def __call__( + self, + m_size: int32, n_size: int, k_size: int, # Matrix dimensions + a_ptr: ~float16, b_ptr: ~float16, c_ptr: ~float16, # Matrix pointers + ): + self.attrs.blocks = [ + cdiv(m_size, self.block_m), # the x dimension size of the grid + cdiv(n_size, self.block_n), # the y dimension size of the grid + ] + num_warps = 1 # added for tuning + self.attrs.warps = num_warps # the number of warps per thread block, must be a compile-time known integer + + # define two int32 variables to store the offsets of the m and n dimensions for the current thread block. + offset_m: int32 = self.block_m * self.blockIdx.x + offset_n: int32 = self.block_n * self.blockIdx.y + + # create two global tensors `ga` and `gb` to represent the input matrices A and B, respectively. + ga = self.global_view(a_ptr, dtype=float16, shape=[m_size, k_size]) + gb = self.global_view(b_ptr, dtype=float16, shape=[k_size, n_size]) + + # create a register tensor `acc` to accumulate the results of the matrix multiplication. + acc = self.register_tensor( + dtype=float32, shape=[self.block_m, self.block_n], init=0.0 + ) + + # iterate over the k dimension in blocks of size `block_k`. + for k in range(cdiv(k_size, self.block_k)): + # calculate the offset for the current block in the k dimension + offset_k = k * self.block_k + + # load a block of matrix A and B into register tensors `a` and `b`. + a = self.load_global( + ga, offsets=[offset_m, offset_k], shape=[self.block_m, self.block_k] + ) + b = self.load_global( + gb, offsets=[offset_k, offset_n], shape=[self.block_k, self.block_n] + ) + + # perform the dot product: acc = a @ b + acc + self.dot(a, b, acc, out=acc) + + # after the loop, we cast the accumulated result `acc` to float16 type and store it back to the output matrix C. + acc_f16 = self.cast(acc, dtype=float16) + gc = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size]) + self.store_global(gc, acc_f16, offsets=[offset_m, offset_n]) + + + + +# This kernel is copied from the Tilus project: +# https://nvidia.github.io/tilus/stable/tutorials/matmul-ampere/matmul/matmul_v4.html +# +# Original example: matmul_v4.py +# Copyright (c) the Tilus authors +# +# Modifications in file: +# - Removed auto-tuning decorators +# - Added default values (None) for the __init__ parameters +class MatmulOpt(tilus.Script): + def __init__(self, num_warps=None, block_m=None, block_n=None, block_k=None, num_stages=None): + super().__init__() + self.block_m = block_m + self.block_n = block_n + self.block_k = block_k + self.num_warps = num_warps + self.num_stages = num_stages + + def __call__( + self, + m_size: int32, n_size: int, k_size: int, + a_ptr: ~float16, b_ptr: ~float16, c_ptr: ~float16, + ): + self.attrs.blocks = [cdiv(m_size, self.block_m), cdiv(n_size, self.block_n)] + self.attrs.warps = self.num_warps + + block_m, block_n, block_k = self.block_m, self.block_n, self.block_k + offset_m: int32 = block_m * self.blockIdx.x + offset_n: int32 = block_n * self.blockIdx.y + + ga = self.global_view(a_ptr, dtype=float16, shape=[m_size, k_size]) + gb = self.global_view(b_ptr, dtype=float16, shape=[k_size, n_size]) + sa = self.shared_tensor(dtype=float16, shape=[self.num_stages, block_m, block_k]) + sb = self.shared_tensor(dtype=float16, shape=[self.num_stages, block_k, block_n]) + acc = self.register_tensor(dtype=float32, shape=[block_m, block_n], init=0.0) + + for stage in range(self.num_stages - 1): + offset_k = stage * self.block_k + self.copy_async(src=ga, dst=sa[stage], offsets=[offset_m, offset_k]) + self.copy_async(src=gb, dst=sb[stage], offsets=[offset_k, offset_n]) + self.copy_async_commit_group() + + self.copy_async_wait_group(n=self.num_stages - 2) + self.sync() + + current_stage: int32 = 0 + preload_stage: int32 = self.num_stages - 1 + for offset_k in self.range(0, k_size, block_k, unroll=self.num_stages): + # computation for current tile + a = self.load_shared(sa[current_stage]) + b = self.load_shared(sb[current_stage]) + self.dot(a, b, acc, out=acc) + + # preload the next tile of A and B into shared memory + preload_offset_k = offset_k + (self.num_stages - 1) * block_k + self.copy_async( + src=ga, + dst=sa[preload_stage], + offsets=[offset_m, preload_offset_k], + ) + self.copy_async( + src=gb, + dst=sb[preload_stage], + offsets=[preload_offset_k, offset_n], + ) + self.copy_async_commit_group() + + # update the stage + current_stage = (current_stage + 1) % self.num_stages + preload_stage = (preload_stage + 1) % self.num_stages + self.copy_async_wait_group(n=self.num_stages - 2) + self.sync() + + self.free_shared(sa) + self.free_shared(sb) + + casted_acc = self.cast(acc, dtype=float16) + gc = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size]) + self.store_global(gc, casted_acc, offsets=[offset_m, offset_n]) + + + +def run_basic(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + matmul = MatmulBasic() + torch.cuda.synchronize() + matmul(M, N, K, A, B, C) + torch.cuda.synchronize() + + torch.testing.assert_close(C_ref, C, atol=M * 2**(-11), rtol=1e-2) + print("Succes") + + +def run_optimized(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + matmul = MatmulOpt(num_warps=4, block_m=64, block_n=64, block_k=16, num_stages=3) + torch.cuda.synchronize() + + matmul(M, N, K, A, B, C) + torch.cuda.synchronize() + + torch.testing.assert_close(C_ref, C, atol=M * 2**(-11), rtol=1e-2) + print("Succes") + + +def tune_basic(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + size = (M, N) + + args = [M, N, K, A, B, C] + + tune_params = dict() + tune_params["block_m"] = [2**i for i in range(5, 9)] + tune_params["block_n"] = [2**i for i in range(5, 9)] + tune_params["block_k"] = [2**i for i in range(4, 8)] + tune_params["num_warps"] = [2, 4, 8, 16] + + + results, env = tune_kernel( + kernel_name="MatmulBasic", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, None, None, None, C_ref.cpu()], + atol=M * 2**(-11), + strategy = "bayes_opt", + strategy_options = {"max_fevals": 200}, + call_function=call_tilus, + ) + + +def tune_opt(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + size = (M, N) + + args = [M, N, K, A, B, C] + + tune_params = dict() + tune_params["block_m"] = [2**i for i in range(5, 9)] + tune_params["block_n"] = [2**i for i in range(5, 9)] + tune_params["block_k"] = [2**i for i in range(4, 8)] + tune_params["num_warps"] = [2, 4, 8, 16] + tune_params["num_stages"] = [2, 3, 4, 5] + + # Shared memory restriction + restrictions = ["2 * num_stages * block_k * (block_m + block_n) <= 49152"] + + results, env = tune_kernel( + kernel_name="MatmulOpt", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, None, None, None, C_ref.cpu()], + atol=M * 2**(-11), + restrictions=restrictions, + strategy = "bayes_opt", + strategy_options = {"max_fevals": 200}, + call_function=call_tilus, + ) + + + + + +if __name__ == "__main__": + M, N, K = 4096, 4096, 4096 + run_basic(M, N, K) + run_optimized(M, N, K) + tune_basic(M, N, K) + tune_opt(M, N, K) \ No newline at end of file diff --git a/examples/generic_python/matmul/triton_matmul.py b/examples/generic_python/matmul/triton_matmul.py new file mode 100644 index 000000000..d5522c627 --- /dev/null +++ b/examples/generic_python/matmul/triton_matmul.py @@ -0,0 +1,312 @@ +import torch +import triton +import triton.language as tl + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_triton + +@triton.jit +def matmul_basic( + a_ptr, b_ptr, c_ptr, # Pointers + M, N, K, # Matrix sizes + stride_am, stride_ak, # Strides + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_SIZE_M: tl.constexpr, # Tile sizes + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + # Each program computes one BLOCK_SIZE_M x BLOCK_SIZE_N tile of C + pid_m = tl.program_id(axis=0) + pid_n = tl.program_id(axis=1) + + # Compute row/column indices for the tile + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + + # Create pointers to A and B tiles + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + # Accumulator + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Loop over K dimension + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load( + a_ptrs, + mask=(offs_am[:, None] < M) & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + + b = tl.load( + b_ptrs, + mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K) & (offs_bn[None, :] < N), + other=0.0, + ) + + accumulator = tl.dot(a, b, accumulator) + + # advance K + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + # Store result + c = accumulator.to(tl.float16) + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=mask) + + + + +# This kernel is copied from the Triton project: +# https://github.com/triton-lang/triton/blob/main/python/tutorials/03-matrix-multiplication.py +# +# Original example: 03-matrix-multiplication.py +# Copyright (c) the Triton authors +# +# Modifications in file: +# - Removed auto-tuning decorators +# - Removed activation function +@triton.jit +def matmul_opt( + # Pointers to matrices + a_ptr, b_ptr, c_ptr, + # Matrix dimensions + M, N, K, + # The stride variables represent how much to increase the ptr by when moving by 1 + # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` + # by to get the element one row down (A has M rows). + stride_am, stride_ak, # + stride_bk, stride_bn, # + stride_cm, stride_cn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, # + GROUP_SIZE_M: tl.constexpr, # +): + """Kernel for computing the matmul C = A x B. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + # See above `L2 Cache Optimizations` section for details. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ----------------------------------------------------------- + # Add some integer bound assumptions. + # This helps to guide integer analysis in the backend to optimize + # load/store offset address calculation + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + # See above `Pointer Arithmetic` section for details + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + # We accumulate along the K dimension. + accumulator = tl.dot(a, b, accumulator) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + c = accumulator.to(tl.float16) + + # ----------------------------------------------------------- + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + + +def run_basic(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + BLOCK_SIZE_M = 64 + BLOCK_SIZE_N = 64 + BLOCK_SIZE_K = 32 + + grid = ( + triton.cdiv(M, BLOCK_SIZE_M), + triton.cdiv(N, BLOCK_SIZE_N), + ) + + matmul_basic[grid]( + A, B, C, + M, N, K, + A.stride(0), A.stride(1), + B.stride(0), B.stride(1), + C.stride(0), C.stride(1), + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_N=BLOCK_SIZE_N, + BLOCK_SIZE_K=BLOCK_SIZE_K, + ) + + assert torch.allclose(C, C_ref, rtol=1e-2, atol= M * 2**(-11)) + + print("Passed") + + +def run_opt(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + BLOCK_SIZE_M = 64 + BLOCK_SIZE_N = 64 + BLOCK_SIZE_K = 32 + GROUP_SIZE_M = 32 + + grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N),) + + matmul_opt[grid]( + A, B, C, + M, N, K, + A.stride(0), A.stride(1), + B.stride(0), B.stride(1), + C.stride(0), C.stride(1), + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_N=BLOCK_SIZE_N, + BLOCK_SIZE_K=BLOCK_SIZE_K, + GROUP_SIZE_M=GROUP_SIZE_M, + ) + + assert torch.allclose(C, C_ref, rtol=1e-2, atol= M * 2**(-11)) + + print("Passed") + + +def tune_basic(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + size = (M, N) + + args = [A, B, C, + M, N, K, + A.stride(0), A.stride(1), + B.stride(0), B.stride(1), + C.stride(0), C.stride(1), + ] + + tune_params = dict() + tune_params["BLOCK_SIZE_M"] = [2**i for i in range(5, 9)] + tune_params["BLOCK_SIZE_N"] = [2**i for i in range(5, 9)] + tune_params["BLOCK_SIZE_K"] = [2**i for i in range(4, 8)] + tune_params["num_warps"] = [2, 4, 8, 16] + tune_params["num_stages"] = [1, 2, 3, 4, 5] + + results, env = tune_kernel( + kernel_name="matmul_basic", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, C_ref.cpu(), None, None, None, None, None, None, None, None, None], + atol=M * 2**(-11), + block_size_names = ["BLOCK_SIZE_M", "BLOCK_SIZE_M"], + strategy = "bayes_opt", + strategy_options = {"max_fevals": 100}, + call_function=call_triton, + ) + + +def tune_opt(M, N, K): + A = torch.randn(M, K, device="cuda", dtype=torch.float16) + B = torch.randn(K, N, device="cuda", dtype=torch.float16) + C = torch.empty((M, N), device='cuda', dtype=torch.float16) + C_ref = A @ B + + size = M * N + + args = [A, B, C, + M, N, K, + A.stride(0), A.stride(1), + B.stride(0), B.stride(1), + C.stride(0), C.stride(1), + ] + + tune_params = dict() + tune_params["BLOCK_SIZE_M"] = [2**i for i in range(5, 9)] + tune_params["BLOCK_SIZE_N"] = [2**i for i in range(5, 9)] + tune_params["BLOCK_SIZE_K"] = [2**i for i in range(4, 8)] + tune_params["GROUP_SIZE_M"] = [4, 6, 8, 10] + tune_params["num_warps"] = [2, 4, 8, 16] + tune_params["num_stages"] = [1, 2, 3, 4, 5] + tune_params["M_dim"] = [M] + + restrictions = ["GROUP_SIZE_M * BLOCK_SIZE_M < M_dim"] # Otherwise grouped ordering has no effect + + results, env = tune_kernel( + kernel_name="matmul_opt", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, C_ref.cpu(), None, None, None, None, None, None, None, None, None], + atol=M * 2**(-11), + block_size_names = ["BLOCK_SIZE_M", "BLOCK_SIZE_M"], + grid_div_x = ["BLOCK_SIZE_M", "BLOCK_SIZE_N"], + restrictions = restrictions, + strategy = "bayes_opt", + strategy_options = {"max_fevals": 100}, + call_function=call_triton, + ) + + + +if __name__ == "__main__": + M, N, K = 4096, 4096, 4096 + run_basic(M, N, K) + run_opt(M, N, K) + + tune_basic(M, N, K) + tune_opt(M, N, K) diff --git a/examples/generic_python/matmul/warp_matmul.py b/examples/generic_python/matmul/warp_matmul.py new file mode 100644 index 000000000..c4a218b21 --- /dev/null +++ b/examples/generic_python/matmul/warp_matmul.py @@ -0,0 +1,90 @@ +import numpy as np +import warp as wp + +from kernel_tuner import tune_kernel +from examples.generic_python.call_functions import call_warp + +wp.init() +wp.config.enable_backward = False + + +@wp.kernel() +def gemm( + A: wp.array2d(dtype=wp.float16), B: wp.array2d(dtype=wp.float16), C: wp.array2d(dtype=wp.float16) +): + i, j = wp.tid() + M = A.shape[0] + N = B.shape[1] + K = A.shape[1] + + if i >= M or j >= N: + return + + # compute dot product + sum = wp.float32(0.0) + for k in range(K): + sum += wp.float32(A[i, k]) * wp.float32(B[k, j]) + + # write result + C[i, j] = wp.float16(sum) + + +def run_gemm(M, N, K): + rng = np.random.default_rng(42) + A = rng.random((M, K)).astype(np.float16) + B = rng.random((K, N)).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + + A_wp = wp.array(A) + B_wp = wp.array(B) + C_wp = wp.array(C) + + wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp]) + + np.testing.assert_allclose(C_wp.numpy(), A @ B, rtol=1e-2, atol=M * 2**(-11)) + + print("Succes") + + +def tune(M, K, N): + rng = np.random.default_rng(42) + A = rng.random((M, K)).astype(np.float16) + B = rng.random((K, N)).astype(np.float16) + C = np.zeros((M, N), dtype=np.float16) + + size = (M, N) + + tune_params = dict() + tune_params["block_dim"] = [2**i for i in range(5, 11)] + tune_params["dim"] = [size] + + args = [A, B, C] + answer = [None, None, A @ B] + + results, env = tune_kernel( + kernel_name="gemm", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=answer, + call_function=call_warp, + ) + + + +if __name__ == "__main__": + M, N, K = 1024, 1024, 1024 + + run_gemm(M, N, K) + tune(M, N, K) + + + + + + + + + diff --git a/examples/generic_python/numba_vec_add.py b/examples/generic_python/numba_vec_add.py new file mode 100644 index 000000000..3bc90de86 --- /dev/null +++ b/examples/generic_python/numba_vec_add.py @@ -0,0 +1,42 @@ +from numba import cuda +import numpy as np + +from kernel_tuner import tune_kernel +from call_functions import call_numba + + +@cuda.jit +def f(a, b, c): + tid = cuda.grid(1) + size = len(c) + + if tid < size: + c[tid] = a[tid] + b[tid] + + +def tune(): + + N = 100000 + + a = np.random.random(N) + b = np.random.random(N) + c = np.zeros(N) + c_expect = a + b + + args = [a, b, c] + tune_params = {"block_size_x": [2**i for i in range(10)]} + + + results, env = tune_kernel( + kernel_name="f", + kernel_source=__file__, + problem_size=N, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, c_expect], + call_function=call_numba, + ) + +if __name__ == "__main__": + tune() \ No newline at end of file diff --git a/examples/generic_python/tilelang_vec_add.py b/examples/generic_python/tilelang_vec_add.py new file mode 100644 index 000000000..38b4ab309 --- /dev/null +++ b/examples/generic_python/tilelang_vec_add.py @@ -0,0 +1,44 @@ +import tilelang +import tilelang.language as T +import torch + +from kernel_tuner import tune_kernel +from call_functions import call_tilelang + + +@tilelang.jit # infers target from tensors at first call +def add(N: int, dtype: str = 'float32', block: int = 256,): + + @T.prim_func + def add_kernel( + A: T.Tensor((N,), dtype), + B: T.Tensor((N,), dtype), + C: T.Tensor((N,), dtype), + ): + with T.Kernel(T.ceildiv(N, block), threads=block) as bx: + for i in T.Parallel(block): + gi = bx * block + i + # Optional — LegalizeSafeMemoryAccess inserts a guard when an access may be OOB + C[gi] = A[gi] + B[gi] + + return add_kernel + + +def tune(): + N = 1 << 20 + A = torch.randn(N, device='cuda', dtype=torch.float32) + B = torch.randn(N, device='cuda', dtype=torch.float32) + C = torch.empty(N, device='cuda', dtype=torch.float32) + + args = [A, B, C] + tune_params = dict() + tune_params["block"] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] + tune_params["N"] = [N] # Not a tune param, but enables an easier call function + + answer = [None, None, (A + B).cpu()] + + res, env = tune_kernel("add", __file__, N, args, tune_params, lang="generic_python", + call_function=call_tilelang, answer=answer) + +if __name__ == "__main__": + tune() \ No newline at end of file diff --git a/examples/generic_python/tilus_vec_add.py b/examples/generic_python/tilus_vec_add.py new file mode 100644 index 000000000..8ee96aa07 --- /dev/null +++ b/examples/generic_python/tilus_vec_add.py @@ -0,0 +1,101 @@ +import tilus +from tilus import float32, int32 +from tilus.utils import cdiv +import torch + +from kernel_tuner import tune_kernel, run_kernel +from call_functions import call_tilus + + + + +class VecAddV(tilus.Script): + def __init__(self, block_size_x=None, num_warps=None): + super().__init__() + self.block_size_x = block_size_x # number of threads per block + + def __call__( + self, + n_size: int32, # size of the vectors + a_ptr: ~float32, # input vector A + b_ptr: ~float32, # input vector B + c_ptr: ~float32 # output vector C + ): + + # compute the number of blocks needed + self.attrs.blocks = [cdiv(n_size, self.block_size_x)] + self.attrs.warps = 4 # number of warps per block + + # calculate the offset for this block + offset: int32 = self.block_size_x * self.blockIdx.x + + # create global views for input/output vectors + ga = self.global_view(a_ptr, dtype=float32, shape=[n_size]) + gb = self.global_view(b_ptr, dtype=float32, shape=[n_size]) + gc = self.global_view(c_ptr, dtype=float32, shape=[n_size]) + + a = self.load_global(ga, offsets=[offset], shape=[self.block_size_x]) + b = self.load_global(gb, offsets=[offset], shape=[self.block_size_x]) + c = a + b + self.store_global(gc, c, offsets=[offset]) + + + + + +def tune(size): + a = torch.randn(size, dtype=torch.float32).cuda() + b = torch.randn(size, dtype=torch.float32).cuda() + c = torch.empty(size, dtype=torch.float32).cuda() + c_expect = a + b + + + args = [size, a, b, c] + tune_params = dict() + tune_params["block_size_x"] = [32, 64, 128, 256, 512, 1024] + tune_params["num_warps"] = [4, 8] + + + results, env = tune_kernel( + kernel_name="VecAddV", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, None, c_expect.cpu()], + call_function=call_tilus, + verbose=True, + ) + + +def run(size): + a = torch.randn(size, dtype=torch.float32).cuda() + b = torch.randn(size, dtype=torch.float32).cuda() + c = torch.empty(size, dtype=torch.float32).cuda() + c_expect = a + b + + + args = [size, a, b, c] + + results = run_kernel( + kernel_name="VecAddV", + kernel_source=__file__, + problem_size=size, + arguments=args, + params={"block_size_x": 32}, + lang="generic_python", + call_function=call_tilus, + ) + + c_expect = c_expect.cpu() + + assert torch.allclose(results[-1], c_expect) + + + +if __name__ == "__main__": + size = 10000000 + tune(size) + run(size) + diff --git a/examples/generic_python/triton_vec_add.py b/examples/generic_python/triton_vec_add.py new file mode 100644 index 000000000..cafba79a9 --- /dev/null +++ b/examples/generic_python/triton_vec_add.py @@ -0,0 +1,66 @@ +import numpy as np +import torch +import triton +import triton.language as tl + +from kernel_tuner import tune_kernel, run_kernel +from call_functions import call_triton + +@triton.jit +def add_op(x, y): + return x + y + +@triton.jit +def add_kernel(x_ptr, # *Pointer* to first input vector. + y_ptr, # *Pointer* to second input vector. + output_ptr, # *Pointer* to output vector. + n_elements, # Size of the vector. + block_size_x: tl.constexpr, # Number of elements each program should process. + # note: `constexpr` so it can be used as a shape value. + ): + pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0. + block_start = pid * block_size_x + offsets = block_start + tl.arange(0, block_size_x) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + y = tl.load(y_ptr + offsets, mask=mask) + output = add_op(x, y) + tl.store(output_ptr + offsets, output, mask=mask) + + + +def tune(): + size = 10000000 + + a = torch.randn(size, device='cuda', dtype=torch.float32) + b = torch.randn(size, device='cuda', dtype=torch.float32) + c = torch.empty_like(b) + n = torch.tensor(size, dtype=torch.int32) + c_expect = a + b + + args = [a, b, c, size] + tune_params = dict() + tune_params["block_size_x"] = [2**i for i in range(11)] + + + result = run_kernel("add_kernel", __file__, size, args, {"block_size_x": 256}, + lang="generic_python", call_function=call_triton) + assert np.allclose(c_expect.cpu(), result[2]) + + + + results, env = tune_kernel( + kernel_name="add_kernel", + kernel_source=__file__, + problem_size=size, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, c_expect.cpu(), None], + call_function=call_triton, + ) + + + +if __name__ == "__main__": + tune() \ No newline at end of file diff --git a/examples/generic_python/warp_vec_add.py b/examples/generic_python/warp_vec_add.py new file mode 100644 index 000000000..81d7fb19e --- /dev/null +++ b/examples/generic_python/warp_vec_add.py @@ -0,0 +1,71 @@ +import warp as wp +import numpy as np +import torch + +from kernel_tuner import tune_kernel +from call_functions import call_warp + + +wp.init() + + +@wp.func +def add_op(x: float, y: float): + return x + y + +@wp.kernel +def vec_add(a: wp.array(dtype=float), + b: wp.array(dtype=float), + c: wp.array(dtype=float), + n: int): + + work_per_thread = 8 + tid = wp.tid() + base = tid * work_per_thread + + for i in range(work_per_thread): + idx = base + i + if idx < n: + c[idx] = add_op(a[idx], b[idx]) + + +def call_warp(kernel_function, args, kwargs, grid, threads, params): + warp_args = [] + for arg in args: + if isinstance(arg, torch.Tensor): + warp_args.append(wp.from_torch(arg)) + else: + warp_args.append(arg) + dim = params['size'] + wp.launch(kernel=kernel_function, dim=dim, inputs=warp_args) + + +def tune(): + n = 1024 + + # Create host arrays + a = np.arange(n, dtype=np.float32) + b = np.arange(n, 0, -1, dtype=np.float32) + c = np.zeros(n, dtype=np.float32) + c_expect = a + b + + + tune_params = dict() + tune_params["work_per_thread"] = [2**i for i in range(10)] + tune_params["size"] = [n] + args = [a, b, c, n] + + results, env = tune_kernel( + kernel_name="vec_add", + kernel_source=__file__, + problem_size=n, + arguments=args, + tune_params=tune_params, + lang="generic_python", + answer=[None, None, c_expect, None], + call_function=call_warp, + ) + + +if __name__ == "__main__": + tune() \ No newline at end of file diff --git a/kernel_tuner/backends/generic_python.py b/kernel_tuner/backends/generic_python.py new file mode 100644 index 000000000..2247d5e87 --- /dev/null +++ b/kernel_tuner/backends/generic_python.py @@ -0,0 +1,391 @@ +import logging +import inspect +import copy +import traceback # for compile error handling +import re +import warnings +import builtins +import numpy as np + +from kernel_tuner.backends.backend import GPUBackend +from kernel_tuner.observers.generic_python import GenericPythonRuntimeObserver + +try: + import torch +except ImportError: + torch = None + + +class GenericPythonFunctions(GPUBackend): + """Class that groups the Python DSL functions on maintains state about the device.""" + + def __init__(self, device=0, iterations=7, compiler_options=None, observers=None): + """Instantiate GenericPythonFunctions object used for interacting with the device. + Currently, only CUDA devices are supported. + + Instantiating this object will inspect and store certain device properties at + runtime, which are used during compilation and/or execution of kernels by the + kernel tuner. Compiler options are not supported for GenericPython. + + :param device: Number of CUDA device to use for this context + :type device: int + + :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. + :type iterations: int + """ + + if not torch: + logging.error("Unable to import Torch") + raise ImportError("Unable to import Torch") + + self.device_id = torch.cuda.current_device() + self.device_properties = torch.cuda.get_device_properties(self.device_id) + self.name = torch.cuda.get_device_name(self.device_id) + self.max_threads = 10**18 # 'inf' to support tile based programming models, which can use less threads then the size of a tile. + + env = dict() + env["device_name"] = self.name + env["max_threads"] = self.max_threads + env["iterations"] = iterations + env["compiler_options"] = compiler_options + self.env = env + + self.stream = torch.cuda.default_stream() + self.start = torch.cuda.Event(enable_timing=True) + self.end = torch.cuda.Event(enable_timing=True) + + # setup observers + self.observers = observers or [] + self.observers.append(GenericPythonRuntimeObserver(self)) + for obs in self.observers: + obs.register_device(self) + + self.units = {"time": "ms", "power": "s,mW", "energy": "J"} + + # Variables to be filled in at compile time, needed for running the kernel after compilation. + self.call_function = None + self.signature = None + self.gpu_kwargs = None + + super().__init__(device=device, iterations=iterations, compiler_options=compiler_options, observers=observers) + + def ready_argument_list(self, arguments): + """Ready argument list to be passed to the kernel. Converts arguments to Torch GPU Tensors or Python + Scalars. Arguments of built-in Python types are left untouched. + + :param arguments: List of arguments to be passed to the kernel. + The order should match the argument list on the kernel. + Allowed values are: + - numpy.ndarray, and/or numpy.int32, numpy.float32, and so on. + - Torch Tensors + - Built-in Python types. + :type arguments: list + + :returns: A list of arguments that can be passed to the kernel. + :rtype: list( Torch.Tensor on GPU, int, float, bool, etc. ) + """ + torch_args = [] + + for arg in arguments: + if isinstance(arg, torch.Tensor): + if arg.dim() == 0: # Scalar tensor, convert to Python scalar + torch_args.append(arg.item()) + else: + if arg.is_cuda: # already on GPU, need deep copy to not overwrite + torch_args.append(arg.clone()) + else: # Copy from CPU to GPU + torch_args.append(arg.contiguous().to("cuda")) + elif isinstance(arg, np.ndarray): # Convert Numpy CPU array to Torch GPU Tensor + torch_args.append(torch.from_numpy(arg).to("cuda")) + elif isinstance(arg, np.generic): # Numpy scalar, convert to Python scalar + torch_args.append(arg.item()) + elif isinstance(arg, (int, float, bool, str)): # Is already Python + torch_args.append(arg) + elif type(arg) in vars(builtins).values(): + torch_args.append(arg) + else: + raise TypeError("Unknown argument type: ", type(arg), ". Accepted types are Torch tenors, NumPy arrays and scalars and built-in Python types.") + + return torch_args + + + def compile(self, kernel_instance, gpu_args=None): + """Compile the kernel by executing it once. This enforces that the kernel is cached. + + :param kernel_instance: The kernel instance containing information such as the kernel_source, + grid, threads, params, etc. + :type kernel_instance: KernelInstance + + :param gpu_args: arguments to be passed to the kernel. + :type gpu_args: list(any) + + :returns: A kernel that can be called with the user-defined call function. + :rtype: callable + """ + if kernel_instance.kernel_fn is None: + raise ValueError("kernel_fn is None, currently Generic Python only supports callable kernel_source") + + if gpu_args is None: + raise ValueError("gpu_args is None, Generic Python needs gpu args to compile the kernel") + + # The first time we compile, we also set the call function and the signature + # We need this later to run the kernel. + if self.call_function is None or self.signature is None: + self.call_function = kernel_instance.kernel_source.call_function + self.signature = kernel_instance.kernel_source.signature + + grid = kernel_instance.grid + threads = kernel_instance.threads + params = kernel_instance.params + + # If the kernel source is a class, we use the __call__ function as the kernel_function. + if inspect.isclass(kernel_instance.kernel_fn): + kernel_function = kernel_instance.kernel_fn() + elif callable(kernel_instance.kernel_fn): + kernel_function = kernel_instance.kernel_fn + else: + raise TypeError("kernel function is not a class or function") + + # Tuning params can contain kernel arguments. In such cases, create keyword arguments with + # the values of the tuning params. + self.gpu_kwargs = {} + if params is not None: + for arg_name in self.signature: + if arg_name in params: + self.gpu_kwargs[arg_name] = params[arg_name] + + # Call the user-defined call function in order to compile the kernel. + self.synchronize() + self.call_function(kernel_function, gpu_args, self.gpu_kwargs, grid, threads, params) + self.synchronize() + + return kernel_function + + def start_event(self): + """Records the event that marks the start of a measurement.""" + self.start.record() + + def stop_event(self): + """Records the event that marks the end of a measurement.""" + self.end.record() + + def kernel_finished(self): + """Returns True if the kernel has finished, False otherwise.""" + return self.end.query() + + def run_kernel(self, func, gpu_args, threads, grid, params=None): + """Runs the Python kernel passed as 'func'. + + :param func: A cached Python kernel for this specific kernel configuration + :type func: callable + + :param gpu_args: A list of arguments to the kernel, order should match the + order in the code. + :type gpu_args: list(any) + + :param threads: A tuple listing the number of threads in each dimension of + the thread block + :type threads: tuple(int, int, int) + + :param grid: A tuple listing the number of thread blocks in each dimension + of the grid + :type grid: tuple(int, int, int) + + :param params: A dictionary with the tuning params for this specific kernel + configuration + :type params: dict + """ + self.call_function(func, gpu_args, self.gpu_kwargs, grid, threads, params) + + + def synchronize(self): + """Halts execution until device has finished its tasks.""" + torch.cuda.synchronize() + + + def memset(self, allocation, value, size): + """This method must implement setting the memory to a value on the device. + Not implemented: Python DSLs usually do not perform explicit memory handling.""" + pass + + + def memcpy_dtoh(self, dest, src): + """This method must implement a device to host copy. + Not implemented: Python DSLs usually do not perform explicit memory handling.""" + pass + + + def memcpy_htod(self, dest, src): + """This method must implement a host to device copy. + Not implemented: Python DSLs usually do not perform explicit memory handling.""" + pass + + + def copy_constant_memory_args(self, cmem_args): + raise NotImplementedError("Generic Python does not support constant memory") + + + def copy_shared_memory_args(self, smem_args): + raise NotImplementedError("Generic Python does not support shared memory") + + + def copy_texture_memory_args(self, texmem_args): + raise NotImplementedError("Generic Python does not support texture memory") + + + def refresh_memory(self, gpu_memory, host_arguments, should_sync): + """Refresh the GPU memory with the untouched host arguments. We overwrite the standard function + because Python DSLs do usually do not manage memory explicitely""" + + for i, host_arg in enumerate(host_arguments): + if should_sync[i]: + gpu_arg = gpu_memory[i] + + # Scalar Python type + if isinstance(gpu_arg, (int, float, bool)): + gpu_memory[i] = host_arg + elif type(gpu_arg) in vars(builtins).values(): + gpu_memory[i] = host_arg + + # GPU tensor + elif isinstance(gpu_arg, torch.Tensor): + if isinstance(host_arg, np.ndarray): + gpu_arg.copy_(torch.as_tensor(host_arg)) + elif isinstance(host_arg, torch.Tensor): + if host_arg.is_cuda and host_arg.device != gpu_arg.device: + gpu_arg.copy_(host_arg.to(gpu_arg.device)) # different gpu's, no direct copy allowed + else: + gpu_arg.copy_(host_arg) + else: + # host_arg is scalar, fill into tensor + gpu_arg.fill_(host_arg) + + + def classify_compile_exception(self, e): + """Best effort to differentiate between a user error and a resource error. + + :param e: the caught exception + :type e: exception + + :returns: "resource_error" , "user_error" or "unknown" + :rtype: string + """ + + RESOURCE_KEYWORDS = ( + # Shared memory + "shared memory", + "smem", + "uses too much shared", + "exceeds shared memory", + "shared memory limit", + + # Registers / occupancy + "too many registers", + "register spill", + "uses too many registers", + "out of registers", + + # Launch configuration + "invalid launch configuration", + "invalid configuration argument", + "threads per block", + "block size", + "grid size", + "num_warps", + "num_ctas", + + # Generic resource exhaustion + "too many resources", + "out of resources", + "exceeds maximum", + "exceeds limit", + + # Compiler-level indicators + "ptxas error", + "ptxas fatal", + "nvcc error", + "cuda error", + "llvm error", + "mlir error", + "lowering failed", + ) + + USER_ERROR_KEYWORDS = ( + # Undefined / missing symbols + "not defined", + "undefined variable", + "without definition", + "unknown variable", + "unbound", + + # Type / shape errors + "type mismatch", + "invalid type", + "cannot convert", + "expected .* but got", + "incompatible types", + + # Indexing / bounds + "index out of bounds", + "out of bounds access", + "invalid index", + + # IR / AST construction + "failed to build", + "invalid expression", + "malformed", + "illegal operation", + ) + + RESOURCE_ORIGINS = ( + "ptxas", + "nvcc", + "cuda", + "llvm", + "mlir", + "cubin", + "fatbin", + ) + + USER_ORIGINS = ( + "transpiler", + "scheduler", + "hidet", + "frontend", + "ast", + ) + + + USER_ERROR_TYPES = ( + NameError, + UnboundLocalError, + AttributeError, + TypeError, + SyntaxError, + IndentationError, + ) + + if isinstance(e, USER_ERROR_TYPES): + return "user_error" + + + def match_any(patterns, text): + return any(re.search(p, text) for p in patterns) + + msg = str(e).lower() + tb = "".join(traceback.format_tb(e.__traceback__)).lower() + + + if match_any(RESOURCE_KEYWORDS, msg): + return "resource_error" + + if match_any(RESOURCE_ORIGINS, msg + tb): + return "resource_error" + + if match_any(USER_ERROR_KEYWORDS, msg): + return "user_error" + + if match_any(USER_ORIGINS, msg + tb): + return "user_error" + + return "unknown" diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 8f9326c2d..f4816528c 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -376,6 +376,7 @@ def memset(self, allocation, value, size): drv.memset_d8(allocation, value, size) def memcpy_dtoh(self, dest, src): + print("dtoh called") """Perform a device to host memory copy. :param dest: A numpy array in host memory to store the data @@ -390,6 +391,7 @@ def memcpy_dtoh(self, dest, src): dest[:] = src def memcpy_htod(self, dest, src): + print("htod called") """Perform a host to device memory copy. :param dest: A GPU memory allocation unit diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 69d952541..1fe5ffbe6 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -16,8 +16,12 @@ def _get_cupy(): import kernel_tuner.util as util from kernel_tuner.accuracy import Tunable +from kernel_tuner.backends.generic_python import GenericPythonFunctions +from kernel_tuner.kernel_sources.kernel_source import KernelSource +from kernel_tuner.observers.nvml import NVMLObserver from kernel_tuner.observers.observer import BenchmarkObserver, ContinuousObserver, OutputObserver, PrologueObserver from kernel_tuner.observers.tegra import TegraObserver +from kernel_tuner.language import Language try: import torch @@ -36,6 +40,7 @@ def _get_cupy(): "name", "kernel_source", "kernel_string", + "kernel_fn", "temp_files", "threads", "grid", @@ -48,13 +53,27 @@ def _get_cupy(): class KernelInstance(_KernelInstance): """Class that represents the specific parameterized instance of a kernel.""" + def __new__(cls, *args, **kwargs): + # Detect old-style calls (without kernel_fn and gpu_kwargs) for old tests + if len(args) == 8: # old version + name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments = args + kernel_fn = None + args = (name, kernel_source, kernel_string, kernel_fn, temp_files, threads, grid, params, arguments) + elif "kernel_fn" not in kwargs and len(args) < 9: + kwargs["kernel_fn"] = None + return super().__new__(cls, *args, **kwargs) + def delete_temp_files(self): """Delete any generated temp files.""" - for v in self.temp_files.values(): - util.delete_temp_file(v) + tmp_files_list = self.temp_files.values() if isinstance(self.temp_files, dict) else self.temp_files + for tmp_file in tmp_files_list: + util.delete_temp_file(tmp_file) def prepare_temp_files_for_error_msg(self): """Prepare temp file with source code, and return list of temp file names.""" + if type(self.kernel_source).__name__ == "KernelSourceFn": + return [] # already done during compilation + temp_filename = util.get_temp_filename(suffix=self.kernel_source.get_suffix()) util.write_file(temp_filename, self.kernel_string) ret = [temp_filename] @@ -62,160 +81,7 @@ def prepare_temp_files_for_error_msg(self): return ret -class KernelSource(object): - """Class that holds the kernel sources. - - There is a primary kernel source, which can be either a source string, - a filename (indicating a file containing the kernel source code), - or a callable (generating the kernel source code). - There can additionally be (one or multiple) secondary kernel sources, which - must be filenames. - """ - - def __init__(self, kernel_name, kernel_sources, lang, defines=None): - if not isinstance(kernel_sources, list): - kernel_sources = [kernel_sources] - self.kernel_sources = kernel_sources - self.kernel_name = kernel_name - self.defines = defines - if lang is None: - if callable(self.kernel_sources[0]): - raise TypeError("Please specify language when using a code generator function") - kernel_string = self.get_kernel_string(0) - lang = util.detect_language(kernel_string) - - # The validity of lang is checked later, when creating the DeviceInterface - self.lang = lang.upper() - - def get_kernel_string(self, index=0, params=None): - """Retrieve the kernel source with the given index and return as a string. - - See util.get_kernel_string() for details. - - :param index: Index of the kernel source in the list of sources. - :type index: int - - :param params: Dictionary containing the tunable parameters for this specific - kernel instance, only needed when kernel_source is a generator. - :type param: dict - - :returns: A string containing the kernel code. - :rtype: string - """ - logging.debug("get_kernel_string called") - - if hasattr(self, 'lang') and self.lang.upper() == "HYPERTUNER": - return "" - - kernel_source = self.kernel_sources[index] - return util.get_kernel_string(kernel_source, params) - - def prepare_list_of_files( - self, kernel_name, params, grid, threads, block_size_names - ): - """Prepare the kernel string along with any additional files. - - The first file in the list is allowed to include or read in the others - The files beyond the first are considered additional files that may also contain tunable parameters - - For each file beyond the first this function creates a temporary file with - preprocessors statements inserted. Occurrences of the original filenames in the - first file are replaced with their temporary counterparts. - - :param kernel_name: A string specifying the kernel name. - :type kernel_name: string - - :param params: A dictionary with the tunable parameters for this particular - instance. - :type params: dict() - - :param grid: The grid dimensions for this instance. The grid dimensions are - also inserted into the code as if they are tunable parameters for - convenience. - :type grid: tuple() - - :param threads: The thread block dimensions for this instance. The thread block are - also inserted into the code as if they are tunable parameters for - convenience. - :type threads: tuple() - - :param block_size_names: A list of strings that denote the names - for the thread block dimensions. - :type block_size_names: list(string) - - """ - temp_files = dict() - - if self.lang.upper() == "HYPERTUNER": - return tuple(["", "", temp_files]) - - for i, f in enumerate(self.kernel_sources): - if i > 0 and not util.looks_like_a_filename(f): - raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames") - - ks = self.get_kernel_string(i, params) - # add preprocessor statements - n, ks = util.prepare_kernel_string( - kernel_name, - ks, - params, - grid, - threads, - block_size_names, - self.lang, - self.defines, - ) - - if i == 0: - # primary kernel source - name = n - kernel_string = ks - continue - - # save secondary kernel sources to temporary files - - # generate temp filename with the same extension - temp_file = util.get_temp_filename(suffix="." + f.split(".")[-1]) - temp_files[f] = temp_file - util.write_file(temp_file, ks) - # replace occurrences of the additional file's name in the first kernel_string with the name of the temp file - kernel_string = kernel_string.replace(f, temp_file) - - return name, kernel_string, temp_files - - def get_user_suffix(self, index=0): - """Get the suffix of the kernel filename, if the user specified one. Return None otherwise.""" - if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]): - return "." + self.kernel_sources[index].split(".")[-1] - return None - - def get_suffix(self, index=0): - """Return a suitable suffix for a kernel filename. - - This uses the user-specified suffix if available, or one based on the - lang/backend otherwise. - """ - # TODO: Consider delegating this to the backend - suffix = self.get_user_suffix(index) - if suffix is not None: - return suffix - - _suffixes = {"CUDA": ".cu", "OpenCL": ".cl", "C": ".c"} - try: - return _suffixes[self.lang] - except KeyError: - return ".c" - - def check_argument_lists(self, kernel_name, arguments): - """Check if the kernel arguments have the correct types. - - This is done by calling util.check_argument_list on each kernel string. - """ - for i, f in enumerate(self.kernel_sources): - if not callable(f): - util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments) - else: - logging.debug("Checking of arguments list not supported yet for code generators.") +# KernelSource has been moved to kernel_sources/kernel_source_str.py to support Generic Python def instantiate_observer(observer, args): @@ -278,6 +144,7 @@ def __init__( logging.debug("DeviceInterface instantiated, lang=%s", lang) + # Ensure observers is a list observers = observers or [] @@ -286,7 +153,7 @@ def __init__( observer_args = dict(device=device, platform=platform, compiler=compiler, lang=lang) observers = [instantiate_observer(ob, observer_args) for ob in observers] - if lang.upper() == "CUDA": + if lang == Language.CUDA: from kernel_tuner.backends.pycuda import PyCudaFunctions dev = PyCudaFunctions( device, @@ -294,7 +161,7 @@ def __init__( iterations=iterations, observers=observers, ) - elif lang.upper() == "CUPY": + elif lang == Language.CUPY: from kernel_tuner.backends.cupy import CupyFunctions dev = CupyFunctions( device, @@ -302,7 +169,7 @@ def __init__( iterations=iterations, observers=observers, ) - elif lang.upper() == "NVCUDA": + elif lang == Language.NVCUDA: from kernel_tuner.backends.nvcuda import CudaFunctions dev = CudaFunctions( device, @@ -310,7 +177,7 @@ def __init__( iterations=iterations, observers=observers, ) - elif lang.upper() == "OPENCL": + elif lang == Language.OPENCL: from kernel_tuner.backends.opencl import OpenCLFunctions dev = OpenCLFunctions( device, @@ -319,7 +186,7 @@ def __init__( iterations=iterations, observers=observers, ) - elif lang.upper() in ["C", "FORTRAN"]: + elif lang == Language.C or lang == Language.FORTRAN: from kernel_tuner.backends.compiler import CompilerFunctions dev = CompilerFunctions( compiler=compiler, @@ -327,7 +194,7 @@ def __init__( iterations=iterations, observers=observers, ) - elif lang.upper() == "HIP": + elif lang == Language.HIP: from kernel_tuner.backends.hip import HipFunctions dev = HipFunctions( device, @@ -335,16 +202,23 @@ def __init__( iterations=iterations, observers=observers, ) - elif lang.upper() == "HYPERTUNER": + elif lang == Language.HYPERTUNER: from kernel_tuner.backends.hypertuner import HypertunerFunctions dev = HypertunerFunctions( iterations=iterations, compiler_options=compiler_options ) self.requires_warmup = False + elif lang == Language.GENERIC_PYTHON: + dev = GenericPythonFunctions( + device, + compiler_options=compiler_options, + iterations=iterations, + observers=observers + ) else: raise NotImplementedError( - "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet" + "Sorry, support for languages other than CUDA, OpenCL, HIP, C, Fortran and Generic Python is not implemented yet" ) self.dev = dev @@ -387,17 +261,23 @@ def __init__( if not quiet: print("Using: " + self.dev.name) - def benchmark_prologue(self, func, gpu_args, threads, grid, result): + def run_kernel_bench(self, func, gpu_args, threads, grid, stream=None, params=None): + if isinstance(self.dev, GenericPythonFunctions): # Generic Python needs params to compile. + self.dev.run_kernel(func, gpu_args, threads, grid, params=params) + else: + self.dev.run_kernel(func, gpu_args, threads, grid) + + def benchmark_prologue(self, func, gpu_args, threads, grid, result, params=None): """Benchmark prologue one kernel execution per PrologueObserver.""" for obs in self.prologue_observers: self.dev.synchronize() obs.before_start() - self.dev.run_kernel(func, gpu_args, threads, grid) + self.run_kernel_bench(func, gpu_args, threads, grid, stream=None, params=params) self.dev.synchronize() obs.after_finish() result.update(obs.get_results()) - def benchmark_default(self, func, gpu_args, threads, grid, result): + def benchmark_default(self, func, gpu_args, threads, grid, result, params=None): """Benchmark one kernel execution for 'iterations' at a time.""" self.dev.synchronize() for _ in range(self.iterations): @@ -405,7 +285,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result): obs.before_start() self.dev.synchronize() self.dev.start_event() - self.dev.run_kernel(func, gpu_args, threads, grid) + self.run_kernel_bench(func, gpu_args, threads, grid, stream=None, params=params) self.dev.stop_event() for obs in self.benchmark_observers: obs.after_start() @@ -417,11 +297,13 @@ def benchmark_default(self, func, gpu_args, threads, grid, result): for obs in self.benchmark_observers: obs.after_finish() + #time.sleep(0.1) # prevent termal throttling + for obs in self.benchmark_observers: result.update(obs.get_results()) - def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration): + def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration, params=None): """Benchmark continuously for at least 'duration' seconds.""" iterations = int(np.ceil(duration / (result["time"] / 1000))) self.dev.synchronize() @@ -429,7 +311,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration): obs.before_start() self.dev.start_event() for _ in range(iterations): - self.dev.run_kernel(func, gpu_args, threads, grid) + self.run_kernel_bench(func, gpu_args, threads, grid, stream=None, params=params) self.dev.stop_event() for obs in self.continuous_observers: obs.after_start() @@ -479,8 +361,8 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett result = {} try: - self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result) - self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result) + self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result, instance.params) + self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result, instance.params) if self.continuous_observers: duration = 1 @@ -488,7 +370,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett obs.results = result duration = max(duration, obs.continuous_duration) - self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration) + self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration, instance.params) except Exception as e: # some launches may fail because too many registers are required @@ -518,6 +400,7 @@ def check_kernel_output( ): """Runs the kernel once and checks the result against answer.""" logging.debug("check_kernel_output") + cp = _get_cupy() # if not using custom verify function, check if the length is the same if answer: @@ -526,7 +409,6 @@ def check_kernel_output( should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)] else: - cp = _get_cupy() cupy_ndarray = (cp.ndarray,) if cp is not None else () should_sync = [ isinstance(arg, (np.ndarray, torch.Tensor, DeviceArray) + cupy_ndarray) @@ -538,33 +420,44 @@ def check_kernel_output( self.dev.refresh_memory(gpu_args, instance.arguments, should_sync) # run the kernel - check = self.run_kernel(func, gpu_args, instance) + self.dev.synchronize() + check = self.run_kernel_check(func, gpu_args, instance) + self.dev.synchronize() if not check: # runtime failure occurred that should be ignored, skip correctness check return # retrieve gpu results to host memory result_host = [] - for i, arg in enumerate(instance.arguments): - if should_sync[i]: - cp = _get_cupy() - cupy_ndarray = (cp.ndarray,) if cp is not None else () - if isinstance(arg, (np.ndarray,) + cupy_ndarray): - result_host.append(np.zeros_like(arg)) - self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) - elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor): - if not answer[i].is_cuda: - # if the answer is on the host, copy gpu output to host as well - result_host.append(torch.zeros_like(answer[i])) - self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor) + if instance.kernel_source is not None and instance.kernel_source.lang == Language.GENERIC_PYTHON: + # arguments are either Torch tensors or built-in Python types + for i, arg in enumerate(gpu_args): + if should_sync[i]: + if isinstance(arg, torch.Tensor): + result_host.append(arg.cpu()) else: - result_host.append(gpu_args[i].tensor) + result_host.append(arg) + else: + result_host.append(None) + else: + for i, arg in enumerate(instance.arguments): + if should_sync[i]: + if isinstance(arg, (np.ndarray, cp.ndarray)): + result_host.append(np.zeros_like(arg)) + self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) + elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor): + if not answer[i].is_cuda: + # if the answer is on the host, copy gpu output to host as well + result_host.append(torch.zeros_like(answer[i])) + self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor) + else: + result_host.append(gpu_args[i].tensor) + else: + # We should sync this argument, but we do not know how to transfer this type of argument + # What do we do? Should we throw an error? + result_host.append(None) else: - # We should sync this argument, but we do not know how to transfer this type of argument - # What do we do? Should we throw an error? result_host.append(None) - else: - result_host.append(None) # Call the output observers for obs in self.output_observers: @@ -582,6 +475,7 @@ def check_kernel_output( correct = True if not correct: + print("expected: ", answer, "\ngot: ", result_host) raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params)) def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to): @@ -597,7 +491,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, instance_string = util.get_instance_string(params) logging.debug("compile_and_benchmark " + instance_string) - instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): result[to.objective] = util.InvalidConfig() @@ -608,7 +501,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, try: # compile the kernel start_compilation = time.perf_counter() - func = self.compile_kernel(instance, verbose) + func = self.compile_kernel(instance, verbose, gpu_args) if not func: result[to.objective] = util.CompilationFailedConfig() else: @@ -651,42 +544,67 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # clean up any temporary files, if no error occurred instance.delete_temp_files() + # For Python DSLs, the compilation time also includes one kernel run, so we subtract the runtime + if self.lang == Language.GENERIC_PYTHON: + last_compilation_time -= result["time"] or 0 + result["compile_time"] = last_compilation_time or 0 result["verification_time"] = last_verification_time or 0 result["benchmark_time"] = last_benchmark_time or 0 return result - def compile_kernel(self, instance, verbose): + def compile_kernel(self, instance, verbose, gpu_args=None): """Compile the kernel for this specific instance.""" logging.debug("compile_kernel " + instance.name) # compile kernel_string into device func func = None - try: - func = self.dev.compile(instance) - except Exception as e: - # compiles may fail because certain kernel configurations use too - # much shared memory for example, the desired behavior is to simply - # skip over this configuration and try the next one - shared_mem_error_messages = [ - "uses too much shared data", - "local memory limit exceeded", - r"local memory \(\d+\) exceeds limit \(\d+\)", - ] - error_message = str(e.stderr) if hasattr(e, "stderr") else str(e) - if any(re.search(msg, error_message) for msg in shared_mem_error_messages): - logging.debug( - "compile_kernel failed due to kernel using too much shared memory" - ) - if verbose: - print( - f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used" + + # Python DSLs have different error handling, so we make a case distinction. + if isinstance(self.dev, GenericPythonFunctions): + try: + func = self.dev.compile(instance, gpu_args) + except Exception as e: + exception_type = self.dev.classify_compile_exception(e) + if exception_type == "user_error": + error_message = str(e.stderr) if hasattr(e, "stderr") else str(e) + print("compile_kernel failed due to error: " + error_message) + print("Error while compiling:", instance.name) + raise e + else: + logging.debug( + "compile_kernel failed due to kernel using too many resources" ) - else: - print("compile_kernel failed due to error: " + error_message) - print("Error while compiling:", instance.name) - raise e + if verbose: + print( + f"skipping config {util.get_instance_string(instance.params)} reason: \n{e}" + ) + else: # All other languages + try: + func = self.dev.compile(instance) + except Exception as e: + # compiles may fail because certain kernel configurations use too + # much shared memory for example, the desired behavior is to simply + # skip over this configuration and try the next one + shared_mem_error_messages = [ + "uses too much shared data", + "local memory limit exceeded", + r"local memory \(\d+\) exceeds limit \(\d+\)", + ] + error_message = str(e.stderr) if hasattr(e, "stderr") else str(e) + if any(re.search(msg, error_message) for msg in shared_mem_error_messages): + logging.debug( + "compile_kernel failed due to kernel using too much shared memory" + ) + if verbose: + print( + f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used" + ) + else: + print("compile_kernel failed due to error: " + error_message) + print("Error while compiling:", instance.name) + raise e return func @staticmethod @@ -725,20 +643,22 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) params, kernel_options.block_size_names, ) - if np.prod(threads) > self.dev.max_threads: + + if kernel_source.lang is not Language.GENERIC_PYTHON and np.prod(threads) > self.dev.max_threads: if verbose: print(f"skipping config {util.get_instance_string(params)} reason: too many threads per block") return util.InvalidConfig() # obtain the kernel_string and prepare additional files, if any - name, kernel_string, temp_files = kernel_source.prepare_list_of_files( - kernel_options.kernel_name, + instance_data = kernel_source.prepare_kernel_instance( + kernel_options, params, grid, threads, - kernel_options.block_size_names, ) + name = instance_data.kernel_name + kernel_string=instance_data.kernel_str # check for templated kernel if kernel_source.lang in ["CUDA"] and "<" in name and ">" in name: kernel_string, name = wrap_templated_kernel(kernel_string, name) @@ -747,7 +667,17 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) arguments = _preprocess_gpu_arguments(kernel_options.arguments, params) # collect everything we know about this instance and return it - return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments) + return KernelInstance( + name=name, + kernel_source=kernel_source, + kernel_string=kernel_string, + kernel_fn=instance_data.kernel_fn, + temp_files=instance_data.temp_files, + threads=threads, + grid=grid, + params=params, + arguments=arguments, + ) def get_environment(self): """Return dictionary with information about the environment.""" @@ -784,14 +714,15 @@ def ready_argument_list(self, arguments): return gpu_args - def run_kernel(self, func, gpu_args, instance): + + def run_kernel_check(self, func, gpu_args, instance): """Run a compiled kernel instance on a device.""" logging.debug("run_kernel %s", instance.name) logging.debug("thread block dims (%d, %d, %d)", *instance.threads) logging.debug("grid dims (%d, %d, %d)", *instance.grid) try: - self.dev.run_kernel(func, gpu_args, instance.threads, instance.grid) + self.run_kernel_bench(func, gpu_args, instance.threads, instance.grid, stream=None, params=instance.params) except Exception as e: if "too many resources requested for launch" in str(e) or "OUT_OF_RESOURCES" in str(e): logging.debug("ignoring runtime failure due to too many resources required") @@ -800,6 +731,7 @@ def run_kernel(self, func, gpu_args, instance): logging.debug("encountered unexpected runtime failure: " + str(e)) raise e return True + def _preprocess_gpu_arguments(old_arguments, params): diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index ad9a250db..3c5f3fd85 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -40,10 +40,12 @@ import kernel_tuner.core as core import kernel_tuner.util as util from kernel_tuner.file_utils import get_input_file, get_t4_metadata, get_t4_results, import_class_from_file +from kernel_tuner.kernel_sources.kernel_source import KernelSource from kernel_tuner.util import get_objective_defaults from kernel_tuner.runners.sequential import SequentialRunner from kernel_tuner.runners.simulation import SimulationRunner from kernel_tuner.searchspace import Searchspace +from kernel_tuner.language import Language try: import torch @@ -128,9 +130,10 @@ def __deepcopy__(self, _): ( "kernel_source", ( - """The CUDA, OpenCL, HIP, or C kernel code. + """The CUDA, OpenCL, HIP, C or Python DSL kernel code. It is allowed for the code to be passed as a string, a filename, a function that returns a string of code, or a list when the code needs auxilliary files. + In the case of a kernel in a Python DSL such as Triton, only a filename is accepted. To support combined host and device code tuning, a list of filenames can be passed. The first file in the list should be the @@ -154,7 +157,7 @@ def __deepcopy__(self, _): """Specifies the language used for GPU kernels. The kernel_tuner automatically detects the language, but if it fails, you may specify the language using this argument, currently supported: "CUDA", "CuPy", - "nvcuda", "OpenCL", "HIP", or "C".""", + "nvcuda", "OpenCL", "HIP", "C", or "Generic_Python.""", "string", ), ), @@ -201,7 +204,9 @@ def __deepcopy__(self, _): "arguments", ( """A list of kernel arguments, use numpy arrays for - arrays, use numpy.int32 or numpy.float32 for scalars.""", + arrays, use numpy.int32 or numpy.float32 for scalars. When the language + Generic Python is used, Torch Tensors and built-in Python data types are also + excepted""", "list", ), ), @@ -305,6 +310,22 @@ def __deepcopy__(self, _): "dict", ), ), + ( + "call_function", + ( + """When the language Generic Python is used, a call function that calls the kernel in the Python + DSL must be specified. The function must take the following positional arguments: + :kernel_function: the callable function with the tuning parameters inserted. + :args: list of kernel arguments, as provided by the user in the argument. + :kwargs: dictionary of kernel keyword arguments. If a tuning parameter is in the kernel signature, + the tuning parameter will be added as a keyword argument. + Optionally, the following arguments can be used. The order and name of the arguments must match. + :grid: the launch grid (tuple with 3 values), as computed by KernelTuner + :threads: the thread block size (tuple with 3 values), as computed by KernelTuner + :params: dictionary with the values of the tuning params for the specific configuration.""", + "function", + ), + ), ] ) @@ -621,12 +642,14 @@ def tune_kernel( observers=None, objective=None, objective_higher_is_better=None, + call_function=None, ): + start_overhead_time = perf_counter() if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log) - kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines) + kernelsource = KernelSource(kernel_name, kernel_source, lang, defines, util.normalize_call_function(call_function)) _check_user_input(kernel_name, kernelsource, arguments, block_size_names) @@ -670,6 +693,10 @@ def tune_kernel( logging.debug("tuning_options: %s", util.get_config_string(tuning_options)) logging.debug("device_options: %s", util.get_config_string(device_options)) + # the user-specific call function may of may not have optional grid, threads and params + # arguments. We normalize it so that it always accepts all arguments. + kernel_options.call_function = util.normalize_call_function(kernel_options.call_function) + # check whether the selected strategy and options are valid strategy_string = strategy if strategy: @@ -851,11 +878,12 @@ def run_kernel( block_size_names=None, quiet=False, log=None, + call_function=None, ): if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log) - kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines) + kernelsource = KernelSource(kernel_name, kernel_source, lang, defines, util.normalize_call_function(call_function)) _check_user_input(kernel_name, kernelsource, arguments, block_size_names) @@ -864,6 +892,10 @@ def run_kernel( kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) + # the user-specific call function may of may not have optional grid, threads and params + # arguments. We normalize it so that it always accepts all arguments. + kernel_options.call_function = util.normalize_call_function(kernel_options.call_function) + # detect language and create the right device function interface dev = core.DeviceInterface(kernelsource, iterations=1, **device_options) @@ -883,8 +915,11 @@ def run_kernel( # see if the kernel arguments have correct type util.check_argument_list(instance.name, instance.kernel_string, arguments) - # compile the kernel - func = dev.compile_kernel(instance, False) + # compile the kernel, Extra logic needed to check if we have a python function + if lang.upper() == "TRITON" or lang.upper() == "GENERIC_PYTHON": + func = dev.compile_kernel(instance, False, gpu_args) + else: + func = dev.compile_kernel(instance, False) if func is None: raise RuntimeError("cannot compile kernel, too much shared memory used") @@ -903,19 +938,29 @@ def run_kernel( instance.delete_temp_files() # run the kernel - if not dev.run_kernel(func, gpu_args, instance): + if not dev.run_kernel_check(func, gpu_args, instance): raise RuntimeError("runtime error occured, too many resources requested") + + # copy data in GPU memory back to the host results = [] - for i, arg in enumerate(arguments): - if numpy.isscalar(arg): - results.append(arg) - elif isinstance(arg, torch.Tensor): - results.append(arg.cpu()) - else: - results.append(numpy.zeros_like(arg)) - dev.memcpy_dtoh(results[-1], gpu_args[i]) + if instance.kernel_source.lang == Language.GENERIC_PYTHON: + for arg in gpu_args: + if isinstance(arg, torch.Tensor): + results.append(arg.cpu()) + else: + results.append(arg) + else: + for i, arg in enumerate(arguments): + if numpy.isscalar(arg): + results.append(arg) + elif isinstance(arg, torch.Tensor): + results.append(arg.cpu()) + else: + results.append(numpy.zeros_like(arg)) + dev.memcpy_dtoh(results[-1], gpu_args[i]) + return results diff --git a/kernel_tuner/kernel_sources/__init__.py b/kernel_tuner/kernel_sources/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_tuner/kernel_sources/kernel_source.py b/kernel_tuner/kernel_sources/kernel_source.py new file mode 100644 index 000000000..51fb75d9b --- /dev/null +++ b/kernel_tuner/kernel_sources/kernel_source.py @@ -0,0 +1,82 @@ +import kernel_tuner.util as util + +from abc import abstractmethod + +from kernel_tuner.language import Language +from kernel_tuner.kernel_sources.model.prepared_kernel_source_data import PreparedKernelSourceData + + +class KernelSourceFactory(type): + ''' + Factory to dynamically determine if a KernelSource should be a KernelSourceStr or a KernelSourceFn. + if lang=generic_python, KernelSourceFn will be used. In all other cases, an instance of KernelSourceStr + is created. + + We create a kernelsouce by calling KernelSource(...). The call method for KernelSource is replaced by + the call method in KernelSourceFactory, because KernelSource uses that class as metaclass. Inside the + call method, a call to create either KernelSourceStr or KernelSourceFn is performed. This triggers the + __init__ call of the corresponding subclass. In both subclasses, we call super().__init__, which triggers + the __init__ call of the KernelSource class to initalize some common variables. + ''' + def __call__(cls, kernel_name, kernel_sources, lang, defines=None, call_function=None): + if lang == None: + language = None + else: + try: + language = Language(lang.upper()) + except ValueError: + raise TypeError(f"Supported languages are {[l.value for l in Language]}") + + # Determine if we need to create a KernelSourceStr or a KernelSourceFn + if (language and language == Language.GENERIC_PYTHON): + ks_str = False + else: + ks_str = True + + from kernel_tuner.kernel_sources.kernel_source_str import KernelSourceStr + from kernel_tuner.kernel_sources.kernel_source_fn import KernelSourceFn + if cls is KernelSource: + if ks_str: + return KernelSourceStr(kernel_name, kernel_sources, lang, defines) + else: + return KernelSourceFn(kernel_name, kernel_sources, lang, defines, call_function) + + # Else, normal behaviour for subclasses + if ks_str: + return super().__call__(kernel_name, kernel_sources, lang, defines) + else: + return super().__call__(kernel_name, kernel_sources, lang, defines, call_function) + + + +class KernelSource(metaclass=KernelSourceFactory): + + def __init__(self, kernel_name, kernel_sources, lang, defines=None): + if not isinstance(kernel_sources, list): + kernel_sources = [kernel_sources] + self.kernel_sources = kernel_sources + self.kernel_name = kernel_name + self.defines = defines + + if lang is None: + if callable(self.kernel_sources[0]): + raise TypeError("Please specify language when using a code generator function") + kernel_string = self.get_kernel_string(0) + language = util.detect_language(kernel_string) + self.lang = Language(language.upper()) + else: + try: + self.lang = Language(lang.upper()) + except ValueError: + raise TypeError(f"Supported languages are {[l.value for l in Language]}") + + + @abstractmethod + def prepare_kernel_instance(self, kernel_options, params, grid, threads) -> PreparedKernelSourceData: + raise NotImplementedError("create_kernel_instance not implemented") + + @abstractmethod + def check_argument_lists(self, kernel_name, arguments): + raise NotImplementedError("check_argument_lists not implemented") + + diff --git a/kernel_tuner/kernel_sources/kernel_source_fn.py b/kernel_tuner/kernel_sources/kernel_source_fn.py new file mode 100644 index 000000000..f806d5b87 --- /dev/null +++ b/kernel_tuner/kernel_sources/kernel_source_fn.py @@ -0,0 +1,304 @@ +import inspect +import ast +import copy +import uuid +import sys +import logging + +import astor +import tempfile +import importlib.util + +from typing import Any + +from kernel_tuner.language import Language +from kernel_tuner.kernel_sources.kernel_source import KernelSource +from kernel_tuner.kernel_sources.model.prepared_kernel_source_data import PreparedKernelSourceData +from kernel_tuner.util import get_kernel_ast, get_arg_names + + +class KernelSourceFn(KernelSource): + """ + Class that holds the Python-function-based kernel sources. + + There is a primary kernel source for function-based kernels in Python. The kernel_source + must be a path to the file where the kernel with kernel_name lives. The kernel can be + decorated by a JIT decorator. + + A call function to specify how the kernel should be launched must be supplied. The call + function must take the following arguments: + - kernel_function: the callable function with the tuning parameters inserted. + - args: list of kernel arguments, as provided by the user in the argument. + - kwargs: dictionary of kernel keyword arguments. If a tuning parameter is in the kernel signature, + the tuning parameter will be added as a keyword argument. + - grid: the launch grid (tuple with 3 values), as computed by KernelTuner + - threads: the thread block size (tuple with 3 values), as computed by KernelTuner + - params: dictionary with the values of the tuning params for this configuration. + """ + + def __init__(self, kernel_name, kernel_source, lang, defines=None, call_function=None): + super().__init__(kernel_name, kernel_source, lang, defines) + if isinstance(kernel_source, list): + raise ValueError("KernelSourceFn only supports a single kernel source") + + if self.lang == Language.GENERIC_PYTHON: + if call_function is None: + raise ValueError("call_function must be supplied for language Generic Python") + if not callable(call_function): + raise TypeError(f"call_function of type {type(call_function)} is not a callable object.") + self.call_function = call_function + + if not isinstance(kernel_name, str): + raise TypeError("kernel_name should be a string, got ", type(kernel_name)) + + source_ast = get_kernel_ast(kernel_name, kernel_source) + if isinstance(source_ast, tuple): # Class based kernel + self.source_tree = source_ast[0] + self.signature = get_arg_names(source_ast[1]) + else: + self.source_tree = source_ast + self.signature = get_arg_names(source_ast) + + self.kernel_fn = self.source_tree # This is where we will store the transformed source. + self.import_nodes = self._find_import_nodes(kernel_source) + self.dependencies = self._find_dependencies(kernel_source) + + + def prepare_kernel_instance(self, kernel_options, params, grid, threads): + ''' + Given the dictionary of tuning parameter values for this configuration, + generate a kernel instance with these tuning parameters inserted. kernel_options, + grid and threads are not needed for Python kernels. + ''' + # Test: see if removing signature params helps in Triton overhead + filtered_params = {} + for k, v in params.items(): + if k not in self.signature: + filtered_params[k] = v + + new_kernel_fn, temp_file_path = self.apply_params_to_source_fn(filtered_params) + self.kernel_fn = new_kernel_fn + + return PreparedKernelSourceData( + temp_files=[temp_file_path], + kernel_name=self.kernel_name, + kernel_fn=new_kernel_fn, + kernel_str=None + ) + + + def check_argument_lists(self, kernel_name, arguments): + ''' + Check if the kernel arguments have the correct types. + Not implemented for Python, because type hinting is not always supplied. + ''' + logging.debug("Checking of arguments list not supported yet for Python kernels") + return True + + + def apply_params_to_source_fn(self, params): + ''' + Create a module with the kernel imports and local dependencies from the kernel source file. + Find instances of the tuning parameters in the kernel and local dependencies and replace these + instances by the values of this configuration. + Return the new kernel and the path to the new module, where the new kernel lives. + ''' + transformer = ReplaceVars(params) + + # Create a new module to store the transformed AST in. + new_module = ast.Module(body=[], type_ignores=[]) + + # Add imports + new_module.body.extend(self.import_nodes) + + # Add dependencies (functions) + for dep_node in self.dependencies.values(): + dep_node_copy = copy.deepcopy(dep_node) + transformed_dep_node = transformer.visit(dep_node_copy) # apply tuning params to dependencies + new_module.body.append(transformed_dep_node) + + # Transform main kernel + source_tree_copy = copy.deepcopy(self.source_tree) + transformed_tree = transformer.visit(source_tree_copy) + + # Add transformed main kernel to new module + new_module.body.append(transformed_tree) + + # Fix locations and generate source + ast.fix_missing_locations(new_module) + new_source = astor.to_source(new_module) + + #print(new_source) + + # Create a unique module name and write new source to it. + module_name = f'temp_kernel_module_{uuid.uuid4().hex}' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file: + temp_file.write(new_source) + temp_file_path = temp_file.name + + # Register the module in sys.modules before executing it + spec = importlib.util.spec_from_file_location(module_name, temp_file_path) + temp_module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = temp_module + spec.loader.exec_module(temp_module) + new_fn = getattr(temp_module, self.kernel_name) + + return new_fn, temp_file_path + + + def _find_import_nodes(self, source_file): + ''' + Parse kernel source file to find import statements. Return those + statements as AST import nodes. + ''' + with open(source_file, "r") as f: + tree = ast.parse(f.read(), filename=source_file) + + import_nodes = [] + for node in tree.body: + if isinstance(node, (ast.Import, ast.ImportFrom)): + import_nodes.append(node) + + return import_nodes + + + def _find_function_dependecies(self, tree, local_funcs): + ''' + Given an AST and a list of local function names, return the funtions that are invoked + somewhere in the file, and are in the local_funcs list. + ''' + class FunctionCallVisitor(ast.NodeVisitor): + def __init__(self): + self.called = set() + + def visit_Call(self, node): + if isinstance(node.func, ast.Name): + name = node.func.id + if name in local_funcs: + self.called.add(name) + self.generic_visit(node) + + visitor = FunctionCallVisitor() + visitor.visit(tree) + return visitor.called + + + def _find_dependencies(self, filepath): + ''' + Find all local function dependencies in the file where the kernel source is + defined. Return a dicionary indexed by the function node name with the function + body as value. + ''' + with open(filepath, "r") as f: + source_code = f.read() + tree = ast.parse(source_code, filename=filepath) + + # Find the locally defined functions in the source file + local_funcs = set() + for node in tree.body: + if isinstance(node, ast.FunctionDef): + local_funcs.add(node.name) + + # Given source file AST and the set of locally defined functions, find the names of + # the local functions that are invoked somewhere in the AST. + called_functions = self._find_function_dependecies(self.source_tree, local_funcs) + + # Traverse the tree one more time to save the dependency functions in a dictionary. + # Skip the main kernel. + dependencies = {} + for node in tree.body: + if (isinstance(node, ast.FunctionDef) and node.name in called_functions and + node.name != self.kernel_name): # Skip the main kernel itself + dependencies[node.name] = node + + return dependencies + + + def __del__(self): + ''' + Clean up temporary modules when the instance is destroyed + ''' + for key in list(sys.modules.keys()): + if key.startswith('temp_kernel_module_'): + del sys.modules[key] + + +class ReplaceVars(ast.NodeTransformer): + ''' + AST transformer that replaces occurrences of tuning parameters with constant values. + The tuning parameters (params) should be provided as a dictionary. + The transformation supports the following cases: + + - Variable reads: + occurrence of a variable name that matches a key in `params` are replaced if the + value is being read. + + - Object attribute reads: + expressions of the form `self.` are replaced with constants if + `` exists in `params` and is being read (not assigned to). + + - Assignments to parameters: + assignments like `param = ...` or `self.param = ...` are overridden, + replacing the right-hand side with the constant value from `params`. + + Examples: + Given `params = {"block_size": 32}`: + x = 2 * block_size -> x = 2 * 32 + x = 2 * self.block_size -> x = 2 * 32 + self.block_size = 64 -> self.block_size = 32 + block_size = 64 -> block_size = 32 + + ''' + + def __init__(self, params: dict): + self.params = params + + def visit_Name(self, node: ast.Name) -> Any: + if isinstance(node.ctx, ast.Load) and node.id in self.params.keys(): + return ast.copy_location( + ast.Constant(value=self.params[node.id]), + node + ) + return node + + + def visit_Attribute(self, node: ast.Attribute): + self.generic_visit(node) + + # Replace self. with constant, but only if it's being read + if ( + isinstance(node.value, ast.Name) + and node.value.id == "self" + and node.attr in self.params + and isinstance(node.ctx, ast.Load) + ): + return ast.copy_location(ast.Constant(self.params[node.attr]), node) + + return node + + + def visit_Assign(self, node: ast.Assign): + # Replace assignments like: mock_param = ... + if ( + len(node.targets) == 1 + and isinstance(node.targets[0], ast.Name) + and node.targets[0].id in self.params + ): + node.value = ast.Constant(value=self.params[node.targets[0].id]) + return node + + # Replace assignment like self. = ... + if ( + len(node.targets) == 1 + and isinstance(node.targets[0], ast.Attribute) + and isinstance(node.targets[0].value, ast.Name) + and node.targets[0].value.id == "self" + and node.targets[0].attr in self.params + ): + node.value = ast.Constant(value=self.params[node.targets[0].attr]) + return node + + return self.generic_visit(node) + + + diff --git a/kernel_tuner/kernel_sources/kernel_source_str.py b/kernel_tuner/kernel_sources/kernel_source_str.py new file mode 100644 index 000000000..095417392 --- /dev/null +++ b/kernel_tuner/kernel_sources/kernel_source_str.py @@ -0,0 +1,174 @@ +import kernel_tuner.util as util +import logging + +from kernel_tuner.kernel_sources.kernel_source import KernelSource +from kernel_tuner.core import wrap_templated_kernel +from kernel_tuner.kernel_sources.model.prepared_kernel_source_data import PreparedKernelSourceData +from kernel_tuner.language import Language + + +class KernelSourceStr(KernelSource): + """Class that holds the string-based kernel sources. + + There is a primary kernel source for string-based kernels., which can be either a source string, + a filename (indicating a file containing the kernel source code), + or a callable (generating the kernel source code). + There can additionally be (one or multiple) secondary kernel sources, which + must be filenames. + """ + + def __init__(self, kernel_name, kernel_sources, lang, defines=None): + super().__init__(kernel_name, kernel_sources, lang, defines) + + def prepare_kernel_instance(self, kernel_options, params, grid, threads): + name, kernel_string, temp_files = self.prepare_list_of_files( + kernel_name=kernel_options.kernel_name, + params=params, + grid=grid, + threads=threads, + block_size_names=kernel_options.block_size_names, + ) + + lang_is_cuda = self.lang in [Language.CUDA, Language.NVCUDA] + + if lang_is_cuda and "<" in name and ">" in name: + kernel_string, name = wrap_templated_kernel(kernel_string, name) + + return PreparedKernelSourceData( + temp_files=temp_files, + kernel_name=name, + kernel_str=kernel_string, + kernel_fn=None, + ) + + + + def get_kernel_string(self, index=0, params=None): + """Retrieve the kernel source with the given index and return as a string. + + See util.get_kernel_string() for details. + + :param index: Index of the kernel source in the list of sources. + :type index: int + + :param params: Dictionary containing the tunable parameters for this specific + kernel instance, only needed when kernel_source is a generator. + :type param: dict + + :returns: A string containing the kernel code. + :rtype: string + """ + logging.debug("get_kernel_string called") + + if hasattr(self, 'lang') and self.lang == Language.HYPERTUNER: + return "" + + kernel_source = self.kernel_sources[index] + return util.get_kernel_string(kernel_source, params) + + def prepare_list_of_files( + self, kernel_name, params, grid, threads, block_size_names + ): + """Prepare the kernel string along with any additional files. + + The first file in the list is allowed to include or read in the others + The files beyond the first are considered additional files that may also contain tunable parameters + + For each file beyond the first this function creates a temporary file with + preprocessors statements inserted. Occurrences of the original filenames in the + first file are replaced with their temporary counterparts. + + :param kernel_name: A string specifying the kernel name. + :type kernel_name: string + + :param params: A dictionary with the tunable parameters for this particular + instance. + :type params: dict() + + :param grid: The grid dimensions for this instance. The grid dimensions are + also inserted into the code as if they are tunable parameters for + convenience. + :type grid: tuple() + + :param threads: The thread block dimensions for this instance. The thread block are + also inserted into the code as if they are tunable parameters for + convenience. + :type threads: tuple() + + :param block_size_names: A list of strings that denote the names + for the thread block dimensions. + :type block_size_names: list(string) + + """ + temp_files = dict() + + if self.lang == Language.HYPERTUNER: + return tuple(["", "", temp_files]) + + for i, f in enumerate(self.kernel_sources): + if i > 0 and not util.looks_like_a_filename(f): + raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames") + + ks = self.get_kernel_string(i, params) + # add preprocessor statements + n, ks = util.prepare_kernel_string( + kernel_name, + ks, + params, + grid, + threads, + block_size_names, + self.lang, + self.defines, + ) + + if i == 0: + # primary kernel source + name = n + kernel_string = ks + continue + + # save secondary kernel sources to temporary files + + # generate temp filename with the same extension + temp_file = util.get_temp_filename(suffix="." + f.split(".")[-1]) + temp_files[f] = temp_file + util.write_file(temp_file, ks) + # replace occurrences of the additional file's name in the first kernel_string with the name of the temp file + kernel_string = kernel_string.replace(f, temp_file) + + return name, kernel_string, temp_files + + def get_user_suffix(self, index=0): + """Get the suffix of the kernel filename, if the user specified one. Return None otherwise.""" + if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]): + return "." + self.kernel_sources[index].split(".")[-1] + return None + + def get_suffix(self, index=0): + """Return a suitable suffix for a kernel filename. + + This uses the user-specified suffix if available, or one based on the + lang/backend otherwise. + """ + # TODO: Consider delegating this to the backend + suffix = self.get_user_suffix(index) + if suffix is not None: + return suffix + + _suffixes = {Language.CUDA: ".cu", Language.OPENCL: ".cl", Language.C: ".c"} + try: + return _suffixes[self.lang] + except KeyError: + return ".c" + + def check_argument_lists(self, kernel_name, arguments): + """Check if the kernel arguments have the correct types. + + This is done by calling util.check_argument_list on each kernel string. + """ + for i, f in enumerate(self.kernel_sources): + if not callable(f): + util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments) + else: + logging.debug("Checking of arguments list not supported yet for code generators.") \ No newline at end of file diff --git a/kernel_tuner/kernel_sources/model/__init__.py b/kernel_tuner/kernel_sources/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_tuner/kernel_sources/model/prepared_kernel_source_data.py b/kernel_tuner/kernel_sources/model/prepared_kernel_source_data.py new file mode 100644 index 000000000..b9c904b38 --- /dev/null +++ b/kernel_tuner/kernel_sources/model/prepared_kernel_source_data.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass +from typing import Any + + +@dataclass +class PreparedKernelSourceData: + temp_files: Any + kernel_name: str + kernel_str: Any + kernel_fn: Any \ No newline at end of file diff --git a/kernel_tuner/kernelbuilder.py b/kernel_tuner/kernelbuilder.py index 0f3f6154f..e2039e66d 100644 --- a/kernel_tuner/kernelbuilder.py +++ b/kernel_tuner/kernelbuilder.py @@ -4,6 +4,7 @@ from kernel_tuner.interface import Options, _kernel_options from kernel_tuner.integration import TuneResults +from kernel_tuner.kernel_sources.kernel_source import KernelSource class PythonKernel(object): @@ -30,7 +31,7 @@ def __init__(self, kernel_name, kernel_string, problem_size, arguments, params=N """ #construct device interface - kernel_source = core.KernelSource(kernel_name, kernel_string, lang) + kernel_source = KernelSource(kernel_name, kernel_string, lang) self.dev = core.DeviceInterface(kernel_source, device=device, quiet=True) if not params: params = {} @@ -92,7 +93,7 @@ def run_kernel(self, args): :type args: list(np.ndarray or np.generic) """ self.update_gpu_args(args) - self.dev.run_kernel(self.func, self.gpu_args, self.kernel_instance) + self.dev.run_kernel_check(self.func, self.gpu_args, self.kernel_instance) return self.get_gpu_result(args) def __call__(self, *args): diff --git a/kernel_tuner/language.py b/kernel_tuner/language.py new file mode 100644 index 000000000..02948c12b --- /dev/null +++ b/kernel_tuner/language.py @@ -0,0 +1,19 @@ +from enum import Enum + +class Language(Enum): + CUDA = "CUDA" + OPENCL = "OPENCL" + C = "C" + HIP = "HIP" + FORTRAN = "FORTRAN" + NVCUDA = "NVCUDA" + GENERIC_PYTHON = "GENERIC_PYTHON" + CUPY = "CUPY" + HYPERTUNER = "HYPERTUNER" + + @classmethod + def is_valid(cls, value: str) -> bool: + """ + Test if a language is valid in Kernel Tuner framework. + """ + return value in cls._value2member_map_ \ No newline at end of file diff --git a/kernel_tuner/observers/generic_python.py b/kernel_tuner/observers/generic_python.py new file mode 100644 index 000000000..62dfa20c7 --- /dev/null +++ b/kernel_tuner/observers/generic_python.py @@ -0,0 +1,32 @@ +import numpy as np + +from kernel_tuner.observers.observer import BenchmarkObserver + +try: + import torch +except (ImportError, RuntimeError): + torch = None + + +class GenericPythonRuntimeObserver(BenchmarkObserver): + """Observer that measures time using CUDA events during benchmarking.""" + + def __init__(self, dev): + if torch is None: + raise ImportError("Unable to load torch") + + self.dev = dev + self.stream = dev.stream + self.start = dev.start + self.end = dev.end + self.times = [] + + def after_finish(self): + # Time is measured in milliseconds + event_elapsed_time = self.start.elapsed_time(self.end) + self.times.append(event_elapsed_time) + + def get_results(self): + results = {"time": np.average(self.times), "times": self.times.copy()} + self.times = [] + return results \ No newline at end of file diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 0163083a1..b0d284b2a 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -133,6 +133,9 @@ def check_argument_type(dtype, kernel_argument): def check_argument_list(kernel_name, kernel_string, args): """Raise an exception if kernel arguments do not match host arguments.""" + if kernel_string is None: + return + cp = _get_cupy() cupy_ndarray = (cp.ndarray,) if cp is not None else () kernel_arguments = list() @@ -666,6 +669,82 @@ def get_kernel_string(kernel_source, params=None): return kernel_string +def get_kernel_ast(kernel_name, filepath): + ''' + Util function for Generic Python Backend that returns the kernel function as AST. + + :param kernel_name: name of the kernel (as passed by the user) + :type kernel_name: string + + :param filepath: the path to the file where the kernel lives. (passed by the user as kenrel_source) + :type filepath: string or Path containing a filename that points to the kernel source + + :returns: ast.FunctionDef node in case the kernel is a function or a tuple + (ast.ClassDef node, ast.FunctionDef node) in case the kernel is represented as the __call__ function + of a class (for Tilus support). + ''' + if isinstance(filepath, Path): + source = read_file(filepath) + elif isinstance(filepath, str): + with open(filepath, "r") as f: + source = f.read() + else: + raise TypeError("Error kernel_source does not specify a path to a file") + + tree = ast.parse(source) + + # Function based kernels + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) and node.name == kernel_name: + return node + + # Class based kernels + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef) and node.name == kernel_name: + class_node = node + break + + if not class_node: + raise ValueError(f"Kernel {kernel_name} not found in {filepath}") + + # Search for __call__ function within class + for node in class_node.body: + if isinstance(node, ast.FunctionDef) and node.name == "__call__": + call_node = node + return (class_node, call_node) + + if not call_node: + raise ValueError(f"No __call__ function found inside Class {kernel_name}") + + + +def get_arg_names(func_node: ast.FunctionDef): + """ + Util to get the argument names from a Python AST function definition. This is needed to check + if there are any tunable parameters in the function signature. + """ + args = func_node.args + names = [] + + names += [arg.arg for arg in args.args] + names += [arg.arg for arg in args.posonlyargs] + names += [arg.arg for arg in args.kwonlyargs] + + # *args + if args.vararg: + names.append(args.vararg.arg) + + # **kwargs + if args.kwarg: + names.append(args.kwarg.arg) + + # Remove self for classes. + if 'self' in names: + names.remove('self') + + return names + + def get_problem_size(problem_size, params): """Compute current problem size.""" if callable(problem_size): @@ -1014,6 +1093,33 @@ def has_kw_argument(func, name): return lambda answer, result_host, atol: v(answer, result_host) +def normalize_call_function(v): + """Normalize a user-specified call function for language Generic Python. + + The user-specified function has three required positional arguments (kernel_function, + args, kwargs), and three optional keyword arguments: grid, threads and params. The + optional keyword arguments should appear in that order. We normalize the function + so that it always accepts grid, threads and params. + + Undefined behaviour if the passed function does not match the required signatures. + """ + def has_kw_argument(func, name): + sig = signature(func) + return name in sig.parameters + + if v is None: + return None + + if has_kw_argument(v, "grid"): + if has_kw_argument(v, "threads"): + if has_kw_argument(v, "params"): + return v + return lambda kernel_function, args, kwargs, grid, threads, params: v(kernel_function, args, kwargs, grid, threads) + return lambda kernel_function, args, kwargs, grid, threads, params: v(kernel_function, args, kwargs, grid) + return lambda kernel_function, args, kwargs, grid, threads, params: v(kernel_function, args, kwargs) + + + def parse_restrictions( restrictions: list[str], tune_params: dict, monolithic=False, format=None ) -> list[tuple[Union[Constraint, str], list[str]]]: diff --git a/test/context.py b/test/context.py index 7d94ac99b..ffa283c72 100644 --- a/test/context.py +++ b/test/context.py @@ -74,6 +74,12 @@ except ImportError: bayes_opt_gpytorch_present = False +try: + import torch + gen_python_torch_present = True +except ImportError: + gen_python_torch_present = False + try: import pyatf pyatf_present = True @@ -110,6 +116,7 @@ skip_if_no_hip = pytest.mark.skipif(not hip_present, reason="No HIP Python found") skip_if_no_pyatf = pytest.mark.skipif(not pyatf_present, reason="PyATF not installed") skip_if_no_methodology = pytest.mark.skipif(not methodology_present, reason="Autotuning Methodology not found") +skip_if_no_torch = pytest.mark.skipif(not gen_python_torch_present, reason="Torch not installed") def skip_backend(backend: str): @@ -129,3 +136,5 @@ def skip_backend(backend: str): pytest.skip("No nvc++ on PATH") elif backend.upper() == "HIP" and not hip_present: pytest.skip("HIP Python not installed") + elif backend.upper() == "GENERIC_PYTHON" and not gen_python_torch_present: + pytest.skip("Torch not installed") diff --git a/test/test_backend.py b/test/test_backend.py index e694649c1..af9971a48 100644 --- a/test/test_backend.py +++ b/test/test_backend.py @@ -4,8 +4,9 @@ skip_if_no_cuda, skip_if_no_opencl, skip_if_no_pycuda, + skip_if_no_torch, ) -from kernel_tuner.backends import backend, compiler, cupy, nvcuda, opencl, pycuda +from kernel_tuner.backends import backend, compiler, cupy, nvcuda, opencl, pycuda, generic_python class WrongBackend(backend.Backend): @@ -45,3 +46,8 @@ def test_opencl_backend(): @skip_if_no_pycuda def test_pycuda_backend(): dev = pycuda.PyCudaFunctions() + + +@skip_if_no_torch +def test_generic_python_backend(): + dev = generic_python.GenericPythonFunctions() \ No newline at end of file diff --git a/test/test_generic_python_functions.py b/test/test_generic_python_functions.py new file mode 100644 index 000000000..b8cddea88 --- /dev/null +++ b/test/test_generic_python_functions.py @@ -0,0 +1,129 @@ +from .context import skip_if_no_torch +from .test_kernel_source_fn import call_mock +from kernel_tuner.core import DeviceInterface, KernelInstance +from kernel_tuner.kernel_sources.kernel_source import KernelSource +import numpy as np +from pathlib import Path +import os + +try: + import torch + torch_present = True +except ImportError: + pass + +KS_FILE = os.path.join(Path(__file__).resolve().parent, "test_kernel_source_fn.py") + +# Helper functions ------------------------------ + +def value_equal(a, b): + # Torch tensors + if isinstance(a, torch.Tensor): + return torch.equal(a, b) + + # NumPy arrays + if isinstance(a, np.ndarray): + return np.array_equal(a, b) + + # Fallback (ints, floats, strings, lists, tuples, dicts) + return a == b + +def get_context(): + params = {'mock_param': 64} + a = 42 + b = torch.randn(12, device='cuda', dtype=torch.float32) + args = [a, b] + ks = KernelSource("mock_kernel", KS_FILE, "generic_python", call_function=call_mock) + return ks, args, params + + +# Tests ---------------------------------------------- + +@skip_if_no_torch +def test_ready_argument_list(): + ks, args, params = get_context() + dev = DeviceInterface(ks) + + gpu_args = dev.ready_argument_list(args) + + assert len(args) == len(gpu_args) + + # arg 0: python scalar + assert isinstance(gpu_args[0], int) + assert gpu_args[0] == args[0] + + # arg 1: torch cuda tensor + assert isinstance(gpu_args[1], torch.Tensor) + assert gpu_args[1].is_cuda + + # values equal + assert torch.allclose(gpu_args[1], args[1]) + + # ensure deep copy + assert gpu_args[1] is not args[1] + + +@skip_if_no_torch +def test_compile(): + ks, args, params = get_context() + + instance_data = ks.prepare_kernel_instance( + kernel_options=None, + params=params, + grid=None, + threads=None, + ) + dev = DeviceInterface(ks) + instance = KernelInstance( + name="mock_kernel", + kernel_source=ks, + kernel_string=None, + kernel_fn=instance_data.kernel_fn, + temp_files=instance_data.temp_files, + threads=None, + grid=None, + params=params, + arguments=None, + ) + gpu_args = dev.ready_argument_list(args) + + callable_fn = dev.compile_kernel(instance, verbose=False, gpu_args=gpu_args) + + # The mock function return the mock_param, which should be set to 64. + assert callable_fn(*args) == 64 + + +@skip_if_no_torch +def test_gpu_kwargs(): + params = {'mock_param': 64} + a = torch.randn(12, device='cuda', dtype=torch.float32) + args = [a] # we do not have to specify the kwarg here + ks = KernelSource("kernel_with_kwarg", KS_FILE, "generic_python", call_function=call_mock) + dev = DeviceInterface(ks) + + instance_data = ks.prepare_kernel_instance( + kernel_options=None, + params=params, + grid=None, + threads=None, + ) + dev = DeviceInterface(ks) + instance = KernelInstance( + name="kernel_with_kwarg", + kernel_source=ks, + kernel_string=None, + kernel_fn=instance_data.kernel_fn, + temp_files=instance_data.temp_files, + threads=None, + grid=None, + params=params, + arguments=None, + ) + gpu_args = dev.ready_argument_list(args) + callable_fn = dev.compile_kernel(instance, verbose=False, gpu_args=gpu_args) + + kwargs = dev.dev.gpu_kwargs + + assert kwargs["mock_param"] == 64 + assert callable_fn(*args, **kwargs) == 64 + diff --git a/test/test_kernel_source_fn.py b/test/test_kernel_source_fn.py new file mode 100644 index 000000000..d02be9e4d --- /dev/null +++ b/test/test_kernel_source_fn.py @@ -0,0 +1,172 @@ +import os +from pathlib import Path +import pytest +import inspect +import ast +import functools # for an example decorator +from kernel_tuner.kernel_sources.kernel_source import KernelSource +# Note: need subclass imports to test for types and instantiate subclasses direclty. +# Normally, just importing KernelSource is enough. +from kernel_tuner.kernel_sources.kernel_source_fn import KernelSourceFn +from kernel_tuner.kernel_sources.kernel_source_str import KernelSourceStr + +KS_FILE = Path(__file__).resolve() +# Helper functions -------------------------------------------- + +def normalize_ast(src: str): + return ast.dump(ast.parse(src), include_attributes=False) + +def mock_kernel(a, b): + mock_param = 256 + if a < mock_param: + b = 42 + return mock_param + +def kernel_with_kwarg(a, mock_param): + return mock_param + +# NOTE should kernel_with_kwarg now return the tuning param value for mock param or 5? +def kernel_with_dependency(): + mock_param = 42 + foo = kernel_with_kwarg(mock_param, 5) + return mock_kernel(42, 42) + +def call_mock(kernel_function, args, kwargs, grid, threads, params): + kernel_function(*args, **kwargs) + +class TilusLike(): + def __init__(self): + self.mock_param = 32 + + def __call__(self): + return self.mock_param + + def another_function(self): + return self.mock_param + +@functools.lru_cache() +def kernel_with_decorator(): + mock_param = 42 + return mock_param + + +# Tests ------------------------------------------------------ + +def test_factory_behaviour(): + # KernelSourceFn should only be created when language generic_python is supplied + ks_fn = KernelSource("mock_kernel", KS_FILE, "generic_python", call_function=call_mock) + ks_str = KernelSource("vector_add", 'extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {', lang=None) + + assert isinstance(ks_fn, KernelSourceFn) + assert isinstance(ks_str, KernelSourceStr) + + +def test_initiation(): + ''' + Test invalid KernelSourceFn initations + ''' + with pytest.raises(ValueError, match=r"call_function must be supplied for language .*"): + KernelSource("mock_kernel", KS_FILE, "generic_python") + + with pytest.raises(FileNotFoundError, match=r".* No such file or directory: .*"): + KernelSource("mock_kernel", "This is a string Kernel", "generic_python", call_function=call_mock) + + with pytest.raises(TypeError, match="Error kernel_source does not specify a path to a file"): + KernelSource("mock_kernel", mock_kernel, "generic_python", call_function=call_mock) + + with pytest.raises(ValueError, match=r"KernelSourceFn only supports a single kernel source"): + KernelSource("mock_kernel", [KS_FILE, "another file"], "generic_python", call_function=call_mock) + + with pytest.raises(TypeError, match=r".* is not a callable object"): + KernelSource("mock_kernel", KS_FILE, "generic_python", call_function="not a function") + + + +def test_param_subsitution(): + params = {"mock_param": 128} + ks = KernelSourceFn("mock_kernel", KS_FILE, "generic_python", call_function=call_mock) + new_kernel_fn, _ = ks.apply_params_to_source_fn(params) + + actual_src = inspect.getsource(new_kernel_fn) + expected_src = """ +def mock_kernel(a, b): + mock_param = 128 + if a < 128: + b = 42 + return 128 +""" + + assert normalize_ast(actual_src) == normalize_ast(expected_src) + + +def test_imports(): + ''' + Import statements that are present in the file where the function lives + should also be present in the file where the new function lives. + ''' + + params = {"mock_param": 128} + ks = KernelSourceFn("mock_kernel", KS_FILE, "generic_python", call_function=call_mock) + _, temp_path = ks.apply_params_to_source_fn(params) + + # Check if imports are present + with open(temp_path) as f: + full_src = f.read() + + assert "import pytest" in full_src + assert "import inspect" in full_src + + +def test_param_substitution_class(): + ''' + Tilus uses the __call__ function of a class to define its kernel. Therefore, + param substitution should also work classes. Param substiution in other class + functions is also supported. + ''' + + params = {"mock_param": 128} + ks = KernelSourceFn("TilusLike", KS_FILE, "generic_python", call_function=call_mock) + + new_kernel_fn, _ = ks.apply_params_to_source_fn(params) + + actual_src = inspect.getsource(new_kernel_fn) + expected_src = """ +class TilusLike(): + def __init__(self): + self.mock_param = 128 + + def __call__(self): + return 128 + + def another_function(self): + return 128 +""" + assert normalize_ast(actual_src) == normalize_ast(expected_src) + + + +def test_decorator(): + params = {"mock_param": 128} + ks = KernelSourceFn("kernel_with_decorator", KS_FILE, "generic_python", call_function=call_mock) + new_kernel_fn, _ = ks.apply_params_to_source_fn(params) + actual_src = inspect.getsource(new_kernel_fn) + expected_src = """ +@functools.lru_cache() +def kernel_with_decorator(): + mock_param = 128 + return 128 +""" + assert hasattr(new_kernel_fn, "__wrapped__") + assert normalize_ast(actual_src) == normalize_ast(expected_src) + + +def test_dependencies(): + params = {"mock_param": 128} + ks = KernelSourceFn("kernel_with_dependency", KS_FILE, "generic_python", call_function=call_mock) + new_kernel_fn, _ = ks.apply_params_to_source_fn(params) + res = new_kernel_fn() # This should not throw an error if the dependency exists in the module. + + assert res == 128 + + + diff --git a/test/test_observers.py b/test/test_observers.py index 69a94a9bb..d1a27179c 100644 --- a/test/test_observers.py +++ b/test/test_observers.py @@ -4,6 +4,7 @@ from kernel_tuner.observers.nvml import NVMLObserver from kernel_tuner.observers.observer import BenchmarkObserver from kernel_tuner.observers.register import RegisterObserver +from kernel_tuner.language import Language from .context import ( skip_if_no_cuda, @@ -59,7 +60,7 @@ def get_results(self): result, _ = kernel_tuner.tune_kernel(*env_compiler, observers=[lambda args: MyObserver(args)], compiler_options=["-fopenmp"]) # Check if the observer has correctly received the lang option - assert result[0]["observer_args"]["lang"] == "C" + assert result[0]["observer_args"]["lang"] == Language.C @skip_if_no_pycuda def test_register_observer_pycuda(env): diff --git a/test/test_util_functions.py b/test/test_util_functions.py index ba8727765..66710a9ba 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -672,6 +672,27 @@ def verify2(answer, result_host, atol): assert v(1, 2, atol=3) + +def test_normalize_call_function_none(): + assert normalize_call_function(None) is None + +@pytest.mark.parametrize( + "func", + [ + lambda f, a, k: f(*a, **k), + lambda f, a, k, grid: f(*a, **k), + lambda f, a, k, grid, threads: f(*a, **k), + lambda f, a, k, grid, threads, params: f(*a, **k), + ], +) +def test_normalize_call_function(func): + v = normalize_call_function(func) + + def kernel(x): + return x + 1 + + assert v(kernel, (1,), {}, grid=1, threads=2, params=3) == 2 + class MockRunner: simulation_mode = False