Gabriele-Codega
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎experiments/config.yaml‎
Lines changed: 6 additions & 0 deletions b/‎experiments/config.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎requirements.txt‎
Lines changed: 32 additions & 3 deletions b/‎requirements.txt‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎scripts/run.py‎
Lines changed: 220 additions & 0 deletions b/‎scripts/run.py‎
Lines changed: 220 additions & 0 deletions
diff --git a/‎src/matmul/__init__.py‎
Lines changed: 11 additions & 0 deletions b/‎src/matmul/__init__.py‎
Lines changed: 11 additions & 0 deletions
@@ -175,3 +175,5 @@ data/
 *.data
 # lightning logs
 lightning_logs/
+# random stuff
+debugging/
@@ -0,0 +1,6 @@
+device: gpu
+size: 20
+function: 
+  routine: matmul_numba_gpu
+  block_size: 16
+print: True
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
 
 
 [project]
-name = "pyclassify"
+name = "matmul"
 version = "0.0.1"
 description = "Gabriele Codega"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.11"
 license = { file = "LICENSE" }
 authors = [{ name = "Gabriele Codega", email = "[email protected]" }]
 dynamic = ["dependencies"]
@@ -21,4 +21,4 @@ exclude = ["scripts", "tests", "shell", "experiments"]
 dependencies = { file = ["requirements.txt"] }
 
 [project.optional-dependencies]
-test = ["pytest"]
+test = ["pytest"]
@@ -1,6 +1,35 @@
-exceptiongroup==1.2.2
+filelock==3.17.0
+fsspec==2025.2.0
 iniconfig==2.0.0
+Jinja2==3.1.5
+line_profiler==4.2.0
+llvmlite==0.44.0
+MarkupSafe==3.0.2
+mpi4py==4.0.3
+mpmath==1.3.0
+networkx==3.4.2
+numba==0.61.0
+numpy==2.1.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-ml-py==12.570.86
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
 packaging==24.2
 pluggy==1.5.0
-pytest==8.3.4
-tomli==2.2.1
+pytest==8.3.5
+PyYAML==6.0.2
+scipy==1.15.2
+sympy==1.13.1
+torch==2.6.0
+triton==3.2.0
+typing_extensions==4.12.2
@@ -0,0 +1,220 @@
+import numpy as np
+from numba import cuda
+
+import mpi4py
+
+mpi4py.rc.initialize = False
+mpi4py.rc.finalize = False
+from mpi4py import MPI
+
+from matmul.utils import create_block, read_config
+import argparse
+import importlib
+
+from line_profiler import profile
+
+@profile
+def main_cpu(params: dict):
+    SIZE = params["size"]
+    md = importlib.import_module("matmul")
+    routine = getattr(md, params["function"]["routine"])
+    bs = params["function"]["block_size"]
+
+    # Initialise MPI with multithreading enabled and share work among processes
+    status = MPI.Init_thread(MPI.THREAD_FUNNELED)
+    if status != MPI.THREAD_FUNNELED:
+        print("Unable to provide required thread level")
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    npes = comm.Get_size()
+
+    rest = SIZE%npes
+    n_loc = SIZE//npes + (rank < rest)
+
+    workloads = np.array([SIZE//npes + (i<rest) for i in range(npes)], dtype=int)
+
+    row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
+
+    # initialise matrices somehow
+    A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
+    B = np.zeros((n_loc,SIZE), dtype=np.float64)
+    C = np.zeros((n_loc,SIZE), dtype=np.float64)
+    for i in range(n_loc):
+        B[i, i+row_offset] = 1
+
+    # Compute quantities for Allgatherv and allocate required memory
+    ncols = workloads[0]
+    rcvcounts = workloads*ncols
+    displacements = np.cumsum(rcvcounts) - rcvcounts
+
+    B_block = np.empty((n_loc,ncols), dtype=np.float64)
+    B_col = np.empty((SIZE,ncols), dtype=np.float64)
+
+    t_tot = 0
+    start = 0
+    for i in range(npes):
+        # Recompute stuff for Algatherv at some point if needed (because of different workloads)
+        if i == rest:
+            ncols = workloads[i]
+            rcvcounts = workloads*ncols
+            displacements = np.cumsum(rcvcounts) - rcvcounts
+
+            B_block = np.empty((n_loc,ncols), dtype=np.float64)
+            B_col = np.empty((SIZE,ncols), dtype=np.float64)
+
+        # create a contiguous block from B to communicate
+        create_block(B, B_block, start, ncols)
+        # gather all pieces of B from other processes
+        comm.Allgatherv([B_block, MPI.DOUBLE], [B_col, rcvcounts,displacements, MPI.DOUBLE])
+
+        t1 = MPI.Wtime()
+        # multiply
+        routine(A,B_col,C[:,start:start+ncols],bs)
+        t2 = MPI.Wtime()
+        t_tot += (t2-t1)
+
+        start += ncols
+
+    print(t_tot)
+
+    if params["print"]:
+        if rank == 0:
+            print(C)
+            for i in range(1,npes):
+                block = np.zeros((workloads[i], SIZE))
+                block = comm.recv(source=i,tag=i)
+                print(block)
+        else:
+            comm.send(C,dest=0,tag=rank)
+
+
+    MPI.Finalize()
+
+@profile
+def main_gpu(params: dict):
+    SIZE = params["size"]
+    md = importlib.import_module("matmul")
+    routine = getattr(md, params["function"]["routine"])
+    bs = params["function"]["block_size"]
+
+    # Initialise MPI with multithreading enabled and share work among processes
+    status = MPI.Init_thread(MPI.THREAD_FUNNELED)
+    if status != MPI.THREAD_FUNNELED:
+        print("Unable to provide required thread level")
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    npes = comm.Get_size()
+
+    rest = SIZE%npes
+    n_loc = SIZE//npes + (rank < rest)
+
+    workloads = np.array([SIZE//npes + (i<rest) for i in range(npes)], dtype=int)
+
+    row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
+
+    # initialise matrices somehow
+    A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
+    B = np.zeros((n_loc,SIZE), dtype=np.float64)
+    C = np.zeros((n_loc,SIZE), dtype=np.float64)
+    for i in range(n_loc):
+        B[i, i+row_offset] = 1
+
+    # Compute quantities for Allgatherv and allocate required memory
+    ncols = workloads[0]
+    rcvcounts = workloads*ncols
+    displacements = np.cumsum(rcvcounts) - rcvcounts
+
+    B_block = np.empty((n_loc,ncols), dtype=np.float64)
+    B_col = np.empty((SIZE,ncols), dtype=np.float64)
+
+    # Select a GPU and move arrays to device
+    num_devices = len(cuda.gpus)
+    cuda.select_device(rank%num_devices)
+    a_d = cuda.to_device(A)
+    c_d = cuda.to_device(C)
+
+    nthreads = bs
+    blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(SIZE + nthreads-1)//nthreads)
+    threads_per_block = (nthreads, nthreads)
+
+    t_tot = 0
+    start = 0
+    for i in range(npes):
+        # Recompute stuff for Algatherv at some point if needed (because of different workloads)
+        if i == rest:
+            ncols = workloads[i]
+            rcvcounts = workloads*ncols
+            displacements = np.cumsum(rcvcounts) - rcvcounts
+
+            B_block = np.empty((n_loc,ncols), dtype=np.float64)
+            B_col = np.empty((SIZE,ncols), dtype=np.float64)
+
+        # create a contiguous block from B to communicate
+        create_block(B, B_block, start, ncols)
+        # gather all pieces of B from other processes
+        comm.Allgatherv([B_block, MPI.DOUBLE], [B_col, rcvcounts,displacements, MPI.DOUBLE])
+
+        # move slice of B to device
+        b_d = cuda.to_device(B_col)
+
+        t1 = cuda.event(timing=True)
+        t2 = cuda.event(timing=True)
+        t1.record()
+        # multiply
+        routine[blocks_per_grid, threads_per_block](a_d,b_d,c_d[:,start:start+ncols])
+        t2.record()
+        t2.synchronize()
+
+        t_tot += (cuda.event_elapsed_time(t1,t2)/1000)
+
+        start += ncols
+    # move final result back to host
+    C = c_d.copy_to_host()
+
+    print(t_tot)
+
+    if params["print"]:
+        if rank == 0:
+            print(C)
+            for i in range(1,npes):
+                block = np.zeros((workloads[i], SIZE))
+                block = comm.recv(source=i,tag=i)
+                print(block)
+        else:
+            comm.send(C,dest=0,tag=rank)
+
+
+    MPI.Finalize()
+
+
+cpu_routines = ['matmul',
+                'matmul_numba_serial',
+                'matmul_numba_cpu',
+                'matmul_numba_block_serial',
+                'matmul_numba_block_cpu']
+
+gpu_routines = ['matmul_numba_gpu',
+                'matmul_numba_block_gpu']
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="Path to the config yaml file")
+    parser = parser.parse_args()
+
+    if not parser.config:
+        raise RuntimeError("Please specify a yaml config file with `--config <filename>`.")
+    params = read_config(parser.config)
+    routine = params["function"]["routine"]
+
+    if params["device"] == "cpu" :
+        if not routine in cpu_routines:
+            raise ValueError(f"Specified routine '{routine}' is incompatible with device 'cpu'. Compatible routines are {cpu_routines}.")
+        main_cpu(params)
+    elif params["device"] == "gpu" :
+        if not routine in gpu_routines:
+            raise ValueError(f"Specified routine '{routine}' is incompatible with device 'gpu'. Compatible routines are {gpu_routines}.")
+        main_gpu(params)
+    else:
+        raise ValueError(f"Parameter `device` can be either 'cpu' or 'gpu', instead got {params['device']}.")
@@ -0,0 +1,11 @@
+__all__ = [
+        'matmul',
+        'matmul_numba_serial',
+        'matmul_numba_cpu',
+        'matmul_numba_gpu',
+        'matmul_numba_block_serial',
+        'matmul_numba_block_cpu',
+        'matmul_numba_block_gpu']
+
+
+from .routines import matmul, matmul_numba_serial, matmul_numba_cpu, matmul_numba_gpu, matmul_numba_block_serial, matmul_numba_block_cpu, matmul_numba_block_gpu