Skip to content

Commit ad6d8c3

Browse files
Lots of stuff:
- Implemented matmul routines serial, parallel, cpu, gpu, blocked - Implemented script to run the multiplications - Added support for config file to run.py - Added tests for shared memory and distributed (MPI) all with cpu and gpu - Updated requirements, pyproject and gitignore
1 parent f5b5891 commit ad6d8c3

11 files changed

Lines changed: 687 additions & 6 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,5 @@ data/
175175
*.data
176176
# lightning logs
177177
lightning_logs/
178+
# random stuff
179+
debugging/

experiments/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
device: gpu
2+
size: 20
3+
function:
4+
routine: matmul_numba_gpu
5+
block_size: 16
6+
print: True

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
44

55

66
[project]
7-
name = "pyclassify"
7+
name = "matmul"
88
version = "0.0.1"
99
description = "Gabriele Codega"
1010
readme = "README.md"
11-
requires-python = ">=3.9"
11+
requires-python = ">=3.11"
1212
license = { file = "LICENSE" }
1313
authors = [{ name = "Gabriele Codega", email = "[email protected]" }]
1414
dynamic = ["dependencies"]
@@ -21,4 +21,4 @@ exclude = ["scripts", "tests", "shell", "experiments"]
2121
dependencies = { file = ["requirements.txt"] }
2222

2323
[project.optional-dependencies]
24-
test = ["pytest"]
24+
test = ["pytest"]

requirements.txt

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,35 @@
1-
exceptiongroup==1.2.2
1+
filelock==3.17.0
2+
fsspec==2025.2.0
23
iniconfig==2.0.0
4+
Jinja2==3.1.5
5+
line_profiler==4.2.0
6+
llvmlite==0.44.0
7+
MarkupSafe==3.0.2
8+
mpi4py==4.0.3
9+
mpmath==1.3.0
10+
networkx==3.4.2
11+
numba==0.61.0
12+
numpy==2.1.3
13+
nvidia-cublas-cu12==12.4.5.8
14+
nvidia-cuda-cupti-cu12==12.4.127
15+
nvidia-cuda-nvrtc-cu12==12.4.127
16+
nvidia-cuda-runtime-cu12==12.4.127
17+
nvidia-cudnn-cu12==9.1.0.70
18+
nvidia-cufft-cu12==11.2.1.3
19+
nvidia-curand-cu12==10.3.5.147
20+
nvidia-cusolver-cu12==11.6.1.9
21+
nvidia-cusparse-cu12==12.3.1.170
22+
nvidia-cusparselt-cu12==0.6.2
23+
nvidia-ml-py==12.570.86
24+
nvidia-nccl-cu12==2.21.5
25+
nvidia-nvjitlink-cu12==12.4.127
26+
nvidia-nvtx-cu12==12.4.127
327
packaging==24.2
428
pluggy==1.5.0
5-
pytest==8.3.4
6-
tomli==2.2.1
29+
pytest==8.3.5
30+
PyYAML==6.0.2
31+
scipy==1.15.2
32+
sympy==1.13.1
33+
torch==2.6.0
34+
triton==3.2.0
35+
typing_extensions==4.12.2

scripts/run.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
import numpy as np
2+
from numba import cuda
3+
4+
import mpi4py
5+
6+
mpi4py.rc.initialize = False
7+
mpi4py.rc.finalize = False
8+
from mpi4py import MPI
9+
10+
from matmul.utils import create_block, read_config
11+
import argparse
12+
import importlib
13+
14+
from line_profiler import profile
15+
16+
@profile
17+
def main_cpu(params: dict):
18+
SIZE = params["size"]
19+
md = importlib.import_module("matmul")
20+
routine = getattr(md, params["function"]["routine"])
21+
bs = params["function"]["block_size"]
22+
23+
# Initialise MPI with multithreading enabled and share work among processes
24+
status = MPI.Init_thread(MPI.THREAD_FUNNELED)
25+
if status != MPI.THREAD_FUNNELED:
26+
print("Unable to provide required thread level")
27+
28+
comm = MPI.COMM_WORLD
29+
rank = comm.Get_rank()
30+
npes = comm.Get_size()
31+
32+
rest = SIZE%npes
33+
n_loc = SIZE//npes + (rank < rest)
34+
35+
workloads = np.array([SIZE//npes + (i<rest) for i in range(npes)], dtype=int)
36+
37+
row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
38+
39+
# initialise matrices somehow
40+
A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
41+
B = np.zeros((n_loc,SIZE), dtype=np.float64)
42+
C = np.zeros((n_loc,SIZE), dtype=np.float64)
43+
for i in range(n_loc):
44+
B[i, i+row_offset] = 1
45+
46+
# Compute quantities for Allgatherv and allocate required memory
47+
ncols = workloads[0]
48+
rcvcounts = workloads*ncols
49+
displacements = np.cumsum(rcvcounts) - rcvcounts
50+
51+
B_block = np.empty((n_loc,ncols), dtype=np.float64)
52+
B_col = np.empty((SIZE,ncols), dtype=np.float64)
53+
54+
t_tot = 0
55+
start = 0
56+
for i in range(npes):
57+
# Recompute stuff for Algatherv at some point if needed (because of different workloads)
58+
if i == rest:
59+
ncols = workloads[i]
60+
rcvcounts = workloads*ncols
61+
displacements = np.cumsum(rcvcounts) - rcvcounts
62+
63+
B_block = np.empty((n_loc,ncols), dtype=np.float64)
64+
B_col = np.empty((SIZE,ncols), dtype=np.float64)
65+
66+
# create a contiguous block from B to communicate
67+
create_block(B, B_block, start, ncols)
68+
# gather all pieces of B from other processes
69+
comm.Allgatherv([B_block, MPI.DOUBLE], [B_col, rcvcounts,displacements, MPI.DOUBLE])
70+
71+
t1 = MPI.Wtime()
72+
# multiply
73+
routine(A,B_col,C[:,start:start+ncols],bs)
74+
t2 = MPI.Wtime()
75+
t_tot += (t2-t1)
76+
77+
start += ncols
78+
79+
print(t_tot)
80+
81+
if params["print"]:
82+
if rank == 0:
83+
print(C)
84+
for i in range(1,npes):
85+
block = np.zeros((workloads[i], SIZE))
86+
block = comm.recv(source=i,tag=i)
87+
print(block)
88+
else:
89+
comm.send(C,dest=0,tag=rank)
90+
91+
92+
MPI.Finalize()
93+
94+
@profile
95+
def main_gpu(params: dict):
96+
SIZE = params["size"]
97+
md = importlib.import_module("matmul")
98+
routine = getattr(md, params["function"]["routine"])
99+
bs = params["function"]["block_size"]
100+
101+
# Initialise MPI with multithreading enabled and share work among processes
102+
status = MPI.Init_thread(MPI.THREAD_FUNNELED)
103+
if status != MPI.THREAD_FUNNELED:
104+
print("Unable to provide required thread level")
105+
106+
comm = MPI.COMM_WORLD
107+
rank = comm.Get_rank()
108+
npes = comm.Get_size()
109+
110+
rest = SIZE%npes
111+
n_loc = SIZE//npes + (rank < rest)
112+
113+
workloads = np.array([SIZE//npes + (i<rest) for i in range(npes)], dtype=int)
114+
115+
row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
116+
117+
# initialise matrices somehow
118+
A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
119+
B = np.zeros((n_loc,SIZE), dtype=np.float64)
120+
C = np.zeros((n_loc,SIZE), dtype=np.float64)
121+
for i in range(n_loc):
122+
B[i, i+row_offset] = 1
123+
124+
# Compute quantities for Allgatherv and allocate required memory
125+
ncols = workloads[0]
126+
rcvcounts = workloads*ncols
127+
displacements = np.cumsum(rcvcounts) - rcvcounts
128+
129+
B_block = np.empty((n_loc,ncols), dtype=np.float64)
130+
B_col = np.empty((SIZE,ncols), dtype=np.float64)
131+
132+
# Select a GPU and move arrays to device
133+
num_devices = len(cuda.gpus)
134+
cuda.select_device(rank%num_devices)
135+
a_d = cuda.to_device(A)
136+
c_d = cuda.to_device(C)
137+
138+
nthreads = bs
139+
blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(SIZE + nthreads-1)//nthreads)
140+
threads_per_block = (nthreads, nthreads)
141+
142+
t_tot = 0
143+
start = 0
144+
for i in range(npes):
145+
# Recompute stuff for Algatherv at some point if needed (because of different workloads)
146+
if i == rest:
147+
ncols = workloads[i]
148+
rcvcounts = workloads*ncols
149+
displacements = np.cumsum(rcvcounts) - rcvcounts
150+
151+
B_block = np.empty((n_loc,ncols), dtype=np.float64)
152+
B_col = np.empty((SIZE,ncols), dtype=np.float64)
153+
154+
# create a contiguous block from B to communicate
155+
create_block(B, B_block, start, ncols)
156+
# gather all pieces of B from other processes
157+
comm.Allgatherv([B_block, MPI.DOUBLE], [B_col, rcvcounts,displacements, MPI.DOUBLE])
158+
159+
# move slice of B to device
160+
b_d = cuda.to_device(B_col)
161+
162+
t1 = cuda.event(timing=True)
163+
t2 = cuda.event(timing=True)
164+
t1.record()
165+
# multiply
166+
routine[blocks_per_grid, threads_per_block](a_d,b_d,c_d[:,start:start+ncols])
167+
t2.record()
168+
t2.synchronize()
169+
170+
t_tot += (cuda.event_elapsed_time(t1,t2)/1000)
171+
172+
start += ncols
173+
# move final result back to host
174+
C = c_d.copy_to_host()
175+
176+
print(t_tot)
177+
178+
if params["print"]:
179+
if rank == 0:
180+
print(C)
181+
for i in range(1,npes):
182+
block = np.zeros((workloads[i], SIZE))
183+
block = comm.recv(source=i,tag=i)
184+
print(block)
185+
else:
186+
comm.send(C,dest=0,tag=rank)
187+
188+
189+
MPI.Finalize()
190+
191+
192+
cpu_routines = ['matmul',
193+
'matmul_numba_serial',
194+
'matmul_numba_cpu',
195+
'matmul_numba_block_serial',
196+
'matmul_numba_block_cpu']
197+
198+
gpu_routines = ['matmul_numba_gpu',
199+
'matmul_numba_block_gpu']
200+
201+
if __name__=="__main__":
202+
parser = argparse.ArgumentParser()
203+
parser.add_argument("--config", type=str, help="Path to the config yaml file")
204+
parser = parser.parse_args()
205+
206+
if not parser.config:
207+
raise RuntimeError("Please specify a yaml config file with `--config <filename>`.")
208+
params = read_config(parser.config)
209+
routine = params["function"]["routine"]
210+
211+
if params["device"] == "cpu" :
212+
if not routine in cpu_routines:
213+
raise ValueError(f"Specified routine '{routine}' is incompatible with device 'cpu'. Compatible routines are {cpu_routines}.")
214+
main_cpu(params)
215+
elif params["device"] == "gpu" :
216+
if not routine in gpu_routines:
217+
raise ValueError(f"Specified routine '{routine}' is incompatible with device 'gpu'. Compatible routines are {gpu_routines}.")
218+
main_gpu(params)
219+
else:
220+
raise ValueError(f"Parameter `device` can be either 'cpu' or 'gpu', instead got {params['device']}.")

src/matmul/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
__all__ = [
2+
'matmul',
3+
'matmul_numba_serial',
4+
'matmul_numba_cpu',
5+
'matmul_numba_gpu',
6+
'matmul_numba_block_serial',
7+
'matmul_numba_block_cpu',
8+
'matmul_numba_block_gpu']
9+
10+
11+
from .routines import matmul, matmul_numba_serial, matmul_numba_cpu, matmul_numba_gpu, matmul_numba_block_serial, matmul_numba_block_cpu, matmul_numba_block_gpu

0 commit comments

Comments
 (0)