Skip to content

Commit ad873e6

Browse files
Modifications to block routines
1 parent ad6d8c3 commit ad873e6

6 files changed

Lines changed: 60 additions & 52 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,5 @@ data/
177177
lightning_logs/
178178
# random stuff
179179
debugging/
180+
# logs
181+
logs/

experiments/config.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
device: gpu
2-
size: 20
1+
device: cpu
2+
size: 512
33
function:
4-
routine: matmul_numba_gpu
5-
block_size: 16
6-
print: True
4+
routine: matmul_numba_block_serial
5+
block_size: 24
6+
print: False

scripts/run.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ def main_cpu(params: dict):
3737
row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
3838

3939
# initialise matrices somehow
40-
A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
41-
B = np.zeros((n_loc,SIZE), dtype=np.float64)
42-
C = np.zeros((n_loc,SIZE), dtype=np.float64)
40+
A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE),order='C') + (row_offset * SIZE)
41+
B = np.zeros((n_loc,SIZE), dtype=np.float64,order='C')
42+
C = np.zeros((n_loc,SIZE), dtype=np.float64,order='C')
4343
for i in range(n_loc):
4444
B[i, i+row_offset] = 1
4545

@@ -48,8 +48,8 @@ def main_cpu(params: dict):
4848
rcvcounts = workloads*ncols
4949
displacements = np.cumsum(rcvcounts) - rcvcounts
5050

51-
B_block = np.empty((n_loc,ncols), dtype=np.float64)
52-
B_col = np.empty((SIZE,ncols), dtype=np.float64)
51+
B_block = np.empty((n_loc,ncols), dtype=np.float64,order='C')
52+
B_col = np.empty((SIZE,ncols), dtype=np.float64,order='C')
5353

5454
t_tot = 0
5555
start = 0
@@ -60,8 +60,8 @@ def main_cpu(params: dict):
6060
rcvcounts = workloads*ncols
6161
displacements = np.cumsum(rcvcounts) - rcvcounts
6262

63-
B_block = np.empty((n_loc,ncols), dtype=np.float64)
64-
B_col = np.empty((SIZE,ncols), dtype=np.float64)
63+
B_block = np.empty((n_loc,ncols), dtype=np.float64,order='C')
64+
B_col = np.empty((SIZE,ncols), dtype=np.float64,order='C')
6565

6666
# create a contiguous block from B to communicate
6767
create_block(B, B_block, start, ncols)

shell/submit.sh

100644100755
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
rank=$OMPI_COMM_WORLD_RANK
4+
5+
export NUMBA_NUM_THREADS=1
6+
7+
# kernprof -lz -o "logs/time/gpu/256_rank_$rank.lprof" scripts/run.py --config experiments/config
8+
valgrind --tool=cachegrind --cache-sim=yes --cachegrind-out-file="logs/memory/512_naive_rank_$rank.log" python scripts/run.py --config experiments/config

src/matmul/routines.py

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -9,61 +9,59 @@ def matmul(A,B,C,_):
99
tmp += A[i,k] * B[k,j]
1010
C[i,j] = tmp
1111

12-
@njit(void(float64[:,:],float64[:,:],float64[:,:],numba.optional(int32)), cache=True)
12+
@njit(void(float64[:,::1],float64[:,::1],float64[:,:],numba.optional(int32)), cache=True)
1313
def matmul_numba_serial(A,B,C,_):
1414
for i in range(A.shape[0]):
15-
for j in range(B.shape[1]):
16-
tmp = 0.
17-
for k in range(A.shape[-1]):
18-
tmp += A[i,k] * B[k,j]
19-
C[i,j] = tmp
15+
for k in range(A.shape[-1]):
16+
for j in range(B.shape[1]):
17+
C[i,j] += A[i,k] * B[k,j]
2018

21-
@njit(void(float64[:,:],float64[:,:],float64[:,:],numba.optional(int32)), parallel=True, nogil=True, cache=True)
19+
@njit(void(float64[:,::1],float64[:,::1],float64[:,:],numba.optional(int32)), parallel=True, nogil=True, cache=True)
2220
def matmul_numba_cpu(A,B,C,_):
2321
for i in prange(A.shape[0]):
24-
for j in range(B.shape[1]):
25-
tmp = 0.
26-
for k in range(A.shape[1]):
27-
tmp += A[i,k] * B[k,j]
28-
C[i,j] = tmp
22+
for k in range(A.shape[1]):
23+
for j in range(B.shape[1]):
24+
C[i,j] += A[i,k] * B[k,j]
2925

3026

3127

32-
@njit(void(float64[:,:],float64[:,:],float64[:,:],int32), parallel=True, nogil=True, cache=True)
28+
@njit(void(float64[:,::1],float64[:,::1],float64[:,:],int32), parallel=True, nogil=True, cache=True)
3329
def matmul_numba_block_cpu(A,B,C, bs=64):
34-
niblocks = (A.shape[0]//bs) + ((A.shape[0] % bs) > 0)
30+
N = A.shape[0]
31+
M = B.shape[1]
32+
K = A.shape[1]
33+
niblocks = (N//bs) + ((N % bs) > 0)
3534
for ii in prange(0,niblocks):
3635
i0 = ii*bs
37-
imax = i0+bs if i0+bs < A.shape[0] else A.shape[0]
38-
for jj in range(0,B.shape[1],bs):
39-
jmax = jj+bs if jj+bs < B.shape[1] else B.shape[1]
40-
for kk in range(0,A.shape[-1],bs):
41-
kmax = kk+bs if kk+bs < A.shape[-1] else A.shape[-1]
36+
imax = min(i0+bs,N)
37+
for kk in range(0,K,bs):
38+
kmax = min(kk+bs,K)
39+
for jj in range(0,M,bs):
40+
jmax = min(jj+bs,M)
4241
for i in range(i0,imax):
43-
for j in range(jj,jmax):
44-
tmp = 0.
45-
for k in range(kk,kmax):
46-
tmp += A[i,k] * B[k,j]
47-
C[i,j] += tmp
42+
for k in range(kk,kmax):
43+
for j in range(jj,jmax):
44+
C[i,j] += A[i,k] * B[k,j]
4845

49-
@njit(void(float64[:,:],float64[:,:],float64[:,:],int32), parallel=False, nogil=True, cache=True)
46+
@njit(void(float64[:,::1],float64[:,::1],float64[:,:],int32), parallel=False, nogil=True, cache=True)
5047
def matmul_numba_block_serial(A,B,C, bs=64):
51-
niblocks = (A.shape[0]//bs) + ((A.shape[0] % bs) > 0)
48+
N = A.shape[0]
49+
M = B.shape[1]
50+
K = A.shape[1]
51+
niblocks = (N//bs) + ((N % bs) > 0)
5252
for ii in range(0,niblocks):
5353
i0 = ii*bs
54-
imax = i0+bs if i0+bs < A.shape[0] else A.shape[0]
55-
for jj in range(0,B.shape[1],bs):
56-
jmax = jj+bs if jj+bs < B.shape[1] else B.shape[1]
57-
for kk in range(0,A.shape[-1],bs):
58-
kmax = kk+bs if kk+bs < A.shape[-1] else A.shape[-1]
54+
imax = min(i0+bs,N)
55+
for kk in range(0,K,bs):
56+
kmax = min(kk+bs,K)
57+
for jj in range(0,M,bs):
58+
jmax = min(jj+bs,M)
5959
for i in range(i0,imax):
60-
for j in range(jj,jmax):
61-
tmp = 0.
62-
for k in range(kk,kmax):
63-
tmp += A[i,k] * B[k,j]
64-
C[i,j] += tmp
60+
for k in range(kk,kmax):
61+
for j in range(jj,jmax):
62+
C[i,j] += A[i,k] * B[k,j]
6563

66-
@cuda.jit(void(float64[:,:],float64[:,:],float64[:,:]), cache=True)
64+
@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True)
6765
def matmul_numba_gpu(A,B,C):
6866
i, j = cuda.grid(ndim=2)
6967
if i < C.shape[0] and j < C.shape[1]:
@@ -73,7 +71,7 @@ def matmul_numba_gpu(A,B,C):
7371
C[i,j] = tmp
7472

7573
BLOCK_SIZE = 16
76-
@cuda.jit(void(float64[:,:],float64[:,:],float64[:,:]), cache=True)
74+
@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True)
7775
def matmul_numba_block_gpu(A,B,C):
7876

7977
bi = cuda.blockIdx.y

test/test_shared.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def test_matmul_numba_block_cpu():
4040
B = np.eye(size,dtype=np.float64)
4141
C = np.zeros((size,size),dtype=np.float64)
4242

43-
matmul_numba_block_cpu(A,B,C,64)
43+
matmul_numba_block_cpu(A,B,C,6)
4444

4545
assert np.allclose(A,C)
4646

@@ -50,7 +50,7 @@ def test_matmul_numba_block_serial():
5050
B = np.eye(size,dtype=np.float64)
5151
C = np.zeros((size,size),dtype=np.float64)
5252

53-
matmul_numba_block_cpu(A,B,C,64)
53+
matmul_numba_block_serial(A,B,C,6)
5454

5555
assert np.allclose(A,C)
5656

0 commit comments

Comments
 (0)