Modifications to block routines

Gabriele-Codega · Gabriele-Codega · commit ad873e698ae2 · 2025-03-12T10:26:25.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -177,3 +177,5 @@ data/
 lightning_logs/
 # random stuff
 debugging/
+# logs
+logs/
diff --git a/experiments/config.yaml b/experiments/config.yaml
@@ -1,6 +1,6 @@
-device: gpu
-size: 20
+device: cpu
+size: 512
 function: 
-  routine: matmul_numba_gpu
-  block_size: 16
-print: True
+  routine: matmul_numba_block_serial
+  block_size: 24
+print: False
diff --git a/scripts/run.py b/scripts/run.py
@@ -37,9 +37,9 @@ def main_cpu(params: dict):
     row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
 
     # initialise matrices somehow
-    A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
-    B = np.zeros((n_loc,SIZE), dtype=np.float64)
-    C = np.zeros((n_loc,SIZE), dtype=np.float64)
+    A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE),order='C') + (row_offset * SIZE)
+    B = np.zeros((n_loc,SIZE), dtype=np.float64,order='C')
+    C = np.zeros((n_loc,SIZE), dtype=np.float64,order='C')
     for i in range(n_loc):
         B[i, i+row_offset] = 1
 
@@ -48,8 +48,8 @@ def main_cpu(params: dict):
     rcvcounts = workloads*ncols
     displacements = np.cumsum(rcvcounts) - rcvcounts
 
-    B_block = np.empty((n_loc,ncols), dtype=np.float64)
-    B_col = np.empty((SIZE,ncols), dtype=np.float64)
+    B_block = np.empty((n_loc,ncols), dtype=np.float64,order='C')
+    B_col = np.empty((SIZE,ncols), dtype=np.float64,order='C')
 
     t_tot = 0
     start = 0
@@ -60,8 +60,8 @@ def main_cpu(params: dict):
             rcvcounts = workloads*ncols
             displacements = np.cumsum(rcvcounts) - rcvcounts
 
-            B_block = np.empty((n_loc,ncols), dtype=np.float64)
-            B_col = np.empty((SIZE,ncols), dtype=np.float64)
+            B_block = np.empty((n_loc,ncols), dtype=np.float64,order='C')
+            B_col = np.empty((SIZE,ncols), dtype=np.float64,order='C')
 
         # create a contiguous block from B to communicate
         create_block(B, B_block, start, ncols)
diff --git a/shell/submit.sh b/shell/submit.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+rank=$OMPI_COMM_WORLD_RANK
+
+export NUMBA_NUM_THREADS=1
+
+# kernprof -lz -o "logs/time/gpu/256_rank_$rank.lprof" scripts/run.py --config experiments/config
+valgrind --tool=cachegrind --cache-sim=yes --cachegrind-out-file="logs/memory/512_naive_rank_$rank.log" python scripts/run.py --config experiments/config
diff --git a/src/matmul/routines.py b/src/matmul/routines.py
@@ -9,61 +9,59 @@ def matmul(A,B,C,_):
                 tmp += A[i,k] * B[k,j]
             C[i,j] = tmp
 
-@njit(void(float64[:,:],float64[:,:],float64[:,:],numba.optional(int32)), cache=True)
+@njit(void(float64[:,::1],float64[:,::1],float64[:,:],numba.optional(int32)), cache=True)
 def matmul_numba_serial(A,B,C,_):
     for i in range(A.shape[0]):
-        for j in range(B.shape[1]):
-            tmp = 0.
-            for k in range(A.shape[-1]):
-                tmp += A[i,k] * B[k,j]
-            C[i,j] = tmp
+        for k in range(A.shape[-1]):
+            for j in range(B.shape[1]):
+                C[i,j] += A[i,k] * B[k,j]
 
-@njit(void(float64[:,:],float64[:,:],float64[:,:],numba.optional(int32)), parallel=True, nogil=True, cache=True)
+@njit(void(float64[:,::1],float64[:,::1],float64[:,:],numba.optional(int32)), parallel=True, nogil=True, cache=True)
 def matmul_numba_cpu(A,B,C,_):
     for i in prange(A.shape[0]):
-        for j in range(B.shape[1]):
-            tmp = 0.
-            for k in range(A.shape[1]):
-                tmp += A[i,k] * B[k,j]
-            C[i,j] = tmp
+        for k in range(A.shape[1]):
+            for j in range(B.shape[1]):
+                C[i,j] += A[i,k] * B[k,j]
 
 
 
-@njit(void(float64[:,:],float64[:,:],float64[:,:],int32), parallel=True, nogil=True, cache=True)
+@njit(void(float64[:,::1],float64[:,::1],float64[:,:],int32), parallel=True, nogil=True, cache=True)
 def matmul_numba_block_cpu(A,B,C, bs=64):
-    niblocks = (A.shape[0]//bs) + ((A.shape[0] % bs) > 0)
+    N = A.shape[0]
+    M = B.shape[1]
+    K = A.shape[1]
+    niblocks = (N//bs) + ((N % bs) > 0)
     for ii in prange(0,niblocks):
         i0 = ii*bs
-        imax = i0+bs if i0+bs < A.shape[0] else A.shape[0]
-        for jj in range(0,B.shape[1],bs):
-            jmax = jj+bs if jj+bs < B.shape[1] else B.shape[1]
-            for kk in range(0,A.shape[-1],bs):
-                kmax = kk+bs if kk+bs < A.shape[-1] else A.shape[-1]
+        imax = min(i0+bs,N)
+        for kk in range(0,K,bs):
+            kmax = min(kk+bs,K)
+            for jj in range(0,M,bs):
+                jmax = min(jj+bs,M)
                 for i in range(i0,imax):
-                    for j in range(jj,jmax):
-                        tmp = 0.
-                        for k in range(kk,kmax):
-                            tmp += A[i,k] * B[k,j]
-                        C[i,j] += tmp
+                    for k in range(kk,kmax):
+                        for j in range(jj,jmax):
+                            C[i,j] += A[i,k] * B[k,j]
 
-@njit(void(float64[:,:],float64[:,:],float64[:,:],int32), parallel=False, nogil=True, cache=True)
+@njit(void(float64[:,::1],float64[:,::1],float64[:,:],int32), parallel=False, nogil=True, cache=True)
 def matmul_numba_block_serial(A,B,C, bs=64):
-    niblocks = (A.shape[0]//bs) + ((A.shape[0] % bs) > 0)
+    N = A.shape[0]
+    M = B.shape[1]
+    K = A.shape[1]
+    niblocks = (N//bs) + ((N % bs) > 0)
     for ii in range(0,niblocks):
         i0 = ii*bs
-        imax = i0+bs if i0+bs < A.shape[0] else A.shape[0]
-        for jj in range(0,B.shape[1],bs):
-            jmax = jj+bs if jj+bs < B.shape[1] else B.shape[1]
-            for kk in range(0,A.shape[-1],bs):
-                kmax = kk+bs if kk+bs < A.shape[-1] else A.shape[-1]
+        imax = min(i0+bs,N)
+        for kk in range(0,K,bs):
+            kmax = min(kk+bs,K)
+            for jj in range(0,M,bs):
+                jmax = min(jj+bs,M)
                 for i in range(i0,imax):
-                    for j in range(jj,jmax):
-                        tmp = 0.
-                        for k in range(kk,kmax):
-                            tmp += A[i,k] * B[k,j]
-                        C[i,j] += tmp
+                    for k in range(kk,kmax):
+                        for j in range(jj,jmax):
+                            C[i,j] += A[i,k] * B[k,j]
 
-@cuda.jit(void(float64[:,:],float64[:,:],float64[:,:]), cache=True)
+@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True)
 def matmul_numba_gpu(A,B,C):
     i, j = cuda.grid(ndim=2)
     if i < C.shape[0] and j < C.shape[1]:
@@ -73,7 +71,7 @@ def matmul_numba_gpu(A,B,C):
         C[i,j] = tmp
 
 BLOCK_SIZE = 16
-@cuda.jit(void(float64[:,:],float64[:,:],float64[:,:]), cache=True)
+@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True)
 def matmul_numba_block_gpu(A,B,C):
 
     bi = cuda.blockIdx.y
diff --git a/test/test_shared.py b/test/test_shared.py
@@ -40,7 +40,7 @@ def test_matmul_numba_block_cpu():
     B = np.eye(size,dtype=np.float64)
     C = np.zeros((size,size),dtype=np.float64)
 
-    matmul_numba_block_cpu(A,B,C,64)
+    matmul_numba_block_cpu(A,B,C,6)
 
     assert np.allclose(A,C)
 
@@ -50,7 +50,7 @@ def test_matmul_numba_block_serial():
     B = np.eye(size,dtype=np.float64)
     C = np.zeros((size,size),dtype=np.float64)
 
-    matmul_numba_block_cpu(A,B,C,64)
+    matmul_numba_block_serial(A,B,C,6)
 
     assert np.allclose(A,C)