Matrix Multiplication with GPU
in science and tagged numericalIf you haven't checked out alkomp, I would recommend doing so before reading on. This post demonstrates how straight-forward it is to write GPU compute code with alkomp
. The cool results, as well as the Jupyter notebook source, can be found at the bottom.
In that case, we take the classic example of matrix multiplication:
$$\begin{bmatrix} 1 & 2 & 3 & 4 \\ 5 & 6 & 7 & 8\end{bmatrix} \times \begin{bmatrix} 1 & 2 \\ 3 & 4 \\ 5 & 6 \\ 7 &8 \end{bmatrix}$$
import numpy as np
import alkompy as ay
import time
import matplotlib.pyplot as plt
Now to start with, let use write out the shader
-- the GPU code that will run on each GPU "thread". The buffer stores the data of our matrices where the first two elements store the dimension of the matrix:
$$\begin{bmatrix} 1 & 2 & 3 & 4 \\ 5 & 6 & 7 & 8\end{bmatrix} = \begin{bmatrix} 2 & 4 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \end{bmatrix}$$
code = """
#version 450
layout(set = 0, binding = 0) readonly buffer AMatrix {
vec2 size;
float data[];
} aMatrix;
layout(set = 0, binding = 1) readonly buffer BMatrix {
vec2 size;
float data[];
} bMatrix;
layout(set = 0, binding = 2) buffer RMatrix {
vec2 size;
float data[];
} rMatrix;
void main() {
rMatrix.size = vec2(aMatrix.size.x, bMatrix.size.y);
ivec2 resultCell = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y);
float result = 0.0;
for (int i = 0; i < aMatrix.size.y; i++) {
int a = i + resultCell.x * int(aMatrix.size.y);
int b = resultCell.y + i * int(bMatrix.size.y);
result += aMatrix.data[a] * bMatrix.data[b];
}
int index = resultCell.y + resultCell.x * int(bMatrix.size.y);
rMatrix.data[index] = result;
}
"""
Compile the shader code into SPIR-V.
shader = ay.compile_glsl(code)
Helper functions to build random square matrices.
def rand_matrix_a_and_b(n):
A = np.random.rand(n,n).astype(np.float32)
B = np.random.rand(n,n).astype(np.float32)
return A, B
def prepare_for_gpu(A, B, n):
a = A.flatten()
b = B.flatten()
a = np.insert(a, 0, [n,n])
b = np.insert(b, 0, [n,n])
r = np.zeros(n * n + 2, dtype=np.float32)
return a, b, r
Example with random 10 by 10 matrices
n = 10
A, B = rand_matrix_a_and_b(n)
a, b, r = prepare_for_gpu(A, B, n)
Send data to GPU
device = ay.Device(0)
a_gpu = device.to_device(a)
b_gpu = device.to_device(b)
r_gpu = device.to_device(r)
Run the shader on the GPU
device.run("main", shader, (n, n, 1), [a_gpu, b_gpu, r_gpu])
Get the result
res = device.get(r_gpu)
Check the result with numpy
matrix product.
np.isclose(res[2:].reshape((n,n)), (A.dot(B)))
array([[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True],
[ True, True, True, True, True, True, True, True, True,
True]])
CPU vs GPU
Here we create test square matrices from $2 \times 2$ to $1022 \times 1022$. We record the time it takes to run on GPU and CPU.
num_size = [2**i for i in range(1, 10)] + list(range(512, 1024, 100))
prep_time = []
gpu_time = []
cpu_time = []
for n in num_size:
print(n)
# Preperation
start_time = time.time()
A, B = rand_matrix_a_and_b(n)
a, b, r = prepare_for_gpu(A, B, n)
total_time = time.time() - start_time
prep_time.append(total_time)
# GPU
start_time = time.time()
a_gpu = device.to_device(a)
b_gpu = device.to_device(b)
r_gpu = device.to_device(r)
device.run("main", shader, (n, n, 1), [a_gpu, b_gpu, r_gpu])
# res = device.get(r_gpu)
total_time = time.time() - start_time
gpu_time.append(total_time)
# CPU
start_time = time.time()
res = A.dot(B)
total_time = time.time() - start_time
cpu_time.append(total_time)
2
4
8
16
32
64
128
256
512
512
612
712
812
912
1012
plt.figure(dpi=100)
plt.plot(num_size, cpu_time, '-x', label='CPU')
plt.plot(num_size, gpu_time, '-x', label='GPU')
plt.xlabel('Matrix Dimension')
plt.ylabel('Seconds')
plt.title("Matrix Multiplication: CPU vs GPU")
plt.legend()
Download the Jupyter Notebook.