Matrix Multiplication with GPU

Published 2020-11-02


If you haven't checked out alkomp, I would recommend doing so before reading on. This post demonstrates how straight-forward it is to write GPU compute code with alkomp. The cool results, as well as the Jupyter notebook source, can be found at the bottom.

In that case, we take the classic example of matrix multiplication:

$$\begin{bmatrix} 1 & 2 & 3 & 4 \\ 5 & 6 & 7 & 8\end{bmatrix} \times \begin{bmatrix} 1 & 2 \\ 3 & 4 \\ 5 & 6 \\ 7 &8 \end{bmatrix}$$

import numpy as np
import alkompy as ay
import time
import matplotlib.pyplot as plt

Now to start with, let use write out the shader -- the GPU code that will run on each GPU "thread". The buffer stores the data of our matrices where the first two elements store the dimension of the matrix:

$$\begin{bmatrix} 1 & 2 & 3 & 4 \\ 5 & 6 & 7 & 8\end{bmatrix} = \begin{bmatrix} 2 & 4 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \end{bmatrix}$$

code = """

#version 450

  layout(set = 0, binding = 0) readonly buffer AMatrix {
      vec2 size;
      float data[];
  } aMatrix;

  layout(set = 0, binding = 1) readonly buffer BMatrix {
      vec2 size;
      float data[];
  } bMatrix;

  layout(set = 0, binding = 2) buffer RMatrix {
      vec2 size;
      float data[];
  } rMatrix;

  void main() {
    rMatrix.size = vec2(aMatrix.size.x, bMatrix.size.y);

    ivec2 resultCell = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y);
    float result = 0.0;
    for (int i = 0; i < aMatrix.size.y; i++) {
      int a = i + resultCell.x * int(aMatrix.size.y);
      int b = resultCell.y + i * int(bMatrix.size.y);
      result += aMatrix.data[a] * bMatrix.data[b];
    }

    int index = resultCell.y + resultCell.x * int(bMatrix.size.y);
    rMatrix.data[index] = result;
  }

"""

Compile the shader code into SPIR-V.

shader = ay.compile_glsl(code)

Helper functions to build random square matrices.

def rand_matrix_a_and_b(n):
    A = np.random.rand(n,n).astype(np.float32)
    B = np.random.rand(n,n).astype(np.float32)
    return A, B

def prepare_for_gpu(A, B, n):

    a = A.flatten()
    b = B.flatten()

    a = np.insert(a, 0, [n,n])
    b = np.insert(b, 0, [n,n])
    r = np.zeros(n * n + 2, dtype=np.float32)

    return a, b, r

Example with random 10 by 10 matrices🔗

n = 10

A, B = rand_matrix_a_and_b(n)
a, b, r = prepare_for_gpu(A, B, n)

Send data to GPU

device = ay.Device(0)
a_gpu = device.to_device(a)
b_gpu = device.to_device(b)
r_gpu = device.to_device(r)

Run the shader on the GPU

device.run("main", shader, (n, n, 1), [a_gpu, b_gpu, r_gpu])

Get the result

res = device.get(r_gpu)

Check the result with numpy matrix product.

np.isclose(res[2:].reshape((n,n)), (A.dot(B)))
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]])

CPU vs GPU🔗

Here we create test square matrices from $2 \times 2$ to $1022 \times 1022$. We record the time it takes to run on GPU and CPU.

num_size = [2**i for i in range(1, 10)] + list(range(512, 1024, 100))  

prep_time = []
gpu_time = []
cpu_time = []

for n in num_size:
    print(n)

    # Preperation
    start_time = time.time()

    A, B = rand_matrix_a_and_b(n)
    a, b, r = prepare_for_gpu(A, B, n)
    total_time = time.time() - start_time
    prep_time.append(total_time)

    # GPU
    start_time = time.time()

    a_gpu = device.to_device(a)
    b_gpu = device.to_device(b)
    r_gpu = device.to_device(r)

    device.run("main", shader, (n, n, 1), [a_gpu, b_gpu, r_gpu])
    # res = device.get(r_gpu)

    total_time = time.time() - start_time
    gpu_time.append(total_time)

    # CPU
    start_time = time.time()
    res = A.dot(B)

    total_time = time.time() - start_time
    cpu_time.append(total_time)


2
4
8
16
32
64
128
256
512
512
612
712
812
912
1012
plt.figure(dpi=100)
plt.plot(num_size, cpu_time, '-x', label='CPU')
plt.plot(num_size, gpu_time, '-x', label='GPU')
plt.xlabel('Matrix Dimension')
plt.ylabel('Seconds')
plt.title("Matrix Multiplication: CPU vs GPU")
plt.legend()

svg

Download the Jupyter Notebook.