Introduction

Now we have to multiply 2 matrices A[2000,2000] and B[2000,6000]. Our result will be a matrix C[2000,6000]. On bytes we have

  • Matrix A: 16 Mb
  • Matrix B/C: 4.8Mb

Basic our code multiply 10 times those matrices.

/*
  Now we make the matrix much bigger
  g++ -pg seq_matrix_big_mul.c -o seq_matrix_big_mul
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>

int num_rows_A = 900; int num_rows_B = 900; int num_rows_C = 900;
int num_cols_A = 900; int num_cols_B = 600; int num_cols_C = 600;

// I'm forcing a malloc because I want to add the malloc time on the game
float *A = (float*) malloc(sizeof(float) * num_rows_A * num_cols_A);
float *B = (float*) malloc(sizeof(float) * num_rows_B * num_cols_B);
float *C = (float*) malloc(sizeof(float) * num_rows_C * num_cols_C);

void matrix_2d_mul_float(float *A, float *B, float *C, int num_rows_A, int num_cols_A, int num_cols_B) {
  float sum = 0;
  int num_rows_C = num_rows_A;
  int num_cols_C = num_cols_B;
  // Iterate on each row of A
  //#pragma omp parallel for schedule(dynamic,1) collapse(2)
  for(int i=0; i<num_rows_A; i++) {
    // Iterate on each collumn of B
    for (int k=0; k<num_cols_B; k++) {
      sum = 0;
      // Do the "multiply add between" row of A and collumn of B
      for (int j=0; j<num_cols_A; j++){
        // A[i][j] == A[i*num_cols_A+j]
        // B[j][k] == B[j*num_cols_B+k]
        //sum += A[i][j]*B[j][k];
        sum += A[i*num_cols_A+j]*B[j*num_cols_B+k];
      }
      // C[i][k] == C[i*num_cols_C+k]
      C[i*num_cols_C+k]=sum;
    }
  }
}

void fillRand(float *vec, int minValue, int maxValue, int sizeVec) {
  srand(time(NULL));
  for (int idx = 0; idx < sizeVec; idx++) {
    vec[idx] = rand() % maxValue + minValue;
  }
}

int main() {
  // Get size in bytes for our vectors
  int numBytesA = sizeof(float) * num_rows_A * num_cols_A;
  int numBytesB = sizeof(float) * num_rows_B * num_cols_B;
  printf("Size in bytes A: %d\n",numBytesA);
  printf("Size in bytes B: %d\n",numBytesB);

  // Fill arrays
  fillRand(A, 1, 100, num_rows_A * num_cols_A);
  fillRand(B, 1, 100, num_rows_B * num_cols_B);

  // Call sequential function
  //ProfilerStart("nameOfProfile.log");
  for (int idxLoop=0; idxLoop < 10; idxLoop++) {
    // Populate matricex on heap

    matrix_2d_mul_float(A,B,C,num_rows_A,num_cols_A,num_cols_B);
    printf("Matrix multiplication done %d\n",idxLoop);
  }
  // Free memory
  free(A);free(B);free(C);
  return 0;
}

Executing sequentially

Executing this sequentially will be slow (30 seconds) to complete 10 matrix multiplications.

# Compile
g++ -g -O1 seq_matrix_big_mul.c -o seq_matrix_big_mul

# Check execution time
sudo perf stat ./seq_matrix_big_mul

Size in bytes A: 16000000
Size in bytes B: 4800000
Matrix multiplication done 0
Matrix multiplication done 1
Matrix multiplication done 2
Matrix multiplication done 3
Matrix multiplication done 4
Matrix multiplication done 5
Matrix multiplication done 6
Matrix multiplication done 7
Matrix multiplication done 8
Matrix multiplication done 9

 Performance counter stats for './seq_matrix_big_mul':

      30472.045987      task-clock (msec)         #    1.000 CPUs utilized          
                82      context-switches          #    0.003 K/sec                  
                14      cpu-migrations            #    0.000 K/sec                  
             1,195      page-faults               #    0.039 K/sec                  
    91,004,382,307      cycles                    #    2.986 GHz                    
                 0      stalled-cycles-frontend   #    0.00% frontend cycles idle   
                 0      stalled-cycles-backend    #    0.00% backend  cycles idle   
   216,486,556,184      instructions              #    2.38  insns per cycle        
    24,137,052,216      branches                  #  792.105 M/sec                  
        12,256,039      branch-misses             #    0.05% of all branches        

      30.473137262 seconds time elapsed

Taking measurements with gprof each call takes 9.5 seconds

g++ -pg seq_matrix_big_mul.c -o seq_matrix_big_mul
./seq_matrix_big_mul
gprof ./seq_matrix_big_mul gmon.out > timeProf.txt

Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total
 time   seconds   seconds    calls   s/call   s/call  name
100.52     95.63    95.63       10     9.56     9.56  matrix_2d_mul_float(float*, float*, float*, int, int, int)
  0.03     95.66     0.03        2     0.02     0.02  fillRand(float*, int, int, int)
  0.00     95.66     0.00        1     0.00     0.00  _GLOBAL__sub_I_num_rows_A
  0.00     95.66     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)

Executing on our CPU cores (7 cores)

Now we change our matrix2dmulfloat to run all rows in parallel with _#pragma omp parallel for schedule(dynamic,1) collapse(2). Now we have 2x speed-up. This is cool but could we done better on GPU?

# Compile
g++ -g -O1 -fopenmp seq_matrix_big_mul.c -o seq_matrix_big_mul

# Check execution time
sudo perf stat ./seq_matrix_big_mul

Size in bytes A: 16000000
Size in bytes B: 4800000
Matrix multiplication done 0
Matrix multiplication done 1
Matrix multiplication done 2
Matrix multiplication done 3
Matrix multiplication done 4
Matrix multiplication done 5
Matrix multiplication done 6
Matrix multiplication done 7
Matrix multiplication done 8
Matrix multiplication done 9

 Performance counter stats for './seq_matrix_big_mul':

     115167.655752      task-clock (msec)         #    7.860 CPUs utilized          
            26,806      context-switches          #    0.233 K/sec                  
                11      cpu-migrations            #    0.000 K/sec                  
             1,903      page-faults               #    0.017 K/sec                  
   343,022,318,959      cycles                    #    2.978 GHz                    
                 0      stalled-cycles-frontend   #    0.00% frontend cycles idle   
                 0      stalled-cycles-backend    #    0.00% backend  cycles idle   
   265,276,470,662      instructions              #    0.77  insns per cycle        
    24,270,361,665      branches                  #  210.739 M/sec                  
        13,057,570      branch-misses             #    0.05% of all branches        

      14.651747684 seconds time elapsed

First check the Transfer time

Basically we need to know what is the time to send 20.8Mb (Matrices A and B) and receive 4.8Mb matrix (result matrix).

leo@leodev:~/work/learningOpenCl/chap1$ nvprof ./checkTransferMatMul 20800000 4800000
Checking GPU transfer...
Sending 20800000 bytes
Sending 4800000 bytes
==4451== NVPROF is profiling process 4451, command: ./checkTransferMatMul 20800000 4800000
==4451== Profiling application: ./checkTransferMatMul 20800000 4800000
==4451== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 82.27%  6.7186ms         1  6.7186ms  6.7186ms  6.7186ms  [CUDA memcpy HtoD]
 17.73%  1.4480ms         1  1.4480ms  1.4480ms  1.4480ms  [CUDA memcpy DtoH]

==4451== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 90.92%  96.428ms         1  96.428ms  96.428ms  96.428ms  cudaMalloc
  8.04%  8.5233ms         2  4.2617ms  2.0535ms  6.4698ms  cudaMemcpy
  0.76%  805.41us        83  9.7030us     838ns  350.05us  cuDeviceGetAttribute
  0.19%  201.07us         1  201.07us  201.07us  201.07us  cudaFree
  0.05%  56.781us         1  56.781us  56.781us  56.781us  cuDeviceTotalMem
  0.04%  44.070us         1  44.070us  44.070us  44.070us  cuDeviceGetName
  0.00%  2.9340us         2  1.4670us  1.0480us  1.8860us  cuDeviceGetCount
  0.00%  1.9560us         2     978ns     908ns  1.0480us  cuDeviceGet

From the results the whole transfer takes 8ms. (On a system with a Quadro GPU). On a system with a Titan X (Maxwell) this transfer takes 2.6ms. So if our GPU calculate each matrix multiplication in less than 9 seconds (Sequential version) we have speed-ups.

Naive way on the GPU. (Using global memory)

So we have less than 9 seconds to multiply 2 matrices A[2000,2000] and B[2000,6000] on the GPU. Let's start by the Naive implementation. First let's take a look again on the CPU code.

void matrix_2d_mul_float(float *A, float *B, float *C, int num_rows_A, int num_cols_A, int num_cols_B) {
  float sum = 0;
  int num_rows_C = num_rows_A;
  int num_cols_C = num_cols_B;
  // Iterate on each row of A  
  for(int i=0; i<num_rows_A; i++) {
    // Iterate on each collumn of B
    for (int k=0; k<num_cols_B; k++) {
      sum = 0;
      // Do the "multiply add between" row of A and collumn of B
      for (int j=0; j<num_cols_A; j++){
        // A[i][j] == A[i*num_cols_A+j]
        // B[j][k] == B[j*num_cols_B+k]
        //sum += A[i][j]*B[j][k];
        sum += A[i*num_cols_A+j]*B[j*num_cols_B+k];
      }
      // C[i][k] == C[i*num_cols_C+k]
      C[i*num_cols_C+k]=sum;
    }
  }
}

You can observe that there is no communication between the loop iterations. So each kernel would execute something like this:

sum = 0;
// Do the "multiply add between" row of A and collumn of B
for (int j=0; j<num_cols_A; j++){
  // A[i][j] == A[i*num_cols_A+j]
  // B[j][k] == B[j*num_cols_B+k]
  //sum += A[i][j]*B[j][k];
  sum += A[i*num_cols_A+j]*B[j*num_cols_B+k];
}
// C[i][k] == C[i*num_cols_C+k]
C[i*num_cols_C+k]=sum;

As there is no communication between the threads the only issue is that each thread reads the global memory at least 2*j times. One for all the cols of A, one for all the rows of B) and one position for C. The issue is that accessing the global memory it's expensive in terms of time. Specially when you have multiple threads accessing at the same time, you start to have some congestion which make the kernel slower.

CUDA

__global__ void matrix_2d_mul_float_gpu(float *A, float *B, float *C, int num_rows_A, int num_cols_A, int num_cols_B) {
  // Same code for all 2d kernel
  int i = blockIdx.y * blockDim.y + threadIdx.y;
  int k = blockIdx.x * blockDim.x + threadIdx.x;
  if (i > num_rows_A || k > num_cols_B) return;

  // Sum is on the register(local to each thread)
  float sum = 0;

  // This iterate a lot on the global memory 2*j times
  for (int j=0; j<num_cols_A; j++){
    // A[i][j] == A[i*num_cols_A+j]
    // B[j][k] == B[j*num_cols_B+k]
    //sum += A[i][j]*B[j][k];
    sum += A[i*num_cols_A+j]*B[j*num_cols_B+k];
  }

  // And now one more time
  C[i*num_cols_B+k]=sum;
}

OpenCl

__kernel void matrix_2d_mul_float_gpu(__global float* A, __global float* B,  __global float* C, int num_rows_A, int num_cols_A, int num_cols_B)          
{  
   int i = get_global_id(0); 
   int k = get_global_id(1);
   if (i > num_rows_A || k > num_cols_B) return;

   // Sum is on the register(local to each thread)
  float sum = 0;

   // This iterate a lot on the global memory 2*j times
  for (int j=0; j<num_cols_A; j++){
    // A[i][j] == A[i*num_cols_A+j]
    // B[j][k] == B[j*num_cols_B+k]
    //sum += A[i][j]*B[j][k];
    sum += A[i*num_cols_A+j]*B[j*num_cols_B+k];
  }

  // And now one more time
  C[i*num_cols_B+k]=sum;
}

On this case each thread create one entry on the matrix C(i,k), by sampling a row from A(i,j) and a col from B(j,k)

Ok let's check how the naive version performed.

==2543== Profiling application: ./matrix_big_mul
==2543== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 89.60%  226.14ms        10  22.614ms  20.433ms  23.861ms  matrix_2d_mul_float_gpu(float*, float*, float*, int, int, int)
  8.20%  20.684ms        20  1.0342ms  456.25us  1.6279ms  [CUDA memcpy HtoD]
  2.20%  5.5601ms        10  556.01us  457.15us  982.74us  [CUDA memcpy DtoH]

Actually each matrix multiplication on the gpu took 23ms (kernel only) adding the time between GPU and CPU transfers we have now a total of 48 ms. This means an speed-up of $$\frac{9560}{48}=199$$.

So even with the naive implementation we got 199 times faster than the original sequential version.

Now let's compare with the multi-threaded cpu version (Took total of 14.6 seconds)

laraujo@lindev:~/work/learningOpenCl/chap2$ sudo perf stat ./matrix_big_mul
Size in bytes A: 16000000
Size in bytes B: 4800000
Matrix multiplication done 0
Matrix multiplication done 1
Matrix multiplication done 2
Matrix multiplication done 3
Matrix multiplication done 4
Matrix multiplication done 5
Matrix multiplication done 6
Matrix multiplication done 7
Matrix multiplication done 8
Matrix multiplication done 9

 Performance counter stats for './matrix_big_mul':

        348.548602      task-clock (msec)         #    0.998 CPUs utilized          
                24      context-switches          #    0.069 K/sec                  
                 2      cpu-migrations            #    0.006 K/sec                  
             3,387      page-faults               #    0.010 M/sec                  
     1,355,696,720      cycles                    #    3.890 GHz                    
   <not supported>      stalled-cycles-frontend  
   <not supported>      stalled-cycles-backend   
     1,593,889,784      instructions              #    1.18  insns per cycle        
       348,741,095      branches                  # 1000.552 M/sec                  
           363,462      branch-misses             #    0.10% of all branches        

       0.349369499 seconds time elapsed

Comparing with the multi-threaded version we had a speed up of 14.6/0.3, which is 48x faster.

This seems cool, also if we compare the execution timeline with Nvidia Visual Profiler. So the kernel execution time is bigger than the CPU/GPU transfer and we're still faster than the CPU.

So what's the problem with the naive implementation? Let's use the Nvidia visual profiler to also analyse the "Kernel performance Limiter" and the Kernel latency.

This shows that we're doing more memory operations than actually computing. Also on the title is saying "Kernel performance bounded by memory Bandwith"

Now let's check what is stalling our kernel.

Here our biggest "Stall Reason" is the "Execution Dependency" which means that our kernels are just waiting for data.

Making faster by Tiling

On the case of the matrix multiplication, we could divide our matrices in tiles and solve all the tiles in parallel. Also while doing this you bring the tiles to the shared memory which is faster than reading directly from the global memory (Kernel pointers, ex A[i,j]). And the tile C can be stored on the register memory (local variable). Which is really fast.

In the end we can just add the results.

Also inside the tiles the calculation remains the same.

This is the new version

__global__ void matrix_2d_mul_float_gpu(float *A, float *B, float *C, int num_rows_A, int num_cols_A, int num_cols_B) {
  // Create shared variables (Available to all threads on the same block)
  __shared__ float A_tile[N_THREADS][N_THREADS];
  __shared__ float B_tile[N_THREADS][N_THREADS];
  // Block index
  int bx = blockIdx.x; int by = blockIdx.y;
  // Thread index
  int tx = threadIdx.x; int ty = threadIdx.y;

  // Index of the first sub-matrix of A processed by the block
  int aBegin = num_cols_A * N_THREADS * by;
  // Index of the last sub-matrix of A processed by the block
  int aEnd   = aBegin + num_cols_A - 1;
  // Index of the first sub-matrix of B processed by the block
  int bBegin = N_THREADS * bx;
  int bStep  = N_THREADS * num_cols_B;
  int aStep  = N_THREADS;

  float sum = 0;

  for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep) {
    A_tile[ty][tx] = A[a + num_cols_A * ty + tx];
    B_tile[tx][ty] = B[b + num_cols_B * tx + ty];

    // Synchronize to make sure the matrices are loaded
    __syncthreads();

    for (int k = 0; k < N_THREADS; ++k)
      sum += A_tile[ty][k] * B_tile[k][tx];

      // Wait other threads to finish their sub-matrices
      __syncthreads();
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = num_cols_B * N_THREADS * by + N_THREADS * bx;
  C[c + num_cols_B * ty + tx] = sum;
}

Now if we compare the kernel execution time we're faster by a factor of ..... Now let's run again our profile to check the balance between computation and memory.

Now our kernel duration dropped to 5ms but if you see we're still bounded by memory bandwidth, but now it's not the L2 memory but the Shared memory, which is faster.

Get more information

There is an compile option -lineinfo that allows the nvidia profiler to annotate your source code to give an idea where you are loosing time (Or stalled due to memory).

nvcc matrix_big_mul_tile.cu -o matrix_big_mul_tile -lineinfo

Bellow we can see with "Kernel Profile PC Sampling" that the selected instruction (line 51) is using most of it's time doing "Execution dependency".

Now the line 44 it's spending most of it's time waiting for data.

The bad news is that this tool works only with Cuda. For more information on using the Nvidia Visual profiler you may refer here and refer to the code which shows a simple transpose algorithm been optimized.

results matching ""

    No results matching ""