My custom Load Generator code for GPU (c & cuda)

Below is a piece of Cuda code that I wrote which does a simple vector add on a GPU. It is particularly useful in comparing the single thread, multi thread and multi grid performances of the GPU. By resizing the size of the array this code can also be used for GPU stress testing.

Let’s first start with a non-cuda just plain C version of the code. It simple does a out[i] = a[i] + b[i] vector addition. The program will display the total bytes reserved on the memory for out, a and b arrays. When the array addition is completed it simply prints “PASSED“.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#define N 500000000
#define MAX_ERR 1e-6
void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
        out[i] = a[i] + b[i];
    }
}
int main(){
    float *a, *b, *out;
    // Allocate memory
    int toplam_alan = sizeof(float) * N;
    printf ("a, b ve out herbiri için toplam %d bytes ayrildi\n", toplam_alan);
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }
    // Main function
    vector_add(out, a, b, N);
    // Verification
    for(int i = 0; i < N; i++){
//        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");
}

Here is a single thread cuda version.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 500000000
#define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i ++){
        out[i] = a[i] + b[i];
    }
}
int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;
    // Allocate host memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    // Initialize host arrays
    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }
    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);
    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
    // Executing kernel
    vector_add<<<1,1>>>(d_out, d_a, d_b, N);
    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
    // Verification
    for(int i = 0; i < N; i++){
//        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");
    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    // Deallocate host memory
    free(a);
    free(b);
    free(out);
}

To do this operation with a multi-thread cuda use the following.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 500000000
#define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
    int index = threadIdx.x;
    int stride = blockDim.x;
    for(int i = index; i < n; i += stride){
        out[i] = a[i] + b[i];
    }
}
int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;
    // Allocate host memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    // Initialize host arrays
    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }
    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);
    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
    // Executing kernel
    vector_add<<<1,256>>>(d_out, d_a, d_b, N);
    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
    // Verification
    for(int i = 0; i < N; i++){
//        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");
    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    // Deallocate host memory
    free(a);
    free(b);
    free(out);
}

Here is the multi-grid version. This version can utilize multiple blocks, allowing it to handle very large arrays. Each thread computes a single element and the global index (tid) ensures that all elements are covered, making it more efficient for highly parallel workloads.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 500000000
#define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    // Handling arbitrary vector size
    if (tid < n){
        out[tid] = a[tid] + b[tid];
    }
}
int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;
    // Allocate host memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    // Initialize host arrays
    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }
    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);
    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
    // Executing kernel
    int block_size = 256;
    int grid_size = ((N + block_size) / block_size);
    vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N);
    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
    // Verification
    for(int i = 0; i < N; i++){
//        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");
    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    // Deallocate host memory
    free(a);
    free(b);
    free(out);
}

Here are some figures that I got on performance profiling Sabancı University toSUn HPC cluster’s GPUs.

non_cuda_just_c:        real    0m6.827s
single thread cuda:     real    0m34.421s
multi_thread_cuda:      real    0m7.308s
multi_grid_cuda:        real    0m6.771s

If you are testing these codes in an HPC environment make sure you first load the cuda module.

module load cuda/9.2

Author: Serdar Acir