CUDA: Understanding the behavior of variables in the registers file in a loop with a dot product example

Question

I am very new to CUDA programming. Currently I have difficulties in understanding the behavior of the following program to calculate dot product of two vectors.

The dot product kernel, dotProd calculates the product of each element and reduce the the results to a shorter vector of length blockDim.x*gridDim.x. Then the results in the vector *out is copied back to Host for further reduction.

The second version, dotProdWithSharedMem is copied from the CUDA By Example book, see here.

My questions are:

When the kernel is initiated with enough threads (nThreadsPerBlock*nblocks >= vector_length), the result of dotProd matches the one calculated by CPU, but the result of dotProdWithSharedMem is different from the two. What can be the possible causes? A possible output of $ dot_prod.o 17 512:

    Number of threads per block : 256 
    Number of blocks in the grid: 512 
    Total number of threads     : 131072 
    Length of vectors           : 131072 

    GPU using registers: 9.6904191971, time consummed: 0.56154 ms
    GPU using shared   : 9.6906833649, time consummed: 0.04473 ms
    CPU result         : 9.6904191971, time consummed: 0.28504 ms

When the kernel is initiated with not enough threads (nThreadsPerBlock*nblocks < vector_length), the GPU results seem to be less accurate. However the while loop is supposed to handle this problem. I guess there might be something happen to the registers variable temp in the loop, otherwise the result should remain the same as in question 1. A possible output of $ dot_prod.o 17 256:

Number of threads per block : 256 
Number of blocks in the grid: 256 
Total number of threads     : 65536 
Length of vectors           : 131072 

GPU using registers: 9.6906890869, time consummed: 0.31478 ms
GPU using shared   : 9.6906604767, time consummed: 0.03530 ms
CPU result         : 9.6904191971, time consummed: 0.28404 ms

I don't quite understand the size of the cache in dotProdWithSharedMem. Why it is of nThreadsPerBlock elements other than the total number of threads nThreadsPerBlock * nblocks? I think that should be the right number of temp values, is this correct?

The code:

#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>


#define PI (float) 3.141592653589793

const size_t nThreadsPerBlock = 256;


static void HandleError(cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess) {
    printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
            file, line );
    exit( EXIT_FAILURE );
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


__global__ void dotProd(int length, float *u, float *v, float *out) {
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
    float temp = 0;

    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }
    out[tid_const] = temp;
}


__global__ void dotProdWithSharedMem(int length, float *u, float *v, float *out) {
    __shared__ float cache[nThreadsPerBlock];
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned cid = threadIdx.x;

    float temp = 0;
    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }

    cache[cid] = temp;
    __syncthreads();

    int i = blockDim.x/2;
    while (i != 0) {
        if (cid < i) {
            cache[cid] += cache[cid + i];
        }
        __syncthreads();
        i /= 2;
    }

    if (cid == 0) {
        out[blockIdx.x] = cache[0];
    }
}


int main(int argc, char* argv[]) {

    size_t vec_len  = 1 << std::stoi(argv[1]);
    size_t size     = vec_len * sizeof(float);
    size_t nblocks  = std::stoi(argv[2]);
    size_t size_out   = nThreadsPerBlock*nblocks*sizeof(float);
    size_t size_out_2 = nblocks*sizeof(float);

    float *u     = (float *)malloc(size);
    float *v     = (float *)malloc(size);
    float *out   = (float *)malloc(size_out);
    float *out_2 = (float *)malloc(size_out_2);

    float *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays

    float res_gpu = 0;
    float res_gpu_2 = 0;
    float res_cpu = 0;

    dim3 dimGrid(nblocks, 1, 1);
    dim3 dimBlocks(nThreadsPerBlock, 1, 1);

    // Initiate values
    for(size_t i=0; i<vec_len; ++i) {
        u[i] = std::sin(i*PI*1E-2);
        v[i] = std::cos(i*PI*1E-2);
    }

    HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
    HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
    HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );


    auto t1_gpu = std::chrono::system_clock::now();
    dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
        res_gpu += out[i];
    }


    auto t2_gpu = std::chrono::system_clock::now();
    // GPU version with shared memory
    dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nblocks; ++i) {
        res_gpu_2 += out_2[i];
    }
    auto t3_gpu = std::chrono::system_clock::now();


    // CPU version for result-check
    for(size_t i=0; i<vec_len; ++i) {
        res_cpu += u[i] * v[i];
    }
    auto t2_cpu = std::chrono::system_clock::now();


    double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
    double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
    double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();

    printf("Number of threads per block : %i \n", nThreadsPerBlock);
    printf("Number of blocks in the grid: %i \n", nblocks);
    printf("Total number of threads     : %i \n", nThreadsPerBlock*nblocks);
    printf("Length of vectors           : %i \n\n", vec_len);
    printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
    printf("GPU using shared   : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
    printf("CPU result         : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);

    cudaFree(dev_u);
    cudaFree(dev_v);
    cudaFree(dev_out);
    cudaFree(dev_out_2);
    free(u);
    free(v);
    free(out);
    free(out_2);

    return 0;
}

Thank you for your patience for having done reading this LONG post! Any help will be deeply appreciated!

Niko

score 3 · Accepted Answer · answered Feb 12 '19 at 05:34

You're exploring the limits of float precision combined with the variation associated with floating point order of operations. The actual "accuracy" here will depend on the exact data and exact order of operations. The different algorithms will have different order of operations, and therefore different results.

You may want to read this paper.

One of the assumptions you seem to be making is that the CPU result is the accurate one without any justification for that assumption.

If we define "accuracy" as the difference (i.e. "closeness") between the result and the numerically correct result, I suspect that the shared memory result is the more accurate one.

If we convert your code to use double type instead of float type, we observe that:

The result of all 3 approaches are much closer (identical in the printout).
The double results don't match any of the float case.
The shared memory result from the float case is actually the result that is closest to the double case results.

Here's a test case demonstrating this:

$ cat t397.cu
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>

#ifndef USE_DOUBLE
typedef float ft;
#else
typedef double ft;
#endif
#define PI (ft) 3.141592653589793

const size_t nThreadsPerBlock = 256;


static void HandleError(cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess) {
    printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
            file, line );
    exit( EXIT_FAILURE );
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


__global__ void dotProd(int length, ft *u, ft *v, ft *out) {
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
    ft temp = 0;

    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }
    out[tid_const] = temp;
}


__global__ void dotProdWithSharedMem(int length, ft *u, ft *v, ft *out) {
    __shared__ ft cache[nThreadsPerBlock];
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned cid = threadIdx.x;

    ft temp = 0;
    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }

    cache[cid] = temp;
    __syncthreads();

    int i = blockDim.x/2;
    while (i != 0) {
        if (cid < i) {
            cache[cid] += cache[cid + i];
        }
        __syncthreads();
        i /= 2;
    }

    if (cid == 0) {
        out[blockIdx.x] = cache[0];
    }
}


int main(int argc, char* argv[]) {

    size_t vec_len  = 1 << std::stoi(argv[1]);
    size_t size     = vec_len * sizeof(ft);
    size_t nblocks  = std::stoi(argv[2]);
    size_t size_out   = nThreadsPerBlock*nblocks*sizeof(ft);
    size_t size_out_2 = nblocks*sizeof(ft);

    ft *u     = (ft *)malloc(size);
    ft *v     = (ft *)malloc(size);
    ft *out   = (ft *)malloc(size_out);
    ft *out_2 = (ft *)malloc(size_out_2);

    ft *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays

    ft res_gpu = 0;
    ft res_gpu_2 = 0;
    ft res_cpu = 0;

    dim3 dimGrid(nblocks, 1, 1);
    dim3 dimBlocks(nThreadsPerBlock, 1, 1);

    // Initiate values
    for(size_t i=0; i<vec_len; ++i) {
        u[i] = std::sin(i*PI*1E-2);
        v[i] = std::cos(i*PI*1E-2);
    }

    HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
    HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
    HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );


    auto t1_gpu = std::chrono::system_clock::now();
    dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
        res_gpu += out[i];
    }


    auto t2_gpu = std::chrono::system_clock::now();
    // GPU version with shared memory
    dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nblocks; ++i) {
        res_gpu_2 += out_2[i];
    }
    auto t3_gpu = std::chrono::system_clock::now();


    // CPU version for result-check
    for(size_t i=0; i<vec_len; ++i) {
        res_cpu += u[i] * v[i];
    }
    auto t2_cpu = std::chrono::system_clock::now();


    double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
    double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
    double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();

    printf("Number of threads per block : %i \n", nThreadsPerBlock);
    printf("Number of blocks in the grid: %i \n", nblocks);
    printf("Total number of threads     : %i \n", nThreadsPerBlock*nblocks);
    printf("Length of vectors           : %i \n\n", vec_len);
    printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
    printf("GPU using shared   : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
    printf("CPU result         : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);

    cudaFree(dev_u);
    cudaFree(dev_v);
    cudaFree(dev_out);
    cudaFree(dev_out_2);
    free(u);
    free(v);
    free(out);
    free(out_2);

    return 0;
}
$ nvcc -std=c++11 t397.cu -o t397
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads     : 131072
Length of vectors           : 131072

GPU using registers: 9.6904191971, time consummed: 0.89290 ms
GPU using shared   : 9.6906833649, time consummed: 0.04289 ms
CPU result         : 9.6904191971, time consummed: 0.41527 ms
$ nvcc -std=c++11 t397.cu -o t397 -DUSE_DOUBLE
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads     : 131072
Length of vectors           : 131072

GPU using registers: 9.6913433287, time consummed: 1.33016 ms
GPU using shared   : 9.6913433287, time consummed: 0.05032 ms
CPU result         : 9.6913433287, time consummed: 0.41275 ms
$

Thank you very much for this excellent answer Robert. Indeed, I didn't do enough verification and validation for the correctness of the three results. BTW, I was about to ask you about my third question, but suddenly noticed that objects in shared memory only exist in blocks. They cannot receive values from threads that are in other blocks. The article you referred is very helpful. Have a nice day :) — Niko Z., Feb 12 '19 at 05:55

CUDA: Understanding the behavior of variables in the registers file in a loop with a dot product example

1 Answers1

Linked