Why do malloc() and calloc() not seem work when using CUDA?

Question

The dynamic memory allocation using malloc()/calloc() seems to be not working properly, when used in CUDA.

As for checking, I wrote the following code using calloc(). The array seems to be allocated with required memory and I could also assign some values. But I could see only the garbage values, when I print the matrix elements from the Kernel. I thought it could be a problem with cudaMemcpy() but, instead of **A, if I put like, A[5][5], the code works perfect.

And the memset() usage leads to 'core dumped' error.

Could anyone help in getting along with malloc()/calloc() with no errors?

#include<stdio.h>

__global__ void threads(int* dA)
{
 int gi=threadIdx.x+(blockIdx.x*blockDim.x);
 int gj=threadIdx.y+(blockIdx.y*blockDim.y);

 printf("global Id in X= %d, in Y =%d, E= %d\n", gi,gj,dA[gi*5+gj]);
}

int main(int argc, char** argv)
{
 int **A, *dA;
 int R=5, C=4;
 int size=R*C*sizeof(int);

 A=(int **)calloc(R, sizeof(int*));

 for(int i=0; i<R; i++)
    A[i]=(int *)calloc(C, sizeof(int));

// memset(A, 0, size);

 for(int i=0; i<R; i++)
   {
   for(int j=0; j<C; j++)
      A[i][j]=i*C+j;
   }

printf(" \n Before \n");
for(int i=0; i<R; i++)
   {
    for(int j=0; j<C; j++)
        printf("%d ",A[i][j]);
    printf("\n");
   }

cudaMalloc((int**) &dA, size);
cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);

dim3 nblocks(R,C);
dim3 nthreads(1);

threads<<<nblocks, nthreads>>>(dA);
cudaDeviceSynchronize();

cudaFree(dA);
free(A);
return 0;
}

What does it mean that `malloc()/calloc()` does not properly work in CUDA? From the code above (which I haven't tested), it seems that you are trying to copy a double pointer from host to device. The correct way to do this has been posted by talonmies in his answer to [cuda 2D array problem](http://stackoverflow.com/questions/6137218/cuda-2d-array-problem). — Vitality, Nov 13 '13 at 13:29
You're not doing any [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) — Robert Crovella, Nov 13 '13 at 14:38
@all: Thank you for your answers. I understood my mistake and now I want to deal with the flattened array while sending them to the Kernels. I've to deal with array of size 200x600 or sort of..! Is it fine to use such a lengthy 1D array in computations? — user86927, Nov 14 '13 at 06:16
Depends on your device memory, If you have enough memory you can use without any problem. I use 512x512x256 flattened into 1d array. — Sagar Masuti, Nov 14 '13 at 07:07

score 3 · Accepted Answer · edited May 23 '17 at 12:34

The problem with your code isn't related to the use of malloc and calloc which are host functions. The problem is that you are not correctly dealing with double pointers and how they are passed to a CUDA kernel. As pointed out by Robert Crovella, a proper error checking would have let you gain a better insight in what was missing by your implementation.

Below there is a working version of your program. It is nothing more than an application of the answer provided by talonmies in cuda 2D array problem.

#include<stdio.h>
#include<conio.h>

inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }       
}

#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }

__global__ void threads(int* dA[]) {

    int gi=blockIdx.x;
    int gj=blockIdx.y;

    printf("global Id in X= %i, in Y =%i, E= %i\n", gi, gj, dA[gi][gj]);

}

int main(int argc, char** argv)
{

    int **A, *dA;
    int R=5, C=4;
    int size=R*C*sizeof(int);

    A=(int**)calloc(R,sizeof(int*));
    for(int i=0; i<R; i++) A[i]=(int*)calloc(C,sizeof(int));
    for(int i=0; i<R; i++) for(int j=0; j<C; j++) A[i][j]=i*C+j;

    printf("Before transfer \n");
    for(int i=0; i<R; i++) { for(int j=0; j<C; j++) { printf("%d ",A[i][j]); } printf("\n"); }
    printf("\n");

    // --- Create an array of R pointers on the host
    int** h_A = (int**)malloc(R*sizeof(int*));
    for(int i=0; i<R;i++){
        // --- For each array pointer, allocate space for C ints on the device
        GPUerrchk(cudaMalloc((void**)&h_A[i], C*sizeof(int)));
        // --- Copy the rows of A from host to device at the address determined by h_A[i]
        GPUerrchk(cudaMemcpy(h_A[i], &A[i][0], C*sizeof(int), cudaMemcpyHostToDevice));
    }

    // --- Create an array of R pointers on the device
    int **d_A; GPUerrchk(cudaMalloc((void***)&d_A, R*sizeof(int*)));
    // --- Copy the addresses of the rows of the device matrix from host to device
    GPUerrchk(cudaMemcpy(d_A, h_A, R*sizeof(int*), cudaMemcpyHostToDevice));

    dim3 nblocks(R,C);
    dim3 nthreads(1);

    printf("After transfer \n");
    threads<<<nblocks, nthreads>>>(d_A);
    GPUerrchk(cudaPeekAtLastError());

    cudaDeviceSynchronize();

    getch();

    return 0;

}

As also underlined in cuda 2D array problem, it is always better to flatten the 2D array to 1D to avoid this cumbersome array handling.

Why do malloc() and calloc() not seem work when using CUDA?

1 Answers1