The dynamic memory allocation using malloc()/calloc() seems to be not working properly, when used in CUDA.
As for checking, I wrote the following code using calloc(). The array seems to be allocated with required memory and I could also assign some values. But I could see only the garbage values, when I print the matrix elements from the Kernel. I thought it could be a problem with cudaMemcpy() but, instead of **A, if I put like, A[5][5], the code works perfect.
And the memset() usage leads to 'core dumped' error.
Could anyone help in getting along with malloc()/calloc() with no errors?
#include<stdio.h>
__global__ void threads(int* dA)
{
 int gi=threadIdx.x+(blockIdx.x*blockDim.x);
 int gj=threadIdx.y+(blockIdx.y*blockDim.y);
 printf("global Id in X= %d, in Y =%d, E= %d\n", gi,gj,dA[gi*5+gj]);
}
int main(int argc, char** argv)
{
 int **A, *dA;
 int R=5, C=4;
 int size=R*C*sizeof(int);
 A=(int **)calloc(R, sizeof(int*));
 for(int i=0; i<R; i++)
    A[i]=(int *)calloc(C, sizeof(int));
// memset(A, 0, size);
 for(int i=0; i<R; i++)
   {
   for(int j=0; j<C; j++)
      A[i][j]=i*C+j;
   }
printf(" \n Before \n");
for(int i=0; i<R; i++)
   {
    for(int j=0; j<C; j++)
        printf("%d ",A[i][j]);
    printf("\n");
   }
cudaMalloc((int**) &dA, size);
cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);
dim3 nblocks(R,C);
dim3 nthreads(1);
threads<<<nblocks, nthreads>>>(dA);
cudaDeviceSynchronize();
cudaFree(dA);
free(A);
return 0;
}
 
     
     
    