I have a CUDA application where I am trying to use constant memory. But when I am writing the kernel in the same file where the main function is, then only the data in the constant memory is getting recognized inside the kernel. Otherwise if I declare the kernel function in some other file then the constant memory is becoming 0 and the operation is operating properly. I am providing a simple dummy code which would explain the problem more easily. This program have a 48x48 matrix divided into 16x16 blocks and I am storing random numbers 1 to 50 in it. Inside the kernel I am adding numbers stored in constant memory to the each rows in a block. The code is given below :
Header File:
#include <windows.h>
#include <dos.h>
#include <stdio.h>
#include <conio.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <curand.h>
#include <curand_kernel.h>
__constant__ int test_cons[16];
__global__ void test_kernel_1(int *,int *);
Main Program :
int main(int argc,char *argv[])
{   int *mat,*dev_mat,*res,*dev_res;
    int i,j;
    int test[16 ]   = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    cudaMemcpyToSymbol(test_cons,test,16*sizeof(int));
    mat = (int *)malloc(48*48*sizeof(int));
    res = (int *)malloc(48*48*sizeof(int));
    memset(res,0,48*48*sizeof(int));
    srand(time(NULL));
    for(i=0;i<48;i++)
    {   for(j=0;j<48;j++)
        {   mat[i*48+j] = rand()%(50-1)+1;
            printf("%d\t",mat[i*48+j] );
        }
        printf("\n");
    }
    cudaMalloc((void **)&dev_mat,48*48*sizeof(int));
    cudaMemcpy(dev_mat,mat,48*48*sizeof(int),cudaMemcpyHostToDevice);
    cudaMalloc((void **)&dev_res,48*48*sizeof(int));
    dim3 gridDim(48/16,48/16,1);
    dim3 blockDim(16,16,1);
    test_kernel_1<<< gridDim,blockDim>>>(dev_mat,dev_res);
    cudaMemcpy(res,dev_res,48*48*sizeof(int),cudaMemcpyDeviceToHost);
    printf("\n\n\n\n");
    for(i=0;i<48;i++)
    {   for(j=0;j<48;j++)
        {   printf("%d\t",res[i*48+j] );
        }
        printf("\n");
    }
    cudaFree(dev_mat);
    cudaFree(dev_res);
    free(mat);
    free(res);
    exit(0);
}
Kernel Function :
__global__ void test_kernel_1(int *dev_mat,int* dev_res)
{
    int row = blockIdx.y*blockDim.y+threadIdx.y;
    int col = blockIdx.x*blockDim.x +threadIdx.x;
    dev_res[row*48+col] = dev_mat[row*48+col] + test_cons[threadIdx.x];
}
Now when I am declaring the kernel function inside the main program file along with the main program then the constant memory values are correct otherwise if it is in a different file the test_cons[threadIdx.x] values are becoming 0.
I came across this link which kind of discuss the same problem but I am not getting it properly. It would be very much helpful if someone could tell me why this is happening and what I need to do avoid this problem. Any sort of help would be highly appreciated. Thanks.
 
     
     
    