I am running a cuda vec addtion program and getting zeros as the output of its sum later. I have tried debugging but am not able to get to the problem at hand. It should be adding the numbers but is rather simply printing out zeros which I am not able to understand why is happening.
I have tried doing everything to the code and still I am not getting any output.
using namespace std;
 __global__ void vecADDKernal(double *A, double *B, double *C, int n){
    int id = blockIdx.x*blockDim.x+threadIdx.x;
    if(id<n) C[id] = A[id] + B[id];
}
int main( ){
    int n = 1048576;
    int size = n*sizeof(double);
    double *d_A, *d_B;
    double *d_C;
    double *h_A, *h_B, *h_C;
    h_A = (double*)malloc(size);
    h_B = (double*)malloc(size);
    h_C = (double*)malloc(size);
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);
    int i;
    // Initialize vectors on host
    for( i = 0; i < n; i++ ) {
        h_A[i] = 2*i;
        h_B[i] = 3*i;
    }
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    int blockSize = 256;
    // Number of thread blocks in grid
    int gridSize = ceil(n/blockSize);
    vecADDKernal<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    double sum = 0;
    for(int a = 0; a<n; a++) {
        sum = h_C[a];
        cout<<h_C[a]<<endl;
    }
    cout<<"HI "<< sum <<endl;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}
 
     
    