I'm a beginner when it comes to CUDA programming, but this situation doesn't look complex, yet it doesn't work.
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
__global__ void add(int *t)
{
    t[2] = t[0] + t[1];
}
int main(int argc, char **argv)
{
    int sum_cpu[3], *sum_gpu;
    sum_cpu[0] = 1;
    sum_cpu[1] = 2;
    sum_cpu[2] = 0;
    cudaMalloc((void**)&sum_gpu, 3 * sizeof(int));
    cudaMemcpy(sum_gpu, sum_cpu, 3 * sizeof(int), cudaMemcpyHostToDevice);
    add<<<1, 1>>>(sum_gpu);
    cudaMemcpy(sum_cpu, sum_gpu, 3 * sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << sum_cpu[2];
    cudaFree(sum_gpu);
    return 0;
}
I'm compiling it like this
nvcc main.cu
It compiles, but the returned value is 0. I tried printing from within the kernel and it won't print so I assume i doesn't execute. Can you explain why?
 
     
     
    