I am trying to add 2 arrays using CUDA , but it didn't work .
I did all that it should be done:
1) I parallelized the VectorAdd function
2) I allocated memory to the GPu and moved the data to the GPU
3) And last thing i modified the function VectorAdd to run on the GPU
This is the code :
#define SIZE 1024
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x ;
    if(i < n)
        c[i] = a[i] + b[i];
}
int main()
{
    int *a , *b , *c;
    int *d_a , *d_b , *d_c;
    a = (int *)malloc(SIZE * sizeof(int));
    b = (int *)malloc(SIZE * sizeof(int));
    c = (int *)malloc(SIZE * sizeof(int));
    cudaMalloc( &d_a , SIZE * sizeof(int) );
    cudaMalloc( &d_b , SIZE * sizeof(int) );
    cudaMalloc( &d_c , SIZE * sizeof(int) );
    for ( int i = 0 ; i < SIZE ; ++i)
    {
        a[i] = i ;
        b[i] = i ;
        c[i] = 0 ;
    }
    cudaMemcpy(d_a, a, SIZE *sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE *sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE *sizeof(int), cudaMemcpyHostToDevice);
    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
    for(int i = 0 ; i < 10 ; ++i)
    {
        printf("C[%d] =  %d\n", i, c[i]);
    }
    free(a);
    free(b);
    free(c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}
The output on the console is this :
c[0] = 0 , c[1] = 0 , c[2] = 0 , c[3] = 0 , c[4] = 0 ....
Why is that it should be :
c[0] = 0 ; c[1] = 2 ; c[2] = 4 ....
 
     
     
    