I try to code a simple example with cuda C, I follow a screencast about this but I have wrong result
this is an the example :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
#define SIZE    1024
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x;
    if (i < n){
        c[i] = a[i] + b[i];
    }
}
int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU         installed?");
    }
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    c = (int *)malloc(SIZE*sizeof(int));
    cudaMalloc(&d_a, SIZE*sizeof(int));
    cudaMalloc(&d_b, SIZE*sizeof(int));
    cudaMalloc(&d_c, SIZE*sizeof(int));
    for (int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }
    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d\n", i, c[i]);
    free(a);
    free(b);
    free(c);
    enter code here
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}
the result is :
c[0]=0
c[1]=0
c[2]=0
c[3]=0
c[4]=0
c[5]=0
c[6]=0
c[7]=0
c[8]=0
c[9]=0
but I expect this result :
c[0]=0
c[1]=2
c[2]=4
c[3]=6
c[4]=8
c[5]=10
c[6]=12
c[7]=14
c[8]=16
c[9]=18
please any one can help about this !
 
     
    