I use reduction logic in code by referring How to find the sum of array in CUDA by reduction.
But It is giving some errors. I am not getting my mistake, could you please help me out??
required specification: 1.Cuda toolkit v6.5 2. graphics: GTX 210 (compute capability 1.2) 3. visual studio 2013
#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>
#define SIZE 10
#define N 100
__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
    __shared__ int sdata[256];
    int i = threadIdx.x + (blockIdx.x*blockDim.x);
    sdata[threadIdx.x] = d_a[i];
    __syncthreads();
    if (i<SIZE)
    for (i = 2; i<SIZE; i++)
    {
        int counter = 0;
        for (int j = 2; j<d_a[i]; j++)
        {
            if (d_a[i] % j == 0)
            {
            counter = 1; break;
            }
        }
if (counter == 0)
        {
            d_b[i] = d_a[i];
        }
    }
    // do reduction in shared mem
    for (int s = 1; s < blockDim.x; s *= 2)
    {
        int index = 2 * s * threadIdx.x;;
        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }
    // write result for this block to global mem
    if (threadIdx.x == 0)
        atomicAdd(d_c, sdata[0]);
}
}
int main()
{
    clock_t tic = clock();
    int *a, *b, *summation=0, sum = 0,count=-1;       //declare summation as double/long if needed
    int *d_a, *d_b, *d_c;
    //int blocks, block_size = 512;
    int size = N * sizeof(int); 
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    summation = (int *)malloc(SIZE*sizeof(int));
    cudaMalloc((void**)&d_a, SIZE * sizeof(int));
    cudaMalloc((void**)&d_b, SIZE * sizeof(int));
    cudaMalloc((void**)&d_c, SIZE * sizeof(int));
    for (int i = 1; i<SIZE; i++)
    {
        a[i] = i;
        b[i] = 0;
    }
    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    /*blocks = SIZE / block_size;
    if (SIZE% block_size != 0)
        blocks++;   */
    dim3 blocksize(256); // create 1D threadblock
    dim3 gridsize(N / blocksize.x);  //create 1D grid
    vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);
    //cudaThreadSynchronize();
    cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
    for (int m = 0; m < SIZE; m++)
    {
        if (b[m] != 0)
        {
            printf("\n prime no is:%d", b[m]);
            count = count + 1;
        }
    }
    printf("\n\n Total prime no. are: %d", count);
/*      for (int j = 1; j<SIZE; j++)
    {
        sum = sum + b[j];
    }*/
    printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);
    clock_t toc = clock();
    printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);
    free(a);    free(b);    free(summation);
    cudaFree(d_a);      cudaFree(d_b);      cudaFree(d_c);
    getchar();  return 0;
}
 
     
     
    