I'm implementing a function to find the sum of an array by using reduction, my array have 32*32 elements and its values is 0 ... 1023. The my expected sum value is 523776, but my reult is 15872, it wrong. Here is my code:
#include <stdio.h>
#include <cuda.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
    int a[N], b[N]; // copies of a, b, c
    int *dev_a, *dev_b; // device copies of a, b, c
    int size = N * sizeof( int ); // we need space for 512 integers
    // allocate device copies of a, b, c
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );
    fill_array( a, N );
    // copy inputs to device
    cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
    dim3 blocksize(16,16);
    dim3 gridsize;
    gridsize.x=(w+blocksize.x-1)/blocksize.x;
    gridsize.y=(h+blocksize.y-1)/blocksize.y;
    reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
    // copy device result back to host copy of c
    cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
    printf("Reduced sum of Array elements = %d \n", b[0]);
    cudaFree( dev_a );
    cudaFree( dev_b );
    return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
    __shared__ int sdata[256];
    // each thread loads one element from global to shared mem
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    sdata[threadIdx.x] = g_idata[i];
    __syncthreads();
    // do reduction in shared mem
    for (int s=1; s < blockDim.x; s *=2)
    {
        int index = 2 * s * threadIdx.x;;
        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }
    // write result for this block to global mem
    if (threadIdx.x == 0)
        atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
    for (int i = 0; i < n; i++)
        a[i] = i;
}
 
     
    