I'm newbie in CUDA C... I want to sum elements of array(with reduce) in 1 block and 267 threads use the shared memory. I read a book "CUDA by example, an introdution to General-Purpose to GPU Programming". According to some recomendations from her, i write my version of program:
__global__ void
conva(int* a, int* out) 
{
    __shared__ int cache[534];
    int cacheIndex = threadIdx.x;
    for(int n=0; n<2;++n) {
        cache[cacheIndex+n] = a[cacheIndex+n];
        int i = blockDim.x/2;
        while (i != 0) {
            if (cacheIndex < i)
                cache[cacheIndex + n] += cache[cacheIndex + n + i];
            __syncthreads();
            i /= 2;
        } 
    }
    //need or not this __syncthreads(), I don't know
    __syncthreads();
     if (cacheIndex == 0)
         out = &cache[0];
}
int main(int argc, char** argv)
{
    //enter array for sum
    int convolution[534];
    for(int i=0; i<534; ++i) 
        convolution[i] = 1;
    //variable in which we take a sum from device
    int summa = 0;
    //it we copy on device from host
    int* tash;
    int* convolution_gpu;
    cudaMalloc((void**)(&convolution_gpu), 534*sizeof(int));
    cudaMalloc((void**)(&tash), sizeof(int));
    cudaMemcpy(convolution_gpu, convolution, 534*sizeof(int),  cudaMemcpyHostToDevice );
    //call core with 1 block and 267 threads
    conva<<<1, 267>>>(convolution_gpu, tash);
    cudaMemcpy(&summa, tash, sizeof(int), cudaMemcpyDeviceToHost);
    //and here I want 534 but I have garbage(may be)
    std::cout<<summa<<std::endl;
    cudaFree(convolution_gpu);
    cudaFree(tash);
    getchar();
}
Tell please, where here is error and help me to resolve her... (sorry for my english)
 
    