I am trying to compute a histogram using some shared memory to improve performance. However I am running into a problem that I don't seem to figure out. Here is the kernel code i am having problem with. i am sure I am missing something silly but i can't locate it.
__global__
 void histogram_kernel_shared(const unsigned int* const d_vals,
                    unsigned int* d_histo,
                    const unsigned int numElems) {
    unsigned int gid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned int lid = threadIdx.x;
    unsigned int bin = d_vals[gid];
    __syncthreads();
    __shared__ unsigned int local_bin[1024];
    local_bin[lid] = d_histo[lid];
    __syncthreads();
    if(local_bin[lid] != d_histo[lid])
        printf("After copy to local. block = %u, lid = %u, local_bin = %u, d_histo = %u \n", blockIdx.x, lid, local_bin[lid], d_histo[lid]);
    __syncthreads();
    // If I comment out this line everything works fine.
    d_histo[lid] = local_bin[lid];  
    // Even this leads to some wrong answers. Printouts on the next printf.
    // d_histo[lid] = d_histo[lid];  
     __syncthreads();
    if(local_bin[lid] != d_histo[lid])
        printf("copy back. block = %u, lid = %u, local_bin = %u, d_histo = %u \n", blockIdx.x, lid, local_bin[lid], d_histo[lid]);
    __syncthreads();
    atomicAdd(&d_histo[bin], static_cast<unsigned int>(1));
    __syncthreads();
    // atomicAdd(&local_bin[bin], static_cast<unsigned int>(1));
    __syncthreads();
}
the kernel is launched as follows
threads = 1024;
blocks = numElems/threads;
histogram_kernel_shared<<<blocks, threads>>>(d_vals, d_histo, numElems);
number of elements is 10,240,000
and number of Bins is 1024.
What is bugging me is why should the assignment d_histo[lid] = local_bin[lid]; make difference here. Code runs fine without it. But nothing should change by that assignemtn since I just copied the value as local_bin[lid] = d_histo[lid]; and even more why does  local_bin[lid] = d_histo[lid]; gives garbage values as well?   
My guess is that something else is wrong somewhere else giving some odd kind of UB but Where?
Thanks for the help.
 
    