I've written a piece of code to call the kernel in the book GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA.
However the results that I get are a bunch of negative numbers instead of prefix scan.
Is my kernel call wrong or is there something wrong with the code from the GPU Gems 3 book?
Here is my code:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int  *g_idata, int n, int dim)
{
    extern __shared__ int temp[];// allocated on invocation
    int thid = threadIdx.x;
    int offset = 1;
    temp[2*thid] = g_idata[2*thid]; // load input into shared memory
    temp[2*thid+1] = g_idata[2*thid+1];
    for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
    {
    __syncthreads();
    if (thid < d)
    {
    int ai = offset*(2*thid+1)-1;
    int bi = offset*(2*thid+2)-1;
    temp[bi] += g_idata[ai];
    }
    offset *= 2;
    }
    if (thid == 0) { temp[n - 1] = 0; } // clear the last element
    for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
    {
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
    int ai = offset*(2*thid+1)-1;
    int bi = offset*(2*thid+2)-1;
    int t = temp[ai];
    temp[ai] = temp[bi];
    temp[bi] += t;
    }
    }
    __syncthreads();
    g_odata[2*thid] = temp[2*thid]; // write results to device memory
    g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int  *h_in,int num_items)
{
    int j;
    for(j=0;j<num_items;j++)
        h_in[j]=j;
        printf(" input: ");
            printf("\n\n");
}
int main(int argc, char** argv)
{
    int num_items = 512;
    int*  h_in = new int[num_items];
    // Initialize problem 
    Initialize(h_in, num_items);
    int *d_in = NULL;
    cudaMalloc((void**)&d_in, sizeof(int) * num_items);
    if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
    // Allocate device output array
    int *d_out = NULL;
    cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
    kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
    int* h_out= new int[num_items+1];
    if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
    int i;
    printf(" \n");
    for(i=0;i<num_items;i++)
    printf(" ,%d ",h_out[i]);
    // Cleanup
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (d_in) cudaFree(d_in);
    if (d_out) cudaFree(d_out);
    printf("\n\n");
    return 0;
}
 
     
     
    