this is my first attempt at a CUDA program. This is what it's supposed to do:
- Receive 1D Pixel array from host memory
- Each Pixel is processed by one thread: it is thread-safe because only "val" is read and only "newval" is updated. Wait for sync.
- Each Pixel is processed by one thread: copy "newval" to "val."
- Write this array back to host memory.
- Repeat 2-4 for several different frames.
What happens, however, is that only a couple of variables, out of about 32000, in the new arrays seem to have decent values at all; the rest are zero.
I've removed the calculations for brevity.
__global__ void kernel(Pixel *array, float dt)
{
    const unsigned int tid = threadIdx.x;
    Pixel *point = array + tid;
    //DO A BUNCH OF CALCULATIONS ON PIXEL KIND OF LIKE THIS
    point->newval = point->val + foo;
}
__global__ void copykernel(Pixel *array)
{
    const unsigned int tid = threadIdx.x;
    Pixel *point = array + tid;
    //COPY THE NEWVALS OVER TO THE OLD VALS IN PREPARATION FOR THE NEXT FRAME
    point->val = point->newval;
}
extern "C" bool runIt(const int argc, const char **argv, Pixel *inarray, Pixel **outarrays, int arraysize, int numframes, float dt)
{
    int memsize = arraysize*sizeof(Pixel);
    int i=0;
    Pixel *array;
    cudaMalloc((void **) &array, memsize);
    cudaMemcpy(array, inarray, memsize, cudaMemcpyHostToDevice);
    int numthreads = arraysize;
    dim3 grid(1,1,1);
    dim3 threads(numthreads,1,1);
    for(i=0;i<numframes;i++)
    {
        kernel<<<grid, threads>>>((Pixel *) array, dt);
        cudaThreadSynchronize();
        copykernel<<<grid, threads>>>((Pixel *) array);
        cudaThreadSynchronize();
        cudaMemcpy(array, outarrays[i], memsize, cudaMemcpyDeviceToHost);
    }
    cudaFree(array);
    return true;
}
I have a suspicion that I'm setting up the parameters for the device incorrectly, or else I'm getting one of the device-specific keywords wrong or forgetting a crucial step. Does anything jump out at you?
 
    