C/CUDA - Modifying CUDA/GL interop example to store image in a memory buffer

Question

I am trying to store the image, which is generated by the CUDA-OpenGL interop example in the 'CUDA-By Example' textbook, into a memory buffer that can store to images.

I want to store two images, one which is a green "X" and another which is an orangish "X", in a memory buffer. When I render the pBuffer with OpenGL, I should get a green "X" image like the example output, however, I just get a black screen. I am not sure why I am not getting the right output. Could someone please tell me what's wrong?

I obtained the code for the memory buffer from A Memory buffer for multiple images

    #include "book.h"
    #include "cpu_bitmap.h"
    #include "cuda.h"
    #include <cuda_gl_interop.h>

    PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
    PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
    PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
    PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

    #define     DIM    512

    #define IMAGESIZE_MAX (DIM*DIM)  // MY CHANGE

    GLuint  bufferObj;
    cudaGraphicsResource *resource;

    // based on ripple code, but uses uchar4 which is the type of data
    // graphic inter op uses. see screenshot - basic2.png

    __global__ void kernel( uchar4 *ptr1)   
    {
   // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr1[offset].x = 0;
    ptr1[offset].y = green;
    ptr1[offset].z = 0;
    ptr1[offset].w = 255;

    }

    // MY CODE

    __global__ void kernel2( uchar4 *ptr2) 
    {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr2[offset].x = 1000;
    ptr2[offset].y = green;
    ptr2[offset].z = 0;
    ptr2[offset].w = 255;
    }

    __global__ void copy ( uchar4 *pBuffer, uchar4 *Ptr, uchar4 *Ptr2, size_t size, int a )
    {

    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = x + y * blockDim.x * gridDim.x ;
    int bdx = idx;

    if (a==1)
    {
    while ( idx < DIM*DIM)
    {
    pBuffer[idx] = Ptr[idx] ;
    __syncthreads();
    if (idx==DIM*DIM)
    {
     break;
    }
    }
    }


    if (a==2)
    {
    while ( (idx < DIM*DIM) && (bdx < DIM*DIM) )
    {
    uchar4 temp = Ptr2[bdx];
    __syncthreads();

    pBuffer[idx+4] = temp;
    __syncthreads();

    if ((idx==DIM*DIM) && (bdx==DIM*DIM))
    {
    break;
    }
    }  
    }

    }


    void key_func( unsigned char key, int x, int y ) 
    {
    switch (key) 
    {
    case 27:
        // clean up OpenGL and CUDA
        ( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
    }
    }

    void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    }


    int main( int argc, char **argv ) 
    {
    cudaDeviceProp  prop;
    int dev;

    memset( &prop, 0, sizeof( cudaDeviceProp ) );
    prop.major = 1;
    prop.minor = 0;
    ( cudaChooseDevice( &dev, &prop ) );

    // tell CUDA which dev we will be using for graphic interop
    // from the programming guide:  Interoperability with OpenGL
    //     requires that the CUDA device be specified by
    //     cudaGLSetGLDevice() before any other runtime calls.

    ( cudaGLSetGLDevice( dev ) );

    // these GLUT calls need to be made before the other OpenGL
    // calls, else we get a seg fault
    glutInit( &argc, argv );
    glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
    glutInitWindowSize( DIM, DIM );
    glutCreateWindow( "bitmap" );

    glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
    glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
    glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
    glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
    glGenBuffers( 1, &bufferObj );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
    glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4 ,NULL, GL_DYNAMIC_DRAW_ARB );

    // REGISTER THE GL BufferObj and CUDA Resource
    ( cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ) );

    // do work with the memory dst being on the GPU, gotten via mapping
    HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );

    // MY MODIFIED CODE

    uchar4 *devPtr; 
    size_t size;
    size_t sizeTotal = 0;
    cudaMalloc ( (uchar4 **)&devPtr,  size);

    uchar4 *devPtr2; 

    cudaMalloc ( (uchar4 **)&devPtr2,  size); 

    uchar4 *pBuffer;

    (cudaMalloc ( (uchar4 **)&pBuffer,  size));

    uchar4 *pBufferCurrent;

    (cudaMalloc ( (uchar4 **)&pBufferCurrent,  size));

    uchar4 *pBufferImage;

    (cudaMalloc ( (uchar4 **)&pBufferImage,  size));

    // REGISTER THE C BUFFER and CUDA Resource

    HANDLE_ERROR( 
    cudaGraphicsResourceGetMappedPointer( (void**)&pBufferImage, 
                                          &size, 
                                          resource) );

    dim3    grids(DIM/16,DIM/16);
    dim3    threads(16,16);


    kernel<<<grids,threads>>>(devPtr);
    kernel2<<<grids,threads>>>(devPtr2);

    int a = 1;
    do 
    {

    if (a==1)
    {
    copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);  
    }

    if(a==2)
    {
    copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);
    }

    a++;

    } while (a<=2); 

    cudaGraphicsUnmapResources( 1, &resource, NULL ) );

    // set up GLUT and kick off main loop
    glutKeyboardFunc( key_func );
    glutDisplayFunc( draw_func );
    glutMainLoop();
   }

score 2 · Answer 1 · edited May 23 '17 at 10:31

Start by doing proper cuda error checking on all your cuda API calls (e.g. cudaMemcpy, etc.) and kernel calls.

When you do that you'll discover that your kernels are not running successfully. These types of things won't work:

uchar4 *devPtr;   // you've just created an unallocated NULL host pointer
size_t img1_size = IMAGESIZE_MAX;

kernel<<<grids,threads>>>(devPtr);  // this kernel will fail


uchar4 *devPtr2;   // you've just created an unallocated NULL host pointer
size_t img2_size = IMAGESIZE_MAX;

kernel2<<<grids,threads>>>(devPtr2);  // this kernel will fail

devPtr and devPtr2 in the above code are NULL pointers. You haven't allocated any storage associated with them. Furthermore, since you are passing them to device kernels, they need to be allocated with cudaMalloc or similar API function, in order for the pointers to be usable in device code.

Since they are not allocated with cudaMalloc, as soon as you try to dereference those pointers in device code, you'll create a kernel fault. This will be evident if you do error checking, as you will have "unspecified launch failure" or similar report from those kernels.

I think there are probably a number of other problems in your code, but first you should do proper cuda error checking and at least get your code to the point where everything you've written is, in fact, running.

And the code you've posted doesn't actually compile.

After fixing the compile errors I also discovered that you have another infinite loop:

cudaMalloc ( (uchar4 **)&pBufferCurrent,  sizeTotal + sizeof(size) + size); 
cudaMalloc ( (uchar4 **)&pBuffer,  sizeTotal + sizeof(size) + size);

do 
{

if (!pBufferCurrent)
{
break;
}

pBuffer = pBufferCurrent;

pBufferCurrent += sizeTotal;

imageget ( pBufferCurrent + sizeof(size), size, devPtr);

sizeTotal += (sizeof(size) + size);

} while (a==1);

Since a is initialized to 1 in your loop, and nothing in the loop modifies a, the loop will never exit based on the while condition. Since pBufferCurrent is also never zero if it's been properly set up by cudaMalloc, the break will never be taken.

If you malloc or cudaMalloc a pointer called pBufferCurrent, it's hard for me to imagine under what circumstances this would ever make sense:

pBufferCurrent += sizeTotal;

and although this is legal, I don't see how this makes sense:

pBuffer = pBufferCurrent;

You just created an allocation for pBuffer using cudaMalloc, but the first thing you do is throw it away?

I allocated memory for the two pointers devptr and devptr2 using cudaMalloc and I used the "HANDLE_ERROR" CUDA error checking function that came along with the example code. The code actually compiles for me but I am not sure why the code is not compiling for you? — Eagle, Aug 01 '13 at 01:29
I don't see any statements in the code you've posted that are using `cudaMalloc` to allocate the pointers `devptr` and `devptr2`. Are you looking at the code you've posted in this question? Regarding compilation, the reason is because the code you're compiling is not what you've posted in this question. Look at the code *in the question*, specifically at the last call to `cudaGraphicsResourceGetMappedPointer`. Does that line contain a semicolon terminator? It does not. It will not compile correctly. — Robert Crovella, Aug 01 '13 at 01:33
I'm sorry, I had forgotten to add an a++ statement in the do-while loop. I am editing the code that I pasted now. I am trying my best to write a program for a memory buffer that can store images coming from an OCT system. I realize that I am a beginner in C/CUDA programming and I am trying to learn as fast as I can. It would be great if you could help me write a memory buffer program that is correct. I am stumped here. Every program I write either has an infinite loop to it or some other problem — Eagle, Aug 01 '13 at 03:25
So, I made a few edits and added a few more lines of code especially in the do-while loop and the kernel, according to what you told me earlier. I looked to see if there were any points in the code where an infinite loop could be the problem and I couldn't find any. Does this piece of code look all right to you? — Eagle, Aug 01 '13 at 18:57
SO is not a chat or dialog site. It is a Question/Answer site. The problem with continually editing this question, posing new questions, with wholesale code modifications, is that: 1. my previous answer becomes irrelevant. 2. The question is useless for future visitors, because it is internally inconsistent (my previous answer does not match your "new" question/code), and thus incomprehensible. Furthermore, as the question already appears to be answered, others may not even bother to look at it, so you're less likely to get help. The correct methodology is to post a new question. — Robert Crovella, Aug 01 '13 at 19:12

C/CUDA - Modifying CUDA/GL interop example to store image in a memory buffer

1 Answers1