C/CUDA - Help needed to write a program to store images in a buffer

Question

I am a new to CUDA programming and I need help in writing a program to store images in a memory buffer. I tried modifying the code in the CUDA-OpenGL interop example, given in the CUDA-By Example book, to store 2 images one after another in a buffer. How should I write the program if I tried to avoid infinite loops but I am not sure if I succeeded? Any help in writing a correct program would be very much appreciated!

#include "book.h"
#include "cpu_bitmap.h"
#include "cuda.h"
#include <cuda_gl_interop.h>

PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
 if (code != cudaSuccess) 
{
   fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
   if (abort) system ("pause");
}
}

#define     DIM    512

#define IMAGESIZE_MAX (DIM*DIM) 

GLuint  bufferObj;
cudaGraphicsResource *resource;

// based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png

__global__ void kernel( uchar4 *ptr1) 
{
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr1[offset].x = 0;
    ptr1[offset].y = green;
    ptr1[offset].z = 0;
    ptr1[offset].w = 255;    

}

__global__ void kernel2( uchar4 *ptr2) 
{
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );
    unsigned char orange = 1000; 
    // accessing uchar4 vs unsigned char*
    ptr2[offset].x = orange;
    ptr2[offset].y = green;
    ptr2[offset].z = 0;
    ptr2[offset].w = 255;

}

__global__ void copy ( uchar4 *pBuffer, uchar4 *Ptr )
{

   int x = threadIdx.x + blockIdx.x * blockDim.x;
   int y = threadIdx.y + blockIdx.y * blockDim.y;
   int idx = x + y * blockDim.x * gridDim.x ;
   while ( idx != DIM*DIM)
   {
    pBuffer[idx] = Ptr[idx] ;
    __syncthreads();

    }

}    

__global__ void copy2 ( uchar4 *pBuffer, uchar4 *Ptr2 )
{  
int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = x + y * blockDim.x * gridDim.x ;
    int bdx = idx;

    while ( (idx < DIM*DIM) && (bdx < DIM*DIM) )
    {
   uchar4 temp = Ptr2[bdx];
   __syncthreads();

   pBuffer[idx+4] = temp;
   __syncthreads();

   if ((idx==DIM*DIM) && (bdx==DIM*DIM))
    {
     break;
    }
    }  


}



void key_func( unsigned char key, int x, int y ) {
    switch (key) {
     case 27:
        // clean up OpenGL and CUDA
        ( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
    }
}

void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    }


    int main( int argc, char **argv ) {
    cudaDeviceProp  prop;
    int dev;

    (memset( &prop, 0, sizeof( cudaDeviceProp ) ));
    prop.major = 1;
    prop.minor = 0;
    HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) );

    // tell CUDA which dev we will be using for graphic interop
    // from the programming guide:  Interoperability with OpenGL
    //     requires that the CUDA device be specified by
    //     cudaGLSetGLDevice() before any other runtime calls.

    HANDLE_ERROR(  cudaGLSetGLDevice( dev ) );

    // these GLUT calls need to be made before the other OpenGL
    // calls, else we get a seg fault
    glutInit( &argc, argv );
    glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
    glutInitWindowSize( DIM, DIM );
    glutCreateWindow( "bitmap" );

    glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
    glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
    glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
    glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
    glGenBuffers( 1, &bufferObj );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
    glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4 ,
              NULL, GL_DYNAMIC_DRAW_ARB );

// REGISTER THE GL BufferObj and CUDA Resource

    HANDLE_ERROR(( cudaGraphicsGLRegisterBuffer( &resource, 
                                  bufferObj, 
                                  cudaGraphicsMapFlagsNone ) ));

    // do work with the memory dst being on the GPU, gotten via mapping
    HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );


    uchar4* devPtr;
    size_t  size = DIM*DIM;
    size_t  sizet = 2*DIM*DIM;

    gpuErrchk(cudaMalloc ( (uchar4 **)&devPtr,  size)); 

    uchar4 *devPtr2; 

    gpuErrchk(cudaMalloc ( (uchar4 **)&devPtr2,  size)); 

uchar4 *pBuffer;

gpuErrchk(cudaMalloc ( (uchar4 **)&pBuffer,  size));

uchar4 *pBufferCurrent;

gpuErrchk(cudaMalloc ( (uchar4 **)&pBuffer,  size));


uchar4 *pBufferImage;
gpuErrchk(cudaMalloc ( (uchar4 **)&pBufferImage,  sizet));

    // REGISTER THE C BUFFER and CUDA Resource
    HANDLE_ERROR( cudaGraphicsResourceGetMappedPointer( (void**)&pBufferImage,  
                                          &size, 
                                          resource) );

    dim3    grids(DIM/16,DIM/16);
    dim3    threads(16,16);
    kernel<<<grids,threads>>>( devPtr );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );

    kernel2<<<grids,threads>>>(devPtr2);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );    
    int a = 1;
do 
{


if (a==1)
{
copy<<< 512, 512>>>(pBufferImage, devPtr);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}

if(a==2)
{
copy2<<< 512, 512>>>(pBufferImage, devPtr2);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
a++;

} while (a<=2); 

HANDLE_ERROR ( cudaGraphicsUnmapResources( 1, &resource, NULL ) );

// set up GLUT and kick off main loop
glutKeyboardFunc( key_func );
glutDisplayFunc( draw_func );
glutMainLoop();

}

When you're having trouble with a cuda code, please instrument it for proper cuda error checking. Please study [this link](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api). Then make sure that *EVERY* time you use `cudaMalloc`, `cudaMemcpy`, or another API call, that you are properly checking for errors. It's OK to use your `HANDLE_ERROR` macro, but you need to use it on *EVERY* line where you use an API call. And you should also study how to get the error from a kernel launch (e.g. `copy<<< ... >>>` ) and check those. — Robert Crovella, Aug 01 '13 at 19:36
When pasting code into a question, please indent it appropriately. It makes it easier to read. It's also better if you are more specific with your question. What are you having trouble with specifically? From your perspective, what is not working? What is the expected behavior, and what is the behavior you are actually seeing? Answering these will help others to help you. — Robert Crovella, Aug 01 '13 at 19:38
This line near the end of the code that you have posted: `cudaGraphicsUnmapResources( 1, &resource, NULL ) );` does not compile. You had similar issues with your previous question. It seems like you're not being successful with copy/paste, or else you have this issue in your actual code. Have you actually tried to compile the code you have posted in this question? — Robert Crovella, Aug 01 '13 at 20:01
whenever you compile cuda code and see a warning like this: `warning: variable "size" is used before it's value is set` it frequently indicates a definite problem in your code. You should not ignore those warnings. — Robert Crovella, Aug 01 '13 at 20:04
@RobertCrovella I'll paste the code again. I am not sure why it is not compiling for you. It compiles for me but, when I included the error checking macros, I have errors on certain lines, especially with the `gpuErrchk( cudaDeviceSynchronize() );`. What I want to do with the code is to store two images, one green and one orange-ish, one after another in the buffer. I want to render the buffer, rotate and translate it such that I can see the two images like they are placed in a stack. Currently, I cannot see anything because the error checks point to three errors in the do-while loop. — Eagle, Aug 01 '13 at 20:21
@RobertCrovella Yeah, I didn't ignore them because the error check macros you suggested pointed me to them. So, I initialized them to `DIM*DIM*. Let me paste the code again. — Eagle, Aug 01 '13 at 20:24
I'm puzzled that you can't look at the code that is posted in this question and see that this line cannot compile: `cudaGraphicsUnmapResources( 1, &resource, NULL ) );` I'm also puzzled that you cannot look at the code you have posted and see that this line: ` cudaMalloc ( (uchar4 **)&devPtr, size);` has no cuda error checking. — Robert Crovella, Aug 01 '13 at 20:25
@RobertCrovella I did apply the `gpuErrchk` to the `cudamalloc` API calls. They are in the code that I edited and pasted. Also, the error check functions, when I run the program after compiling and linking, show errors in the do-while loop under `cudaDeviceSynchronize()` and `cudaPeekAtLastError()` — Eagle, Aug 01 '13 at 20:52
The program you started with (from the first section of chapter 8 in the cuda by example book) draws a particular green wavy line pattern. Are you saying that you want the green wavy line pattern, plus a similar orange pattern that is rotated by 90 degrees, to be displayed? — Robert Crovella, Aug 01 '13 at 21:53
@RobertCrovella Yeah, sort of. I don't want the orange image to be rotated by any degree; it should be generated as is. I just want to store both those images in a buffer and then render the buffer containing those two images in OpenGL. I will use OpenGL functions to rotate and translate the buffer. By doing so, I can visualize the buffer as a stack and add further images to the buffer (by changing the size of the buffer and storing the images). This is also what I want to do with OCT images. I am just using this example as a template. — Eagle, Aug 02 '13 at 00:12
Moreover, when I allocate `size_t size = DIM*DIM` in the example code and render the green image, the bottom half of the image is corrupted and it is not "green" all over. I am not sure why that is happening. If I remove the `DIM*DIM` part, then it renders correctly. — Eagle, Aug 02 '13 at 00:15

Robert Crovella · Accepted Answer · 2013-08-02T02:09:27.890

Here's some code I wrote that is a modification of the CUDA by examples code contained here which I believe is effectively what you started with. I used two kernels, just as you have, to generate either a green or an orange image. It will initially start with the green image displayed, but you can toggle between green and orange images using the space bar. ESC key will exit the app.

#include "book.h"
#include "cpu_bitmap.h"

//#include "cuda.h"
#include <cuda_gl_interop.h>

int which_image;
PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

#define     DIM    512

GLuint  bufferObj;
cudaGraphicsResource *resource;

dim3    mgrids(DIM/16,DIM/16);
dim3    mthreads(16,16);

// based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel_gr( uchar4 *ptr ) {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 *
                            sin( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr[offset].x = 0;
    ptr[offset].y = green;
    ptr[offset].z = 0;
    ptr[offset].w = 255;
}

__global__ void kernel_or( uchar4 *ptr ) {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   orange = 128 + 127 *
                            sin( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr[offset].x = orange;
    ptr[offset].y = orange/2;
    ptr[offset].z = 0;
    ptr[offset].w = 255;
}

static void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
}

static void key_func( unsigned char key, int x, int y ) {
    switch (key) {
        case 32:
    // do work with the memory dst being on the GPU, gotten via mapping

            HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );
            uchar4* devPtr;
            size_t  size;
            HANDLE_ERROR(
              cudaGraphicsResourceGetMappedPointer( (void**)&devPtr,
                                              &size,
                                              resource) );

            if (which_image == 1){
              kernel_or<<<mgrids,mthreads>>>( devPtr );
              HANDLE_ERROR(cudaPeekAtLastError());
              HANDLE_ERROR(cudaDeviceSynchronize());
              printf("orange\n");
              which_image = 2;
              }
            else {
              kernel_gr<<<mgrids,mthreads>>>( devPtr );
              HANDLE_ERROR(cudaPeekAtLastError());
              HANDLE_ERROR(cudaDeviceSynchronize());
              printf("green\n");
              which_image = 1;
              }

            HANDLE_ERROR( cudaGraphicsUnmapResources( 1, &resource, NULL ) );
            draw_func();
            break;
        case 27:
            // clean up OpenGL and CUDA
            HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) );
            glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
            glDeleteBuffers( 1, &bufferObj );
            exit(0);
    }
}



int main( int argc, char **argv ) {
    cudaDeviceProp  prop;
    int dev;

    memset( &prop, 0, sizeof( cudaDeviceProp ) );
    prop.major = 1;
    prop.minor = 0;
    HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) );

    // tell CUDA which dev we will be using for graphic interop
    // from the programming guide:  Interoperability with OpenGL
    //     requires that the CUDA device be specified by
    //     cudaGLSetGLDevice() before any other runtime calls.

    HANDLE_ERROR( cudaGLSetGLDevice( dev ) );

    // these GLUT calls need to be made before the other OpenGL
    // calls, else we get a seg fault
    glutInit( &argc, argv );
    glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
    glutInitWindowSize( DIM, DIM );
    glutCreateWindow( "bitmap" );

    glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
    glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
    glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
    glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg
    // of the bitmap these calls exist starting in OpenGL 1.5
    glGenBuffers( 1, &bufferObj );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
    glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4,
                  NULL, GL_DYNAMIC_DRAW_ARB );

    HANDLE_ERROR(
        cudaGraphicsGLRegisterBuffer( &resource,
                                      bufferObj,
                                      cudaGraphicsMapFlagsNone ) );

    // do work with the memory dst being on the GPU, gotten via mapping
    HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );
    uchar4* devPtr;
    size_t  size;
    HANDLE_ERROR(
        cudaGraphicsResourceGetMappedPointer( (void**)&devPtr,
                                              &size,
                                              resource) );

    dim3    grids(DIM/16,DIM/16);
    dim3    threads(16,16);
    kernel_gr<<<grids,threads>>>( devPtr );
    HANDLE_ERROR( cudaGraphicsUnmapResources( 1, &resource, NULL ) );
    which_image = 1;
    // set up GLUT and kick off main loop
    glutKeyboardFunc( key_func );
    glutDisplayFunc( draw_func );
    glutMainLoop();
}

Not sure if it will be useful, I'm still not understanding what you want to accomplish entirely. I don't really know what this means:

I just want to store both those images in a buffer and then render the buffer containing those two images in OpenGL.

You want to be able to see one image at a time, and switch images? Or you want to be able to see both images at the same time? If the latter, please explain. Do you want one at the top of the window and one at the bottom of the window? Both of them blended together?

EDIT: It seems to me that you may be wanting some sort of 3D visualization of multiple images, since the question and answer with you about what you want hasn't been productive (at least I still can't get a handle on what you want to see VISUALLY, ignoring what goes on under the hood.) You haven't tagged this question with OpenGL, so no OpenGL experts are looking at it. Furthermore, you've made statements like: "I will use OpenGL functions to rotate and translate the buffer. " If what you're trying to do is create a 3D visualization of a set of images that a user can interact with, this is not the sample code you want to start with. This is a basic 2D image display code. Trying to expand the buffer to hold multiple images is the least of your difficulties in creating some sort of 3D visualization in OpenGL. And you will not get to some kind of 3D multi-image display using this sample code.

I suspect that the CUDA-OpenGL interop portion of what you're trying to do is not difficult. I've shown with the example program how you can get 2 different images, generated by 2 different kernels, displayed under user control. So the problem of how to take an image from CUDA and display it, or get it into a buffer that can be displayed, I think is pretty well illustrated.

My suggestion is this: Leave the CUDA-OpenGL interop portion aside. Write an OpenGL program that does what you want, with arbitrary images (generate them however you like, no need to use CUDA.) If you need help with that, pose questions on SO, and tag them with OpenGL so that people who will know how to do it can help you. Then, when you have a prototype of what you want to display visually, you can inject the CUDA portion. And I suspect that part will be pretty simple at that point.

Thank you very much for the code! I will test it out on my system. What I meant by storing both images was this: Imagine the buffer as a bookshelf of 10 rows which can store an image one by one in each of the 10 rows. These 10 images should be stored one after the other in the buffer. When you render this buffer, you will see these 10 images placed one below the other in an orderly fashion. By, rotating and translating the buffer, we can see any of the 10 images placed in them and pull out any one of the images from the buffer. — Eagle, Aug 02 '13 at 01:45
I realized something when I was looking through the code. I don't think the OpenGL API calls support memory allocated on the device by CUDA. My guess is that we can pass data from the host to device memory, do any necessary computation in CUDA and transfer it back to the host. Rendering must only be supported in host memory by OpenGL. This is probably the reason why they never used `cudaMalloc` for `uchar4 *devPtr` in the example code. — Eagle, Aug 02 '13 at 16:24

C/CUDA - Help needed to write a program to store images in a buffer

1 Answers1