I have the following kernel
   __global__ void filter(unsigned char *image, unsigned char *out, int n, int m)
    {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;
        int offset = x + y * blockDim.x * gridDim.x;
        int sumx, sumy, sumz, k, l;
        __shared__ float shared[16][16];
        shared[threadIdx.x][threadIdx.y] = image[offset];
            out[offset] = shared[threadIdx.x][threadIdx.y]; 
    }
which I am calling like filter<<<dimGrid, dimBlock>>>(dev_image, dev_out, n, m);. 
The strange thing is that even if I comment the call to the kernel and compile, the image remain the same. Any idea why this is happening? Isn't the memory on the gpu freed?
void Draw()
{
    unsigned char *image, *out;
    int n, m;
    unsigned char *dev_image, *dev_out;
    image = readppm("maskros512.ppm", &n, &m);
    out = (unsigned char*) malloc(n*m*3);
    printf("%d %d\n",n,m );
    cudaMalloc( (void**)&dev_image, n*m*3);
    cudaMalloc( (void**)&dev_out, n*m*3);
    cudaMemcpy( dev_image, image, n*m*3, cudaMemcpyHostToDevice);
    dim3 threads( 1, 256 );
    dim3 blocks( 32, 32 );
    filter<<<blocks, threads>>>(dev_image, dev_out, n, m);
    cudaMemcpy( out, dev_out, n*m*3, cudaMemcpyDeviceToHost );
    cudaFree(dev_image);
    cudaFree(dev_out);
    glClearColor( 0.0, 0.0, 0.0, 1.0 );
    glClear( GL_COLOR_BUFFER_BIT );
    glRasterPos2f(-1, -1);
    glDrawPixels( n, m, GL_RGB, GL_UNSIGNED_BYTE, image );
    glRasterPos2i(0, -1);
    glDrawPixels( n, m, GL_RGB, GL_UNSIGNED_BYTE, out );
    glFlush();
}
 
     
    