I'm trying to write a simple wrapper class to move data to/from device memory, but I keep getting "invalid argument" errors in my call to cudaMempcy when I try to copy data back from device to host.
This is my code:
#include <iostream>
#define gpu_chk(ans) { gpu_assert( ( ans ), __FILE__, __LINE__ ); }
inline void gpu_assert( cudaError_t code, const char *file, int line, bool abort=true ) {
   if ( code != cudaSuccess ) {
      fprintf( stderr,"GPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
      if( abort ) exit( code );
   }
}
class DevMatrix {
    int nrow;
    int ncol;
    double* dptr;
public:
    DevMatrix( int nrow, int ncol ) : nrow( nrow ), ncol( ncol ) {
        gpu_chk( cudaMalloc( (void**) &dptr, nrow * ncol * sizeof( double ) ) );
    }
    ~DevMatrix() {
        gpu_chk( cudaFree( dptr ) );
    }
    __host__ __device__ double* get() {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0))
        return dptr;
#else
        double* hptr;
        gpu_chk( cudaMemcpy( hptr, dptr, nrow * ncol * sizeof( double ), cudaMemcpyDeviceToHost ) );
        return hptr;
#endif
    }
};
__global__ void akernel( DevMatrix dm ) {
    int i = blockIdx.x;
    int j = threadIdx.x;
    int idx = ( gridDim.x * i ) + j;
    double* d = dm.get();
    d[idx] = -1;
}
#define ROWS 2
#define COLS 2
int main() {
    DevMatrix dm( ROWS, COLS );
    akernel<<<ROWS,COLS>>>( dm );
    double* hptr = dm.get();
    for( int i = 0; i < ROWS; i++ ) {
        for( int j = 0; j < COLS; j++ ) {
            int idx = ( i * ROWS ) + j;
            std::cout << hptr[idx] << std::endl;
        }
    }
    return 0;
}
Following answers to other "invalid argument" questions, I've tried different combinations like hptr, &hptr, etc.
Running the above in cuda-gdb, I can see that hptr and ptr have what I think to be the correct type, namely:
(cuda-gdb) p hptr
$1 = (double *) 0x7fffffffdd30
(cuda-gdb) p dptr
$2 = (double *) 0xb00a80000
But I keep getting the same error every time. What is wrong with the above code?
 
    