cudaMemcpy error when copying from device to host after device class member function alters value of device variable

Question

I am confused as to the behavior of the CUDA code I have written. I am in the midst of writing tests for my __device__ functions in a class called DimmedGridGPU. This class is templated on an int DIM and the function I have trouble with is meant to return the value of the grid at the point nearest the input value, x. I have this kernel namespace for unit testing purposes, to call each __device__ function in isolation.

The desired behavior of this code would be to return the value 3.0 from the do_get_value(x, grid_) call, and set d_target[0] to this value, then transfer it back to the host side for unit test assertions. The whole of the kernel seems to function properly, but when I do the final transfer back to the host side, I receive a cudaErrorInvalidValue error, and I do not understand why.

Here is a minimal example of the code, preserving the structure of the class and its features:

#include <cuda_runtime.h>
#include <fstream>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
     fprintf(stderr,"GPUassert: \"%s\": %s %s %d\n", cudaGetErrorName(code), cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}


template <int DIM>
class DimmedGridGPU{

public:
  size_t grid_size_;//total size of grid
  int b_derivatives_;//if derivatives are going to be used
  int b_interpolate_;//if interpolation should be used on the grid
  double* grid_;//the grid values
  double* grid_deriv_;//derivatives    
  double dx_[DIM];//grid spacing
  double min_[DIM];//grid minimum
  double max_[DIM];//maximum
  int grid_number_[DIM];//number of points on grid
  int b_periodic_[DIM];//if a dimension is periodic
  int* d_b_interpolate_;
  int* d_b_derivatives_;


  DimmedGridGPU(const double* min, 
        const double* max, 
        const double* bin_spacing, 
        const int* b_periodic, 
        int b_derivatives, 
        int b_interpolate) :   b_derivatives_(b_derivatives), b_interpolate_(b_interpolate), grid_(NULL), grid_deriv_(NULL){
    
    size_t i;

    for(i = 0; i < DIM; i++) {
      min_[i] = min[i];
      max_[i] = max[i];
      b_periodic_[i] = b_periodic[i];

      grid_number_[i] = (int) ceil((max_[i] - min_[i]) / bin_spacing[i]);
      dx_[i] = (max_[i] - min_[i]) / grid_number_[i];
      //add one to grid points if 
      grid_number_[i] = b_periodic_[i] ? grid_number_[i] : grid_number_[i] + 1;
      //increment dx to compensate
      if(!b_periodic_[i])
    max_[i] += dx_[i];
    }

    grid_size_ = 1;
    for(i = 0; i < DIM; i++)
      grid_size_ *= grid_number_[i];
    gpuErrchk(cudaMallocManaged(&grid_, grid_size_ * sizeof(double)));
    if(b_derivatives_) {
      gpuErrchk(cudaMallocManaged(&grid_deriv_, DIM * grid_size_ * sizeof(double)));
      if(!grid_deriv_) {
    printf("Out of memory!! gpugrid.cuh:initialize");   
      }
    }
    
    gpuErrchk(cudaMalloc((void**)&d_b_interpolate_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_interpolate_, &b_interpolate, sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMalloc((void**)&d_b_derivatives_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_derivatives_, &b_derivatives, sizeof(int), cudaMemcpyHostToDevice));
  }

  ~DimmedGridGPU(){
    gpuErrchk(cudaDeviceSynchronize());
    if(grid_ != NULL){
      gpuErrchk(cudaFree(grid_));
      grid_ = NULL;//need to do this so DimmedGrid's destructor functions properly
    }
    
    if(grid_deriv_ != NULL){
      gpuErrchk(cudaFree(grid_deriv_));
      grid_deriv_ = NULL;
    }
      
    gpuErrchk(cudaDeviceReset());
  }
//gets the value of the grid closest to x
  __host__ __device__ double do_get_value( double* x, double* grid_) {

    size_t index[DIM];
    get_index(x, index);
    printf("do_get_value was called on the GPU!, and index[0] is now %d\n", index[0]);
    printf("but multi2one(index) gives us %d\n", multi2one(index));
    double value = grid_[multi2one(index)];
    printf("and value to be returned is %f\n", value);
    return value;
  }
//gets grid's 1D index from an array of coordinates
   __host__ __device__ void get_index(const double* x, size_t result[DIM]) const {
    size_t i;
    double xi;
    printf("get_index was called on the GPU in %i dimension(s)\n", DIM);
    for(i = 0; i < DIM; i++) {
      xi = x[i];
      printf("xi is now %f, min_[i] is %f and dx_[i] is %f\n",xi, min_[i], dx_[i]);
      if(b_periodic_[i]){
    xi -= (max_[i] - min_[i]) * gpu_int_floor((xi - min_[i]) / (max_[i] - min_[i]));
      }
      result[i] = (size_t) floor((xi - min_[i]) / dx_[i]);
    }
  }
//takes a multidimensional index to a 1D index
  __host__ __device__ size_t multi2one(const size_t index[DIM]) const {
    size_t result = index[DIM-1];

    size_t i;    
    for(i = DIM - 1; i > 0; i--) {
      result = result * grid_number_[i-1] + index[i-1];
    }
    
    return result;
    
  }

};

__host__ __device__ int gpu_int_floor(double number) {
  return (int) number < 0.0 ? -ceil(fabs(number)) : floor(number);
}


namespace kernels{
  template <int DIM>
  __global__ void get_value_kernel(double* x, double* target_arr, double* grid_, DimmedGridGPU<DIM>  g){
    target_arr[0] = g.do_get_value(x, grid_);
    printf("get_value_kernel has set target[0] to be %f\n", target_arr[0]);//check if the value is set correctly
    return;
  }
}


int main(){
  using namespace kernels;
  double min[] = {0};
  double max[] = {10};
  double bin_spacing[] = {1};
  int periodic[] = {0};
  DimmedGridGPU<1> g (min, max, bin_spacing, periodic, 0, 0);
  for(int i = 0; i < 11; i++){
    g.grid_[i] = i;
    printf("g.grid_[%d] is now %f\n", i, g.grid_[i]);
  }
  gpuErrchk(cudaDeviceSynchronize());
  double x[] = {3.5};
  
  double* d_x;
  gpuErrchk(cudaMalloc(&d_x, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_x, x, sizeof(double), cudaMemcpyHostToDevice));
  double target[] = {5.0};
  double* d_target;
  gpuErrchk(cudaMalloc((void**)&d_target, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_target, target, sizeof(double), cudaMemcpyHostToDevice));
  gpuErrchk(cudaDeviceSynchronize());
  get_value_kernel<1><<<1,1>>>(d_x, d_target, g.grid_, g);
  gpuErrchk(cudaDeviceSynchronize());
  gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
  printf("and after GPU stuff, target[0] is now %f\n", target[0]);
  return(0);
}

So, why does this line (the last cudaMemcpy) throw an error "CudaErrorInvalidValue", when the print statements I have included clearly demonstrate that the correct values are being used on the device, and the value returned by the do_get_value(x, grid_) call is correct?

I have already tried using cudaMemcpyFromSymbol, thinking that perhaps the assignment was creating a symbol instead of passing and changing a value somehow, but that is not the case, as d_target is not a valid symbol.

Here is sample output from my code:

g.grid_[0] is now 0.000000

g.grid_[1] is now 1.000000

g.grid_[2] is now 2.000000

g.grid_[3] is now 3.000000

g.grid_[4] is now 4.000000

g.grid_[5] is now 5.000000

g.grid_[6] is now 6.000000

g.grid_[7] is now 7.000000

g.grid_[8] is now 8.000000

g.grid_[9] is now 9.000000

g.grid_[10] is now 10.000000

get_index was called on the GPU in 1 dimension(s)

xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000

do_get_value was called on the GPU!, and index[0] is now 3

but multi2one(index) gives us 3

and value to be returned is 3.000000

get_value_kernel has set target[0] to be 3.000000

GPUassert: "cudaErrorInvalidValue": invalid argument gpugrid.cu 166

Robert Crovella · Accepted Answer · 2018-10-13T21:39:53.317

So, why does this line (the last cudaMemcpy) throw an error "CudaErrorInvalidValue"...?

The problem revolves around your destructor:

  ~DimmedGridGPU(){

The destructor is getting called in places you probably aren't expecting. To convince yourself of this, add a printf statement to the destructor. Note where it appears in the printout:

$ ./t955
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
Destructor!
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument t955.cu 167

Given that, it should be pretty evident that calling cudaDeviceReset() in that destructor now seems like a bad idea. The cudaDeviceReset() wipes out all device allocations, so then when you attempt to do this:

gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));

d_target is no longer a valid allocation on the device, so when you attempt to use it as the device target for cudaMemcpy, the runtime checks this pointer value (which is not changed by the device reset) and determines that the pointer value no longer corresponds to a valid allocation, and throws a runtime error.

Just like in C++ when you pass an object to a function (or a kernel in this case) as a pass-by-value parameter, the copy constructor for that object gets called. It stands to reason when that object copy goes out of scope, the destructor for it will be called.

I would suggest that putting such global-scope affecting functions as cudaDeviceReset() in an object destructor might be a fragile programming paradigm, but that is perhaps a matter of opinion. I assume you now have enough information to go about fixing the issue.

To avoid the next possible question, simply commenting out that call to cudaDeviceReset() in your destructor may not be sufficient to make all problems disappear (although this particular one will). Now that you know that this destructor is being called at least twice in the ordinary execution of this program, you may want to think carefully about what else is going on in that destructor, and perhaps strip more things out of it, or else rearchitect your class altogether.

For example, note that cudaDeviceReset() is not the only function that can cause trouble in a destructor for objects used this way. Similarly, cudaFree() may have unintended consequences on the original object, when used in a destructor called on the object-copy.

Thanks! I definitely should have been more careful with my destructor. I will be able to move forward with this now. I appreciate the help. — R. Barrett, Mar 17 '17 at 14:12

cudaMemcpy error when copying from device to host after device class member function alters value of device variable

1 Answers1

Linked

Related

cudaMemcpy error when copying from device to host after __device__ class member function alters value of device variable

1 Answers1

Linked

Related

cudaMemcpy error when copying from device to host after device class member function alters value of device variable