What is a correct way to implement memcpy inside a CUDA kernel?

Question

I am implementing a PDE solver (Lax-Friedrichs) in CUDA that I previously wrote in C. Please find the C code below:

void solve(int M, double u[M+3][M+3], double unp1[M+3][M+3], double params[3]){
 int i;
 int j;
 int n;


 for (n=0; n<params[0]; n++){
        for (i=0; i<M+2; i++)
           for(j=0; j<M+2; j++){
              unp1[i][j] = 0.25*(u[i+1][j] + u[i-1][j] + u[i][j+1] + u[i][j-1])
                 - params[1]*(u[i+1][j] - u[i-1][j])
                 - params[2]*(u[i][j+1] - u[i][j-1]);
           }
 
        memcpy(u, unp1, pow(M+3,2)*sizeof(double));
 
 
        /*Periodic Boundary Conditions*/
        for (i=0; i<M+2; i++){
           u[0][i] = u[N+1][i];
           u[i][0] = u[i][N+1];
           u[N+2][i] = u[1][i];
           u[i][N+2] = u[i][1];
        }
     }
  }

And it works fine. But when I am trying to implement it in CUDA I do not get the same data. Unfortunately I cannot exactly pinpoint the exact problem since I am a totally beginner to the whole parallel programming thing, but I think it might have to do with the u[i*(N+3) + j] = unp1[i*(N+3) + j] on the solver, since I cannot really perform a memcpy inside the kernel, because it doesn't change anything, I don't know how to proceed. I took a look at This previous answer, but it unfortunately couldn't help solving my problem. Here is the solver in CUDA I am trying to code:

#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>

/*Configuration of the grid*/
const int N = 100; //Number of nodes  
const double xmin = -1.0;
const double ymin = -1.0;
const double xmax = 1.0;
const double ymax = 1.0;
const double tmax = 0.5;

/*Configuration of the simulation physics*/
const double dx = (xmax - xmin)/N;
const double dy = (ymax - ymin)/N;
const double dt = 0.009;
const double vx = 1.0;
const double vy = 1.0;


__global__ void initializeDomain(double *x, double *y){
    /*Initializes the grid of size (N+3)x(N+3) to better accomodate Boundary Conditions*/
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int j=index ; j<N+3; j+=stride){
        x[j] = xmin + (j-1)*dx;
        y[j] = ymin + (j-1)*dy;
    }
}


__global__ void initializeU(double *x, double *y, double *u0){
    
    double sigma_x = 2.0;
    double sigma_y = 6.0;
    
    int index_x = blockIdx.x * blockDim.x + threadIdx.x;
    int stride_x = blockDim.x * gridDim.x;
    int index_y = blockIdx.y * blockDim.y + threadIdx.y;
    int stride_y = blockDim.y * gridDim.y;
    
    for (int i = index_x; i < N+3; i += stride_x)
        for (int j = index_y; j < N+3; j+= stride_y){
            u0[i*(N+3) + j] = exp(-200*(pow(x[i],2)/(2*pow(sigma_x,2)) + pow(y[j],2)/(2*pow(sigma_y,2))));
            u0[i*(N+3) + j] *= 1/(2*M_PI*sigma_x*sigma_y);
               //u[i*(N+3) + j] = u0[i*(N+3) + j];
               //unp1[i*(N+3) + j] = u0[i*(N+3) + j];
    }
}


void initializeParams(double params[3]){
    params[0] = round(tmax/dt);
    params[1] = vx*dt/(2*dx);
    params[2] = vy*dt/(2*dy);
}


__global__ void solve(double *u, double *unp1, double params[3]){

     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;
     int index_y = blockIdx.y * blockDim.y + threadIdx.y;
     int stride_y = blockDim.y * gridDim.y; 

     for (int i = index_x; i < N+2; i += stride_x)
          for (int j = index_y; j < N+2; j += stride_y){
               unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
               - params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
               - params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
          }

}


__global__ void bc(double *u){
     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;


     /*Also BC are set on parallel */
     for (int i = index_x; i < N+2; i += stride_x){
          u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
          u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
          u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
          u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
     }
}


int main(){
     int i;
     int j;

     double *x = (double *)malloc((N+3)*sizeof(double));
     double *y = (double *)malloc((N+3)*sizeof(double));

     double *d_x, *d_y;
     cudaMalloc(&d_x, (N+3)*sizeof(double));
     cudaMalloc(&d_y, (N+3)*sizeof(double));

     initializeDomain<<<1, 1>>>(d_x, d_y);
     cudaDeviceSynchronize();

     cudaMemcpy(x, d_x, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(y, d_y, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);

     FILE *fout1 = fopen("data_x.csv", "w");
    FILE *fout2 = fopen("data_y.csv", "w");

    for (i=0; i<N+3; i++){
        if (i==N+2){
            fprintf(fout1, "%.5f", x[i]);
            fprintf(fout2, "%.5f", y[i]);
        }
        else{
            fprintf(fout1, "%.5f, ", x[i]);
            fprintf(fout2, "%.5f, ", y[i]);
        }
    }


     dim3 Block2D(1,1);
     dim3 ThreadsPerBlock(1,1);

     double *d_u0;
     double *u0 = (double *)malloc((N+3)*(N+3)*sizeof(double));
     cudaMalloc(&d_u0, (N+3)*(N+3)*sizeof(double));

     initializeU<<<Block2D, ThreadsPerBlock>>>(d_x, d_y, d_u0);
     cudaDeviceSynchronize();
     cudaMemcpy(u0, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);

     /*Initialize parameters*/
     double params[3];
     initializeParams(params);

     /*Allocate memory for u and unp1 on device for the solver*/
     double *d_u, *d_unp1;
     cudaMalloc(&d_u, (N+3)*(N+3)*sizeof(double));
     cudaMalloc(&d_unp1, (N+3)*(N+3)*sizeof(double));
     cudaMemcpy(d_u, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
     cudaMemcpy(d_unp1, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
     
     /*Solve*/
     for (int n=0; n<params[0]; n++){
          solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
          double *temp = d_u;
          d_u = d_unp1;
          d_unp1 = temp;
          bc<<<1,1>>>(d_u);
          cudaDeviceSynchronize();
     }

     /*Copy results on host*/
     double *u = (double *)malloc((N+3)*(N+3)*sizeof(double));
     cudaMemcpy(u, d_u, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);

     FILE *fu = fopen("data_u.csv", "w");
    for (i=0; i<N+3; i++){
          for(j=0; j<N+3; j++)
            if (j==N+2)
                fprintf(fu, "%.5f", u[i*(N+3) + j]);
            else
                fprintf(fu, "%.5f, ", u[i*(N+3) + j]);
        fprintf(fu, "\n");
    }
     fclose(fu);

     free(x);
     free(y);
     free(u0);
     free(u);
     cudaFree(d_x);
     cudaFree(d_y);
     cudaFree(d_u0);
     cudaFree(d_u);
     cudaFree(d_unp1);

     return 0;
}

I unfortunately keep having the same issue: The data I get is 0.0000.

Robert Crovella · Accepted Answer · 2021-06-16T14:45:23.323

One thing that is tripping you up is that your original algorithm has an ordering that is required for correctness:

Update unp from u
copy unp to u
enforce boundary condition
repeat

Your algorithm requires that step 1 be completed entirely before step 2 begins, and likewise for step 2 before step 3. Your CUDA realization (putting steps 1 and 3, or 1,2,3) in a single kernel does not preserve or guarantee that ordering. CUDA threads can execute in any order. If you apply that rigorously to your code (for example, imagine that thread with index 0 executes completely before any other thread begins. That would be valid CUDA execution) then you will see that your kernel design does not preserve the ordering required.

So do something like this:

Create a solve kernel that is just the first step:

 __global__ void solve(double *u, double *unp1, double params[3]){

      int index_x = blockIdx.x * blockDim.x + threadIdx.x;
      int stride_x = blockDim.x * gridDim.x;
      int index_y = blockIdx.y * blockDim.y + threadIdx.y;
      int stride_y = blockDim.y * gridDim.y;

      for (int i = index_x; i < N+2; i += stride_x)
          for (int j = index_y; j < N+2; j += stride_y){
               unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
               - params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
               - params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
               u[i*(N+3) + j] = unp1[i*(N+3) + j];
          }
 }

don't bother with the memcpy operation. the better way to do that is swapping pointers (in host code).

Create a separate kernel to enforce the boundary:

 __global__ void bc(double *u, double *unp1, double params[3]){

      int index_x = blockIdx.x * blockDim.x + threadIdx.x;
      int stride_x = blockDim.x * gridDim.x;
      int index_y = blockIdx.y * blockDim.y + threadIdx.y;
      int stride_y = blockDim.y * gridDim.y;
      /*Also BC are set on parallel */
      for (int i = index_x; i < N+2; i += stride_x){
           u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
           u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
           u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
           u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
      }
 }

Modify your host code to call these kernels in sequence, with the pointer swap in-between:

   /*Solve*/
   for(int n = 0; n<params[0]; n++){
        solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
        double *temp = d_u;
        d_u = d_unp1;
        d_unp1 = temp;
        bc<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
        cudaDeviceSynchronize();
   }

(coded in browser, not tested)

This will enforce the ordering that your algorithm requires.

NOTE: As identified in the comments below, the solve kernel as depicted above (and in OP's original post, and in their posted CPU code version) has indexing errors at least associated with i-1 and j-1 indexing patterns. These should be fixed otherwise the code is broken. Fixing them requires some decision as to what to do for the edge cases, which OP provides no guidance on, therefore I have left that code as-is.

Thank you very much for your answer, I understand what you mean. Unfortunately, I still get only 0.0000 as data points though. I edited out the line `u[i*(N+3) + j] = unp1[i*(N+3) + j];` on the kernel solve() (because I imagine this would defeat the purpose of the swap-in) and created the bc kernel as you advised (using only a 1D block, so on the host code it is now `bc<<<1,1>>>(d_u)`. I edited the first post for clarification, you can find the EDIT at the end of the post — rungekutta, Jun 15 '21 at 23:24
I can't debug a partial code for you. If you are asking for debugging assistance (rather than what to do about `memcpy`) you are supposed to provide a [mcve]. See item 1 [here](https://stackoverflow.com/help/on-topic), note use of the word "must". — Robert Crovella, Jun 15 '21 at 23:49
I would also encourage you to use [proper CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) and run your code with `compute-sanitizer`. — Robert Crovella, Jun 16 '21 at 00:01
I updated my first post with a reproducible example in case you would like to compile it. As it can be seen, I start having problems from the solve() kernel onwards. Before that, I get the correct data. Thanks again! — rungekutta, Jun 16 '21 at 03:37
Run your code with `compute-sanitizer` like I suggested. Even better, first compile with `-lineinfo`. Your `solve` kernel is making illegal, out-of-bounds accesses (global reads) at this line: `unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) ...` It seems fairly evident to me that `i-1` will give illegal indexing for the threads whose `i` value starts at 0. likewise for `j-1` indexing. You might want to fix your CPU code also. — Robert Crovella, Jun 16 '21 at 04:39
True. This is indeed the problem. I just fixed that and mark your solution as valid. Thanks. — rungekutta, Jun 16 '21 at 04:52

What is a correct way to implement memcpy inside a CUDA kernel?

1 Answers1