I am implementing a PDE solver (Lax-Friedrichs) in CUDA that I previously wrote in C. Please find the C code below:
void solve(int M, double u[M+3][M+3], double unp1[M+3][M+3], double params[3]){
 int i;
 int j;
 int n;
 for (n=0; n<params[0]; n++){
        for (i=0; i<M+2; i++)
           for(j=0; j<M+2; j++){
              unp1[i][j] = 0.25*(u[i+1][j] + u[i-1][j] + u[i][j+1] + u[i][j-1])
                 - params[1]*(u[i+1][j] - u[i-1][j])
                 - params[2]*(u[i][j+1] - u[i][j-1]);
           }
 
        memcpy(u, unp1, pow(M+3,2)*sizeof(double));
 
 
        /*Periodic Boundary Conditions*/
        for (i=0; i<M+2; i++){
           u[0][i] = u[N+1][i];
           u[i][0] = u[i][N+1];
           u[N+2][i] = u[1][i];
           u[i][N+2] = u[i][1];
        }
     }
  }
And it works fine. But when I am trying to implement it in CUDA I do not get the same data. Unfortunately I cannot exactly pinpoint the exact problem since I am a totally beginner to the whole parallel programming thing, but I think it might have to do with the u[i*(N+3) + j] = unp1[i*(N+3) + j] on the solver, since I cannot really perform a memcpy inside the kernel, because it doesn't change anything, I don't know how to proceed. I took a look at This previous answer, but it unfortunately couldn't help solving my problem. Here is the solver in CUDA I am trying to code:
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>
/*Configuration of the grid*/
const int N = 100; //Number of nodes  
const double xmin = -1.0;
const double ymin = -1.0;
const double xmax = 1.0;
const double ymax = 1.0;
const double tmax = 0.5;
/*Configuration of the simulation physics*/
const double dx = (xmax - xmin)/N;
const double dy = (ymax - ymin)/N;
const double dt = 0.009;
const double vx = 1.0;
const double vy = 1.0;
__global__ void initializeDomain(double *x, double *y){
    /*Initializes the grid of size (N+3)x(N+3) to better accomodate Boundary Conditions*/
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int j=index ; j<N+3; j+=stride){
        x[j] = xmin + (j-1)*dx;
        y[j] = ymin + (j-1)*dy;
    }
}
__global__ void initializeU(double *x, double *y, double *u0){
    
    double sigma_x = 2.0;
    double sigma_y = 6.0;
    
    int index_x = blockIdx.x * blockDim.x + threadIdx.x;
    int stride_x = blockDim.x * gridDim.x;
    int index_y = blockIdx.y * blockDim.y + threadIdx.y;
    int stride_y = blockDim.y * gridDim.y;
    
    for (int i = index_x; i < N+3; i += stride_x)
        for (int j = index_y; j < N+3; j+= stride_y){
            u0[i*(N+3) + j] = exp(-200*(pow(x[i],2)/(2*pow(sigma_x,2)) + pow(y[j],2)/(2*pow(sigma_y,2))));
            u0[i*(N+3) + j] *= 1/(2*M_PI*sigma_x*sigma_y);
               //u[i*(N+3) + j] = u0[i*(N+3) + j];
               //unp1[i*(N+3) + j] = u0[i*(N+3) + j];
    }
}
void initializeParams(double params[3]){
    params[0] = round(tmax/dt);
    params[1] = vx*dt/(2*dx);
    params[2] = vy*dt/(2*dy);
}
__global__ void solve(double *u, double *unp1, double params[3]){
     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;
     int index_y = blockIdx.y * blockDim.y + threadIdx.y;
     int stride_y = blockDim.y * gridDim.y; 
     for (int i = index_x; i < N+2; i += stride_x)
          for (int j = index_y; j < N+2; j += stride_y){
               unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
               - params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
               - params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
          }
}
__global__ void bc(double *u){
     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;
     /*Also BC are set on parallel */
     for (int i = index_x; i < N+2; i += stride_x){
          u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
          u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
          u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
          u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
     }
}
int main(){
     int i;
     int j;
     double *x = (double *)malloc((N+3)*sizeof(double));
     double *y = (double *)malloc((N+3)*sizeof(double));
     double *d_x, *d_y;
     cudaMalloc(&d_x, (N+3)*sizeof(double));
     cudaMalloc(&d_y, (N+3)*sizeof(double));
     initializeDomain<<<1, 1>>>(d_x, d_y);
     cudaDeviceSynchronize();
     cudaMemcpy(x, d_x, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(y, d_y, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
     FILE *fout1 = fopen("data_x.csv", "w");
    FILE *fout2 = fopen("data_y.csv", "w");
    for (i=0; i<N+3; i++){
        if (i==N+2){
            fprintf(fout1, "%.5f", x[i]);
            fprintf(fout2, "%.5f", y[i]);
        }
        else{
            fprintf(fout1, "%.5f, ", x[i]);
            fprintf(fout2, "%.5f, ", y[i]);
        }
    }
     dim3 Block2D(1,1);
     dim3 ThreadsPerBlock(1,1);
     double *d_u0;
     double *u0 = (double *)malloc((N+3)*(N+3)*sizeof(double));
     cudaMalloc(&d_u0, (N+3)*(N+3)*sizeof(double));
     initializeU<<<Block2D, ThreadsPerBlock>>>(d_x, d_y, d_u0);
     cudaDeviceSynchronize();
     cudaMemcpy(u0, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
     /*Initialize parameters*/
     double params[3];
     initializeParams(params);
     /*Allocate memory for u and unp1 on device for the solver*/
     double *d_u, *d_unp1;
     cudaMalloc(&d_u, (N+3)*(N+3)*sizeof(double));
     cudaMalloc(&d_unp1, (N+3)*(N+3)*sizeof(double));
     cudaMemcpy(d_u, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
     cudaMemcpy(d_unp1, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
     
     /*Solve*/
     for (int n=0; n<params[0]; n++){
          solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
          double *temp = d_u;
          d_u = d_unp1;
          d_unp1 = temp;
          bc<<<1,1>>>(d_u);
          cudaDeviceSynchronize();
     }
     /*Copy results on host*/
     double *u = (double *)malloc((N+3)*(N+3)*sizeof(double));
     cudaMemcpy(u, d_u, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
     FILE *fu = fopen("data_u.csv", "w");
    for (i=0; i<N+3; i++){
          for(j=0; j<N+3; j++)
            if (j==N+2)
                fprintf(fu, "%.5f", u[i*(N+3) + j]);
            else
                fprintf(fu, "%.5f, ", u[i*(N+3) + j]);
        fprintf(fu, "\n");
    }
     fclose(fu);
     free(x);
     free(y);
     free(u0);
     free(u);
     cudaFree(d_x);
     cudaFree(d_y);
     cudaFree(d_u0);
     cudaFree(d_u);
     cudaFree(d_unp1);
     return 0;
}
I unfortunately keep having the same issue: The data I get is 0.0000.
 
    