I'm encountering an "unspecified launch failure" when running my program in Cuda . I've checked the errors .
The program is a solver of a differential equation . It iterates TOTAL_ITER times . ROOM_X ans ROOM_Y are the width and height of the matrices .
Here is the header, its name is "sole :
#define ITER_BETWEEN_SAVES 10000
#define TOTAL_ITER 10000
#define ROOM_X 2048
#define ROOM_Y 2048
#define SOURCE_DIM_X 200
#define SOURCE_DIM_Y 1000
#define ALPHA 1.11e-4
#define DELTA_T 10
#define H 0.1
#include <stdio.h>
void Matrix(float* M);
void SolverCPU(float* M1, float* M2);
__global__ void SolverGPU(float* M1, float* M2);
Here is the kernel and a function that fill a matrix :
#include "solver.h"
#include<cuda.h>
void Matrix(float* M)
{
  for (int j = 0; j < SOURCE_DIM_Y; ++j) {
    for (int i = 0; i <  SOURCE_DIM_X; ++i) {
    M[(i+(ROOM_X/2 - SOURCE_DIM_X/2)) + ROOM_X * (j+(ROOM_Y/2 - SOURCE_DIM_Y/2))] = 100;
    }
  }
}
    __global__ void SolverGPU(float* M1,float *M2)  {
   int i =threadIdx.x + blockIdx.x * blockDim.x;
       int j = threadIdx.y + blockIdx.y * blockDim.y;
        float M1_Index = M1[i + ROOM_X * j];
        float M1_IndexUp = M1[i+1 + ROOM_X * j];
        float M1_IndexDown =M1[i-1 + ROOM_X * j];
        float M1_IndexLeft = M1[i + ROOM_X * (j+1)];
        float M1_IndexRight = M1[i + ROOM_X *(j-1)];
        M2[i + ROOM_X * j] = M1_Index + (ALPHA * DELTA_T / (H*H)) * (M1_IndexUp + M1_IndexDown + M1_IndexLeft +M1_IndexRight - 4*M1_Index);     
}
And here is the main
int main(int argc, char* argv[] ){
    float *M1_h, *M1_d,*M2_h, *M2_d;
    int size = ROOM_X * ROOM_Y * sizeof(float);
    cudaError_t err = cudaSuccess;  
    //Allocating Memories on Host
    M1_h = (float *)malloc(size);
    M2_h = (float *)malloc(size);
    //Allocating Memories on Host
    err=cudaMalloc((void**)&M1_d, size);
    if (err != cudaSuccess) { 
        fprintf(stderr, "Failed to allocate array_d ... %s .\n", cudaGetErrorString(err)); 
        exit(EXIT_FAILURE); 
    }
    err=cudaMalloc((void**)&M2_d, size);    
    if (err != cudaSuccess) { 
        fprintf(stderr, "Failed to allocate array_d ... %s .\n", cudaGetErrorString(err)); 
        exit(EXIT_FAILURE); 
    }
    //Filling the Matrix
    Matrix(M1_h);
    //Copy on Device
    err = cudaMemcpy(M1_d, M1_h, size, cudaMemcpyHostToDevice);
    if(err !=0){
        printf("%s-%d\n",cudaGetErrorString(err),1);
        getchar();  
    }
    err=cudaMemcpy(M2_d, M2_h, size, cudaMemcpyHostToDevice);
    if(err !=0){
        printf("%s-%d",cudaGetErrorString(err),2);
        getchar();  
    }
    dim3 dimGrid(64,64);
    dim3 dimBlock(32,32);
    //SolverGPU<< <threadsPerBlock, numBlocks >> >(M1_d,M2_d);
    for(int i=0;i<TOTAL_ITER;i++) { 
    if (i%2==0) 
    SolverGPU<< <dimGrid,dimBlock >> >(M1_d,M2_d);
    else
    SolverGPU<< <dimGrid,dimBlock >> >(M2_d,M1_d);
    }   
    err=cudaMemcpy(M1_h, M1_d, size, cudaMemcpyDeviceToHost);
    if(err !=0){
        printf("%s-%d",cudaGetErrorString(err),3);
        getchar();  
    }   
    cudaFree(M1_d);
    cudaFree(M2_d);
    free(M1_h);
    free(M2_h);
    return 0;   
}
There's no problem at compilation .
Whne I check my errors, the "unspecified launch failure" appears on the memcpy AFTER the kernel .
Ok, so I've read that it's usually due to the kernel which doesn't run properly . But I can't find the error (s) in the kernel ... I guess that's the error is quite simple , but can't figure to find it .
 
    