I'm trying to play around widh CUDA (in C on VS 2013, with Cuda 7.5).
I have been trying for hours to make a specific procedure run, without success. So I reduced it to its simplest expression, and I still have problems...
The following code runs smoothly with only 2 loops in the kernel: the "p" value goes up and reaches 14000. But with 3 loops, p stops at around 600 (instead of 14000) and the kernel gives back the floor to the Main program without any kind of notice.
#include <stdio.h>
__global__ void kernelLoops(long imax, long jmax, long pmax)
{
    long p, q, r;
// the loops generating problems are p, q, r. 
    for (p = 0; p< pmax; ++p){
        printf(" %d /%d\n", p, pmax);
        for (q = 0; q < imax; ++q){
            for (r = 0; r < jmax; ++r){
            }
        }
    }
}
void main()
{
    long imax = 200;
    long jmax = 200;
    long pmax = 14000;
    setbuf(stdout, NULL); // to get the printf output without delay
    kernelLoops <<< 1, 1 >>>( imax, jmax, pmax);
    printf("%s\n", cudaGetErrorString   (cudaGetLastError()));
    // This gives me "no error" on top of my screen. Then I see the numbers of the Kernel running...
    cudaDeviceSynchronize();
    printf("%s\n", cudaGetErrorString   (cudaGetLastError()));
    // This gives me "unspecified launch failure"
    printf("end of sync\n");
    getchar();
}
Can anyone help??? What may be happening? Thanks
