I wanted to determine how many numbers of the form x^2+1 are prime, for 1 <= x <= 10^7. I just wanted to parallelize it with CUDA and check the difference, so I used trivial primality check and I'm not concerned with improving its algorithm.
I arranged a grid, and sled it over my interval, record the results in shared memory of each block, performed a reduction on gpu over each block and finally performed a reduction in cpu to get the final result.
My problem is that the output result changes when I change the number of blocks and number of threads in each block. Another thing I can not explain is that, for a config of 8 blocks and 2048 threads per block, code runs in under 100ms, but when I reduce the number of threads to 1024 and double the number of blocks, the code will cause a timeout in memcpy from device to host!! How can I explain this behaviour and where the correctness falls into problem?
I'm using a GTX 480 nvidia gpu.
My Code is:
#include <stdio.h>
static void HandleError( cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define N 10000000
#define BLOCKS 8
#define THREADS 2048
__device__ int isprime(int x)
{
    long long n = (long long)x*x + 1;
    for( int p=3; p<=x+1; p+=2 )
        if ( n % p == 0 ) return 0;
    return 1;
}
__global__ void solve(int n, int* result)
{
    __shared__ int ipc[THREADS];
    int tid = threadIdx.x;
    int x = blockIdx.x*blockDim.x + threadIdx.x + 2;
    // sliding grid window over interval of to-be-computed data
    int acc = 0;
    while( x <= n )
    {
        if ( isprime(x) ) acc++;
        x += blockDim.x*gridDim.x;
    }
    ipc[tid] = acc;
    __syncthreads();
    // reduction over each block in parallel
    for( int s=blockDim.x/2; s>0; s>>=1 )
    {
        if ( tid < s )
        {
            ipc[tid] += ipc[tid+s];
        }
        __syncthreads();
    }
    if ( tid == 0 ) result[blockIdx.x] = ipc[0];
}
int main()
{
    int *dev;
    int res[BLOCKS];
    int ans = 0;
    HANDLE_ERROR( cudaMalloc((void**)&dev, BLOCKS * sizeof(int)) );
    solve<<<BLOCKS, THREADS>>>(N, dev);
    HANDLE_ERROR( cudaMemcpy(res, dev, BLOCKS*sizeof(int), cudaMemcpyDeviceToHost) );
    // final reduction over results for each block
    for( int j=0; j<BLOCKS; j++ )
        ans += res[j];
    printf("ans = %d\n", ans);
    HANDLE_ERROR( cudaFree( dev ) );
    return 0;
}
 
     
    