I have written a CUDA test program, because my more complex program was not working. This one isn't working either.
What should it do?
I've written a test program (I think) to add 0.5 to an array of numbers. Or at least that's what it's supposed to do.
Here's the code:
#include <iostream>
#include <cuda.h>
__global__
void cuda_kernel_func(double *in, double *out, int count)
{
    int index = blockIdx.x;
    if(index < count)
    {
        out[index] = in[index] + 0.5;
    }
}
int main()
{
    int num = 10;
    double *out;
    double *d_out;
    double *in;
    double *d_in;
    
    out = (double*)malloc(num * sizeof(double));
    in = (double*)malloc(num * sizeof(double));
    cudaMalloc(&d_out, num * sizeof(double));
    cudaMalloc(&d_in, num * sizeof(double));
    
    for(int i = 0; i < num; ++ i)
    {
        in[i] = (double)i;
    }
    
    cudaMemcpy(d_in, in, num * sizeof(double), cudaMemcpyHostToDevice);
    cuda_kernel_func<<<num, 1>>>(d_in, d_out, num);
    cudaDeviceSynchronize();
    
    cudaMemcpy(out, d_out, num * sizeof(double), cudaMemcpyDeviceToHost);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    for(int i = 0; i < num; ++ i)
    {
        std::cout << out[i] << " ";
    }
    std::cout << std::endl;
    
    free(in);
    free(out);
    return 0;
}
I am fairly new to CUDA, but not to parallelization or C/C++. I think the code is fairly self-explanatory.
Output:
0 0 0 0 0 0 0 0 0 0
Which isn't very exciting.
 
     
    