I am trying to learn CUDA by writing basic code, which should hopefully put me in a better position to convert my existing C++ code to CUDA (for research).
I need to do a fair bit of complex number manipulations, so I have written this very basic code to multiply an array of complex numbers with a real number in a GPU kernel.
#include <complex>
#include <iostream>
#include <cmath>
#include "cuda.h"
#include "math.h"
#include "cuComplex.h"
#define n   5
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
__global__ void func( double *s, cuDoubleComplex *j, cuDoubleComplex *calc ) {
    int tid = blockIdx.x;
    calc[tid] = cuCmul(j[tid], make_cuDoubleComplex(*s, 0));
}
int main( void ) {
    cuDoubleComplex calc[n+1], *dev_j, *dev_calc;
    double *dev_s, s[n+1] = { 2.0, 2.0, 2.0, 2.0, 2.0 };
    //complex<double> j[n+1]
    cuDoubleComplex j[n+1];
    for (int i = 1; i <= n; i++) {
        j[i] = make_cuDoubleComplex(0, 5);
        cout << "\nJ cout = " << cuCreal(j[i]) << ", " << cuCimag(j[i]);
    }
    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_s, (n+1) * sizeof(double) );
    cudaMalloc( (void**)&dev_j, (n+1) * sizeof(double) );
    cudaMalloc( (void**)&dev_calc, (n+1) * sizeof(double) );
    cudaMemcpy( dev_s, s, (n+1) * sizeof(double), cudaMemcpyHostToDevice );
    cudaMemcpy( dev_j, j, (n+1) * sizeof(double), cudaMemcpyHostToDevice );
    func<<<n,1>>>( dev_s, dev_j, dev_calc );
    //kernel<<<1,1>>>(a_d);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaMemcpy(calc, dev_calc, (n+1) * sizeof(double), cudaMemcpyDeviceToHost) );
    //cudaMemcpy( calc, dev_calc, (n+1) * sizeof(double), cudaMemcpyDeviceToHost );
    for (int i = 1; i <= n; i++) {
        cout << "\nCALC cout = " << cuCreal(calc[i]) << ", " << cuCimag(calc[i]);
    }
    return 0;
}
The final answer is wrong, and I have also identified a few other places where I am not getting expected values.
1) I expected a complex double array of (0, 5i) for all elements of 'j' after the following line of code. However, I am getting all 0s. Why is that?
j[i] = make_cuDoubleComplex(0, 5); 
2) Why can't I print my array using cout? The line of code shown below gives the following error : no operator "<<" matches these operands. How can I fix this without using with printf?
cout << "\nJ = " << j[i];
3) The GPU function 'func' which should give out an array of (0, 10i) as the final answer is giving random values such as these:
CALC = -1.#QNAN0
CALC = -1.#QNAN0
CALC = -9255963134931783100000000...000.. etc
CALC = -9255963134931783100000000...000.. etc
4) For my actual research, the complex array 'j' will be given in the format of complex(double) and not cuDoubleComplex. Can I do similar operations to 'j' array of complex(double)s using the function 'func'? If not, what are my options?
I think I've explained myself well, but feel free to ask any follow-up questions. New to C++ as well as CUDA so be nice :D
 
     
    