I am trying to write a program for matrix calculations using C/CUDA. I have the following program:
In main.cu
#include <cuda.h>
#include <iostream>
#include "teste.cuh"
using std::cout;
int main(void)
{
 const int Ndofs = 2;
 const int Nel   = 4;
 double *Gh   = new double[Ndofs*Nel*Ndofs*Nel];
 double *Gg;
 cudaMalloc((void**)& Gg, sizeof(double)*Ndofs*Nel*Ndofs*Nel);
 for (int ii = 0; ii < Ndofs*Nel*Ndofs*Nel; ii++)
  Gh[ii] = 0.;
 cudaMemcpy(Gh, Gg, sizeof(double)*Ndofs*Nel*Ndofs*Nel, cudaMemcpyHostToDevice);
 integraG<<<256, 256>>>(Nel, Gg);
 cudaMemcpy(Gg, Gh, sizeof(double)*Ndofs*Nel*Ndofs*Nel, cudaMemcpyDeviceToHost);
 for (int ii = 0; ii < Ndofs*Nel*Ndofs*Nel; ii++)
  cout << ii  + 1 << " " << Gh[ii] << "\n";
 return 0;
}
In mtrx.cuh
#ifndef TESTE_CUH_
#define TESTE_CUH_
__global__ void integraG(const int N, double* G)
{
    const int szmodel = 2*N;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    int idy = threadIdx.y + blockIdx.y*blockDim.y;
    int offset = idx + idy*blockDim.x*gridDim.x;
    int posInit = szmodel*offset;
    G[posInit + 0] = 1;
    G[posInit + 1] = 1;
    G[posInit + 2] = 1;
    G[posInit + 3] = 1;
}
#endif
The result (which is supposed to be a matrix filled with 1's) is copied back to the host array; The problem is: nothing happens! Apparently, my program is not calling the gpu kernel, and I am still getting an array full of zeros.
I am very new to CUDA programming and I am using CUDA by example (Jason Sanders) as a reference book.
My questions are:
- What is wrong with my code?
- Is this the best way to deal with matrices using GPU, using matrices vectorized form?
- Is there another reference that can provide more examples on matrices using GPU's?
 
     
    