It is a simple CUDA code for initializing a big matrix (filling in zeros).
I output the first 1*3 matrix, if the code works. It should be all zeros.
If I set the matrix size to be small, then the program works properly. But when I make the size larger (> 43200 * 2400), what is inside the matrix are all garbage.
I had cudaDeviceSynchronize() append at the end of each CUDA functions already.
I am using NVIDIA Quadro K4200, Xeon E5-2630 with Ubuntu 14.04.
Thanks for anyone helping me here.
Attached below is my full code.
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <cuComplex.h>
#define BLOCK_SIZE 16 // change it to 16 to get maximum performance
// populate the matrix using first row
__global__ void RepmatKernel (cuComplex *Mat, const unsigned int N, const unsigned int Cols) 
{
    unsigned int i = (unsigned int)blockIdx.x * (unsigned int)blockDim.x + (unsigned int)threadIdx.x;
    if (i < N) 
    {
        Mat[i].x = 0;
        Mat[i].y = 0;
    }
}
// main routine
int main ()
{
  const unsigned int Rows = 43200;
  const unsigned int Cols = 2400;
  const unsigned int Num_thrd = 256; // max threads per block 
  unsigned int Mat_size = Rows * Cols; // size of array
  cuComplex *vec; // supposedly the input
  cuComplex *mat_debug; // for debug
  vec = new cuComplex [Cols];
  mat_debug = new cuComplex [Rows*Cols];
  cuComplex *mat_in_d;  // device array
  //input in host array
  for(unsigned int i = 0; i < Cols; i++)
  {
      vec[i].x = 3*i+4;
      vec[i].y = 0.2*i+1;
  }
  const unsigned int size_mat_d =    Rows * Cols * sizeof(cuComplex); 
  //create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
  if (cudaMalloc((void **) &mat_in_d ,  size_mat_d) != cudaSuccess) std::cout<<"Error allocating GPU";
  cudaDeviceSynchronize() ;
  //copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
  cudaMemcpy ( mat_in_d , vec , Cols , cudaMemcpyHostToDevice ) ;
  cudaDeviceSynchronize() ;
// ========================================================================
  cudaMemcpy(mat_debug , mat_in_d , size_mat_d , cudaMemcpyDeviceToHost) ;
  cudaDeviceSynchronize() ;
  std::cout<<"before repmat="<<std::endl;
  std::cout<<"[";
  for(unsigned int i = 0; i < 3; i++)
  {
    std::cout<< mat_debug[i * Cols].x <<"+"<<mat_debug[i * Cols].y <<"i,  ";
    std::cout<<";"<<std::endl;
  }
  std::cout<<"]"<<std::endl;
// ==========================================================================
  RepmatKernel<<<(unsigned int)ceil((float)(Mat_size)/(float)(Num_thrd)),
               (Num_thrd)>>>(mat_in_d,
                     Mat_size,
                     Cols);
  cudaDeviceSynchronize();
// ========================================================================
  cudaMemcpy(mat_debug , mat_in_d , size_mat_d , cudaMemcpyDeviceToHost) ;
  cudaDeviceSynchronize() ;
  std::cout<<"after repmat="<<std::endl;
  std::cout<<"[";
  for(unsigned int i = 0; i < 3; i++)
  {
    std::cout<< mat_debug[i * Cols].x <<"+"<<mat_debug[i * Cols].y <<"i,  ";
    std::cout<<";"<<std::endl;
  }
  std::cout<<"]"<<std::endl;
// ==========================================================================
  cudaFree(mat_in_d);
  delete [] vec; 
  delete [] mat_debug;
  return 0;
}    
 
     
     
    