I'm trying to calculate a matrice multiplication of size N (square matrix) but I'm getting a stack overflow error(I'm new to Cuda ):
if I test the code for N < 300 everything is fine, but if I test it with N> 300 it does not work, and a stack overflow error was displayed but there is enough memory.in my graphics card GF 820M . if N = 300 then 300 * 300 * 4(size of float) = 360000 byte : necessary space in the device to allocate for an array of type float.and here it must allocate for 3 Table to do multiplication .therefore 360000 * 3 = 1080000 bytes and if I control the CudaMalloc nothing is displayed.
I inform you that my main goal is to test for N large enough.How do I solve that? thank you in advance for any help you might be able to provide.
#include <stdio.h>
#include<device_launch_parameters.h>
#include<cuda.h>
#include<time.h>
#include<cuda_runtime.h>
#include <math.h>
__global__ void MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{   // calculate thread id
          unsigned  int row = blockIdx.y*blockDim.y+threadIdx.y;
          unsigned  int col = blockIdx.x*blockDim.x+threadIdx.x;
for (int k = 0 ; k<WIDTH ; k++ )
         { Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;  }}
int main ()
{       const int i=64 ;
       cudaEvent_t start, stop;
        float time;
       cudaEventCreate(&start);
       cudaEventCreate(&stop);
       const int WIDTH =300;
       cudaError_t cudaStatus;
   float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
   float *array1_d , *array2_d ,*M_result_array_d ; // device array
  // Allocate GPU buffers for 2 vectors (two input, one output) 
    cudaStatus = cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!"); }  
    cudaStatus = cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!"); }  
       for ( int i = 0 ; i<WIDTH ; i++ ) { 
           for (int j = 0 ; j<WIDTH ; j++ )
           { array1_h[i][j] = 1 ; array2_h[i][j] = 2 ; }}
  //copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
  cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
  cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
  //allocating memory for resultent device array
  cudaStatus = cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (float) ) ;
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!"); }  
  //calling kernal
  dim3 dimBlock( i,i, 1 ) ;
  dim3 dimGrid ( ((WIDTH-1)/i) +1 , ((WIDTH-1)/i)+1 ,1 ) ;
  cudaEventRecord(start, 0);
MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
    cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
   cudaEventElapsedTime(&time, start, stop);
   printf ("taille du probleme:%d Time for the kernel: %f \n",WIDTH,time);
  //copy back result_array_d to result_array_h
  cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(float) , cudaMemcpyDeviceToHost) ;
  //printf the result array
  for (int i = 0 ; i<WIDTH ; i++ )
  { for (int j = 0 ; j < WIDTH ; j++ )
     {   printf ("%f   ",M_result_array_h[i][j] ) ; }
              printf ("\n") ; } 
    cudaFree(array1_d);
    cudaFree(array2_d);
    cudaFree(M_result_array_h);
  system("pause") ; }
 
     
     
    