MAXIMUM THREAD PER BLOCK

Question

how is it possible that we can use a number of threads per block bigger than the maximum number of thread per block supported by Quadro K500(1024 threads per block) in our CUDA Application and it works ? thanks

Cuda version: 5.0 Device: Quadro K5000 Os: Linux

#include <cuda.h>
#include <stdio.h>
#include <cuda_profiler_api.h>

#include <thrust/system_error.h>
#include <thrust/system/cuda_error.h>
#include <sstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/* START PROGRAM */

void inizializzaMatrice (int*,int,int);
void stampaMatrice (int*,int,int);
void flipMatriceCPU (int*,int,int);
void confrontaMatrici (int*,int*,int,int);
__global__ void flipMatriceGPU (int*,int*,int,int);

int main(int argn, char * argv[]){
  dim3 nBlocchi,nThreadPerBlocco;
  int M,N,flag;
  int *in_host, *out_host,*out_DeToHo;
  int *in_device, *out_device;
  int size,sizeSM;
  cudaEvent_t startCPU, stopCPU, startGPU, stopGPU;
    float timeCPU=0,timeGPU=0;


  printf("\n\n******************** RIFLESSIONE ORIZZONTALE DI UNA MATRICE ********************\n\n");
  if(argn<6 || atoi(argv[2])%2==0 ){
    if(argn<6)
      printf("Numero di parametri insufficiente!!!\n");

    else if(atoi(argv[2])%2==0)
      printf("Errore nell'utilizzo di %s. Il numero di colonne <N> deve essere dispari\n",argv[0]);

    printf("Uso corretto: %s <M> <N> <NumThreadPerBlocco.x> <NumThreadPerBlocco.y> <flag per la Stampa>\n", argv[0]);
    printf("Uso dei valori di default ... ...\n\n\n"); 
    nThreadPerBlocco.x=2; 
    nThreadPerBlocco.y=3;
    M=5; N=5; flag=1;
  }
  else {
    M=atoi(argv[1]); 
    N=atoi(argv[2]);
    nThreadPerBlocco.x=atoi(argv[3]);
    nThreadPerBlocco.y=atoi(argv[4]);
    flag=atoi(argv[5]); 
  }


  nBlocchi.x=M/nThreadPerBlocco.x+((M%nThreadPerBlocco.x)==0?0:1);
  nBlocchi.y=N/nThreadPerBlocco.y+((N%nThreadPerBlocco.y)==0?0:1);

  size=M*N*sizeof(int);

//stampa delle info sull'esecuzione del kernel
  printf("Matrix Size = %d * %d\n",M, N);
  printf("Threads per block = %d * %d\n", nThreadPerBlocco.x,nThreadPerBlocco.y); 
  printf("Grid size = %d * %d\n\n\n",nBlocchi.x,nBlocchi.y);

// Allocazione dati sull'host
  in_host=(int*)malloc(size);
  out_host=(int*)malloc(size);
  out_DeToHo=(int*)malloc(size);

//cudaProfilerStart();

// Allocazione dati dul device
  gpuErrchk( cudaMalloc((void**)&in_device,size) );
  gpuErrchk( cudaMalloc((void**)&out_device,size) );

// Inizializzazione dati sull'host

  inizializzaMatrice(in_host,M,N);



  // Flip Matrice CPU
  memcpy(out_host,in_host,size);
  cudaEventCreate(&startCPU);
  cudaEventCreate(&stopCPU);
  cudaEventRecord(startCPU,0);

  flipMatriceCPU(out_host,M,N);
  cudaEventRecord(stopCPU,0);
  cudaEventSynchronize(stopCPU);
  cudaEventElapsedTime(&timeCPU,startCPU,stopCPU);
  printf("CPU time: %f\n",timeCPU/1000);
  cudaEventDestroy(startCPU);
  cudaEventDestroy(stopCPU);

  sizeSM=nThreadPerBlocco.y*nThreadPerBlocco.x*sizeof(int);
// Invocazione del Kernel
  printf("blocks.x: %d, blocks.y: %d,  threads.x: %d, threads.y: %d, smem size: %d\n", nBlocchi.x, nBlocchi.y, nThreadPerBlocco.x, nThreadPerBlocco.y, sizeSM);
gpuErrchk(cudaMemcpy(in_device, in_host, size, cudaMemcpyHostToDevice));  
  cudaEventCreate(&startGPU);
  cudaEventCreate(&stopGPU);
  cudaEventRecord(startGPU,0);

// Copia dei dati dall'host al device

//  gpuErrchk(cudaMemcpy(in_device, in_host, size, cudaMemcpyHostToDevice));

  flipMatriceGPU<<<nBlocchi, nThreadPerBlocco, sizeSM>>>(in_device, out_device, N,M);

  cudaEventRecord(stopGPU,0);
  cudaEventSynchronize(stopGPU);
  cudaEventElapsedTime(&timeGPU,startGPU,stopGPU);
  printf("GPU time: %f \n",timeGPU/1000);
  cudaEventDestroy(startGPU);
  cudaEventDestroy(stopGPU);  

  gpuErrchk( cudaMemcpy(out_DeToHo, out_device, size, cudaMemcpyDeviceToHost) );

// cudaProfilerStop();

// Stampa Matrici

  if (flag==1){

    printf("Matrice di input:\n");
    stampaMatrice(in_host, M, N);

    printf("Matrice di output host CPU:\n");
    stampaMatrice(out_host, M, N);

    printf("Matrice di output device GPU:\n");
    stampaMatrice(out_DeToHo, M, N);

  }

  confrontaMatrici(out_host,out_DeToHo,M,N);
  printf("\n\n********************************************************************************\n\n");
  free(in_host);
  free(out_host);
  free(out_DeToHo);
  cudaFree(in_device);
  cudaFree(out_device);

  exit(0);
}


void inizializzaMatrice(int* matrice, int M, int N) {
  int i,j; for(i=0;i<M;i++)
  for(j=0;j<N;j++) matrice[i*N+j]=i*N+j;
}

void stampaMatrice(int*matrice, int M, int N) {
  int i,j; 
  for(i=0;i<M;i++) {
    for(j=0;j<N;j++)
      printf("%d\t", matrice[i*N+j]);
    printf("\n"); 
  }
}

void flipMatriceCPU(int *matrice, int row, int col){
  int i, j,tmp;
  for ( i = 0; i < row; i++ ) {
    for (  j = 0; j < col/2; j++ ) {
      tmp=matrice[col*i+j];
      matrice[col*i+j] = matrice[col*i+col-j-1];
      matrice[col*i+col-j-1] = tmp;

    }
  }
}

void confrontaMatrici(int* m1, int*m2, int M, int N) {
  int i, j; for(i=0;i<M;i++)
  for(j=0;j<N;j++) if(m1[i*N+j]!=m2[i*N+j]) {
    printf("Host and Device Outputs: ERROR!\n");
    return; 
  }
  if(i==M && j==N)
    printf("Host and Device Outputs OK.\n");
}

__global__ void flipMatriceGPU(int *in, int *out, int col, int row) {
  extern __shared__ int s_data[];
  int indexRow=threadIdx.x + blockIdx.x*blockDim.x; 
  int indexCol=threadIdx.y + blockIdx.y*blockDim.y; 
  int index=indexRow*col+indexCol;

  if(indexCol<col && indexRow<row){
    int index_data=blockDim.y-1-threadIdx.y+threadIdx.x*blockDim.y;
    s_data[index_data]=in[index];
    __syncthreads();

    int outOffset= blockDim.y*(gridDim.y-1-blockIdx.y);
    int outIndex= outOffset + threadIdx.y -(gridDim.y*blockDim.y - col) + indexRow*col;

    if(blockIdx.y==gridDim.y-1){
      outIndex+=gridDim.y*blockDim.y - col;
      out[outIndex]= s_data[(gridDim.y*blockDim.y - col)+(threadIdx.y+threadIdx.x*blockDim.y)];
    }

    else  
      out[outIndex]= s_data[threadIdx.y+threadIdx.x*blockDim.y];
  }
}

It's not possible. Please provide a *complete* sample code that you think is working, along with your compile command, the environmnet (e.g. CUDA version, windows or linux, etc.) as well as the output you are getting when you run it. If you're in doubt about this, I would suggest printing out the threadblock configuration variables (second config argument for the kernel call) immediately prior to the kernel call, and do [rigorous, proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) — Robert Crovella, Jul 14 '14 at 14:26
thanks Robert for the answer, i post the code above. thank you. — FlaGlo, Jul 14 '14 at 14:44
Also as a suggestion: use English names for the functions. Not everyone knows Italian :) — Marco A., Jul 14 '14 at 14:45
Please post the command line you are using to launch the executable *in the question*, not in a dropbox link. — Robert Crovella, Jul 14 '14 at 14:54

score 2 · Answer 1 · edited May 23 '17 at 11:57

Please post everything in the question, not in dropbox links.

When I compile your code and run it as follows:

cuda-memcheck ./t479 20000 20001 2048 1000 0

I get the following:

$ cuda-memcheck ./t479 20000 20001 2048 1000 0
========= CUDA-MEMCHECK


******************** RIFLESSIONE ORIZZONTALE DI UNA MATRICE ********************

Matrix Size = 20000 * 20001
Threads per block = 2048 * 1000
Grid size = 10 * 21


CPU time: 0.216176
blocks.x: 10, blocks.y: 21,  threads.x: 2048, threads.y: 1000, smem size: 8192000
========= Program hit error 9 on CUDA API call to cudaLaunch
GPU time: 0.000037
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so [0x2ef033]
=========     Host Frame:./t479 [0x3b6fe]
=========     Host Frame:./t479 [0x31e2]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
=========     Host Frame:./t479 [0x24e9]
=========
Host and Device Outputs: ERROR!


********************************************************************************

========= ERROR SUMMARY: 1 error
$

So the kernel launch is failing. Why doesn't your program report this directly?

Because you are not doing proper cuda error checking.

Add the following after your kernel call:

gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );

Note that not only are your threads per block out of range in your example, but your shared mem per block is out of range as well.

Note that your results validation is broken as well. I was able to reproduce your "passing" result with your original code by running the following sequence:

./t479 20000 20001 32 32 0
./t479 20000 20001 2048 1000 0

This is due to the fact that the second run of the program is testing the stale results left over from the first run (in device memory). To prove this, add the following line before your kernel launch:

cudaMemset(out_device, 0, size);

and you will no longer be able to get "passing" results this way.

Finally, as instructed previously, this line should not be used (and in fact is unnecessary for this program):

#include <thrust/system/cuda_error.h>

files in thrust/system may change from one CUDA version to the next, and your program should not include these directly.

Thanks Robert and sorry for the links I deleted them. I added in my code your advices and it works. Thank you so much. — FlaGlo, Jul 14 '14 at 19:29

MAXIMUM THREAD PER BLOCK

1 Answers1