how is it possible that we can use a number of threads per block bigger than the maximum number of thread per block supported by Quadro K500(1024 threads per block) in our CUDA Application and it works ? thanks
Cuda version: 5.0 Device: Quadro K5000 Os: Linux
#include <cuda.h>
#include <stdio.h>
#include <cuda_profiler_api.h>
#include <thrust/system_error.h>
#include <thrust/system/cuda_error.h>
#include <sstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
/* START PROGRAM */
void inizializzaMatrice (int*,int,int);
void stampaMatrice (int*,int,int);
void flipMatriceCPU (int*,int,int);
void confrontaMatrici (int*,int*,int,int);
__global__ void flipMatriceGPU (int*,int*,int,int);
int main(int argn, char * argv[]){
  dim3 nBlocchi,nThreadPerBlocco;
  int M,N,flag;
  int *in_host, *out_host,*out_DeToHo;
  int *in_device, *out_device;
  int size,sizeSM;
  cudaEvent_t startCPU, stopCPU, startGPU, stopGPU;
    float timeCPU=0,timeGPU=0;
  printf("\n\n******************** RIFLESSIONE ORIZZONTALE DI UNA MATRICE ********************\n\n");
  if(argn<6 || atoi(argv[2])%2==0 ){
    if(argn<6)
      printf("Numero di parametri insufficiente!!!\n");
    else if(atoi(argv[2])%2==0)
      printf("Errore nell'utilizzo di %s. Il numero di colonne <N> deve essere dispari\n",argv[0]);
    printf("Uso corretto: %s <M> <N> <NumThreadPerBlocco.x> <NumThreadPerBlocco.y> <flag per la Stampa>\n", argv[0]);
    printf("Uso dei valori di default ... ...\n\n\n"); 
    nThreadPerBlocco.x=2; 
    nThreadPerBlocco.y=3;
    M=5; N=5; flag=1;
  }
  else {
    M=atoi(argv[1]); 
    N=atoi(argv[2]);
    nThreadPerBlocco.x=atoi(argv[3]);
    nThreadPerBlocco.y=atoi(argv[4]);
    flag=atoi(argv[5]); 
  }
  nBlocchi.x=M/nThreadPerBlocco.x+((M%nThreadPerBlocco.x)==0?0:1);
  nBlocchi.y=N/nThreadPerBlocco.y+((N%nThreadPerBlocco.y)==0?0:1);
  size=M*N*sizeof(int);
//stampa delle info sull'esecuzione del kernel
  printf("Matrix Size = %d * %d\n",M, N);
  printf("Threads per block = %d * %d\n", nThreadPerBlocco.x,nThreadPerBlocco.y); 
  printf("Grid size = %d * %d\n\n\n",nBlocchi.x,nBlocchi.y);
// Allocazione dati sull'host
  in_host=(int*)malloc(size);
  out_host=(int*)malloc(size);
  out_DeToHo=(int*)malloc(size);
//cudaProfilerStart();
// Allocazione dati dul device
  gpuErrchk( cudaMalloc((void**)&in_device,size) );
  gpuErrchk( cudaMalloc((void**)&out_device,size) );
// Inizializzazione dati sull'host
  inizializzaMatrice(in_host,M,N);
  // Flip Matrice CPU
  memcpy(out_host,in_host,size);
  cudaEventCreate(&startCPU);
  cudaEventCreate(&stopCPU);
  cudaEventRecord(startCPU,0);
  flipMatriceCPU(out_host,M,N);
  cudaEventRecord(stopCPU,0);
  cudaEventSynchronize(stopCPU);
  cudaEventElapsedTime(&timeCPU,startCPU,stopCPU);
  printf("CPU time: %f\n",timeCPU/1000);
  cudaEventDestroy(startCPU);
  cudaEventDestroy(stopCPU);
  sizeSM=nThreadPerBlocco.y*nThreadPerBlocco.x*sizeof(int);
// Invocazione del Kernel
  printf("blocks.x: %d, blocks.y: %d,  threads.x: %d, threads.y: %d, smem size: %d\n", nBlocchi.x, nBlocchi.y, nThreadPerBlocco.x, nThreadPerBlocco.y, sizeSM);
gpuErrchk(cudaMemcpy(in_device, in_host, size, cudaMemcpyHostToDevice));  
  cudaEventCreate(&startGPU);
  cudaEventCreate(&stopGPU);
  cudaEventRecord(startGPU,0);
// Copia dei dati dall'host al device
//  gpuErrchk(cudaMemcpy(in_device, in_host, size, cudaMemcpyHostToDevice));
  flipMatriceGPU<<<nBlocchi, nThreadPerBlocco, sizeSM>>>(in_device, out_device, N,M);
  cudaEventRecord(stopGPU,0);
  cudaEventSynchronize(stopGPU);
  cudaEventElapsedTime(&timeGPU,startGPU,stopGPU);
  printf("GPU time: %f \n",timeGPU/1000);
  cudaEventDestroy(startGPU);
  cudaEventDestroy(stopGPU);  
  gpuErrchk( cudaMemcpy(out_DeToHo, out_device, size, cudaMemcpyDeviceToHost) );
// cudaProfilerStop();
// Stampa Matrici
  if (flag==1){
    printf("Matrice di input:\n");
    stampaMatrice(in_host, M, N);
    printf("Matrice di output host CPU:\n");
    stampaMatrice(out_host, M, N);
    printf("Matrice di output device GPU:\n");
    stampaMatrice(out_DeToHo, M, N);
  }
  confrontaMatrici(out_host,out_DeToHo,M,N);
  printf("\n\n********************************************************************************\n\n");
  free(in_host);
  free(out_host);
  free(out_DeToHo);
  cudaFree(in_device);
  cudaFree(out_device);
  exit(0);
}
void inizializzaMatrice(int* matrice, int M, int N) {
  int i,j; for(i=0;i<M;i++)
  for(j=0;j<N;j++) matrice[i*N+j]=i*N+j;
}
void stampaMatrice(int*matrice, int M, int N) {
  int i,j; 
  for(i=0;i<M;i++) {
    for(j=0;j<N;j++)
      printf("%d\t", matrice[i*N+j]);
    printf("\n"); 
  }
}
void flipMatriceCPU(int *matrice, int row, int col){
  int i, j,tmp;
  for ( i = 0; i < row; i++ ) {
    for (  j = 0; j < col/2; j++ ) {
      tmp=matrice[col*i+j];
      matrice[col*i+j] = matrice[col*i+col-j-1];
      matrice[col*i+col-j-1] = tmp;
    }
  }
}
void confrontaMatrici(int* m1, int*m2, int M, int N) {
  int i, j; for(i=0;i<M;i++)
  for(j=0;j<N;j++) if(m1[i*N+j]!=m2[i*N+j]) {
    printf("Host and Device Outputs: ERROR!\n");
    return; 
  }
  if(i==M && j==N)
    printf("Host and Device Outputs OK.\n");
}
__global__ void flipMatriceGPU(int *in, int *out, int col, int row) {
  extern __shared__ int s_data[];
  int indexRow=threadIdx.x + blockIdx.x*blockDim.x; 
  int indexCol=threadIdx.y + blockIdx.y*blockDim.y; 
  int index=indexRow*col+indexCol;
  if(indexCol<col && indexRow<row){
    int index_data=blockDim.y-1-threadIdx.y+threadIdx.x*blockDim.y;
    s_data[index_data]=in[index];
    __syncthreads();
    int outOffset= blockDim.y*(gridDim.y-1-blockIdx.y);
    int outIndex= outOffset + threadIdx.y -(gridDim.y*blockDim.y - col) + indexRow*col;
    if(blockIdx.y==gridDim.y-1){
      outIndex+=gridDim.y*blockDim.y - col;
      out[outIndex]= s_data[(gridDim.y*blockDim.y - col)+(threadIdx.y+threadIdx.x*blockDim.y)];
    }
    else  
      out[outIndex]= s_data[threadIdx.y+threadIdx.x*blockDim.y];
  }
}
 
     
    