I was learning using shared memory to optimize cuda code. I followed most of the implementations from Nvidia materials. But I found that my device code is never executed. Anyone could help me figure out why? Did I miss something? Thanks.
#include <stdio.h>
#include <cuda_runtime.h>
#include <chrono>
#define BLOCKSIZE 16
typedef struct {
    int height;
    int width;
    int stride;
    float *element;
} Matrix;
void initData(float *p, int size){
    for (int t=0; t<size; t++){
        p[t] = (float)(rand()&0xffff)/1000.0f;
    }
}
__device__ float getElement(Matrix a, int row, int col)
{
    return a.element[row*a.stride+col];
}
__device__ Matrix getSubM(Matrix a, int row, int col)
{
    Matrix res;
    res.height = BLOCKSIZE;
    res.width = BLOCKSIZE;
    res.stride = a.width;
    res.element = &a.element[row*BLOCKSIZE*a.stride+col*BLOCKSIZE];
    return res;
}
__device__ void setElement(Matrix a, int row, int col, float val)
{
    a.element[row*a.stride+col] = val;
}
__global__ void shmMM(Matrix a, Matrix b, Matrix c)
{
    int blockRow = blockDim.y;
    int blockCol = blockDim.x;
    Matrix Csub = getSubM(c, blockRow, blockCol);
    int row = threadIdx.y;
    int col = threadIdx.x;
    float tmp = 0;
    for (int i=0; i < a.width/BLOCKSIZE; i++)
    {   
        Matrix a_sub = getSubM(a, blockRow, i);
        Matrix b_sub = getSubM(b, i, blockCol);
        __shared__ float A[BLOCKSIZE][BLOCKSIZE];
        __shared__ float B[BLOCKSIZE][BLOCKSIZE];
        A[row][col] = getElement(a, row, col);
        B[row][col] = getElement(b, row, col); 
        __syncthreads();
        for (int e = 0; e < BLOCKSIZE; e++)
        {
            tmp += A[row][e]*B[e][col];
        }
        __syncthreads();
    }
    //printf("debug: %f.\n", tmp);
    setElement(Csub, row, col, tmp);
}
int main()
{
    Matrix a, b, c;
    int size = 1<<12;
    a.height = a.width = size;
    b.height = b.width = size;
    c.height = c.width = size;
    a.stride = a.width;
    b.stride = b.width;
    c.stride = c.width;
    float *a_h, *b_h, *c_h;
    cudaMallocHost((float**)&a_h, a.height*a.width*sizeof(float));
    cudaMallocHost((float**)&b_h, b.height*b.width*sizeof(float));
    initData(a_h, a.height*a.width);
    initData(b_h, b.height*b.width);
    c_h = (float*)malloc(c.height*c.width*sizeof(float));
    float *a_d, *b_d, *c_d;
    cudaMalloc((float**)&a.element, a.height*a.width*sizeof(float));
    cudaMalloc((float**)&b.element, b.height*b.width*sizeof(float));
    cudaMalloc((float**)&c.element, c.height*c.width*sizeof(float));
    cudaMemcpy(a.element, a_h, a.height*a.width*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(b.element, b_h, b.height*b.width*sizeof(float), cudaMemcpyHostToDevice);
    dim3 block(BLOCKSIZE, BLOCKSIZE);
    dim3 grid((b.width-1)/block.x+1, (a.height-1)/block.y+1);
    //naiveMM<<<block, grid>>>(a, b, c);
    shmMM<<<block, grid>>>(a, b, c);
    cudaMemcpy(c_h, c.element, c.height*c.width*sizeof(float), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    cudaFree(a_h);
    cudaFree(b_h);
    free(c_h);
    cudaFree(a.element);
    cudaFree(b.element);
    cudaFree(c.element);
    return 0;
}
I couldn't figure it out since there is no reported compiling error and runtime error.
 
    