I've tried to implement Big matrix multiplication using Shared Memory with Tiled Algorithm. And When I execute this code, I get correct results. 
But When I try to calculate more or equal than 4000 x 4000 sized matrix, It returns a matrix only filled with 0.
I couldn't find what's wrong with this code..
the code below is Kernel function for matrix multiplication.
typedef float element;
#define TILE_WIDTH 32
#define WIDTH 4010
#define GRID_WIDTH ( (WIDTH) / TILE_WIDTH) + 1
__global__ void MatrixMulKernel(element* d_P, element* d_N, element* d_M, int Width) {
        __shared__ element ds_M[TILE_WIDTH][TILE_WIDTH];
        __shared__ element ds_N[TILE_WIDTH][TILE_WIDTH];
        int bx = blockIdx.x; int by = blockIdx.y;
        int tx = threadIdx.x; int ty = threadIdx.y;
        int Row = by*TILE_WIDTH + ty;
        int Col = bx*TILE_WIDTH + tx;
        element pValue = 0;
        int length = (Width + TILE_WIDTH - 1)/TILE_WIDTH;
        for(int m = 0; m < length; m++) {
                if( m*TILE_WIDTH + tx < WIDTH && Row < WIDTH) ds_N[ty][tx] = d_N[Row*Width + m*TILE_WIDTH+tx];
                else ds_N[ty][tx] = 0.0;
                if( m*TILE_WIDTH + ty < WIDTH && Col < WIDTH) ds_M[ty][tx] = d_M[Col + (m*TILE_WIDTH+ty)*Width];
                else ds_M[ty][tx] = 0.0;
                __syncthreads();
                for(int k = 0; k < TILE_WIDTH; ++k) {
                        pValue += ds_N[ty][k]*ds_M[k][tx];
                }
                __syncthreads();
        }
        if( Row < Width && Col < Width)
                d_P[Row*Width+Col] = pValue;
}
the code below is full code.
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <chrono>
using namespace std;
using namespace chrono;
typedef float element;
#define TILE_WIDTH 32
#define WIDTH 4010
#define GRID_WIDTH ( (WIDTH) / TILE_WIDTH) + 1
__global__ void MatrixMulKernel(element* d_P, element* d_N, element* d_M, int Width) {
        __shared__ element ds_M[TILE_WIDTH][TILE_WIDTH];
        __shared__ element ds_N[TILE_WIDTH][TILE_WIDTH];
        int bx = blockIdx.x; int by = blockIdx.y;
        int tx = threadIdx.x; int ty = threadIdx.y;
        int Row = by*TILE_WIDTH + ty;
        int Col = bx*TILE_WIDTH + tx;
        element pValue = 0;
        int length = (Width + TILE_WIDTH - 1)/TILE_WIDTH;
        for(int m = 0; m < length; m++) {
                if( m*TILE_WIDTH + tx < WIDTH && R
ow < WIDTH) ds_N[ty][tx] = d_N[Row*Width + m*TILE_WIDTH+tx];
                else ds_N[ty][tx] = 0.0;
                if( m*TILE_WIDTH + ty < WIDTH && Col < WIDTH) ds_M[ty][tx] = d_M[Col + (m*TILE_WIDTH+ty)*Width];
                else ds_M[ty][tx] = 0.0;
                __syncthreads();
                for(int k = 0; k < TILE_WIDTH; ++k) {
                        pValue += ds_N[ty][k]*ds_M[k][tx];
                }
                __syncthreads();
        }
        if( Row < Width && Col < Width)
                d_P[Row*Width+Col] = pValue;
}
void matmul(float* c, float* a, float* b, int width) {
        // c[y][x] = sum_k a[y][k] * b[k][x]
        // c[y * WIDTH + x] = sum_k a[y*WIDTH + k] * b[k*WIDTH + x]
        for (register int y = 0; y < width; ++y) {
                for (register int x = 0; x < width; ++x) {
                        register float sum = 0.0F;
                        for (register int k = 0; k < width; ++k) {
                                sum += a[y * width + k] * b[k * width + x];
                        }
     c[y * width + x] = sum;
                }
        }
}
void genData(element* ptr, unsigned int size) {
        while(size--) {
                ptr[size] = (rand() % 10) + 1;
        }
}
int main() {
        element *pA = NULL;
        element *pB = NULL;
        element *pC = NULL;
        system_clock::time_point start;
        system_clock::time_point  end;
        srand(time(NULL));
        start = system_clock::now();
        // malloc meroies on the host-side
        pA = (element*)malloc(WIDTH * WIDTH  * sizeof(element));
        pB = (element*)malloc(WIDTH * WIDTH  * sizeof(element));
        pC = (element*)malloc(WIDTH * WIDTH  * sizeof(element));
        genData(pA, WIDTH*WIDTH);
        genData(pB, WIDTH*WIDTH);
//device side
        element *pAdev = NULL;
        element *pBdev = NULL;
        element *pCdev = NULL;
        cudaMalloc((void**)&pAdev, WIDTH*WIDTH*sizeof(element));
        cudaMalloc((void**)&pBdev, WIDTH*WIDTH*sizeof(element));
        cudaMalloc((void**)&pCdev, WIDTH*WIDTH*sizeof(element));
        cudaMemcpy(pAdev, pA, WIDTH * WIDTH * sizeof(element), cudaMemcpyHostToDevice);
        cudaMemcpy(pBdev, pB, WIDTH * WIDTH * sizeof(element), cudaMemcpyHostToDevice);
        dim3 dimGrid(GRID_WIDTH, GRID_WIDTH, 1);
        dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
        MatrixMulKernel<<<dimGrid, dimBlock>>>(pCdev, pAdev, pBdev, WIDTH);
        cudaMemcpy(pC, pCdev, WIDTH * WIDTH * sizeof(element), cudaMemcpyDeviceToHost);
        element *test;
        test = (element*)malloc(WIDTH * WIDTH * sizeof(element));
     matmul(test, pA, pB, WIDTH);
        bool flag = true;
        for(int i = 0; i<WIDTH; i++) {
                for(int j = 0; j<WIDTH; j++) {
                        if(pC[i*WIDTH+j] != test[i*WIDTH+j]) {
                                printf("%f, %f\n", pC[i*WIDTH+j], test[i*WIDTH+j]);
                                flag = false;
                        }
                 }
        }
        if(flag) printf("TRUE\n");
        else printf("FALSE\n");
        end = system_clock::now();
        cout << "Elapsed time : " << duration_cast<nanoseconds>(end - start).count() << "ns" << '\n';
        free(pA);
        free(pB);
        free(pC);
        cudaFree(pAdev);
        cudaFree(pBdev);
        cudaFree(pCdev);
        return 0;
}
I am waiting for your answer! Thank you!
