I am new to CUDA. I have tried to add two vectors and it works fine. Now I want to add two matrix. I want to add two matrix using two dimension threads(threadIdx.x and threadIdx.y). I have found this code in Internet, and I have made some changes to display the results. It compiles. But displays unexpected results, it looks like memory addresses. Please help me, Thank you in advance.
#include <stdio.h>
#include <stdlib.h>
#define N 5
#define BLOCK_DIM 10
__global__ void matrixAdd (int *a, int *b, int *c) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int index = col + row * N;
    if (col < N && row < N) {
        c[index] = a[index] + b[index];
    }
}
int main() {
    int a[N][N], b[N][N], c[N][N];
    int *dev_a, *dev_b, *dev_c;
    int size = N * N;
    for(int i=0; i<N; i++)
        for (int j=0; j<N; j++){
            a[i][j] = 1;
            b[i][j] = 2;
        }
    cudaMalloc((void**)&dev_a, size);
    cudaMalloc((void**)&dev_b, size);
    cudaMalloc((void**)&dev_c, size);
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
    dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
    dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));
    matrixAdd<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c);
    cudaDeviceSynchronize();
    for(int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            printf("%d\t", c[i][j] );
        }
        printf("\n");
    }
    cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
    cudaFree(dev_a); 
    cudaFree(dev_b); 
    cudaFree(dev_c);
}
and the output is
0   0   -780197879  32659   1   
0   452489360   32764   6303208 0   
4198328 0   452489376   32764   4198181 
0   2   0   4198557 0   
4196864 0   0   0   4198480 
my expected output is a 5x5 matrix of element 3. Please help me.
 
     
     
    