I've written a CUDA C program to parallelize matrix multiplication. I've stored my matrices as 1D arrays in row-major form. I can't seem to find anywhere why my program shouldn't be working, be it issues with pointers or the kernel code. Help will be appreciated, thanks! (I know I have already asked this question - however, I have taken a step further in parallelization this time and I have written my code such that each thread calculates a single element of the result matrix instead of a single row - also, I have checked that the nvidia_uvm module responsible for CUDA device memory is loaded on my OS currently right now - so that isn't the issue, the issue must lie with my source code)
Code:
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <stdlib.h>
__global__ void multiplyMatricesKernel(float* d_x, float* d_y, float* d_z, int m, int n, int p)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.x + threadIdx.y;
    if(i < p && j < m)
    {
        for(int k = 0; k < n; ++k)
        {
            d_z[j * p + i] += d_x[j * n + k] * d_y[k * p + i];
        }
    }
}
void multiplyMatrices(float* x, float* y, float* z, int m, int n, int p)
{
    dim3 dimOfGrid(ceil(m * p) / 64.0);
    dim3 dimOfBlock(64, 64);
    size_t elements_x = m * n * sizeof(float);
    size_t elements_y = n * p * sizeof(float);
    size_t elements_z = m * p * sizeof(float);
    float* d_x;
    float* d_y;
    float* d_z;
    cudaMalloc((void**) &d_x, elements_x);
    cudaMalloc((void**) &d_y, elements_y);
    cudaMalloc((void**) &d_z, elements_z);
    cudaMemcpy(d_x, x, elements_x, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_y, y, elements_y, cudaMemcpyHostToDevice);
    multiplyMatricesKernel<<<dimOfGrid, dimOfBlock>>>(d_x, d_y, d_z, m, n, p);
    cudaMemcpy(z, d_z, elements_z, cudaMemcpyDeviceToHost);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
}
int main()
{
    srand(time(NULL));
    size_t m = rand() % 8 + 1;
    size_t n = rand() % 8 + 1;
    size_t p = rand() % 8 + 1;
    float x[m * n] = {0};
    float y[n * p] = {0};
    float z[m * p] = {0};
    printf("X =\n[");
    for(int i = 0; i < sizeof(x) / sizeof(float); ++i)
    {
        x[i] = rand() % 129 - 64;
        printf("%.1f ", x[i]);
        if((i + 1) % n == 0 && i != (sizeof(x) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(x) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    
    printf("Y = \n[");
    for(int i = 0; i < sizeof(y) / sizeof(float); ++i)
    {
        y[i] = rand() % 129 - 64;
        printf("%.1f ", y[i]);
        if((i + 1) % p == 0 && i != (sizeof(y) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(y) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    multiplyMatrices(x, y, z, m, n, p);
    printf("Z = \n[");
    for(int i = 0; i < sizeof(z) / sizeof(float); ++i)
    {   
        printf("%.1f ", z[i]);
        if((i + 1) % p == 0 && i != (sizeof(z) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(z) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    return 0;
}
