I've written a CUDA C program to parallelize matrix multiplication by having each thread calculate a row of the result matrix. I've stored my matrices as 1D arrays in row-major form. I can't seem to find anywhere why my program shouldn't be working, be it issues with pointers or the kernel code. Help will be appreciated, thanks!
Code:
#include <cuda.h>
#include <time.h>
#include <stdlib.h>
__global__ void multiplyMatricesKernel(float* d_x, float* d_y, float* d_z, int m, int n, int p)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if(i < m)
    {
        for(int j = 0; j < p; ++j)
        {
            d_z[i * p + j] = 0;
            for(int k = 0; k < m; ++k)
            {
                d_z[i * p + j] += d_x[i * n + k] * d_y[k * p + j];
            }
        }
    }
}
void multiplyMatrices(float* x, float* y, float* z, int m, int n, int p)
{
    int elements_x = m * n * sizeof(float);
    int elements_y = n * p * sizeof(float);
    int elements_z = m * p * sizeof(float);
    float* d_x;
    float* d_y;
    float* d_z;
    cudaMalloc((void**) &d_x, elements_x);
    cudaMalloc((void**) &d_y, elements_y);
    cudaMalloc((void**) &d_z, elements_z);
    cudaMemcpy(d_x, x, elements_x, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_y, y, elements_y, cudaMemcpyHostToDevice);
    multiplyMatricesKernel<<<ceil(m / 64.0), 64>>>(d_x, d_y, d_z, m, n, p);
    cudaMemcpy(z, d_z, elements_z, cudaMemcpyDeviceToHost);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
}
int main()
{
    srand(time(NULL));
    int m = rand() % 8 + 1;
    int n = rand() % 8 + 1;
    int p = rand() % 8 + 1;
    float x[m * n] = {0};
    float y[n * p] = {0};
    float z[m * p] = {0};
    printf("X =\n[");
    for(int i = 0; i < sizeof(x) / sizeof(float); ++i)
    {
        x[i] = rand() % 129 - 64;
        printf("%.1f ", x[i]);
        if((i + 1) % n == 0 && i != (sizeof(x) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(x) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    
    printf("Y = \n[");
    for(int i = 0; i < sizeof(y) / sizeof(float); ++i)
    {
        y[i] = rand() % 129 - 64;
        printf("%.1f ", y[i]);
        if((i + 1) % p == 0 && i != (sizeof(y) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(y) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    multiplyMatrices(x, y, z, m, n, p);
    printf("Z = \n[");
    for(int i = 0; i < sizeof(z) / sizeof(float); ++i)
    {   
        printf("%.1f ", z[i]);
        if((i + 1) % p == 0 && i != (sizeof(z) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(z) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    return 0;
}
 
     
    