I have just begun playing around with my vectorising code. My matrix-vector multiplication code is not being autovectorised by gcc, I’d like to know why. This pastebin contains the output from -fopt-info-vec-missed.
I’m having trouble understanding what the output is telling me and seeing how it matches up to what I’ve written in code.
For instance, I see a number of lines saying not enough data-refs in basic block, I can’t find much detail online with a google search about this. I also see that there’s issues relating to memory alignment e.g. Unknown misalignment, naturally aligned and vector alignment may not be reachable. All of my memory allocation was for double types using malloc, which I believed was guaranteed to be aligned for that type.
Environment: compiling with gcc on WSL2
gcc -v: gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define N 4000 // Matrix size will be N x N
#define T 1
//gcc -fopenmp -g vectorisation.c -o main -O3 -march=native -fopt-info-vec-missed=missed.txt
void doParallelComputation(double *A, double *V, double *results, unsigned long matrixSize, int numThreads)
{
omp_set_num_threads(numThreads);
unsigned long i, j;
#pragma omp parallel for simd private(j)
for (i = 0; i < matrixSize; i++)
{
// double *AHead = &A[i * matrixSize];
// double tmp = 0;
for (j = 0; j < matrixSize; j++)
{
results[i] += A[i * matrixSize + j] * V[j];
// also tried tmp += A[i * matrixSize + j] * V[j];
}
// results[i] = tmp;
}
}
void genRandVector(double *S, unsigned long size)
{
srand(time(0));
unsigned long i;
for (i = 0; i < size; i++)
{
double n = rand() % 5;
S[i] = n;
}
}
void genRandMatrix(double *A, unsigned long size)
{
srand(time(0));
unsigned long i, j;
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
double n = rand() % 5;
A[i*size + j] = n;
}
}
}
int main(int argc, char *argv[])
{
double *V = (double *)malloc(N * sizeof(double)); // v in our A*v = parV computation
double *parV = (double *)malloc(N * sizeof(double)); // Parallel computed vector
double *A = (double *)malloc(N * N * sizeof(double)); // NxN Matrix to multiply by V
genRandVector(V, N);
doParallelComputation(A, V, parV, N, T);
free(parV);
free(A);
free(V);
return 0;
}