I have some simple SIMD code which multiply 2 double arrays using intel intrinsic (using flag /arch:AVX2) and I compare it to standard loop without simd:
int const N = 67108864;
__declspec(align(32)) double* ar1 = new double[N];
__declspec(align(32)) double* ar2 = new double[N];
__declspec(align(32)) double* ar3 = new double[N];
for (size_t i = 0; i < N; i++)
{
    ar1[0] = 3.0;
    ar2[0] = 2.0;
}
for (int s = 0; s < 20; s++)
{
    auto begin = chrono::steady_clock::now();
    for (size_t i = 0; i < N; i++)
    {
        ar3[i] = ar1[i] * ar2[i];
    }
    cout << "n: " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << endl;
    begin = chrono::steady_clock::now();
    for (size_t i = 0; i < N; i+=4)
    {
        __m256d in1 = _mm256_load_pd(&ar1[i]);
        __m256d in2 = _mm256_load_pd(&ar2[i]);
        _mm256_store_pd(&ar3[i], _mm256_mul_pd(in1, in2));
    }
    cout << "s: " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << endl;
}
but, I can't get any performance improvement for simd version. I looked on assembly and i guess, that it is because of vmovupd instruction when it should use vmovapd. Why it's use mov for unaligned packed data when i am using __declspec(align(32))?
entire simd loop:
61:         for (size_t i = 0; i < N; i+=4)
62:         {
63:             __m256d in1 = _mm256_load_pd(&ar1[i]);
64:             __m256d in2 = _mm256_load_pd(&ar2[i]);
00007FF62ED612A0  vmovupd     ymm1,ymmword ptr [rax]  
65: 
66:             _mm256_store_pd(&ar3[i], _mm256_mul_pd(in1, in2));
00007FF62ED612A4  vmulpd      ymm1,ymm1,ymmword ptr [rax+r13]  
00007FF62ED612AA  vmovupd     ymmword ptr [rdx+rax],ymm1  
00007FF62ED612AF  lea         rax,[rax+20h]  
00007FF62ED612B3  sub         rcx,1  
00007FF62ED612B7  vzeroupper  
00007FF62ED612BA  jne         main+2A0h (07FF62ED612A0h)  
67:         }
I am new to code vectorisation, so I would be happy for pointer for any common mistakes I am doing.