I wrote some code to test/play around comparing regular C++ code with SSE intrinsics. What I noticed is that both sections of the code shown below run at similar times, usually with a difference of 5-10%. Naïvely, I'd expect something more noticeable.
I post the code here. Below the code, you may find some interesting sections of the disassembled code, which just added to my confusion.
// main.cpp
#include <chrono>
#include <vector>
#include <complex>
#include <iostream>
#include <immintrin.h>
using cdouble_sse = __m128d;
using cdouble = std::complex<double>;
cdouble_sse prod_sse(cdouble_sse val, cdouble_sse other);
cdouble prod(cdouble a, cdouble b);
int main() {
    int constexpr N = 10000;
    int constexpr N2 = 1000000;
    cdouble *v1 = (cdouble*)_mm_malloc(N*sizeof(cdouble), alignof(cdouble_sse));
    cdouble *v2 = (cdouble*)_mm_malloc(N*sizeof(cdouble), alignof(cdouble_sse));
    for (int i = 0; i < N; ++i) {
        v1[i] = cdouble{drand48(), drand48()};
        v2[i] = cdouble{drand48(), drand48()};
    }
    {
        double const div = 1.0 / N;
        cdouble out = 0.0;
        auto start = std::chrono::steady_clock::now();
        for (int t = 0; t < N2; ++t) {
            cdouble out_tmp = 0.0;
            cdouble *p1 = v1;
            cdouble *p2 = v2;
            cdouble *end = p1 + N;
            for (; p1 != end; ++p1, ++p2)
                out_tmp += prod(*p1, *p2);
            out += out_tmp * div;
        }
        auto end   = std::chrono::steady_clock::now();
        std::cout << out.real() << '\t' << out.imag() << '\n';
        std::chrono::duration<double> elapsed = end - start;
        std::cout << elapsed.count();
        std::cout << '\n';
    }
    {
        cdouble_sse div{1.0 / N, 1.0 / N};
        cdouble_sse out{0.0, 0.0};
        auto start = std::chrono::steady_clock::now();
        for (int t = 0; t < N2; ++t) {
            cdouble_sse out_tmp{0.0, 0.0};
            cdouble_sse *p1 = reinterpret_cast<cdouble_sse*>(v1);
            cdouble_sse *p2 = reinterpret_cast<cdouble_sse*>(v2);
            cdouble_sse *end = p1 + N;
            for (; p1 != end; ++p1, ++p2) {
                cdouble_sse tmp = prod_sse(*p1, *p2);
                out_tmp = _mm_add_pd(out_tmp, tmp);
            }
            out = _mm_add_pd(out, _mm_mul_pd(out_tmp, div));
        }
        auto end   = std::chrono::steady_clock::now();
        cdouble res = cdouble{out[0], out[1]};
        std::cout << res.real() << '\t' << res.imag() << '\n';
        std::chrono::duration<double> elapsed = end - start;
        std::cout << elapsed.count();
        std::cout << '\n';
    }
    _mm_free(v1);
    _mm_free(v2);
    return 0;
}
// prods.cpp
cdouble_sse prod_sse(cdouble_sse val, cdouble_sse other) {
        auto const x1 = _mm_shuffle_pd(val, val, 0);
        auto const y1 = _mm_shuffle_pd(val, val, 3);
        auto const y2 = _mm_shuffle_pd(other, other, 1);
        auto const z1 = _mm_mul_pd(y1, y2);
        return cdouble_sse{_mm_fmsubadd_pd(x1, other, z1)};
}
cdouble prod(cdouble a, cdouble b) { return std::conj(a) * b; }
ARGS=--std=c++20 -march=native -O2
all: main
prods.o: prods.cpp Makefile
    g++ $(ARGS) prods.cpp -c
main: main.cpp prods.o
    g++ $(ARGS) main.cpp -c
    g++ $(ARGS) main.o prods.o -o main
    objdump -Cd main > dump_main
Here are the sections from the disassembled code that call the prod and prod_sse functions:
    11c0:   c4 c1 7b 10 07          vmovsd (%r15),%xmm0
    11c5:   c4 c1 7b 10 4f 08       vmovsd 0x8(%r15),%xmm1
    11cb:   c5 fb 10 13             vmovsd (%rbx),%xmm2
    11cf:   c5 fb 10 5b 08          vmovsd 0x8(%rbx),%xmm3
    11d4:   48 83 c3 10             add    $0x10,%rbx
    11d8:   e8 63 03 00 00          call   1540 <prod(std::complex<double>, std::complex<double>)>
    11dd:   c4 e1 f9 6e ed          vmovq  %rbp,%xmm5
    11e2:   c5 d3 58 e0             vaddsd %xmm0,%xmm5,%xmm4
    11e6:   c5 f3 58 34 24          vaddsd (%rsp),%xmm1,%xmm6
    11eb:   49 83 c7 10             add    $0x10,%r15
    11ef:   c4 e1 f9 7e e5          vmovq  %xmm4,%rbp
    11f4:   c5 fb 11 34 24          vmovsd %xmm6,(%rsp)
    11f9:   4c 39 f3                cmp    %r14,%rbx
    11fc:   75 c2                   jne    11c0 <main+0xf0>
...
    12d8:   c4 c1 79 28 0c 1c       vmovapd (%r12,%rbx,1),%xmm1
    12de:   c4 c1 79 28 44 1d 00    vmovapd 0x0(%r13,%rbx,1),%xmm0
    12e5:   48 83 c3 10             add    $0x10,%rbx
    12e9:   e8 32 02 00 00          call   1520 <prod_sse(double __vector(2), double __vector(2))>
    12ee:   c5 f9 58 14 24          vaddpd (%rsp),%xmm0,%xmm2
    12f3:   c5 f9 29 14 24          vmovapd %xmm2,(%rsp)
    12f8:   48 81 fb 00 71 02 00    cmp    $0x27100,%rbx
    12ff:   75 d7                   jne    12d8 <main+0x208>
The non-SSE version is using twice as many mov's as the SSE version, which makes sense.
You can also see below the disassembled code for both product functions:
0000000000001520 <prod_sse(double __vector(2), double __vector(2))>:
    1520:   c4 e3 79 05 d0 00       vpermilpd $0x0,%xmm0,%xmm2
    1526:   c4 e3 79 05 d9 01       vpermilpd $0x1,%xmm1,%xmm3
    152c:   c4 e3 79 05 c0 03       vpermilpd $0x3,%xmm0,%xmm0
    1532:   c5 f9 59 c3             vmulpd %xmm3,%xmm0,%xmm0
    1536:   c4 e2 e9 b7 c1          vfmsubadd231pd %xmm1,%xmm2,%xmm0
    153b:   c3                      ret    
    153c:   0f 1f 40 00             nopl   0x0(%rax)
0000000000001540 <prod(std::complex<double>, std::complex<double>)>:
    1540:   c5 e3 10 eb             vmovsd %xmm3,%xmm3,%xmm5
    1544:   c5 f1 57 1d d4 0a 00    vxorpd 0xad4(%rip),%xmm1,%xmm3        # 2020 <_IO_stdin_used+0x20>
    154b:   00 
    154c:   c5 eb 10 e2             vmovsd %xmm2,%xmm2,%xmm4
    1550:   c5 fb 10 f0             vmovsd %xmm0,%xmm0,%xmm6
    1554:   c5 fb 59 d5             vmulsd %xmm5,%xmm0,%xmm2
    1558:   c5 e3 59 c5             vmulsd %xmm5,%xmm3,%xmm0
    155c:   c4 e2 e9 9d cc          vfnmadd132sd %xmm4,%xmm2,%xmm1
    1561:   c4 e2 c9 bb c4          vfmsub231sd %xmm4,%xmm6,%xmm0
    1566:   c5 f9 2e c1             vucomisd %xmm1,%xmm0
    156a:   7a 01                   jp     156d <prod(std::complex<double>, std::complex<double>)+0x2d>
    156c:   c3                      ret    
    156d:   50                      push   %rax
    156e:   c5 d3 10 cd             vmovsd %xmm5,%xmm5,%xmm1
    1572:   c5 db 10 c4             vmovsd %xmm4,%xmm4,%xmm0
    1576:   c5 cb 10 d6             vmovsd %xmm6,%xmm6,%xmm2
    157a:   e8 b1 fa ff ff          call   1030 <__muldc3@plt>
    157f:   5a                      pop    %rdx
    1580:   c3                      ret
The code for prod performs more operations than that of prod_sse, since it deals with real and imaginary parts separately.
Can anyone explain to me why the SSE version is only 5-10% faster than the "regular" version?
Edit: even if I compile with the -flto flag, the time difference between SSE and non-SSE codes is still around 5-10%.
