Difference in performance between MSVC and GCC for highly optimized matrix multplication code

Question

I'm seeing a big difference in performance between code compiled in MSVC (on Windows) and GCC (on Linux) for an Ivy Bridge system. The code does dense matrix multiplication. I'm getting 70% of the peak flops with GCC and only 50% with MSVC. I think I may have isolated the difference to how they both convert the following three intrinsics.

__m256 breg0 = _mm256_loadu_ps(&b[8*i])
_mm256_add_ps(_mm256_mul_ps(arge0,breg0), tmp0)

GCC does this

vmovups ymm9, YMMWORD PTR [rax-256]
vmulps  ymm9, ymm0, ymm9
vaddps  ymm8, ymm8, ymm9

MSVC does this

vmulps   ymm1, ymm2, YMMWORD PTR [rax-256]
vaddps   ymm3, ymm1, ymm3

Could somebody please explain to me if and why these two solutions could give such a big difference in performance?

Despite MSVC using one less instruction it ties the load to the mult and maybe that makes it more dependent (maybe the load can't be done out of order)? I mean Ivy Bridge can do one AVX load, one AVX mult, and one AVX add in one clock cycle but this requires each operation to be independent.

Maybe the problem lies elsewhere? You can see the full assembly code for GCC and MSVC for the innermost loop below. You can see the C++ code for the loop here Loop unrolling to achieve maximum throughput with Ivy Bridge and Haswell

g++ -S -masm=intel matrix.cpp -O3 -mavx -fopenmp

.L4:
    vbroadcastss    ymm0, DWORD PTR [rcx+rdx*4]
    add rdx, 1
    add rax, 256
    vmovups ymm9, YMMWORD PTR [rax-256]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm8, ymm8, ymm9
    vmovups ymm9, YMMWORD PTR [rax-224]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm7, ymm7, ymm9
    vmovups ymm9, YMMWORD PTR [rax-192]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm6, ymm6, ymm9
    vmovups ymm9, YMMWORD PTR [rax-160]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm5, ymm5, ymm9
    vmovups ymm9, YMMWORD PTR [rax-128]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm4, ymm4, ymm9
    vmovups ymm9, YMMWORD PTR [rax-96]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm3, ymm3, ymm9
    vmovups ymm9, YMMWORD PTR [rax-64]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm2, ymm2, ymm9
    vmovups ymm9, YMMWORD PTR [rax-32]
    cmp esi, edx
    vmulps  ymm0, ymm0, ymm9
    vaddps  ymm1, ymm1, ymm0
    jg  .L4

MSVC /FAc /O2 /openmp /arch:AVX ...

vbroadcastss ymm2, DWORD PTR [r10]    
lea  rax, QWORD PTR [rax+256]
lea  r10, QWORD PTR [r10+4] 
vmulps   ymm1, ymm2, YMMWORD PTR [rax-320]
vaddps   ymm3, ymm1, ymm3    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-288]
vaddps   ymm4, ymm1, ymm4    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-256]
vaddps   ymm5, ymm1, ymm5    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-224]
vaddps   ymm6, ymm1, ymm6    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-192]
vaddps   ymm7, ymm1, ymm7    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-160]
vaddps   ymm8, ymm1, ymm8    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-128]
vaddps   ymm9, ymm1, ymm9    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-96]
vaddps   ymm10, ymm1, ymm10    
dec  rdx
jne  SHORT $LL3@AddDot4x4_

EDIT:

I benchmark the code by claculating the total floating point operations as 2.0*n^3 where n is the width of the square matrix and dividing by the time measured with omp_get_wtime(). I repeat the loop several times. In the output below I repeated it 100 times.

Output from MSVC2012 on an Intel Xeon E5 1620 (Ivy Bridge) turbo for all cores is 3.7 GHz

maximum GFLOPS = 236.8 = (8-wide SIMD) * (1 AVX mult + 1 AVX add) * (4 cores) * 3.7 GHz

n   64,     0.02 ms, GFLOPs   0.001, GFLOPs/s   23.88, error 0.000e+000, efficiency/core   40.34%, efficiency  10.08%, mem 0.05 MB
n  128,     0.05 ms, GFLOPs   0.004, GFLOPs/s   84.54, error 0.000e+000, efficiency/core  142.81%, efficiency  35.70%, mem 0.19 MB
n  192,     0.17 ms, GFLOPs   0.014, GFLOPs/s   85.45, error 0.000e+000, efficiency/core  144.34%, efficiency  36.09%, mem 0.42 MB
n  256,     0.29 ms, GFLOPs   0.034, GFLOPs/s  114.48, error 0.000e+000, efficiency/core  193.37%, efficiency  48.34%, mem 0.75 MB
n  320,     0.59 ms, GFLOPs   0.066, GFLOPs/s  110.50, error 0.000e+000, efficiency/core  186.66%, efficiency  46.67%, mem 1.17 MB
n  384,     1.39 ms, GFLOPs   0.113, GFLOPs/s   81.39, error 0.000e+000, efficiency/core  137.48%, efficiency  34.37%, mem 1.69 MB
n  448,     3.27 ms, GFLOPs   0.180, GFLOPs/s   55.01, error 0.000e+000, efficiency/core   92.92%, efficiency  23.23%, mem 2.30 MB
n  512,     3.60 ms, GFLOPs   0.268, GFLOPs/s   74.63, error 0.000e+000, efficiency/core  126.07%, efficiency  31.52%, mem 3.00 MB
n  576,     3.93 ms, GFLOPs   0.382, GFLOPs/s   97.24, error 0.000e+000, efficiency/core  164.26%, efficiency  41.07%, mem 3.80 MB
n  640,     5.21 ms, GFLOPs   0.524, GFLOPs/s  100.60, error 0.000e+000, efficiency/core  169.93%, efficiency  42.48%, mem 4.69 MB
n  704,     6.73 ms, GFLOPs   0.698, GFLOPs/s  103.63, error 0.000e+000, efficiency/core  175.04%, efficiency  43.76%, mem 5.67 MB
n  768,     8.55 ms, GFLOPs   0.906, GFLOPs/s  105.95, error 0.000e+000, efficiency/core  178.98%, efficiency  44.74%, mem 6.75 MB
n  832,    10.89 ms, GFLOPs   1.152, GFLOPs/s  105.76, error 0.000e+000, efficiency/core  178.65%, efficiency  44.66%, mem 7.92 MB
n  896,    13.26 ms, GFLOPs   1.439, GFLOPs/s  108.48, error 0.000e+000, efficiency/core  183.25%, efficiency  45.81%, mem 9.19 MB
n  960,    16.36 ms, GFLOPs   1.769, GFLOPs/s  108.16, error 0.000e+000, efficiency/core  182.70%, efficiency  45.67%, mem 10.55 MB
n 1024,    17.74 ms, GFLOPs   2.147, GFLOPs/s  121.05, error 0.000e+000, efficiency/core  204.47%, efficiency  51.12%, mem 12.00 MB

@selbie, sorry, I don't have ICC installed anymore. But it actually had the worst performance last I used it. I think it was 45% of the peak flops. — Z boson, Jan 15 '14 at 10:07
@MitchWheat, I'm using MSVC2012 express. I don't have access to GCC write now but I think it was GCC 4.8. I can check later today. — Z boson, Jan 15 '14 at 10:08
@PlasmaHH, I added some text at the end on how I benchmark my code. — Z boson, Jan 15 '14 at 10:45
@Zboson, while interesting, I think the question needs to be rephrased, otherwise it's not really fit for SO. You are showing that two compilers have different behaviors, then ask "what do you say?" — Shahbaz, Jan 15 '14 at 11:47
@Shahbaz, okay, can you suggest a different wording? I think the problem is in the two different ways of interpreting `_mm256_add_ps(_mm256_mul_ps))` and somebody knows why. I suspect that the MSVC makes the load dependent so it can't be done out of order. But that's just a guess. Somebody knows this much better than me and that's why the question is on SO. — Z boson, Jan 15 '14 at 12:11
@Zboson: First rule out alignment problems. Most compilers align things to 16-byte boundaries (to get best performance from SSE) but 256-bit AVX requires 32-byte alignment for best performance. — Brendan, Jan 15 '14 at 13:29
@Brendan, all the arrays are 64 byte aligned. But even if they were only 16 byte aligned the penalty is quite small with AVX. See ScottD's code in the comments here http://stackoverflow.com/questions/18653452/successful-compilation-of-sse-instruction-with-qmake-but-sse2-is-not-recognized/18660135#18660135 I got only a 5% loss for unaligned loads with his code. I assume it's in cases where the 256-bit word crosses a cache line. — Z boson, Jan 15 '14 at 13:34
@PlasmaHH, I updated my code at https://stackoverflow.com/questions/21090873/loop-unrolling-to-achieve-maximum-throughput-with-ivy-bridge-and-haswell. You can drop this into http://gcc.godbolt.org/ and see the Clang, ICC, and GCC output. GCC is the only one I think doing what makes the most sense (for maximum throughput) to me. I never tested Clang but I know ICC was the slowest I tested. — Z boson, Jan 18 '14 at 15:11
@neagoegab, how to I do that? I need the 64-bit assembly code. Windows and Linux don't have the same 64-bit call functions. I got the 32-bit code working but it only has 8 AVX registers which is not enough for maximum throughput. I'm still working on the 64-bit code. It's the first time I have tried something like this (editing x86 code). — Z boson, Jan 18 '14 at 15:12
@JayeshBadwaik, worry about what? What are you referring to? — Z boson, Jun 09 '15 at 07:08
@Zboson Sorry for such a vague question. My question was, which field do you work in (which requires you to worry about optimizations in so much detail, especially at cross-platform)? — , Jun 09 '15 at 09:43
Just a sidenote, if you're doing flops on gcc, you should try using -Ofast which is an even faster mode than -O3, but can be a slight bit lossy. Recommend it. — Íhor Mé, Aug 11 '16 at 19:08

iwolf · Accepted Answer · 2016-07-14T17:57:14.593

24

Since we've covered the alignment issue, I would guess it's this: http://en.wikipedia.org/wiki/Out-of-order_execution

Since g++ issues a standalone load instruction, your processor can reorder the instructions to be pre-fetching the next data that will be needed while also adding and multiplying. MSVC throwing a pointer at mul makes the load and mul tied to the same instruction, so changing the execution order of the instructions doesn't help anything.

EDIT: Intel's server(s) with all the docs are less angry today, so here's more research on why out of order execution is (part of) the answer.

First of all, it looks like your comment is completely right about it being possible for the MSVC version of the multiplication instruction to decode to separate µ-ops that can be optimized by a CPU's out of order engine. The fun part here is that modern microcode sequencers are programmable, so the actual behavior is both hardware and firmware dependent. The differences in the generated assembly seems to be from GCC and MSVC each trying to fight different potential bottlenecks. The GCC version tries to give leeway to the out of order engine (as we've already covered). However, the MSVC version ends up taking advantage of a feature called "micro-op fusion". This is because of the µ-op retirement limitations. The end of the pipeline can only retire 3 µ-ops per tick. Micro-op fusion, in specific cases, takes two µ-ops that must be done on two different execution units (i.e. memory read and arithmetic) and ties them to a single µ-op for most of the pipeline. The fused µ-op is only split into the two real µ-ops right before execution unit assignment. After the execution, the ops are fused again, allowing them to be retired as one.

The out of order engine only sees the fused µ-op, so it can't pull the load op away from the multiplication. This causes the pipeline to hang while waiting for the next operand to finish its bus ride.

ALL THE LINKS!!!: http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf

http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf

http://www.agner.org/optimize/microarchitecture.pdf

http://www.agner.org/optimize/optimizing_assembly.pdf

http://www.agner.org/optimize/instruction_tables.ods (NOTE: Excel complains that this spreadsheet is partially corrupted or otherwise sketchy, so open at your own risk. It doesn't seem to be malicious, though, and according to the rest of my research, Agner Fog is awesome. After I opted-in to the Excel recovery step, I found it full of tons of great data)

http://cs.nyu.edu/courses/fall13/CSCI-GA.3033-008/Microprocessor-Report-Sandy-Bridge-Spans-Generations-243901.pdf

http://www.syncfusion.com/Content/downloads/ebook/Assembly_Language_Succinctly.pdf

MUCH LATER EDIT: Wow, there has been some interesting update to the discussion here. I guess I was mistaken about how much of the pipeline is actually affected by micro op fusion. Maybe there is more perf gain than I expected from the the differences in the loop condition check, where the unfused instructions allow GCC to interleave the compare and jump with the last vector load and arithmetic steps?

vmovups ymm9, YMMWORD PTR [rax-32]
cmp esi, edx
vmulps  ymm0, ymm0, ymm9
vaddps  ymm1, ymm1, ymm0
jg  .L4

edited Jul 14 '16 at 17:57

answered Jan 16 '14 at 01:28

iwolf

1,080
1
7
10

You basically repeated the same guess I had in the question. That's not proof. I need at least a source, e.g. from an Intel manual, or some performance tests. How do you know know that the processors does not break the combined mult load into two independent µ-ops and that the problem lies elsewhere? – Z boson Jan 17 '14 at 07:16
1

Agner's instruction tables list what µops instructions get broken down into. – Cory Nelson Jan 17 '14 at 23:10
Thanks I forgot to include that in my note. They are also empirical, not the vendor claims – iwolf Jan 17 '14 at 23:13
@iwolf, that's a much better answer! I managed to get an assembly version of my module working in 32 bit MSVC (it's my first time editing x86 assembly). The problem is that there are only 8 AVX registers in 32 bit and I end up having to push two onto the stack (even with register renaming) so the performance drops. I'm still working on getting the 64 bit version working. Maybe i'll be able to test this in 64-bit mode soon. – Z boson Jan 18 '14 at 07:27
@iwolf, I was able to show that GCC gives a better result in Visual Studio (see my answer to my question). I'm giving you the accepted answer. Even if the experimentalist do all the hard work (Higgs Boson) it's the theorists that get the rewards. You got the theory right (I think). – Z boson Jan 21 '14 at 12:19
@Zboson and iwolf: micro-fused uops are split in the scheduler (aka Reservation Station). I experimentally confirmed that micro-fused loads can start before the other input dependency is ready, so e.g. a chain of `or eax, [rsi]` runs at 1 per clock. The scheduler / RS is only ~32 uops, so it's pretty small compared to a cache-miss, but it only has to hold uops that haven't executed yet. After executing, they leave the RS and remain only in the ROB until they retire. uops can leave the RS out of order. Anyway, this answer isn't quite right: The OOO engine does see the unfused uops. – Peter Cordes Jul 06 '16 at 14:03
See [near the end of this answer](http://stackoverflow.com/questions/26046634/micro-fusion-and-addressing-modes/31027695#31027695) for experimental proof that micro-fusion doesn't stop a load from starting before the ALU uop it's fused with. – Peter Cordes Jul 06 '16 at 14:05
@PeterCordes, I wish you had responded to this sooner. I was never totally convinced by this answer which is why I wrote "you got the theory right (**I think**)". Can you explain my observation in my question? Why don't you answer the question or maybe I should ask a new question? I think I'll ask a new question. – Z boson Jul 07 '16 at 06:56
@Zboson: I'd seen this earlier, but either hadn't read that part of the answer as carefully, or wasn't confident enough in my understanding to say anything specific. Your experimental results are very surprising to me :/. I *think* an adjacent unfused pair of load + alu uops (from different instructions) should perform essentially the same as a micro-fused pair, except that the unfused pair take separate ROB entries. So either my understanding is wrong and/or there's another subtle effect, or else having separate instructions causes another separate effect (code size/frontend?) – Peter Cordes Jul 07 '16 at 14:37
1

@PeterCordes, another possibility is that my observation was due to something else which is why I wrote "**Maybe the problem lies elsewhere?**" in the question. It could be related to code alignment: see [here](https://stackoverflow.com/questions/25958649/code-alignment-in-one-object-file-is-affecting-the-performance-of-a-function-in) and [here](https://stackoverflow.com/questions/33438592/the-effect-of-code-alignment-in-timing-main-loops-in-assembly). Then again maybe those are also related to something else. I was getting +-10% uncertainty in timing just by adding nops if I recall. Strange. – Z boson Jul 08 '16 at 06:14
@Zboson: The gcc output looks like it has 29 uops, which is one too many to fit in the loop buffer in SnB/IvB, or in Haswell with two hyperthreads sharing a core. If you'd used `-march=ivybridge` instead of `-mavx`, it would have put the cmp/jg together for macro-fusion, and it would be 28 uops. This might or might not be relevant. Since gcc's code already runs faster, I guess the uop cache isn't too big a problem for it. – Peter Cordes Jul 08 '16 at 15:07
@PeterCordes, I added some info to [my answer](http://stackoverflow.com/a/38225036/2542702) to reflect that this issue is still being debated. – Z boson Jul 11 '16 at 10:32
@iwolf: macro-fusion makes separating the compare and branch strictly a loss. That doesn't explain anything, unfortunately. :/ Even without that, OOOE hides short latencies. Instruction-ordering can have subtle indirect effects, like maybe more or less resource conflicts on the critical path. BTW, I removed my upvote since I cast it before reading this answer carefully enough, not because of the content of your edit. (Although like I said, the hypothesis in your edit is also almost certainly wrong :( That said, I can't explain the perf difference myself, so IDK. – Peter Cordes Jul 14 '16 at 18:14

score 6 · Answer 2 · edited May 23 '17 at 12:08

I can confirm that using the GCC code in Visual Studio does indeed improve the performance. I did this by converting the GCC object file in Linux to work in Visual Studio. The efficient went from 50% to 60% using all four cores (and 60% to 70% for a single core).

Microsoft has removed inline assembly from 64-bit code and also broken their 64-bit dissembler so that code can't be resembled without modification (but the 32-bit version still works). They evidently thought intrinsics would be sufficient but as this case shows they are wrong.

Maybe fused instructions should be separate intrinsics?

But Microsoft is not the only one that produces less optimal intrinsic code. If you put the code below into http://gcc.godbolt.org/ you can see what Clang, ICC, and GCC do. ICC gave even worse performance than MSVC. It is using vinsertf128 but I don't know why. I'm not sure what Clang is doing but it looks to be closer to GCC just in a different order (and more code).

This explains why Agner Fog wrote in his manual "Optimizing subroutines in assembly language" in regards to "disadvantages of using intrinsic functions":

The compiler can modify the code or implement it in a less efficient way than the programmer intended. It may be necessary to look at the code generated by the compiler to see if it is optimized in the way the programmer intended.

This is disappointing for the case for using intrinsics. This means one either has to still write 64-bit assembly code soemtimes or find a compiler which implements the intrinsics the way the programmer intended. In this case only GCC appears to do that (and perhaps Clang).

#include <immintrin.h>
extern "C" void AddDot4x4_vec_block_8wide(const int n, const float *a, const float *b, float *c, const int stridea, const int strideb, const int stridec) {     
    const int vec_size = 8;
    __m256 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    tmp0 = _mm256_loadu_ps(&c[0*vec_size]);
    tmp1 = _mm256_loadu_ps(&c[1*vec_size]);
    tmp2 = _mm256_loadu_ps(&c[2*vec_size]);
    tmp3 = _mm256_loadu_ps(&c[3*vec_size]);
    tmp4 = _mm256_loadu_ps(&c[4*vec_size]);
    tmp5 = _mm256_loadu_ps(&c[5*vec_size]);
    tmp6 = _mm256_loadu_ps(&c[6*vec_size]);
    tmp7 = _mm256_loadu_ps(&c[7*vec_size]);

    for(int i=0; i<n; i++) {
        __m256 areg0 = _mm256_set1_ps(a[i]);

        __m256 breg0 = _mm256_loadu_ps(&b[vec_size*(8*i + 0)]);
        tmp0 = _mm256_add_ps(_mm256_mul_ps(areg0,breg0), tmp0);    
        __m256 breg1 = _mm256_loadu_ps(&b[vec_size*(8*i + 1)]);
        tmp1 = _mm256_add_ps(_mm256_mul_ps(areg0,breg1), tmp1);
        __m256 breg2 = _mm256_loadu_ps(&b[vec_size*(8*i + 2)]);
        tmp2 = _mm256_add_ps(_mm256_mul_ps(areg0,breg2), tmp2);    
        __m256 breg3 = _mm256_loadu_ps(&b[vec_size*(8*i + 3)]);
        tmp3 = _mm256_add_ps(_mm256_mul_ps(areg0,breg3), tmp3);   
        __m256 breg4 = _mm256_loadu_ps(&b[vec_size*(8*i + 4)]);
        tmp4 = _mm256_add_ps(_mm256_mul_ps(areg0,breg4), tmp4);    
        __m256 breg5 = _mm256_loadu_ps(&b[vec_size*(8*i + 5)]);
        tmp5 = _mm256_add_ps(_mm256_mul_ps(areg0,breg5), tmp5);    
        __m256 breg6 = _mm256_loadu_ps(&b[vec_size*(8*i + 6)]);
        tmp6 = _mm256_add_ps(_mm256_mul_ps(areg0,breg6), tmp6);    
        __m256 breg7 = _mm256_loadu_ps(&b[vec_size*(8*i + 7)]);
        tmp7 = _mm256_add_ps(_mm256_mul_ps(areg0,breg7), tmp7);    
    }
    _mm256_storeu_ps(&c[0*vec_size], tmp0);
    _mm256_storeu_ps(&c[1*vec_size], tmp1);
    _mm256_storeu_ps(&c[2*vec_size], tmp2);
    _mm256_storeu_ps(&c[3*vec_size], tmp3);
    _mm256_storeu_ps(&c[4*vec_size], tmp4);
    _mm256_storeu_ps(&c[5*vec_size], tmp5);
    _mm256_storeu_ps(&c[6*vec_size], tmp6);
    _mm256_storeu_ps(&c[7*vec_size], tmp7);
}

score 3 · Answer 3 · answered Jan 18 '14 at 01:25

3

MSVC did exactly what you asked it to. If you want a vmovups instruction emitted, use the _mm256_loadu_ps intrinsic.

answered Jan 18 '14 at 01:25

Ben Voigt

277,958
43
419
720

Sorry, I should have corrected that but I discovered what community wiki is (the hard way) recently so I have been limiting my edits. I'm issuing `_mm256_loadu_ps` as well. – Z boson Jan 18 '14 at 07:19

Difference in performance between MSVC and GCC for highly optimized matrix multplication code

3 Answers3

Linked