I have following snippet which sums all the elements of the array (size is hardcoded and is 32):
static unsafe int F(int* a) 
{
    Vector256<int> ymm0 = Avx2.LoadVector256(a + 0);
    Vector256<int> ymm1 = Avx2.LoadVector256(a + 8);
    Vector256<int> ymm2 = Avx2.LoadVector256(a + 16);
    Vector256<int> ymm3 = Avx2.LoadVector256(a + 24);
    ymm0 = Avx2.Add(ymm0, ymm1);
    ymm2 = Avx2.Add(ymm2, ymm3);
    ymm0 = Avx2.Add(ymm0, ymm2);
    const int s = 256 / 32;
    int*      t = stackalloc int[s];
    Avx2.Store(t, ymm0);
    int r = 0;
    for (int i = 0; i < s; ++i)
        r += t[i];
    return r;
}
this generates following ASM:
Program.F(Int32*)
    L0000: sub rsp, 0x28
    L0004: vzeroupper                       ; Question #1
    L0007: vxorps xmm4, xmm4, xmm4
    L000b: vmovdqa [rsp], xmm4              ; Question #2
    L0010: vmovdqa [rsp+0x10], xmm4         ; Question #2
    L0016: xor eax, eax                     ; Question #3
    L0018: mov [rsp+0x20], rax
    L001d: mov rax, 0x7d847bd1f9ce          ; Question #4
    L0027: mov [rsp+0x20], rax
    L002c: vmovdqu ymm0, [rcx]
    L0030: vmovdqu ymm1, [rcx+0x20]
    L0035: vmovdqu ymm2, [rcx+0x40]
    L003a: vmovdqu ymm3, [rcx+0x60]
    L003f: vpaddd ymm0, ymm0, ymm1
    L0043: vpaddd ymm2, ymm2, ymm3
    L0047: vpaddd ymm0, ymm0, ymm2
    L004b: lea rax, [rsp]                   ; Question #5
    L004f: vmovdqu [rax], ymm0
    L0053: xor edx, edx                     ; Question #5
    L0055: xor ecx, ecx                     ; Question #5
    L0057: movsxd r8, ecx
    L005a: add edx, [rax+r8*4]
    L005e: inc ecx
    L0060: cmp ecx, 8
    L0063: jl short L0057
    L0065: mov eax, edx
    L0067: mov rcx, 0x7d847bd1f9ce          ; Question #4
    L0071: cmp [rsp+0x20], rcx
    L0076: je short L007d
    L0078: call 0x00007ffc9de2d430          ; Question #6
    L007d: nop
    L007e: vzeroupper
    L0081: add rsp, 0x28
    L0085: ret
Questions
- Why do we need VZEROUPPERat the beginning. Wouldn't it be perfectly fine without it?
- What do the VMOVDQAs do in the beginning. Or rather why are they there?
- Zeroing out the EAXregister? Why? Probably related to next lineMOV [RSP+0x20], RAX, but still can't understand.
- What does this mysterious value (0x7d847bd1f9ce) do?
- There are also lines in between which I can not understand why are they needed (see "Question #5" comments in the code).
- I'm assuming this line (L0078: call 0x00007ffc9de2d430) throws an exception. Is there a function or something in my code that can throw an exception?
I know there are lot of question, but I can't separate them because they are related to each other I think. TO BE CRYSTAL CLEAR: I'm just trying to understand the generated ASM here. I'm not a professional in this area.
Note
- In case you're wondering what GCC (O2)generates, here is the result:
int32_t
f(int32_t *a) {
        __m256i ymm0;
        __m256i ymm1;
        __m256i ymm2;
        __m256i ymm3;
        ymm0 = _mm256_load_si256((__m256i*)(a + 0));
        ymm1 = _mm256_load_si256((__m256i*)(a + 8));
        ymm2 = _mm256_load_si256((__m256i*)(a + 16));
        ymm3 = _mm256_load_si256((__m256i*)(a + 24));
           
        ymm0 = _mm256_add_epi32(ymm0, ymm1);
        ymm2 = _mm256_add_epi32(ymm2, ymm3);
        ymm0 = _mm256_add_epi32(ymm0, ymm2);
        int32_t t[8];
        _mm256_store_si256((__m256i*)t, ymm0);
        int32_t r;
        r = 0;
        for (int i = 0; i < 8; ++i)
                r += t[i];
        return r;
}
And the generated ASM:
f:
  push rbp
  xor r8d, r8d
  mov rbp, rsp
  and rsp, -32
  lea rax, [rsp-32]
  mov rdx, rsp
  vmovdqa ymm1, YMMWORD PTR [rdi+96]
  vpaddd ymm0, ymm1, YMMWORD PTR [rdi+64]
  vpaddd ymm0, ymm0, YMMWORD PTR [rdi+32]
  vpaddd ymm0, ymm0, YMMWORD PTR [rdi]
  vmovdqa YMMWORD PTR [rsp-32], ymm0
.L2:
  add r8d, DWORD PTR [rax]
  add rax, 4
  cmp rax, rdx
  jne .L2
  mov eax, r8d
  vzeroupper
  leave
  ret
I think It optimized (maybe heavily) my code here, but whatever.
 
     
    