I wrote this snippet in a recent argument over the supposed speed of array[i++] vs array[i]; i++. 
int array[10];
int main(){
    int i=0;
    while(i < 10){
        array[i] = 0;
        i++;
    }
    return 0;
}
Snippet at the compiler explorer: https://godbolt.org/g/de7TY2
As expected, the compiler output identical asm for array[i++] and array[i]; i++ with at least -O1. However what surprised me was the placement of the xor eax, eax seemingly randomly in the function at higher optimization levels.
GCC
At -O2, GCC places the xor before the ret as expected
    mov     DWORD PTR [rax], 0
    add     rax, 4
    cmp     rax, OFFSET FLAT:array+40
    jne     .L2
    xor     eax, eax
    ret
However it places the xor after the second mov at -O3
    mov     QWORD PTR array[rip], 0
    mov     QWORD PTR array[rip+8], 0
    xor     eax, eax
    mov     QWORD PTR array[rip+16], 0
    mov     QWORD PTR array[rip+24], 0
    mov     QWORD PTR array[rip+32], 0
    ret
icc
icc places it normally at -O1:
    push      rsi
    xor       esi, esi
    push      3
    pop       rdi
    call      __intel_new_feature_proc_init
    stmxcsr   DWORD PTR [rsp]
    xor       eax, eax
    or        DWORD PTR [rsp], 32832
    ldmxcsr   DWORD PTR [rsp]
..B1.2:
    mov       DWORD PTR [array+rax*4], 0
    inc       rax
    cmp       rax, 10
    jl        ..B1.2
    xor       eax, eax
    pop       rcx
    ret
but in a strange place at -O2
    push      rbp
    mov       rbp, rsp
    and       rsp, -128
    sub       rsp, 128
    xor       esi, esi
    mov       edi, 3
    call      __intel_new_feature_proc_init
    stmxcsr   DWORD PTR [rsp]
    pxor      xmm0, xmm0
    xor       eax, eax
    or        DWORD PTR [rsp], 32832
    ldmxcsr   DWORD PTR [rsp]
    movdqu    XMMWORD PTR array[rip], xmm0
    movdqu    XMMWORD PTR 16+array[rip], xmm0
    mov       DWORD PTR 32+array[rip], eax
    mov       DWORD PTR 36+array[rip], eax
    mov       rsp, rbp
    pop       rbp
    ret       
and -O3
    and       rsp, -128
    sub       rsp, 128
    mov       edi, 3
    call      __intel_new_proc_init
    stmxcsr   DWORD PTR [rsp]
    xor       eax, eax
    or        DWORD PTR [rsp], 32832
    ldmxcsr   DWORD PTR [rsp]
    mov       rsp, rbp
    pop       rbp
    ret   
Clang
only clang places the xor directly in front of the ret at all optimization levels:
    xorps   xmm0, xmm0
    movaps  xmmword ptr [rip + array+16], xmm0
    movaps  xmmword ptr [rip + array], xmm0
    mov     qword ptr [rip + array+32], 0
    xor     eax, eax
    ret
Since GCC and ICC both do this at higher optimisation levels, I presume there must be some kind of good reason.
Why do some compilers do this?
The code is semantically identical of course and the compiler can reorder it as it wishes, but since this only changes at higher optimization levels this must be caused by some kind of optimization.
 
     
     
     
     
    