For the following loop GCC will only vectorize the loop if I tell it to use associative math e.g. with -Ofast.
float sumf(float *x)
{
  x = (float*)__builtin_assume_aligned(x, 64);
  float sum = 0;
  for(int i=0; i<2048; i++) sum += x[i];
  return sum;
}
Here is the assembly with -Ofast -mavx
sumf(float*):
    vxorps  %xmm0, %xmm0, %xmm0
    leaq    8192(%rdi), %rax
.L2:
    vaddps  (%rdi), %ymm0, %ymm0
    addq    $32, %rdi
    cmpq    %rdi, %rax
    jne .L2
    vhaddps %ymm0, %ymm0, %ymm0
    vhaddps %ymm0, %ymm0, %ymm1
    vperm2f128  $1, %ymm1, %ymm1, %ymm0
    vaddps  %ymm1, %ymm0, %ymm0
    vzeroupper
    ret
This clearly shows the loop has been vectorized.
But this loop also has a dependency chain. In order to overcome the latency of the addition I need to unroll and do partial sums at least three times on x86_64 (excluding Skylake which needs to unroll eight times and doing the addition with FMA instructions which need to unroll 10 times on Haswell and Broadwell). As far as I understand I can unroll the loop with -funroll-loops.
Here is the assembly with -Ofast -mavx -funroll-loops.
sumf(float*):
    vxorps  %xmm7, %xmm7, %xmm7
    leaq    8192(%rdi), %rax
.L2:
    vaddps  (%rdi), %ymm7, %ymm0
    addq    $256, %rdi
    vaddps  -224(%rdi), %ymm0, %ymm1
    vaddps  -192(%rdi), %ymm1, %ymm2
    vaddps  -160(%rdi), %ymm2, %ymm3
    vaddps  -128(%rdi), %ymm3, %ymm4
    vaddps  -96(%rdi), %ymm4, %ymm5
    vaddps  -64(%rdi), %ymm5, %ymm6
    vaddps  -32(%rdi), %ymm6, %ymm7
    cmpq    %rdi, %rax
    jne .L2
    vhaddps %ymm7, %ymm7, %ymm8
    vhaddps %ymm8, %ymm8, %ymm9
    vperm2f128  $1, %ymm9, %ymm9, %ymm10
    vaddps  %ymm9, %ymm10, %ymm0
    vzeroupper
    ret
GCC does unroll the loop eight times. However, it does not do independent sums. It does eight dependent sums. That's pointless and no better than without unrolling.
How can I get GCC to unroll the loop and do independent partial sums?
Edit:
Clang unrolls to four independent partial sums even without -funroll-loops for SSE but I am not sure its AVX code is as efficient. The compiler should not need -funroll-loops with -Ofast anyway so it's good to see Clang is doing this right at least for SSE.
Clang 3.5.1 with -Ofast.
sumf(float*):                              # @sumf(float*)
    xorps   %xmm0, %xmm0
    xorl    %eax, %eax
    xorps   %xmm1, %xmm1
.LBB0_1:                                # %vector.body
    movups  (%rdi,%rax,4), %xmm2
    movups  16(%rdi,%rax,4), %xmm3
    addps   %xmm0, %xmm2
    addps   %xmm1, %xmm3
    movups  32(%rdi,%rax,4), %xmm0
    movups  48(%rdi,%rax,4), %xmm1
    addps   %xmm2, %xmm0
    addps   %xmm3, %xmm1
    addq    $16, %rax
    cmpq    $2048, %rax             # imm = 0x800
    jne .LBB0_1
    addps   %xmm0, %xmm1
    movaps  %xmm1, %xmm2
    movhlps %xmm2, %xmm2            # xmm2 = xmm2[1,1]
    addps   %xmm1, %xmm2
    pshufd  $1, %xmm2, %xmm0        # xmm0 = xmm2[1,0,0,0]
    addps   %xmm2, %xmm0
    retq
ICC 13.0.1 with -O3 unrolls to two independent partial sums.  ICC apparently assumes associative math with only -O3.
.B1.8: 
    vaddps    (%rdi,%rdx,4), %ymm1, %ymm1                   #5.29
    vaddps    32(%rdi,%rdx,4), %ymm0, %ymm0                 #5.29
    vaddps    64(%rdi,%rdx,4), %ymm1, %ymm1                 #5.29
    vaddps    96(%rdi,%rdx,4), %ymm0, %ymm0                 #5.29
    addq      $32, %rdx                                     #5.3
    cmpq      %rax, %rdx                                    #5.3
    jb        ..B1.8        # Prob 99%                      #5.3
 
    