#include <immintrin.h>
__m256 mult(__m256 num) {
return 278*num/(num+1400);
}
.LCPI0_0:
.long 0x438b0000 # float 278
.LCPI0_1:
.long 0x44af0000 # float 1400
mult(float __vector(8)): # @mult(float __vector(8))
vbroadcastss ymm1, dword ptr [rip + .LCPI0_0] # ymm1 = [2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2]
vmulps ymm1, ymm0, ymm1
vbroadcastss ymm2, dword ptr [rip + .LCPI0_1] # ymm2 = [1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3]
vaddps ymm0, ymm0, ymm2
vrcpps ymm2, ymm0
vmulps ymm3, ymm1, ymm2
vfmsub213ps ymm0, ymm3, ymm1 # ymm0 = (ymm3 * ymm0) - ymm1
vfnmadd213ps ymm0, ymm2, ymm3 # ymm0 = -(ymm2 * ymm0) + ymm3
ret
Why does Clang add the two extra FMA instructions to the code? The result should already be computed with vmulps ymm3, ymm1, ymm2. Don't the extra instructions increase the latency beyond just using vdivps like with -O3?