I'm using Intel SSE/AVX/FMA intrinsics to achieve perfectly inlining SSE/AVX instructions for some math functions.
Given the following code
#include <cmath>
#include <immintrin.h>
auto std_fma(float x, float y, float z)
{
    return std::fma(x, y, z);
}
float _fma(float x, float y, float z)
{
    _mm_store_ss(&x,
        _mm_fmadd_ss(_mm_load_ss(&x), _mm_load_ss(&y), _mm_load_ss(&z))
    );
    return x;
}
float _sqrt(float x)
{
    _mm_store_ss(&x,
        _mm_sqrt_ss(_mm_load_ss(&x))
    );
    return x;
}
the clang 3.9 generated assembly with -march=x86-64 -mfma -O3
std_fma(float, float, float):                          # @std_fma(float, float, float)
        vfmadd213ss     xmm0, xmm1, xmm2
        ret
_fma(float, float, float):                             # @_fma(float, float, float)
        vxorps  xmm3, xmm3, xmm3
        vmovss  xmm0, xmm3, xmm0        # xmm0 = xmm0[0],xmm3[1,2,3]
        vmovss  xmm1, xmm3, xmm1        # xmm1 = xmm1[0],xmm3[1,2,3]
        vmovss  xmm2, xmm3, xmm2        # xmm2 = xmm2[0],xmm3[1,2,3]
        vfmadd213ss     xmm0, xmm1, xmm2
        ret
_sqrt(float):                              # @_sqrt(float)
        vsqrtss xmm0, xmm0, xmm0
        ret
while the generated code for _sqrt is fine, there are unnecessary vxorps (which sets the absolutely unused xmm3 register to zero) and movss instructions in _fma compared to std_fma (which rely on compiler intrinsic std::fma)
the GCC 6.2 generated assembly with -march=x86-64 -mfma -O3
std_fma(float, float, float):
        vfmadd132ss     xmm0, xmm2, xmm1
        ret
_fma(float, float, float):
        vinsertps       xmm1, xmm1, xmm1, 0xe
        vinsertps       xmm2, xmm2, xmm2, 0xe
        vinsertps       xmm0, xmm0, xmm0, 0xe
        vfmadd132ss     xmm0, xmm2, xmm1
        ret
_sqrt(float):
        vinsertps       xmm0, xmm0, xmm0, 0xe
        vsqrtss xmm0, xmm0, xmm0
        ret
and here are a lot of unnecessary vinsertps instructions
Working example: https://godbolt.org/g/q1BQym
The default x64 calling convention pass floating-point function arguments in XMM registers, so those vmovss and vinsertps instructions should be eliminated. Why do the mentioned compilers still emit them? Is it possible to get rid of them without inline assembly?
I also tried to use _mm_cvtss_f32 instead of _mm_store_ss and multiple calling conventions, but nothing changed.
