I have playing around with basic math function implementations in C++ for academic purposes. Today, I benchmarked the following code for Square Root:
inline float sqrt_new(float n)
{
    __asm {
        fld n
            fsqrt
    }
}
I was surprised to see that it is consistently faster than the standard sqrt function (it takes around 85% of the execution time of the standard function).
I don't quite get why and would love to better understand it. Below I show the full code I am using to profile (in Visual Studio 2015, compiling in Release mode and with all optimizations turned on):
#include <iostream>
#include <random>
#include <chrono>
#define M 1000000
float ranfloats[M];
using namespace std;
inline float sqrt_new(float n)
{
    __asm {
        fld n
            fsqrt
    }
}
int main()
{
    default_random_engine randomGenerator(time(0));
    uniform_real_distribution<float> diceroll(0.0f , 1.0f);
chrono::high_resolution_clock::time_point start1, start2;
chrono::high_resolution_clock::time_point end1, end2;
float sqrt1 = 0;
float sqrt2 = 0;
for (int i = 0; i<M; i++) ranfloats[i] = diceroll(randomGenerator);
start1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i<M; i++) sqrt1 += sqrt(ranfloats[i]);
end1 = std::chrono::high_resolution_clock::now();
start2 = std::chrono::high_resolution_clock::now();
for (int i = 0; i<M; i++) sqrt2 += sqrt_new(ranfloats[i]);
end2 = std::chrono::high_resolution_clock::now();
auto time1 = std::chrono::duration_cast<std::chrono::milliseconds>(end1 - start1).count();
auto time2  = std::chrono::duration_cast<std::chrono::milliseconds>(end2 - start2).count();
cout << "Time elapsed for SQRT1: " << time1 << " seconds" << endl;
cout << "Time elapsed for SQRT2: " << time2 << " seconds" << endl;
cout << "Average of Time for SQRT2 / Time for SQRT1: " << time2 / time1  << endl;
cout << "Equal to standard sqrt? " << (sqrt1 == sqrt2) << endl;
system("pause");
return 0;
}
EDIT: I am editing the question to include disassembly codes of both loops that calculate square roots as they came at Visual Studio 2015.
First, the disassembly for for (int i = 0; i<M; i++) sqrt1 += sqrt(ranfloats[i]);:
00091194 0F 5A C0             cvtps2pd    xmm0,xmm0  
00091197 E8 F2 18 00 00       call        __libm_sse2_sqrt_precise (092A8Eh)  
0009119C F2 0F 5A C0          cvtsd2ss    xmm0,xmm0  
000911A0 83 C6 04             add         esi,4  
000911A3 F3 0F 58 44 24 4C    addss       xmm0,dword ptr [esp+4Ch]  
000911A9 F3 0F 11 44 24 4C    movss       dword ptr [esp+4Ch],xmm0  
000911AF 81 FE 90 5C 46 00    cmp         esi,offset __dyn_tls_dtor_callback (0465C90h)  
000911B5 7C D9                jl          main+190h (091190h) 
Next, the disassembly for for (int i = 0; i<M; i++) sqrt2 += sqrt_new(ranfloats[i]);:
00091290 F3 0F 10 00          movss       xmm0,dword ptr [eax]  
00091294 F3 0F 11 44 24 6C    movss       dword ptr [esp+6Ch],xmm0  
0009129A D9 44 24 6C          fld         dword ptr [esp+6Ch]  
0009129E D9 FA                fsqrt  
000912A0 D9 5C 24 6C          fstp        dword ptr [esp+6Ch]  
000912A4 F3 0F 10 44 24 6C    movss       xmm0,dword ptr [esp+6Ch]  
000912AA 83 C0 04             add         eax,4  
000912AD F3 0F 58 44 24 54    addss       xmm0,dword ptr [esp+54h]  
000912B3 F3 0F 11 44 24 54    movss       dword ptr [esp+54h],xmm0  
000912B9 ??                   ?? ?? 
000912BA ??                   ?? ?? 
000912BB ??                   ?? ?? 
000912BC ??                   ?? ?? 
000912BD ??                   ?? ?? 
000912BE ??                   ?? ?? 
000912BF ??                   ?? ?? 
000912C0 ??                   ?? ?? 
000912C1 ??                   ?? ?? 
000912C2 ??                   ?? ?? 
000912C3 ??                   ?? ?? 
000912C4 ??                   ?? ?? 
000912C5 ??                   ?? ?? 
000912C6 ??                   ?? ?? 
000912C7 ??                   ?? ?? 
000912C8 ??                   ?? ?? 
000912C9 ??                   ?? ?? 
000912CA ??                   ?? ?? 
000912CB ??                   ?? ?? 
000912CC ??                   ?? ?? 
000912CD ??                   ?? ?? 
000912CE ??                   ?? ?? 
000912CF ??                   ?? ?? 
000912D0 ??                   ?? ?? 
000912D1 ??                   ?? ?? 
000912D2 ??                   ?? ?? 
000912D3 ??                   ?? ?? 
000912D4 ??                   ?? ?? 
000912D5 ??                   ?? ?? 
000912D6 ??                   ?? ?? 
000912D7 ??                   ?? ?? 
000912D8 ??                   ?? ?? 
000912D9 ??                   ?? ?? 
000912DA ??                   ?? ?? 
000912DB ??                   ?? ?? 
000912DC ??                   ?? ?? 
000912DD ??                   ?? ?? 
000912DE ??                   ?? ?? 
 
     
     
    