In this code, I'm just looping through the set of instructions a bunch of times. Without regard to how many times (100, 1000, 1000000), the timing using RDTSC shows (outputs) 6 clock cycles for the loop. I'm on a Coffee Lake I9-9900K
There are 13 instructions in the loop- I would have thought the minimum RDTSC delta would have been 13.
Would someone be able to educate me as to how this is seeming to run twice as fast as I expected it to? I'm clearly misunderstanding something basic, or I've made a ridiculous mistake.
Thank you!
    rng.SetFloatScale(2.0f / 8.0f);
00C010AE  vmovups     ymm4,ymmword ptr [__ymm@3e0000003e0000003e0000003e0000003e0000003e0000003e0000003e000000 (0C02160h)]  
    Vec8f sum = 0;
    const size_t loopLen = 1000;
    auto start = __rdtsc();
00C010BB  rdtsc  
00C010BD  mov         esi,eax  
        sum += rng.NextScaledFloats();
00C010F0  vpslld      ymm0,ymm2,xmm5  
00C010F4  vpxor       ymm1,ymm0,ymm2  
00C010F8  vpsrld      ymm0,ymm1,xmm6  
00C010FC  vpxor       ymm1,ymm0,ymm1  
00C01100  vpslld      ymm0,ymm1,xmm7  
00C01104  vpxor       ymm2,ymm0,ymm1  
00C01108  vpand       ymm0,ymm2,ymmword ptr [__ymm@007fffff007fffff007fffff007fffff007fffff007fffff007fffff007fffff (0C02140h)]  
00C01110  vpor        ymm0,ymm0,ymmword ptr [__ymm@4000000040000000400000004000000040000000400000004000000040000000 (0C021A0h)]  
00C01118  vmovups     ymm1,ymm4  
00C0111C  vfmsub213ps ymm1,ymm0,ymmword ptr [__ymm@3e8000003e8000003e8000003e8000003e8000003e8000003e8000003e800000 (0C02180h)]  
00C01125  vaddps      ymm3,ymm1,ymm3  
    for (size_t i = 0; i < loopLen; i++)
00C01129  sub         eax,1  
00C0112C  jne         main+80h (0C010F0h)  
    auto end = __rdtsc();
00C0112E  rdtsc  
00C01130  mov         edi,eax  
00C01132  mov         ecx,edx  
    printf("\n\nAverage: %f\nAverage RDTSC: %ld\n", fsum, (end - start) / loopLen);
 
    