My problem
I have 2 adjacent buffers of bytes of identical size (around 20 MB each). I just want to count the differences between them.
My question
How much time this loop should take to run on a 4.8GHz Intel I7 9700K with 3600MT RAM ?
How do we compute max theoretical speed ?
What I tried
uint64_t compareFunction(const char *const __restrict buffer, const uint64_t commonSize)
{
    uint64_t diffFound = 0;
    for(uint64_t byte = 0; byte < commonSize; ++byte)
        diffFound += static_cast<uint64_t>(buffer[byte] != buffer[byte + commonSize]);
    return diffFound;
}
It takes 11ms on my PC (9700K 4.8Ghz RAM 3600 Windows 10 Clang 14.0.6 -O3 MinGW ) and I feel it is too slow and that I am missing something.
40MB should take less than 2ms to be read on the CPU (my RAM bandwidth is between 20 and 30GB/s)
I don't know how to count cycles required to execute one iteration (especially because CPUs are superscalar nowadays). If I assume 1 cycle per operation and if I don't mess up my counting, it should be 10 ops per iteration -> 200 million ops -> at 4.8 Ghz with only one execution unit -> 40ms. Obviously I am wrong on how to compute the number of cycles per loop.
Fun fact: I tried on Linux PopOS GCC 11.2 -O3 and it ran at 4.5ms. Why such a difference?
Here are the dissassemblies vectorised and scalar produced by clang:
compareFunction(char const*, unsigned long): # @compareFunction(char const*, unsigned long)
        test    rsi, rsi
        je      .LBB0_1
        lea     r8, [rdi + rsi]
        neg     rsi
        xor     edx, edx
        xor     eax, eax
.LBB0_4:                                # =>This Inner Loop Header: Depth=1
        movzx   r9d, byte ptr [rdi + rdx]
        xor     ecx, ecx
        cmp     r9b, byte ptr [r8 + rdx]
        setne   cl
        add     rax, rcx
        add     rdx, 1
        mov     rcx, rsi
        add     rcx, rdx
        jne     .LBB0_4
        ret
.LBB0_1:
        xor     eax, eax
        ret
Clang14 O3:
.LCPI0_0:
        .quad   1                               # 0x1
        .quad   1                               # 0x1
compareFunction(char const*, unsigned long):                # @compareFunction(char const*, unsigned long)
        test    rsi, rsi
        je      .LBB0_1
        cmp     rsi, 4
        jae     .LBB0_4
        xor     r9d, r9d
        xor     eax, eax
        jmp     .LBB0_11
.LBB0_1:
        xor     eax, eax
        ret
.LBB0_4:
        mov     r9, rsi
        and     r9, -4
        lea     rax, [r9 - 4]
        mov     r8, rax
        shr     r8, 2
        add     r8, 1
        test    rax, rax
        je      .LBB0_5
        mov     rdx, r8
        and     rdx, -2
        lea     r10, [rdi + 6]
        lea     r11, [rdi + rsi]
        add     r11, 6
        pxor    xmm0, xmm0
        xor     eax, eax
        pcmpeqd xmm2, xmm2
        movdqa  xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 = [1,1]
        pxor    xmm1, xmm1
.LBB0_7:                                # =>This Inner Loop Header: Depth=1
        movzx   ecx, word ptr [r10 + rax - 6]
        movd    xmm4, ecx
        movzx   ecx, word ptr [r10 + rax - 4]
        movd    xmm5, ecx
        movzx   ecx, word ptr [r11 + rax - 6]
        movd    xmm6, ecx
        pcmpeqb xmm6, xmm4
        movzx   ecx, word ptr [r11 + rax - 4]
        movd    xmm7, ecx
        pcmpeqb xmm7, xmm5
        pxor    xmm6, xmm2
        punpcklbw       xmm6, xmm6              # xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm4, xmm6, 212                 # xmm4 = xmm6[0,1,1,3,4,5,6,7]
        pshufd  xmm4, xmm4, 212                 # xmm4 = xmm4[0,1,1,3]
        pand    xmm4, xmm3
        paddq   xmm4, xmm0
        pxor    xmm7, xmm2
        punpcklbw       xmm7, xmm7              # xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm0, xmm7, 212                 # xmm0 = xmm7[0,1,1,3,4,5,6,7]
        pshufd  xmm5, xmm0, 212                 # xmm5 = xmm0[0,1,1,3]
        pand    xmm5, xmm3
        paddq   xmm5, xmm1
        movzx   ecx, word ptr [r10 + rax - 2]
        movd    xmm0, ecx
        movzx   ecx, word ptr [r10 + rax]
        movd    xmm1, ecx
        movzx   ecx, word ptr [r11 + rax - 2]
        movd    xmm6, ecx
        pcmpeqb xmm6, xmm0
        movzx   ecx, word ptr [r11 + rax]
        movd    xmm7, ecx
        pcmpeqb xmm7, xmm1
        pxor    xmm6, xmm2
        punpcklbw       xmm6, xmm6              # xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm0, xmm6, 212                 # xmm0 = xmm6[0,1,1,3,4,5,6,7]
        pshufd  xmm0, xmm0, 212                 # xmm0 = xmm0[0,1,1,3]
        pand    xmm0, xmm3
        paddq   xmm0, xmm4
        pxor    xmm7, xmm2
        punpcklbw       xmm7, xmm7              # xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm1, xmm7, 212                 # xmm1 = xmm7[0,1,1,3,4,5,6,7]
        pshufd  xmm1, xmm1, 212                 # xmm1 = xmm1[0,1,1,3]
        pand    xmm1, xmm3
        paddq   xmm1, xmm5
        add     rax, 8
        add     rdx, -2
        jne     .LBB0_7
        test    r8b, 1
        je      .LBB0_10
.LBB0_9:
        movzx   ecx, word ptr [rdi + rax]
        movd    xmm2, ecx
        movzx   ecx, word ptr [rdi + rax + 2]
        movd    xmm3, ecx
        add     rax, rsi
        movzx   ecx, word ptr [rdi + rax]
        movd    xmm4, ecx
        pcmpeqb xmm4, xmm2
        movzx   eax, word ptr [rdi + rax + 2]
        movd    xmm2, eax
        pcmpeqb xmm2, xmm3
        pcmpeqd xmm3, xmm3
        pxor    xmm4, xmm3
        punpcklbw       xmm4, xmm4              # xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm4, xmm4, 212                 # xmm4 = xmm4[0,1,1,3,4,5,6,7]
        pshufd  xmm4, xmm4, 212                 # xmm4 = xmm4[0,1,1,3]
        movdqa  xmm5, xmmword ptr [rip + .LCPI0_0] # xmm5 = [1,1]
        pand    xmm4, xmm5
        paddq   xmm0, xmm4
        pxor    xmm2, xmm3
        punpcklbw       xmm2, xmm2              # xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm2, xmm2, 212                 # xmm2 = xmm2[0,1,1,3,4,5,6,7]
        pshufd  xmm2, xmm2, 212                 # xmm2 = xmm2[0,1,1,3]
        pand    xmm2, xmm5
        paddq   xmm1, xmm2
.LBB0_10:
        paddq   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddq   xmm1, xmm0
        movq    rax, xmm1
        cmp     r9, rsi
        je      .LBB0_13
.LBB0_11:
        lea     r8, [r9 + rsi]
        sub     rsi, r9
        add     r8, rdi
        add     rdi, r9
        xor     edx, edx
.LBB0_12:                               # =>This Inner Loop Header: Depth=1
        movzx   r9d, byte ptr [rdi + rdx]
        xor     ecx, ecx
        cmp     r9b, byte ptr [r8 + rdx]
        setne   cl
        add     rax, rcx
        add     rdx, 1
        cmp     rsi, rdx
        jne     .LBB0_12
.LBB0_13:
        ret
.LBB0_5:
        pxor    xmm0, xmm0
        xor     eax, eax
        pxor    xmm1, xmm1
        test    r8b, 1
        jne     .LBB0_9
        jmp     .LBB0_10
 
     
     
     
    