I decide to create a string-length function in Assembly (using FASM).
My function takes a string (no matter aligned at 8 bytes or not) and checks if it's aligned at 8 bytes. If it's aligned, the main process (loop) will be begun. Otherwise, first 8 characters will be checked one-by-one, then the string will be aligned at 8 bytes and continue ...
There will be no "end of the memory page" problem since the string will be aligned at 8 bytes boundary anyway and by this alignment, it will never face the end of memory page problem.
But the problem is that I decided to implement its C version too, and I compiled it, and now I have 2 assembly codes, the one I wrote it and the one is written in C and compiled to assembly. The problem is the C version is up to 1.5x faster than my handwritten assembly !!!!!!! In my code, everything is just fine, and I even aligned the jump-points to 16 bytes and there is no nop running (except one, out of the loop which is kinda nothing (.align8 to .loop)) !!!
I can't find why my pure assembly code is 1.5x slower than the GCC version !!!
My Assembly source-code :
 align 16
slen:
        mov     r8, rcx
        test    cl, 7
        jz      .loop
        xor     eax, eax
        cmp     BYTE [rcx], al
        je      SHORT .ret
        cmp     BYTE [rcx+1], al
        je      SHORT .ret1
        cmp     BYTE [rcx+2], al
        je      SHORT .ret2
        cmp     BYTE [rcx+3], al
        je      SHORT .ret3
        cmp     BYTE [rcx+4], al
        je      SHORT .ret4
        cmp     BYTE [rcx+5], al
        je      SHORT .ret5
        cmp     BYTE [rcx+6], al
        je      SHORT .ret6
        cmp     BYTE [rcx+7], al
        jne     SHORT .align8
        mov     al, 7
        ret
 align 16
 .ret:  ret
 align 16
 .ret1: mov     al, 1
        ret
 align 16
 .ret2: mov     al, 2
        ret
 align 16
 .ret3: mov     al, 3
        ret
 align 16
 .ret4: mov     al, 4
        ret
 align 16
 .ret5: mov     al, 5
        ret
 align 16
 .ret6: mov     al, 6
        ret
 align 16
 .align8:
        lea     rcx, [rcx+7]
        and     rcx, (-8)
 align 16
 .loop: mov     rax, QWORD [rcx]
        test    al, al
        jz      SHORT .end
        test    ah, ah
        jz      SHORT .end.1
        test    eax, 0x00ff0000
        jz      SHORT .end.2
        test    eax, 0xff000000
        jz      SHORT .end.3
        shr     rax, 32
        test    al, al
        jz      SHORT .end.4
        test    ah, ah
        jz      SHORT .end.5
        test    eax, 0x00ff0000
        jz      SHORT .end.6
        test    eax, 0xff000000
        jz      SHORT .end.7
        add     rcx, 8
        jmp     SHORT .loop
 align 16
 .end: mov      rax, rcx
        sub     rax, r8
        ret
 align 16
 .end.1:
        lea     rax, [rcx+1]
        sub     rax, r8
        ret
 .end.2:
        lea     rax, [rcx+2]
        sub     rax, r8
        ret
 .end.3:
        lea     rax, [rcx+3]
        sub     rax, r8
        ret
 .end.4:
        lea     rax, [rcx+4]
        sub     rax, r8
        ret
 .end.5:
        lea     rax, [rcx+5]
        sub     rax, r8
        ret
 .end.6:
        lea     rax, [rcx+6]
        sub     rax, r8
        ret
 .end.7:
        lea     rax, [rcx+7]
        sub     rax, r8
        ret       
The GCC version :
 align 16
slen:
        test    cl, 7
        je      .L18
        xor     eax, eax
        cmp     BYTE [rcx], 0
        je      .L1
        cmp     BYTE [rcx+1], 0
        mov     eax, 1
        je      .L1
        cmp     BYTE [rcx+2], 0
        mov     eax, 2
        je      .L1
        cmp     BYTE [rcx+3], 0
        mov     eax, 3
        je      .L1
        cmp     BYTE [rcx+4], 0
        mov     eax, 4
        je      .L1
        cmp     BYTE [rcx+5], 0
        mov     eax, 5
        je      .L1
        cmp     BYTE [rcx+6], 0
        mov     eax, 6
        je      .L1
        cmp     BYTE [rcx+7], 0
        mov     eax, 7
        je      .L1
        lea     rax, [rcx+7]
        and     rax, -8
        jmp     .L47
 align 16
.L18:
        mov     rax, rcx
        jmp     .L47
 align 16
.L40:
        test    dh, dh
        je      .L49
        test    edx, 16711680
        je      .L50
        test    edx, 4278190080
        je      .L51
        shr     rdx, 32
        test    dl, dl
        je      .L52
        test    dh, dh
        je      .L53
        test    edx, 16711680
        je      .L54
        test    edx, 4278190080
        je      .L55
        add     rax, 8
.L47:
        mov     rdx, QWORD [rax]
        test    dl, dl
        jne     .L40
        sub     eax, ecx
.L1:
        ret
 align 16
.L49:
        sub     rax, rcx
        add     eax, 1
        ret
 align 16
.L50:
        sub     rax, rcx
        add     eax, 2
        ret
 align 16
.L51:
        sub     rax, rcx
        add     eax, 3
        ret
 align 16
.L52:
        sub     rax, rcx
        add     eax, 4
        ret
 align 16
.L53:
        sub     rax, rcx
        add     eax, 5
        ret
 align 16
.L54:
        sub     rax, rcx
        add     eax, 6
        ret
 align 16
.L55:
        sub     rax, rcx
        add     eax, 7
        ret   
My function test result :
string length => 336
loop execution times => 10000000
total execution time => 0.772015
GCC function test result :
string length => 336
loop execution times => 10000000
total execution time => 0.522015
What is the problem ? Why my function is 1.5x slower when everything is kinda looks fine? My string is aligned at 8 bytes, so you can skip the first one-by-one process and alignment.
Is there any problem with my label aligning ? Or the problem is from somewhere else?
ABI -> x64 (Windows)
CPU (Test) => i7-7800X
My C test application source-code :
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
unsigned int
slen_by_me(const char *);
unsigned int
slen_gcc(const char *);
int main() {
    static const char *str="WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW";
    LARGE_INTEGER frequency;
    LARGE_INTEGER start;
    LARGE_INTEGER end;
    double interval;
    unsigned int l = 0;
    QueryPerformanceFrequency(&frequency);
    QueryPerformanceCounter(&start);
    for (int i = 0; i < 10000000; i++) {
        l += slen_gcc(str);
    }
    QueryPerformanceCounter(&end);
    interval = (double) (end.QuadPart - start.QuadPart) / frequency.QuadPart;
    printf("%f\n%u\n", interval, l);
    return 0;
}
My object file (with these 2 slen functions to link to that C tester) creator in FASM :
format MS64 COFF
public slen_gcc
public slen_by_me
section '.text' code readable executable align 64
 align 16
slen_gcc:
        test    cl, 7
        je      .L18
        xor     eax, eax
        cmp     BYTE [rcx], 0
        je      .L1
        cmp     BYTE [rcx+1], 0
        mov     eax, 1
        je      .L1
        cmp     BYTE [rcx+2], 0
        mov     eax, 2
        je      .L1
        cmp     BYTE [rcx+3], 0
        mov     eax, 3
        je      .L1
        cmp     BYTE [rcx+4], 0
        mov     eax, 4
        je      .L1
        cmp     BYTE [rcx+5], 0
        mov     eax, 5
        je      .L1
        cmp     BYTE [rcx+6], 0
        mov     eax, 6
        je      .L1
        cmp     BYTE [rcx+7], 0
        mov     eax, 7
        je      .L1
        lea     rax, [rcx+7]
        and     rax, -8
        jmp     .L47
 align 16
.L18:
        mov     rax, rcx
        jmp     .L47
 align 16
.L40:
        test    dh, dh
        je      .L49
        test    edx, 16711680
        je      .L50
        test    edx, 4278190080
        je      .L51
        shr     rdx, 32
        test    dl, dl
        je      .L52
        test    dh, dh
        je      .L53
        test    edx, 16711680
        je      .L54
        test    edx, 4278190080
        je      .L55
        add     rax, 8
.L47:
        mov     rdx, QWORD [rax]
        test    dl, dl
        jne     .L40
        sub     eax, ecx
.L1:
        ret
 align 16
.L49:
        sub     rax, rcx
        add     eax, 1
        ret
 align 16
.L50:
        sub     rax, rcx
        add     eax, 2
        ret
 align 16
.L51:
        sub     rax, rcx
        add     eax, 3
        ret
 align 16
.L52:
        sub     rax, rcx
        add     eax, 4
        ret
 align 16
.L53:
        sub     rax, rcx
        add     eax, 5
        ret
 align 16
.L54:
        sub     rax, rcx
        add     eax, 6
        ret
 align 16
.L55:
        sub     rax, rcx
        add     eax, 7
        ret
 align 16
slen_by_me:
        mov     r8, rcx
        test    cl, 7
        jz      .loop
        xor     eax, eax
        cmp     BYTE [rcx], al
        je      SHORT .ret
        cmp     BYTE [rcx+1], al
        je      SHORT .ret1
        cmp     BYTE [rcx+2], al
        je      SHORT .ret2
        cmp     BYTE [rcx+3], al
        je      SHORT .ret3
        cmp     BYTE [rcx+4], al
        je      SHORT .ret4
        cmp     BYTE [rcx+5], al
        je      SHORT .ret5
        cmp     BYTE [rcx+6], al
        je      SHORT .ret6
        cmp     BYTE [rcx+7], al
        jne     SHORT .align8
        mov     al, 7
        ret
 align 16
 .ret:  ret
 align 16
 .ret1: mov     al, 1
        ret
 align 16
 .ret2: mov     al, 2
        ret
 align 16
 .ret3: mov     al, 3
        ret
 align 16
 .ret4: mov     al, 4
        ret
 align 16
 .ret5: mov     al, 5
        ret
 align 16
 .ret6: mov     al, 6
        ret
 align 16
 .align8:
        lea     rcx, [rcx+7]
        and     rcx, (-8)
 align 16
 .loop: mov     rax, QWORD [rcx]
        test    al, al
        jz      SHORT .end
        test    ah, ah
        jz      SHORT .end.1
        test    eax, 0x00ff0000
        jz      SHORT .end.2
        test    eax, 0xff000000
        jz      SHORT .end.3
        shr     rax, 32
        test    al, al
        jz      SHORT .end.4
        test    ah, ah
        jz      SHORT .end.5
        test    eax, 0x00ff0000
        jz      SHORT .end.6
        test    eax, 0xff000000
        jz      SHORT .end.7
        add     rcx, 8
        jmp     SHORT .loop
 align 16
 .end: mov      rax, rcx
        sub     rax, r8
        ret
 align 16
 .end.1:
        lea     rax, [rcx+1]
        sub     rax, r8
        ret
 .end.2:
        lea     rax, [rcx+2]
        sub     rax, r8
        ret
 .end.3:
        lea     rax, [rcx+3]
        sub     rax, r8
        ret
 .end.4:
        lea     rax, [rcx+4]
        sub     rax, r8
        ret
 .end.5:
        lea     rax, [rcx+5]
        sub     rax, r8
        ret
 .end.6:
        lea     rax, [rcx+6]
        sub     rax, r8
        ret
 .end.7:
        lea     rax, [rcx+7]
        sub     rax, r8
        ret
Also the C version of slen
int
slen(const char *str) {
    const char *start=str;
    if(((unsigned long long)str & 7) != 0) {
        if(str[0] == 0x00)
            return 0;
        if(str[1] == 0x00)
            return 1;
        if(str[2] == 0x00)
            return 2;
        if(str[3] == 0x00)
            return 3;
        if(str[4] == 0x00)
            return 4;
        if(str[5] == 0x00)
            return 5;
        if(str[6] == 0x00)
            return 6;
        if(str[7] == 0x00)
            return 7;
        str=(const char *)(((unsigned long long)str + 7) & (-8));
    }
    do {
        unsigned long long bytes=(*(unsigned long long*)(str));
        if((unsigned char)bytes==0x00)
            return (int)(str-start);
        if((bytes & 0x0000ff00)==0)
            return (int)(str-start+1);
        if((bytes & 0x00ff0000)==0)
            return (int)(str-start+2);
        if((bytes & 0xff000000)==0)
            return (int)(str-start+3);
        bytes >>= 32;
        if((unsigned char)bytes==0x00)
            return (int)(str-start+4);
        if((bytes & 0x0000ff00)==0)
            return (int)(str-start+5);
        if((bytes & 0x00ff0000)==0)
            return (int)(str-start+6);
        if((bytes & 0xff000000)==0)
            return (int)(str-start+7);
        str+=8;
    } while (1);
}
                
 
     
    