strlen:
   xor r8,r8
.Lalignlong:
    test rdi, 0xf
    je .LfindNull
    prefetch [rdi + 8]
    cmp  Byte PTR [rdi], 0
    je  .LansNoAdd
    inc r8
    inc rdi
    jmp .Lalignlong
# do while is faster than while because of less  jumps (Agner)
.LfindNull:
    mov  r9, 0xFEFEFEFEFEFEFEFF
    mov  r10, 0x8080808080808080 # citation: Bit Twiddling Hacks Sean Eron Anderson
    prefetch [rdi + 192]
    mov rcx, [rdi]
    lea    rax, [rcx + r9]
    not     rcx
    and     rcx, rax
    and     rcx, r10
    jne .Lanswer
    nop # no idea why this makes it 2 cycles faster. findloop changes from 4a -> 4b
.Lfindloop:
    prefetch [rdi + 420]
    mov rcx, [rdi + 8]
    add rdi, 8
    add r8, 8
    lea     rax, [rcx + r9]
    not     rcx
    and     rcx, rax
    and     rcx, r10
    je .Lfindloop
.Lanswer:
    bsf     rcx, rcx
    shr     rcx, 3
    lea rax, [rcx + r8]
    ret
.LansNoAdd:
    mov rax, r8
    ret
This should be the x86 64 bit assembly code for counting the length of a char string, and the address of the string is passed to RDI.
I don't understand the first .Lalignlong part; does that do the data alignment?
And if yes, how is it supposed to work? Especially the line test rdi, 0xf confuses me very much.
 
    