I'm having trouble understanding a certain piece of code in assembly. The task is to find the dot product of 2 vectors using SSE arithmetic and the XMM registers. The approach is to read the vectors 4 floats at a time (meaning one xmm register will hold four in an iteration). End result of that is an xmm register, with each byte holding a sum of products (x1*y1 +...) of the given vectors.
What I don't get is the part that comes afterwards. All that is needed to sum these 'end' bytes altogether, basically sum the 4 bytes making the final register. I tried finding something on this, but to no prevail. What I'm given is beyond my understanding, I even tried writing every computation on paper, nothing made sense much. In the highlighted part, the actual sum is computed and stored in the lowest byte of the xmm0. Any insight on this is welcome.
.intel_syntax noprefix
.data
two: .int 2
.text
.global dot_product
############################################################################
##
## Function:
##
## void dot_product(float *x, float *y, int n, float *r);
##
## calculates the dot product of x and y (n lengths) and stores the result
## in r
##
## -- float * x -- rdi --
## -- float * y -- rsi --
## -- int n -- rdx --
## -- float * r -- rcx --
##
############################################################################
dot_product:
enter 0, 0
mov r8, rcx
mov r9, rdx
mov rax, 1
cpuid
test rdx, 0x2000000
jz not_supported
mov rdx, rsp
and rsp, 0xfffffffffffffff0
sub rsp, 512
fxsave [rsp]
mov rcx, r9
xorps xmm0, xmm0
next_four:
cmp rcx, 4
jb next_one
movups xmm1, [rsi]
movups xmm2, [rdi]
mulps xmm1, xmm2
addps xmm0, xmm1
add rsi, 16
add rdi, 16
sub rcx, 4
jmp next_four
next_one:
jrcxz finish
movss xmm1, [rsi]
movss xmm2, [rdi]
mulss xmm1, xmm2
addss xmm0, xmm1
add rsi, 4
add rdi, 4
dec rcx
jmp next_one
finish:
#**summing the 4 bytes giving the actual dot product**
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0b01010101
addss xmm0, xmm1
movss [r8], xmm0
fxrstor [rsp]
mov rsp, rdx
done:
leave
ret
not_supported:
mov rax, 1
mov rbx, 1
int 0x80