I have two functions of 2d arrays multiplication. One of them with SSE. Another function without any optimization. Both functions work well. But the results are slightly different. For example 20.333334 and 20.333332.
Can you explain why the results are different? And what can I do with functions to have the same result?
function with SSE
float** sse_multiplication(float** array1, float** array2, float** arraycheck)
{
    int i, j, k;
    float *ms1, *ms2, result;
    float *end_loop;
    for( i = 0; i < rows1; i++)
    {
        for( j = 0; j < columns2; j++)
        {
            result = 0;
            ms1 = array1[i];
            ms2 = array2[j];
            end_loop = &array1[i][columns1];
            __asm{
                     mov rax, ms1
                     mov rbx, ms2
                     mov rdx, end_loop
                     xorps xmm2, xmm2
                loop:
                     movups xmm0, [rax]
                     movups xmm1, [rbx]
                     movups xmm3, [rax+16]
                     movups xmm4, [rbx+16]
                     mulps xmm0, xmm1
                     mulps xmm3, xmm4
                     addps xmm2, xmm0
                     add rax, 32
                     add rbx, 32
                     cmp rdx, rax
                     jne loop
                     haddps xmm2, xmm2
                     haddps xmm2, xmm2
                     movups result, xmm2
               }
             arraycheck[i][j] = result;
        }
    }
    return arraycheck;
}
function without any optimization
float** multiplication(float** array1, float** array2, float** arraycheck)
{
    for (int i = 0; i < rows1; i++)
        for (int j = 0; j < columns2; j++)
            for (int k = 0; k < rows1; k++)
                arraycheck[i][j] += array1[i][k] * array2[k][j];
    return arraycheck;
}
 
     
     
    