I'm learning writing some SSE code. Here is one test program that compare the SSE code and normal c++ code in both accuracy and performance. The program take two vectors of a given size, and output the sum value of all the elements. For example, when size=3, v1={1, 2, 3}, v2={2, 3, 4}, the result will be (1+2) + (2+3) + (3+4) = 15. The vector's values is generated by generator with fixed seed. And the size of the vectors is divisible by 4. Here is the code:
test.cpp:
#include <random>
#include <iostream>
#if defined USESSE
    #include <pmmintrin.h>
#endif
#include "timer.h"
using namespace std;
float* init_vector(int size) {
    default_random_engine generator(100);  // seed
    uniform_real_distribution<float> distribution(0.0, 1.0);
    float* vec = (float*)malloc(size*sizeof(float));
    for (int i=0; i<size; i++) {
        vec[i] = distribution(generator);
    }
    return vec;
}
#if defined USESSE
    float vec_sum(float* v1, float* v2, int size) {
        __m128 vec_sumed = _mm_setzero_ps();  // {0, 0, 0, 0}
        // vertical sum of two vectors
        for (int i=0; i<size; i+=4) {
            // sum vertically and then horizentally
            vec_sumed = _mm_add_ps(vec_sumed, _mm_add_ps(_mm_load_ps(v1+i), _mm_load_ps(v2+i)));
        }
        float* v = (float*)malloc(4*sizeof(float));
        _mm_store_ps(v, vec_sumed);
        // make horizental sum over the final vec
        float result = 0;
        for (int i=0; i<4; i++) {
            result += v[i];
        }
        return result;
    }
#else
    float vec_sum(float* v1, float* v2, int size) {
        float result = 0;
        for (int i=0; i<size; i++) {
            result += v1[i]+v2[i];
        }
        return result;
    }
#endif
void make_test(int size) {
    float* vec1 = init_vector(size);
    float* vec2 = init_vector(size);
    Timer timer;
    timer.tic();
    cout << vec_sum(vec1, vec2, size)/size << endl;
    timer.toc();
    cout << "Run time: " << timer.get() << endl;
}
int main() {
#if defined USESSE
    cout << "with SSE" << endl;
#else
    cout << "without SSE" << endl;
#endif
    make_test(40000000);
}
timer.h (Only for timing the functions).
#pragma once
#include <chrono>
#include <string>
class Timer
{
public:
    Timer(){reset();};
    void reset() {
        begin = std::chrono::high_resolution_clock::now();
        duration = std::chrono::duration_cast<std::chrono::milliseconds>(begin-begin);
    };
    void tic() {
        begin = std::chrono::high_resolution_clock::now();
    };
    float toc() {
        duration += std::chrono::duration_cast<std::chrono::milliseconds>
                        (std::chrono::high_resolution_clock::now()-begin);
        return get();
    };
    float get() {
        return (float)duration.count() / 1000;
    }
private:
    std::chrono::high_resolution_clock::time_point begin;
    std::chrono::milliseconds duration;
};
I use the compile and run command g++ -std=c++14 test.cpp -DUSESSE -o test && ./test for using SSE and g++ -std=c++14 test.cpp -o test && ./test for not using SSE. The result is:
with SSE
0.999982
Run time: 0.071
------------------------
without SSE
0.838861
Run time: 0.124
If I change the size into a smaller number, let's say 400, the results is the same:
with SSE
1.01521
Run time: 0
-----------------------
without SSE
1.01521
Run time: 0
But it is hard to compare the performance using a small number. My question is, with a large vector size, why the SSE version function produces a different result from the normal c++ function?
