How to calculate approximate reciprocal square root of an array faster on a cpu with popcnt and SSE4.2?
The input is positive integers (ranges from 0 to about 200,000) stored in an array of floats.
The output is an array of floats.
Both arrays have correct memory alignment for sse.
The code below only use 1 xmm register, runs on linux, and can be compiled by gcc -O3 code.cpp -lrt -msse4.2
Thank you.
#include <iostream>
#include <emmintrin.h>
#include <time.h>
using namespace std;
void print_xmm(__m128 xmm){
    float out[4];
    _mm_storeu_ps(out,xmm);
    int i;
    for (i = 0; i < 4; ++i) std::cout << out[i] << " ";
    std::cout << std::endl;
}
void print_arr(float* ptr, size_t size){
    size_t i;
    for(i = 0; i < size; ++i){
        cout << ptr[i] << " ";
    }
    cout << endl;
}
int main(void){
    size_t size = 25000 * 4;
        // this has to be multiple of 4
    size_t repeat = 10000;
        // test 10000 cycles of the code
    float* ar_in = (float*)aligned_alloc(16, size*sizeof(float));
    float* ar_out = (float*)aligned_alloc(16, size*sizeof(float));
    //fill test data into the input array
    //the data is an array of positive numbers.
    size_t i;
    for (i = 0; i < size; ++i){
        ar_in[i] = (i+1) * (i+1);
    }
    //prepare for recipical square root.
    __m128 xmm0;
    size_t size_fix = size*sizeof(float)/sizeof(__m128);
    float* ar_in_end  = ar_in + size_fix;
    float* ar_out_now;
    float* ar_in_now;
    //timing
    struct timespec tp_start, tp_end;
    i = repeat;
    clock_gettime(CLOCK_MONOTONIC, &tp_start);
    //start timing
    while(--i){
        ar_out_now = ar_out;
        for(ar_in_now = ar_in;
            ar_in_now != ar_in_end;
            ar_in_now += 4, ar_out_now+=4){
            //4 = sizeof(__m128)/sizeof(float);
            xmm0 = _mm_load_ps(ar_in_now);
            //cout << "load xmm: ";
            //print_xmm(xmm0);
            xmm0 = _mm_rsqrt_ps(xmm0);
            //cout << "rsqrt xmm: ";
            //print_xmm(xmm0);
            _mm_store_ps(ar_out_now,xmm0);
        }
    }
    //end timing
    clock_gettime(CLOCK_MONOTONIC, &tp_end);
    double timing;
    const double nano = 0.000000001;
    timing = ((double)(tp_end.tv_sec  - tp_start.tv_sec )
           + (tp_end.tv_nsec - tp_start.tv_nsec) * nano)/repeat;
    cout << "    timing per cycle: " << timing << endl;
    /* 
    cout << "input array: ";
    print_arr(ar_in, size);
    cout << "output array: ";
    print_arr(ar_out,size);
    */
    //free mem
    free(ar_in);
    free(ar_out);
    return 0;
}
 
     
     
     
    