I am trying to rewrite a code from c++ source code including SSE instructions, to only c++ code. I know i will lose performance, but its an experiment, i am trying to perform. I was wondering if there is a C++ equivalent for doing the same as , __mm_unpackhi_pd and __mm_unpacklo_pd. I have zero knowledge about SSE.
A snippet of the code for reference which i am trying to convert. Any knowledge or tips would be helpful. Thank you.
for (unsigned chunk = 0; chunk < chunks; chunk++)
{
  unsigned start = chunk * chunksize;
  unsigned end =
    std::min((chunk + 1) * chunksize, (unsigned)2 * w);
  __m128d a2b2 =
    _mm_load_pd(d_origx +
                ((2 * init_G_offset + start) & n2_m_1));
  unsigned i2_mod_B = 0;
  for (unsigned i = start; i < end; i += 2)
    {
      __m128d ab = a2b2;
      a2b2 =
        _mm_load_pd(d_origx +
                    ((origx_offset + i) & n2_m_1));
      __m128d cd = _mm_load_pd(d_filter + i);
      __m128d cc = _mm_unpacklo_pd(cd, cd);
      __m128d dd = _mm_unpackhi_pd(cd, cd);
      __m128d a0a1 = _mm_unpacklo_pd(ab, a2b2);
      __m128d b0b1 = _mm_unpackhi_pd(ab, a2b2);
      __m128d ac = _mm_mul_pd(cc, a0a1);
      __m128d ad = _mm_mul_pd(dd, a0a1);
      __m128d bc = _mm_mul_pd(cc, b0b1);
      __m128d bd = _mm_mul_pd(dd, b0b1);
      __m128d ac_m_bd = _mm_sub_pd(ac, bd);
      __m128d ad_p_bc = _mm_add_pd(ad, bc);
      __m128d ab_times_cd = _mm_unpacklo_pd(ac_m_bd, ad_p_bc);
      __m128d a2b2_times_cd =
        _mm_unpackhi_pd(ac_m_bd, ad_p_bc);
      __m128d xy = _mm_load_pd(d_x_sampt + i2_mod_B);
      __m128d x2y2 = _mm_load_pd(d_x_sampt + i2_mod_B + 2);
      __m128d st = _mm_add_pd(xy, ab_times_cd);
      __m128d s2t2 = _mm_add_pd(x2y2, a2b2_times_cd);
      _mm_store_pd(d_x_sampt + i2_mod_B, st);
      _mm_store_pd(d_x_sampt + i2_mod_B + 2, s2t2);
      i2_mod_B += 4;
    }
}
 
     
    