I'm currently trying to implement an XOR_SHIFT Random Number Generator using AVX2, it's actually quite easy and very fast. However I need to be able to specify a range. This usually requires modulo.
This is a major problem for me for 2 reasons:
- Adding the _mm256_rem_epu32() / _mm256_rem_epi32() SVML function to my code takes the run time of my loop from around 270ms to 1.8 seconds. Ouch!
- SVML is only available on MSVC and Intel Compilers
Are the any significantly faster ways to do modulo using AVX2?
Non Vector Code:
 std::srand(std::time(nullptr));
 std::mt19937_64 e(std::rand());
 uint32_t seed = static_cast<uint32_t>(e());
 for (; i != end; ++i)
 {
      seed ^= (seed << 13u);
      seed ^= (seed >> 7u);
      seed ^= (seed << 17u);
      arr[i] = static_cast<T>(low + (seed % ((up + 1u) - low)));
 }//End for
Vectorized:
  constexpr uint32_t thirteen = 13u;
  constexpr uint32_t seven = 7u;
  constexpr uint32_t seventeen = 17u;
  const __m256i _one = _mm256_set1_epi32(1);
  const __m256i _lower = _mm256_set1_epi32(static_cast<uint32_t>(low));
  const __m256i _upper = _mm256_set1_epi32(static_cast<uint32_t>(up));
                           
  __m256i _temp = _mm256_setzero_si256();
  __m256i _res = _mm256_setzero_si256();
                                                            
  __m256i _seed = _mm256_set_epi32(
       static_cast<uint32_t>(e()),
       static_cast<uint32_t>(e()),
       static_cast<uint32_t>(e()),
       static_cast<uint32_t>(e()),
       static_cast<uint32_t>(e()),
       static_cast<uint32_t>(e()),
       static_cast<uint32_t>(e()),
       static_cast<uint32_t>(e())
  );
  for (; (i + 8uz) < end; ++i)
  {
       //Generate Random Numbers
       _temp = _mm256_slli_epi32(_seed, thirteen);
       _seed = _mm256_xor_si256(_seed, _temp);
       _temp = _mm256_srai_epi32(_seed, seven);
       _seed = _mm256_xor_si256(_seed, _temp);
       _temp = _mm256_slli_epi32(_seed, seventeen);
       _seed = _mm256_xor_si256(_seed, _temp);
       //Narrow
       _temp = _mm256_add_epi32(_upper, _one);
       _temp = _mm256_sub_epi32(_temp, _lower);
       _temp = _mm256_rem_epu32(_seed, _temp); //Comment this line out for a massive speed up but incorrect results
       _res = _mm256_add_epi32(_lower, _temp);                                        
       _mm256_store_si256((__m256i*) &arr[i], _res);
  }//End for
 
    