I have the following code which compiles with GCC using the flag -msse4 but the problem is that the pop count only gets the last four 8-bits of the converted __m128i type. Basically what I want is to count all 16 numbers inside the __m128i type but I'm not sure what intrinsic function call to make after creating the variable popA. Somehow popA has to be converted into an integer that contains all the 128-bits of information? I suppose theres _mm_cvtsi128_si64 and using a few shuffle few operations but my OS is 32-bit. Is there only the shuffle method and using _mm_cvtsi128_si32?
EDIT: If the shuffle method is the only option I need help implementing it for my 32-bit OS, please.
Heres the code.
#include <stdio.h>
#include <smmintrin.h>
#include <emmintrin.h>
int main(void)
{
int A = 1;
__m128i popA = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);
unsigned int integer = _mm_cvtsi128_si32(popA);
//long long LONG = _mm_cvtsi128_si64(popA);//my OS is 32-bits so no luck here
printf("integer = %d\n", integer);
int pop = _mm_popcnt_u32(integer);
//int popLONG = _mm_popcnt_u64(LONG);
printf("popcount = %d\n", pop);
//printf("popcount LONG = %d\n", popLONG);
return 0;
}
EDIT 2: This one finally runs (with GCC compiler flags -msse -msse2 -msse3 -msse4) although I'm not sure if the output for pop_count1() is correct.
Output:
pop_count1(): 1799 1799 1799 1799 1799 1799 1799 1799
pop_count2():population count for each byte: 1 1 1 1 1 1 1 1 0 1 2 3 4 5 6 7
#include <stdio.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>
#include <stdint.h>
#include <tmmintrin.h>
void print128_num(__m128i var)
{
uint16_t *val = (uint16_t*) &var;
printf("pop_count1(): %i %i %i %i %i %i %i %i \n",
val[0], val[1], val[2], val[3], val[4], val[5],
val[6], val[7]);
}
static __m128i parallelPopcnt16bytes (__m128i xmm)//for pop_count2
{
const __m128i mask4 = _mm_set1_epi8 (0x0F);
const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m128i low, high, count;
low = _mm_and_si128 (mask4, xmm);
high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
return count;
}
void pop_count1()
{
int A = 1;
__m128i in = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);
__m128i bit0 = _mm_set1_epi8( 0x80 );
__m128i mask0 = _mm_and_si128( in, bit0 );
__m128i sum = _mm_cmpeq_epi8( mask0, _mm_setzero_si128() );
/* general pattern */
__m128i bit1 = _mm_set1_epi8( 0x40 );
__m128i mask1 = _mm_and_si128( in, bit1 );
mask1 = _mm_cmpeq_epi8( mask1, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask1 );
/* next bit */
__m128i bit2 = _mm_set1_epi8( 0x20 );
__m128i mask2 = _mm_and_si128( in, bit2 );
mask2 = _mm_cmpeq_epi8( mask2, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask2 );
__m128i bit3 = _mm_set1_epi8( 0x10 );
__m128i mask3 = _mm_and_si128( in, bit3 );
mask3 = _mm_cmpeq_epi8( mask3, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask3 );
__m128i bit4 = _mm_set1_epi8( 0x08 );
__m128i mask4 = _mm_and_si128( in, bit4 );
mask4 = _mm_cmpeq_epi8( mask4, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask4 );
__m128i bit5 = _mm_set1_epi8( 0x04 );
__m128i mask5 = _mm_and_si128( in, bit5 );
mask5 = _mm_cmpeq_epi8( mask5, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask5 );
__m128i bit6 = _mm_set1_epi8( 0x02 );
__m128i mask6 = _mm_and_si128( in, bit6 );
mask6 = _mm_cmpeq_epi8( mask6, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask6 );
__m128i bit7 = _mm_set1_epi8( 0x01 );
__m128i mask7 = _mm_and_si128( in, bit7 );
mask7 = _mm_cmpeq_epi8( mask7, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask7 );
/* finish up */
sum = _mm_sub_epi8( _mm_setzero_si128(), sum );
print128_num(sum);
}
void pop_count2()
{
int index;
__m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
__m128i counts = parallelPopcnt16bytes (testVector);
printf ("pop_count2():population count for each byte:");
for (index = 15; index >= 0; index--)
{
uint8_t *bytes = (void *) &counts;
printf (" %d", bytes [index]);
}
printf ("\n");
}
int main(void)
{
pop_count1();
pop_count2();
return 0;
}