I am trying to write a simple code using SSE and SSE3 to calculate the sum of all the elements of an array. The difference is that in one of the codes I do the sum "vertically" using PADDD and in the other I do the sum horizontally, using HADPPS. Since the only value I am interested in is the total sum, the way I do the sum should not matter. However, the horizontal addition is outputting the wrong results. Any idea why?
This is the code for the regular add:
int sumelems_sse(int *a, int size)
{
  int tmp[4];
  tmp[0] = 0;
  tmp[1] = 0;
  tmp[2] = 0;
  tmp[3] = 0;
  int total;
  __asm__ volatile (
                   "\n\t movdqa %0,%%xmm0 \t#"       // moves tmp[0] to xmm0
                   : /* no output */
                   : "m" (tmp[0])   //%0
                   );
  for (int i=0;i<size;i+=4) {
    __asm__ volatile
        ( // instruction         comment          
        "\n\t movdqa     %0,%%xmm1     \t#"           // moves a[i] to xmm1
        "\n\t paddd    %%xmm1,%%xmm0  \t#"            // xmm0 = xmm0+xmm1 in 4 blocks of 32 bits
        : /* no output */
        : "m"  (a[i])       // %0 
        );
  }
   __asm__ volatile(
                   "\n\t movdqa %%xmm0,%0 \t#"         // moves xmm0 to tmp[0]
                   : "=m" (tmp[0])
                   );
   total = tmp[0] + tmp[1] + tmp[2] + tmp[3];
   return total;
}
And this is the code for the horizontal add:
int sumelems_sse3(int *a, int size)
{
  int tmp[4];
  tmp[0] = 0;
  tmp[1] = 0;
  tmp[2] = 0;
  tmp[3] = 0;
  int total;
  __asm__ volatile (
                   "\n\t movdqa %0,%%xmm0 \t#"       // moves tmp[0] to xmm0
                   : /* no output */
                   : "m" (tmp[0])   //%0
                   );
  for (int i=0;i<size;i+=4) {
    __asm__ volatile
        ( // instruction         comment          
        "\n\t movdqa     %0,%%xmm1     \t#"             // moves a[i] to xmm1
        "\n\t haddps      %%xmm1,%%xmm0   \t#"           // xmm0 = xmm0+xmm2 in 4 blocks of 32 bits
        : /* no output */
        : "m"  (a[i])       // %0 
        );
  }
   __asm__ volatile(
                   "\n\t movdqa %%xmm0,%0 \t#"         // moves xmm0 to tmp[0]
                   : "=m" (tmp[0])
                   );
   total = tmp[0] + tmp[1] + tmp[2] + tmp[3];
   return total;
}
I think only the adding instruction should change, or not?
 
    