I'm trying to learn more about how to analyze the performance of my more frequently used methods.
I've tried using rand() and timing large numbers of calls to my methods as a method of performance measurement but I also want to learn more about how to measure performance by understanding what the assembly code is doing.
For example, I have read about people trying to optimize the sgn function (Is there a standard sign function (signum, sgn) in C/C++?) so i figured this would be a good place to start.  I went to http://gcc.godbolt.org and generated asm for the following code (ICC with -march=core-avx2 -fverbose-asm -Ofast -std=c++11):
int sgn_v1(float val)
{
    return (float(0) < val) - (val < float(0));
}
and
int sgn_v2(float val)
{
  if (float(0) < val)      return  1;
  else if (val < float(0)) return -1;
  else                     return  0;
}
This generated the following assembly
L__routine_start__Z6sgn_v1f_0:
sgn_v1(float):
        vxorps    %xmm2, %xmm2, %xmm2                           #3.38
        vcmpgtss  %xmm2, %xmm0, %xmm1                           #3.38
        vcmpgtss  %xmm0, %xmm2, %xmm3                           #3.38
        vmovd     %xmm1, %eax                                   #3.38
        vmovd     %xmm3, %edx                                   #3.38
        negl      %eax                                          #3.38
        negl      %edx                                          #3.38
        subl      %edx, %eax                                    #3.38
        ret                                                     #3.38
and
L__routine_start__Z6sgn_v2f_1:
sgn_v2(float):
        vxorps    %xmm1, %xmm1, %xmm1                           #8.3
        vcomiss   %xmm1, %xmm0                                  #8.18
        ja        ..B2.3        # Prob 28%                      #8.18
        vcmpgtss  %xmm0, %xmm1, %xmm0                           #
        vmovd     %xmm0, %eax                                   #
        ret                                                     #
..B2.3:                         # Preds ..B2.1
        movl      $1, %eax                                      #9.12
        ret                                                     #9.12
My analysis starts off with the fact that sgn_v1 has 9 instructions and sgn_v2 has 6 or 5 instructions depending on the results of the jump.  The previous post talks about how sgn_v1 is branchless and it seems that this is a good thing, I assume this means that multiple instructions in sgn_v1 can be executed at the same time.  I went to http://www.agner.org/optimize/instruction_tables.pdf and I couldn't fund most of these operations in the haswell section (p187-p202).
How can I analyze this?
Edit:
Responding to @Raxvan's comments, I ran the following test program
extern "C" int sgn_v1(float);
__asm__(
"sgn_v1:\n"
"  vxorps    %xmm2, %xmm2, %xmm2\n"
"  vcmpgtss  %xmm2, %xmm0, %xmm1\n"
"  vcmpgtss  %xmm0, %xmm2, %xmm3\n"
"  vmovd     %xmm1, %eax\n"
"  vmovd     %xmm3, %edx\n"
"  negl      %eax\n"
"  negl      %edx\n"
"  subl      %edx, %eax\n"
"  ret\n"
);
extern "C" int sgn_v2(float);
__asm__(
"sgn_v2:\n"
"  vxorps    %xmm1, %xmm1, %xmm1\n"
"  vcomiss   %xmm1, %xmm0\n"
"  ja        ..B2.3\n"
"  vcmpgtss  %xmm0, %xmm1, %xmm0\n"
"  vmovd     %xmm0, %eax\n"
"  ret\n"
"  ..B2.3:\n"
"  movl      $1, %eax\n"
"  ret\n"
);
#include <cstdlib>
#include <ctime>
#include <iostream>
int main()
{
  size_t N = 50000000;
  std::clock_t start = std::clock();
  for (size_t i = 0; i < N; ++i)
  {
    sgn_v1(float(std::rand() % 3) - 1.0);
  }
  std::cout << "v1 Time: " << (std::clock() - start) / (double)(CLOCKS_PER_SEC / 1000) << " ms " << std::endl;
  start = std::clock();
  for (size_t i = 0; i < N; ++i)
  {
    sgn_v2(float(std::rand() % 3) - 1.0);
  }
  std::cout << "v2 Time: " << (std::clock() - start) / (double)(CLOCKS_PER_SEC / 1000) << " ms " << std::endl;
  start = std::clock();
  for (size_t i = 0; i < N; ++i)
  {
    sgn_v2(float(std::rand() % 3) - 1.0);
  }
  std::cout << "v2 Time: " << (std::clock() - start) / (double)(CLOCKS_PER_SEC / 1000) << " ms " << std::endl;
  start = std::clock();
  for (size_t i = 0; i < N; ++i)
  {
    sgn_v1(float(std::rand() % 3) - 1.0);
  }
  std::cout << "v1 Time: " << (std::clock() - start) / (double)(CLOCKS_PER_SEC / 1000) << " ms " << std::endl;
}
And I got the following result:
g++-4.8 -std=c++11 test.cpp && ./a.out
v1 Time: 423.81 ms
v2 Time: 657.226 ms
v2 Time: 666.233 ms
v1 Time: 436.545 ms
So the branchless result is clearly better; @Jim's suggested I look into how branch predictors work but I still can't find a way of computing how "full" the pipeline is...
 
    