sse newb here...
I'm testing two implementations of a routine that has nested logic: a naive implementation and one where I've been clever to try to remove some of the branching. I'm using 'gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3' on x86 Merom with gcc options '-ffast-math -fomit-frame-pointer -msseregparm -mfpmath=sse -msse2'. Code follows:
#define math_sign(a) ( (a) < .0f ? -1.f : +1.f )
inline float math_interp_clamp(float a, float slope, float target)
{
#if 0
    // 5 instr, 1 branch
    float b = a + slope;
    return slope > 0.f ? (b > target ? target : b) : (b < target ? target : b);
#else
    // 19 instr
    float b = a + slope;
    return ( b - target ) *  math_sign( slope ) > 0.f ? target : b;
#endif
}
With my ifdef enabled I get:
math_interp_clamp:
.LFB505:
    .cfi_startproc
    comiss  .LC7, %xmm1
    addss   %xmm1, %xmm0
    jbe .L44
    minss   %xmm0, %xmm2
    movaps  %xmm2, %xmm0
    ret
.L44:
    maxss   %xmm0, %xmm2
    movaps  %xmm2, %xmm0
    ret
    .cfi_endproc
With my ifdef disabled I get:
math_interp_clamp:
.LFB505:
    .cfi_startproc
    xorps   %xmm5, %xmm5
    addss   %xmm1, %xmm0
    movss   .LC3, %xmm4
    cmpltss %xmm5, %xmm1
    movss   .LC2, %xmm6
    movaps  %xmm0, %xmm3
    andps   %xmm1, %xmm4
    andnps  %xmm6, %xmm1
    subss   %xmm2, %xmm3
    orps    %xmm4, %xmm1
    mulss   %xmm1, %xmm3
    movaps  %xmm5, %xmm1
    cmpltss %xmm3, %xmm1
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    andps   %xmm1, %xmm3
    andnps  %xmm0, %xmm2
    orps    %xmm3, %xmm2
    movaps  %xmm2, %xmm0
    ret
    .cfi_endproc
I have not actually timed the generated code, but on the basis of cycle-count I can't imagine those 19 instructions being faster than a mere branch... How ruthless should I be in avoiding branches, or am I using gcc wrong?
Links to a good timing-howto or sse-tutorial graciously accepted.
