Is there a gcc pragma or something I can use to force gcc to generate branch-free instructions on a specific section of code?
I have a piece of code that I want gcc to compile to branch-free code using cmov instructions:
int foo(int *a, int n, int x) {
    int i = 0, j = n;
    while (i < n) {
#ifdef PREFETCH
        __builtin_prefetch(a+16*i + 15);
#endif /* PREFETCH */
        j = (x <= a[i]) ? i : j;
        i = (x <= a[i]) ? 2*i + 1 : 2*i + 2;
    }
    return j;
}
and, indeed, it does so:
morin@soprano$ gcc -O4 -S -c test.c -o -    
    .file   "test.c"
    .text
    .p2align 4,,15
    .globl  foo
    .type   foo, @function
foo:
.LFB0:
    .cfi_startproc
    testl   %esi, %esi
    movl    %esi, %eax
    jle .L2
    xorl    %r8d, %r8d
    jmp .L3
    .p2align 4,,10
    .p2align 3
.L6:
    movl    %ecx, %r8d
.L3:
    movslq  %r8d, %rcx
    movl    (%rdi,%rcx,4), %r9d
    leal    (%r8,%r8), %ecx      # put 2*i in ecx
    leal    1(%rcx), %r10d       # put 2*i+1 in r10d
    addl    $2, %ecx             # put 2*i+2 in ecx
    cmpl    %edx, %r9d
    cmovge  %r10d, %ecx          # put 2*i+1 in ecx if appropriate
    cmovge  %r8d, %eax           # set j = i if appropriate
    cmpl    %esi, %ecx
    jl  .L6
.L2:
    rep ret
    .cfi_endproc
.LFE0:
    .size   foo, .-foo
    .ident  "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
    .section    .note.GNU-stack,"",@progbits
(Yes, I realize the loop is a branch, but I'm talking about the choice operators inside the loop.)
Unfortunately, when I enable the __builtin_prefetch call, gcc generates branchy code:
morin@soprano$ gcc -DPREFETCH -O4 -S -c test.c -o -
    .file   "test.c"
    .text
    .p2align 4,,15
    .globl  foo
    .type   foo, @function
foo:
.LFB0:
    .cfi_startproc
    testl   %esi, %esi
    movl    %esi, %eax
    jle .L7
    xorl    %ecx, %ecx
    jmp .L5
    .p2align 4,,10
    .p2align 3
.L3:
    movl    %ecx, %eax           # this is the x <= a[i] branch
    leal    1(%rcx,%rcx), %ecx
    cmpl    %esi, %ecx
    jge .L11
.L5:
    movl    %ecx, %r8d           # this is the main branch
    sall    $4, %r8d             # setup the prefetch
    movslq  %r8d, %r8            # setup the prefetch
    prefetcht0  60(%rdi,%r8,4)   # do the prefetch
    movslq  %ecx, %r8
    cmpl    %edx, (%rdi,%r8,4)   # compare x with a[i]
    jge .L3
    leal    2(%rcx,%rcx), %ecx   # this is the x > a[i] branch
    cmpl    %esi, %ecx
    jl  .L5
.L11:
    rep ret
.L7:
    .p2align 4,,5
    rep ret
    .cfi_endproc
.LFE0:
    .size   foo, .-foo
    .ident  "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
    .section    .note.GNU-stack,"",@progbits
I've tried using __attribute__((optimize("if-conversion2"))) on this function, but that has no effect.
The reason I care so much is that I haved hand-edited compiler-generated branch-free code (from the first example) to include the prefetcht0 instructions and it runs considerably faster than both of the versions gcc produces.
 
     
     
    