I've got an AVX kernel I wrote to do complex conjugate multiplies:
__attribute__((noinline))
static __attribute__((target("avx"))) void asm_vcmulcc(
    cfloat* __restrict__ cc, const cfloat* __restrict__ aa, const cfloat* __restrict__ bb, ssize_t size) {
    ssize_t iters = size/4;
    ssize_t rem   = size-iters*4;
    __asm__(
        ".section .rodata # constant section\n\t"
        ".align 32        # 32 byte alignment\n\t"
        "LC%=:\n\t" 
        "     .long 0x80000000\n\t"
        "     .long 0x80000000\n\t"
        "     .long 0x80000000\n\t"
        "     .long 0x80000000\n\t"
        "     .long 0x80000000\n\t"
        "     .long 0x80000000\n\t"
        "     .long 0x80000000\n\t"
        "     .long 0x80000000\n\t"
        ""
        ".text\n\t"
        "     vmovaps   LC%=(%%rip), %%ymm4\n\t"
        "     xorl      %%eax,  %%eax\n\t"
        ""
        ".p2align 4\n\t"
        "LOOP%=:\n\t"
        "     vmovups   (%[bb],%%rax,1), %%ymm3\n\t"
        "     vmovups   (%[aa],%%rax,1), %%ymm1\n\t"
        "     vpermilps $0xa0,  %%ymm1,  %%ymm2\n\t"
        "     vpermilps $0xf5,  %%ymm1,  %%ymm0\n\t"               
        "     vmulps    %%ymm3, %%ymm2,  %%ymm2\n\t"
        "     vxorps    %%ymm4, %%ymm0,  %%ymm0\n\t"
        "     vpermilps $0xb1,  %%ymm3,  %%ymm3\n\t"
        "     vmulps    %%ymm3, %%ymm0,  %%ymm0\n\t"
        "     vaddsubps %%ymm0, %%ymm2,  %%ymm0\n\t"
        "     vmovups   %%ymm0, (%[cc],%%rax,1)\n\t"
        "     addq      $32,      %%rax\n\t"
        "     cmpq      %[bytes], %%rax\n\t"
        "     jl        LOOP%=\n\t"
        :
        : [aa] "r" (aa), [bb] "r" (bb), [cc] "r" (cc), [bytes] "r" (iters*4*sizeof(cfloat))
        : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "rax", "memory"
    );
    if (rem > 0) {
        aa += iters*4;
        bb += iters*4;
        cc += iters*4;
        for (ssize_t ii=0; ii < rem; ii++) {
            cc[ii] = conj(aa[ii])*bb[ii];
        }
    }
}
Which works great with Intel compilers, and gcc >= 5, but gcc < 5 errors out (this is g++ 4.8.5):
> g++ -std=c++0x -I. -c -mavx lib.cc -O3 -o lib.o
lib.cc: In function ‘void avx_vcmulcc(prelude::{anonymous}::cfloat*, const cfloat*, const cfloat*, int)’:
lib.cc:80:6: error: unknown register name ‘ymm4’ in ‘asm’
     );
      ^
lib.cc:80:6: error: unknown register name ‘ymm3’ in ‘asm’
lib.cc:80:6: error: unknown register name ‘ymm2’ in ‘asm’
lib.cc:80:6: error: unknown register name ‘ymm1’ in ‘asm’
lib.cc:80:6: error: unknown register name ‘ymm0’ in ‘asm’
With or without the -mavx option. Apparently the compiler is allowed to emit AVX, but won't let it pass through unmolested? Is there a hidden option somewhere to suppress this?
 
     
    