For example, with this function,
void mask_rol(unsigned char *a, unsigned char *b) {
    a[0] &= __rolb(-2, b[0]);
    a[1] &= __rolb(-2, b[1]);
    a[2] &= __rolb(-2, b[2]);
    a[3] &= __rolb(-2, b[3]);
    a[4] &= __rolb(-2, b[4]);
    a[5] &= __rolb(-2, b[5]);
    a[6] &= __rolb(-2, b[6]);
    a[7] &= __rolb(-2, b[7]);
}
gcc produces,
mov     edx, -2
mov     rax, rdi
movzx   ecx, BYTE PTR [rsi]
mov     edi, edx
rol     dil, cl
and     BYTE PTR [rax], dil
...
While I don't understand why it is filling dx and ax, this is from clang.
mov     cl, byte ptr [rsi]
mov     al, -2
rol     al, cl
and     byte ptr [rdi], al
...
It doesn't do seemingly unnecessary movs like gcc, but it also doesn't care about clearing the upper bits using movzx.
As far as I know, the reason gcc does movzx is to remove false dependency from the dirty upper bits, but maybe clang also has a reason not to do it, so I ran a simple benchmark, and this is the result.
$ time ./rol_gcc
 2161860550
real    0m0.895s
user    0m0.877s
sys     0m0.002s
$ time ./rol_clang
 3205979094
real    0m1.328s
user    0m1.311s
sys     0m0.001s
At least in this case, clang's approach seems to be wrong.
Is this clearly clang's bug, or are there some cases in which clang's approach could produce more efficient code?
benchmark code
#include <stdio.h>
#include <x86intrin.h>
__attribute__((noinline))
static void mask_rol(unsigned char *a, unsigned char *b) {
    a[0] &= __rolb(-2, b[0]);
    a[1] &= __rolb(-2, b[1]);
    a[2] &= __rolb(-2, b[2]);
    a[3] &= __rolb(-2, b[3]);
    a[4] &= __rolb(-2, b[4]);
    a[5] &= __rolb(-2, b[5]);
    a[6] &= __rolb(-2, b[6]);
    a[7] &= __rolb(-2, b[7]);
}
static unsigned long long rdtscp() {
    unsigned _;
    return __rdtscp(&_);
}
int main() {
    unsigned char a[8] = {0}, b[8] = {7, 0, 6, 1, 5, 2, 4, 3};
    unsigned long long c = rdtscp();
    for (int i = 0; i < 300000000; ++i) {
        mask_rol(a, b);
    }
    printf("%11llu\n", rdtscp() - c);
    return 0;
}
 
    