I was writing some code to clear the screen to a particular color. C++ code:
void clear_screen(unsigned int color, void *memory, int height, int width) {
  unsigned int *pixel = (unsigned int *)memory;
  for (auto y = 0; y < height; y++)
    for (auto x = 0; x < width; x++)
      *pixel++ = color;
}
I used g++ and objconv to generate the corresponding assembly. This is what I got, and I've commented what I think some of the lines do too.
renderer_clear_screen:
        push    r13                                     
        push    r12                                     
        push    rbp                                     
        push    rdi                                     
        push    rsi                                     
        push    rbx                                     
        mov     r11d, ecx            ; move the color into r11d
        mov     ebx, r8d             ; move the height into ebx
        mov     rcx, rdx             ; 000E _ 48: 89. D1st
        test    r8d, r8d             ; 
        jle     _cls_return          ; basically, return if width or height is 0
        test    r9d, r9d             ; ( window minimized )
        jle     _cls_return          ;
        mov     r8d, r9d             ; height = width
        mov     esi, r9d             ; esi = width
        mov     edi, r9d             ; edi = width
        xor     r10d, r10d           ; r10d = 0
        shr     esi, 2               ; esi = width / 2
        movd    xmm1, r11d           ; move the lower 32-bits of the color into xmm1
        lea     r12d, [r9-1]         ; r12d = width - 1
        shl     rsi, 4               ; 003F _ 48: C1. E6, 04
        mov     ebp, r8d             ; 0043 _ 44: 89. C5
        shl     rdi, 2               ; 0046 _ 48: C1. E7, 02
        pshufd  xmm0, xmm1, 0        ; 004A _ 66: 0F 70. C1, 00
        shl     rbp, 2               ; 004F _ 48: C1. E5, 02
ALIGN   8
?_001:  cmp     r12d, 2                                
        jbe     ?_006                ; if (width - 1 <= 2) { ?_006 }
        mov     rax, rcx             ; 005E _ 48: 89. C8
        lea     rdx, [rcx+rsi]       ; 0061 _ 48: 8D. 14 31
ALIGN   8
?_002:  movups  oword [rax], xmm0    ; 0068 _ 0F 11. 00
        add     rax, 16              ; 006B _ 48: 83. C0, 10
        cmp     rdx, rax             ; 006F _ 48: 39. C2
        jnz     ?_002                ; 0072 _ 75, F4
        lea     rdx, [rcx+rbp]       ; 0074 _ 48: 8D. 14 29
        mov     eax, r8d             ; 0078 _ 44: 89. C0
        cmp     r9d, r8d             ; 007B _ 45: 39. C1
        jz      ?_004                ; 007E _ 74, 1C
?_003:  lea     r13d, [rax+1H]       ; 0080 _ 44: 8D. 68, 01
        mov     dword [rdx], r11d    ; 0084 _ 44: 89. 1A
        cmp     r13d, r9d            ; 0087 _ 45: 39. CD
        jge     ?_004                ; 008A _ 7D, 10
        add     eax, 2               ; 008C _ 83. C0, 02
        mov     dword [rdx+4H], r11d ; 008F _ 44: 89. 5A, 04
        cmp     r9d, eax             ; 0093 _ 41: 39. C1
        jle     ?_004                ; 0096 _ 7E, 04
        mov     dword [rdx+8H], r11d ; 0098 _ 44: 89. 5A, 08
?_004:  add     r10d, 1              ; 009C _ 41: 83. C2, 01
        add     rcx, rdi             ; 00A0 _ 48: 01. F9
        cmp     ebx, r10d            ; 00A3 _ 44: 39. D3
        jnz     ?_001                ; 00A6 _ 75, B0
_cls_return: 
        pop     rbx                  ;
        pop     rsi                  ;
        pop     rdi                  ;
        pop     rbp                  ;
        pop     r12                  ;
        pop     r13                  ; pop all the saved registers
        ret                          ; 
?_006:  ; Local function
        mov     rdx, rcx             ; 00B1 _ 48: 89. CA
        xor     eax, eax             ; 00B4 _ 31. C0
        jmp     ?_003                ; 00B6 _ EB, C8
Now, in ?_001, the compiler compares width - 1 to 2, which is the same thing as comparing the width to 3. My question is, with -O3, why did the compiler choose two instead of three, and waste a lea (to move width - 1 into r12d).
The only thing which makes sense to me is that powers of two are somehow faster to compare. Or maybe it's a compiler quirk?
 
    