for x86_64 you can use the shr for both the oddness and zero test...
Like for example this:
To build and test in debugger I used commands (on 64b linux):
nasm -f elf64 m.asm -l m.lst -w+all; ld -b elf64-x86-64 -o m m.o
edb --run ./m
source:
    segment .text
mulByBits:
; input: 2x uint32_t in edi, esi (System V AMD64 ABI calling convention)
; output: uint64_t in rax
; modifies also: rcx
    xor     eax, eax            ; rax = 0 "C"
    mov     edi, edi            ; clear upper 32b of input "A" (extend to 64b)
.mulLoop:
    lea     rcx, [rax + rdi]    ; rcx = C + A (new C in case B is odd)
    add     rdi, rdi            ; A *= 2 (for next loop)
    shr     esi, 1              ; B >>= 1 (sets ZF and CF)
    cmovc   rax, rcx            ; if B was odd, update the sum to new C
    jnz     .mulLoop            ; repeat until B is zero
    ret
global _start
_start:     ; run some hardcoded simple tests, verify in debugger
    mov     edi, 143254         ; "normal" values test
    mov     esi, 43526
    call    mulByBits
    mov     rbx, 6235273604     ; expected result, compare with rax
    mov     edi, 0
    mov     esi, 0
    call    mulByBits
    mov     rbx, 0
    mov     edi, 43257432
    mov     esi, 0
    call    mulByBits
    mov     rbx, 0
    mov     edi, 0
    mov     esi, 432543
    call    mulByBits
    mov     rbx, 0
    mov     edi, 3276547234
    mov     esi, 1
    call    mulByBits
    mov     rbx, 3276547234
    mov     edi, 1
    mov     esi, 3276547234
    call    mulByBits
    mov     rbx, 3276547234
    mov     edi, ~0             ; UINT_MAX * UINT_MAX
    mov     esi, ~0
    call    mulByBits
    mov     rbx, 0xFFFFFFFE00000001
    mov     rdi, 0xE00000004    ; garbage in upper 32 bits of inputs
    mov     rsi, 0xE00000004    ; should be multiplied as 4*4
    call    mulByBits
    mov     rbx, 0x10
    ; exit back to linux
    mov     eax, 60
    xor     edi, edi
    syscall
Adding is the most efficient way to left shift by 1 on most CPUs; add same,same can run on more execution ports than shl rdi,1 (https://agner.org/optimize), allowing more instruction-level parallelism for potentially better throughput of the loop.