I have read this as well as wiki I understand the following code should result in 12 instructions in asm.
 i = i - ((i >> 1) & 0x55555555);        // add pairs of bits
 i = (i & 0x33333333) + ((i >> 2) & 0x33333333);  // quads
 i = (i + (i >> 4)) & 0x0F0F0F0F;        // groups of 8
 return (i * 0x01010101) >> 24;          // horizontal sum of bytes
I am currently solving this problem, and my best correct solution so far executes 190 instructions total in 10 tests (19 per test).
// A test case to test your function with
.global _start
_start:
    mov r0, #5
    bl popcount
    1: b 1b    // Done
// Only your function (starting at popcount) is judged. The test code above is not executed.
    
popcount:
    PUSH    {r1,r2, r3, r4, r5, r6, r7, r8, r9, r10}
    
    ldr r2, =0x55555555 // 2bits
    ldr r3, =0x33333333 // 4bits
    ldr r4, =0x0F0F0F0F // 8bits
    ldr r6, =0x01010101 // 8bits
    
    
    //x -= (x >> 1) & m1;             //put count of each 2 bits into those 2 bits
    lsr r1, r0, #0x1 // one shift
    and r1, r1, r2
    sub r0, r0, r1
    //x = (x & m2) + ((x >> 2) & m2); //put count of each 4 bits into those 4 bits 
    lsr r1, r0, #0x2 // two shift
    and r1, r1, r3
    and r5, r0, r3  
    add r0, r5, r1
    
    //x = (x + (x >> 4)) & m4;        //put count of each 8 bits into those 8 bits 
    lsr r1, r0, #0x4 // two shift
    add r1, r1, r0
    and r0, r1, r4
    //return (i * 0x01010101) >> 24;
    mul r0, r0, r6
    lsr r0, r0, #0x18
    POP    {r1,r2, r3, r4, r5, r6, r7, r8, r9, r10}
    
    bx lr
    
The best running score so far is 120 instructions. But here the problems are:
- I need to push/pop not to clobber registers (2 instructions)
- I need the LDR instructions because constants are too big for immediate (4 instructions), also no rotation would work.
- I need to return from function (1 instruction)
- Neon SIMD instructions are not available (I tried)
These are precisely the 7 extra instructions per test.
How would you proceed to get only 12 instructions executed?
 
     
     
    