I have a few place in my code that could really use some speed up, when I try to use CM4 SIMD instructions the result is always slower than scalar version, for example, this is an alpha blending function I'm using a lot, it's not really slow but it serves as an example:
for (int y=0; y<h; y++) {
    i=y*w;
    for (int x=0; x<w; x++) {
        uint spix = *srcp++;
        uint dpix = dstp[i+x];
        uint r=(alpha*R565(spix)+(256-alpha)*R565(dpix))>>8;
        uint g=(alpha*G565(spix)+(256-alpha)*G565(dpix))>>8;
        uint b=(alpha*B565(spix)+(256-alpha)*B565(dpix))>>8;
        dstp[i+x]= RGB565(r, g, b);
    }
}
R565, G565, B565 and RGB565 are macros that extract and pack RGB565 respectively, please ignore
Now I tried using __SMUAD and see if anything changes, the result was slower (or same speed as the original code) even tried loop unrolling, with no luck:
uint v0, vr, vg, vb;
v0 = (alpha<<16)|(256-alpha);
for (int y=0; y<h; y++) {
    i=y*w;
    for (int x=0; x<w; x++) {
        spix = *srcp++;
        dpix = dstp[i+x];
        uint vr = R565(spix)<<16 | R565(dpix);
        uint vg = G565(spix)<<16 | G565(dpix);
        uint vb = B565(spix)<<16 | B565(dpix);
        uint r = __SMUAD(v0, vr)>>8;
        uint g = __SMUAD(v0, vg)>>8;
        uint b = __SMUAD(v0, vb)>>8;
        dstp[i+x]= RGB565(r, g, b);
    }
}
I know this has been asked before, but given the architectural differences, and the fact that none of the answers really solve my problem, I'm asking again. Thanks!
Update
Scalar disassembly:
    Disassembly of section .text.blend:
00000000 <blend>:
   0:   e92d 0ff0   stmdb   sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
   4:   6846        ldr r6, [r0, #4]
   6:   68c4        ldr r4, [r0, #12]
   8:   b086        sub sp, #24
   a:   199e        adds    r6, r3, r6
   c:   9601        str r6, [sp, #4]
   e:   9200        str r2, [sp, #0]
  10:   68ca        ldr r2, [r1, #12]
  12:   f89d 5038   ldrb.w  r5, [sp, #56]   ; 0x38
  16:   9204        str r2, [sp, #16]
  18:   9a01        ldr r2, [sp, #4]
  1a:   426e        negs    r6, r5
  1c:   4293        cmp r3, r2
  1e:   b2f6        uxtb    r6, r6
  20:   da5b        bge.n   da <blend+0xda>
  22:   8809        ldrh    r1, [r1, #0]
  24:   6802        ldr r2, [r0, #0]
  26:   9102        str r1, [sp, #8]
  28:   fb03 fb01   mul.w   fp, r3, r1
  2c:   9900        ldr r1, [sp, #0]
  2e:   4411        add r1, r2
  30:   9103        str r1, [sp, #12]
  32:   0052        lsls    r2, r2, #1
  34:   9205        str r2, [sp, #20]
  36:   9903        ldr r1, [sp, #12]
  38:   9a00        ldr r2, [sp, #0]
  3a:   428a        cmp r2, r1
  3c:   fa1f fb8b   uxth.w  fp, fp
  40:   da49        bge.n   d6 <blend+0xd6>
  42:   4610        mov r0, r2
  44:   4458        add r0, fp
  46:   f100 4000   add.w   r0, r0, #2147483648 ; 0x80000000
  4a:   9a04        ldr r2, [sp, #16]
  4c:   f8dd a014   ldr.w   sl, [sp, #20]
  50:   3801        subs    r0, #1
  52:   eb02 0040   add.w   r0, r2, r0, lsl #1
  56:   44a2        add sl, r4
  58:   f834 1b02   ldrh.w  r1, [r4], #2
  5c:   8842        ldrh    r2, [r0, #2]
  5e:   f3c1 07c4   ubfx    r7, r1, #3, #5
  62:   f3c2 09c4   ubfx    r9, r2, #3, #5
  66:   f001 0c07   and.w   ip, r1, #7
  6a:   f3c1 2804   ubfx    r8, r1, #8, #5
  6e:   fb07 f705   mul.w   r7, r7, r5
  72:   0b49        lsrs    r1, r1, #13
  74:   fb06 7709   mla r7, r6, r9, r7
  78:   ea41 01cc   orr.w   r1, r1, ip, lsl #3
  7c:   f3c2 2904   ubfx    r9, r2, #8, #5
  80:   f002 0c07   and.w   ip, r2, #7
  84:   fb08 f805   mul.w   r8, r8, r5
  88:   0b52        lsrs    r2, r2, #13
  8a:   fb01 f105   mul.w   r1, r1, r5
  8e:   097f        lsrs    r7, r7, #5
  90:   fb06 8809   mla r8, r6, r9, r8
  94:   ea42 02cc   orr.w   r2, r2, ip, lsl #3
  98:   fb06 1202   mla r2, r6, r2, r1
  9c:   f007 07f8   and.w   r7, r7, #248    ; 0xf8
  a0:   f408 58f8   and.w   r8, r8, #7936   ; 0x1f00
  a4:   0a12        lsrs    r2, r2, #8
  a6:   ea48 0107   orr.w   r1, r8, r7
  aa:   ea41 3142   orr.w   r1, r1, r2, lsl #13
  ae:   f3c2 02c2   ubfx    r2, r2, #3, #3
  b2:   430a        orrs    r2, r1
  b4:   4554        cmp r4, sl
  b6:   f820 2f02   strh.w  r2, [r0, #2]!
  ba:   d1cd        bne.n   58 <blend+0x58>
  bc:   9902        ldr r1, [sp, #8]
  be:   448b        add fp, r1
  c0:   9901        ldr r1, [sp, #4]
  c2:   3301        adds    r3, #1
  c4:   428b        cmp r3, r1
  c6:   fa1f fb8b   uxth.w  fp, fp
  ca:   d006        beq.n   da <blend+0xda>
  cc:   9a00        ldr r2, [sp, #0]
  ce:   9903        ldr r1, [sp, #12]
  d0:   428a        cmp r2, r1
  d2:   4654        mov r4, sl
  d4:   dbb5        blt.n   42 <blend+0x42>
  d6:   46a2        mov sl, r4
  d8:   e7f0        b.n bc <blend+0xbc>
  da:   b006        add sp, #24
  dc:   e8bd 0ff0   ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
  e0:   4770        bx  lr
  e2:   bf00        nop
SIMD disassembly:
    sassembly of section .text.blend:
00000000 <blend>:
   0:   e92d 0ff0   stmdb   sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
   4:   6846        ldr r6, [r0, #4]
   6:   68c4        ldr r4, [r0, #12]
   8:   b086        sub sp, #24
   a:   199e        adds    r6, r3, r6
   c:   9601        str r6, [sp, #4]
   e:   9200        str r2, [sp, #0]
  10:   68ca        ldr r2, [r1, #12]
  12:   f89d 5038   ldrb.w  r5, [sp, #56]   ; 0x38
  16:   9204        str r2, [sp, #16]
  18:   9a01        ldr r2, [sp, #4]
  1a:   f5c5 7680   rsb r6, r5, #256    ; 0x100
  1e:   4293        cmp r3, r2
  20:   ea46 4505   orr.w   r5, r6, r5, lsl #16
  24:   da5d        bge.n   e2 <blend+0xe2>
  26:   8809        ldrh    r1, [r1, #0]
  28:   6802        ldr r2, [r0, #0]
  2a:   9102        str r1, [sp, #8]
  2c:   fb03 fb01   mul.w   fp, r3, r1
  30:   9900        ldr r1, [sp, #0]
  32:   4411        add r1, r2
  34:   9103        str r1, [sp, #12]
  36:   0052        lsls    r2, r2, #1
  38:   9205        str r2, [sp, #20]
  3a:   9903        ldr r1, [sp, #12]
  3c:   9a00        ldr r2, [sp, #0]
  3e:   428a        cmp r2, r1
  40:   fa1f fb8b   uxth.w  fp, fp
  44:   da4b        bge.n   de <blend+0xde>
  46:   4610        mov r0, r2
  48:   4458        add r0, fp
  4a:   f100 4000   add.w   r0, r0, #2147483648 ; 0x80000000
  4e:   9a04        ldr r2, [sp, #16]
  50:   f8dd a014   ldr.w   sl, [sp, #20]
  54:   3801        subs    r0, #1
  56:   eb02 0040   add.w   r0, r2, r0, lsl #1
  5a:   44a2        add sl, r4
  5c:   f834 2b02   ldrh.w  r2, [r4], #2
  60:   8841        ldrh    r1, [r0, #2]
  62:   f3c2 07c4   ubfx    r7, r2, #3, #5
  66:   f3c1 06c4   ubfx    r6, r1, #3, #5
  6a:   ea46 4707   orr.w   r7, r6, r7, lsl #16
  6e:   fb25 f707   smuad   r7, r5, r7
  72:   f001 0907   and.w   r9, r1, #7
  76:   ea4f 3c51   mov.w   ip, r1, lsr #13
  7a:   f002 0607   and.w   r6, r2, #7
  7e:   ea4f 3852   mov.w   r8, r2, lsr #13
  82:   ea4c 0cc9   orr.w   ip, ip, r9, lsl #3
  86:   ea48 06c6   orr.w   r6, r8, r6, lsl #3
  8a:   ea4c 4606   orr.w   r6, ip, r6, lsl #16
  8e:   fb25 f606   smuad   r6, r5, r6
  92:   f3c1 2104   ubfx    r1, r1, #8, #5
  96:   f3c2 2204   ubfx    r2, r2, #8, #5
  9a:   ea41 4202   orr.w   r2, r1, r2, lsl #16
  9e:   fb25 f202   smuad   r2, r5, r2
  a2:   f3c6 260f   ubfx    r6, r6, #8, #16
  a6:   097f        lsrs    r7, r7, #5
  a8:   f3c6 01c2   ubfx    r1, r6, #3, #3
  ac:   f007 07f8   and.w   r7, r7, #248    ; 0xf8
  b0:   430f        orrs    r7, r1
  b2:   f402 52f8   and.w   r2, r2, #7936   ; 0x1f00
  b6:   ea47 3646   orr.w   r6, r7, r6, lsl #13
  ba:   4316        orrs    r6, r2
  bc:   4554        cmp r4, sl
  be:   f820 6f02   strh.w  r6, [r0, #2]!
  c2:   d1cb        bne.n   5c <blend+0x5c>
  c4:   9902        ldr r1, [sp, #8]
  c6:   448b        add fp, r1
  c8:   9901        ldr r1, [sp, #4]
  ca:   3301        adds    r3, #1
  cc:   428b        cmp r3, r1
  ce:   fa1f fb8b   uxth.w  fp, fp
  d2:   d006        beq.n   e2 <blend+0xe2>
  d4:   9a00        ldr r2, [sp, #0]
  d6:   9903        ldr r1, [sp, #12]
  d8:   428a        cmp r2, r1
  da:   4654        mov r4, sl
  dc:   dbb3        blt.n   46 <blend+0x46>
  de:   46a2        mov sl, r4
  e0:   e7f0        b.n c4 <blend+0xc4>
  e2:   b006        add sp, #24
  e4:   e8bd 0ff0   ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
  e8:   4770        bx  lr
  ea:   bf00        nop
 
     
    