Try to setup ALU architecture similar to real HW implementation.
Most algorithms are written for such environment. It will clean and speed up your code. In some cases I use asm for this but if you want to be not CPU dependent you can use this class of mine
ALU source in C++:
//---------------------------------------------------------------------------
//--- ALU32 class 2.01 ------------------------------------------------------
//---------------------------------------------------------------------------
#ifndef _ALU32_h
#define _ALU32_h
//---------------------------------------------------------------------------
//#define _ALU32_no_asm
//---------------------------------------------------------------------------
class ALU32
    {
public:
    BYTE cy;
    ALU32() { cy=0; }
    void sar(DWORD &c); // msb -> [msb...lsb] -> cy     shift arithmetic right
    void shl(DWORD &c); // cy  <- [msb...lsb] <- 0      shift left
    void shr(DWORD &c); // 0   -> [msb...lsb] -> cy     shift right
    void rcl(DWORD &c); // cy  <- [msb...lsb] <- cy     shift through carry left
    void rcr(DWORD &c); // cy  -> [msb...lsb] -> cy     shift through carry lright
    void inc(DWORD &c);                                     
    void dec(DWORD &c);                                     
    void add(DWORD &c,DWORD a,DWORD b);                     
    void sub(DWORD &c,DWORD a,DWORD b);                     
    void adc(DWORD &c,DWORD a,DWORD b);                     
    void sbc(DWORD &c,DWORD a,DWORD b);                     
    void mul(DWORD &ch,DWORD &cl,DWORD a,DWORD b);          // (ch,cl) = a*b
    void div(DWORD &c,DWORD &d,DWORD ah,DWORD al,DWORD b);  // c = a/b d =a%b
    };
//---------------------------------------------------------------------------
void ALU32::inc(DWORD &c) { if (c==0xFFFFFFFF) cy=1; else cy=0; c++; }
void ALU32::dec(DWORD &c) { if (c==0x00000000) cy=1; else cy=0; c--; }
//---------------------------------------------------------------------------
void ALU32::sar(DWORD &c)
    {
    cy=c&1;
    c=((c>>1)&0x7FFFFFFF)|(c&0x80000000);
    }
//---------------------------------------------------------------------------
void ALU32::shl(DWORD &c)
    {
    cy=c>>31;
    c=(c<<1)&0xFFFFFFFE;
    }
//---------------------------------------------------------------------------
void ALU32::shr(DWORD &c)
    {
    cy=c&1;
    c=(c>>1)&0x7FFFFFFF;
    }
//---------------------------------------------------------------------------
void ALU32::rcl(DWORD &c)
    {
    DWORD cy0=cy;
    cy=c>>31;
    c=((c<<1)&0xFFFFFFFE)|cy0;
    }
//---------------------------------------------------------------------------
void ALU32::rcr(DWORD &c)
    {
    DWORD cy0=cy;
    cy=c&1;
    c=((c>>1)&0x7FFFFFFF)|(cy0<<31);
    }
//---------------------------------------------------------------------------
void ALU32::add(DWORD &c,DWORD a,DWORD b)
    {
    c=a+b;
    cy=DWORD(((a &1)+(b &1)   )>> 1);
    cy=DWORD(((a>>1)+(b>>1)+cy)>>31);
    }
//---------------------------------------------------------------------------
void ALU32::sub(DWORD &c,DWORD a,DWORD b)
    {
    c=a-b;
    if (a<b) cy=1; else cy=0;
    }
//---------------------------------------------------------------------------
void ALU32::adc(DWORD &c,DWORD a,DWORD b)
    {
    c=a+b+cy;
    cy=DWORD(((a &1)+(b &1)+cy)>> 1);
    cy=DWORD(((a>>1)+(b>>1)+cy)>>31);
    }
//---------------------------------------------------------------------------
void ALU32::sbc(DWORD &c,DWORD a,DWORD b)
    {
    c=a-b-cy;
    if (cy) { if (a<=b) cy=1; else cy=0; }
    else    { if (a< b) cy=1; else cy=0; }
    }
//---------------------------------------------------------------------------
void ALU32::mul(DWORD &ch,DWORD &cl,DWORD a,DWORD b)
    {
    #ifdef _ALU32_no_asm
    const int _h=1; // this is MSW,LSW order platform dependent So swap 0,1 if your platform is different
    const int _l=0;
    union _u
        {
        DWORD u32;
        WORD u16[2];
        } u;
    DWORD al,ah,bl,bh;
    DWORD c0,c1,c2;
    // separate 2^16 base digits
    u.u32=a; al=u.u16[_l]; ah=u.u16[_h];
    u.u32=b; bl=u.u16[_l]; bh=u.u16[_h];
    // multiplication (al+ah<<16)*(bl+bh<<16) = al*bl + al*bh<<16 + ah*bl<<16 + ah*bh<<32
    c0=(al*bl);
    add(c1,al*bh,ah*bl);
    c2=(ah*bh)+(cy<<16);
    // add subresults
    add(c0,c0,(c1<<16)&0xFFFF0000); c1=((c1>>16)&0x0000FFFF)+cy;
    add(c1,c1,c2);
    // construct result from (c3,c2,c1,c0)
    ch=c1;
    cl=c0;
    #else
    DWORD _a,_b,_cl,_ch;
    _a=a;
    _b=b;
    asm {
        mov eax,_a
        mov ebx,_b
        mul ebx     // H(edx),L(eax) = eax * ebx
        mov _cl,eax
        mov _ch,edx
        }
    cl=_cl;
    ch=_ch;
    #endif
    }
//---------------------------------------------------------------------------
void ALU32::div(DWORD &c,DWORD &d,DWORD ah,DWORD al,DWORD b)
    {
    #ifdef _ALU32_no_asm
    DWORD ch,cl,bh,bl,h,l,mh,ml;
    int e;
    // edge cases
    if (!b ){ c=0xFFFFFFFF; d=0xFFFFFFFF; cy=1; return; }
    if (!ah){ c=al/b;       d=al%b;       cy=0; return; }
    // align a,b for binary long division m is the shifted mask of b lsb
    for (bl=b,bh=0,mh=0,ml=1;bh<0x80000000;)
        {
        e=0; if (ah>bh) e=+1;   // e = cmp a,b {-1,0,+1}
        else if (ah<bh) e=-1;
        else if (al>bl) e=+1;
        else if (al<bl) e=-1;
        if (e<=0) break;        // a<=b ?
        shl(bl); rcl(bh);       // b<<=1
        shl(ml); rcl(mh);       // m<<=1
        }
    // binary long division
    for (ch=0,cl=0;;)
        {
        sub(l,al,bl);           // a-b
        sbc(h,ah,bh);
        if (cy)                 // a<b ?
            {
            if (ml==1) break;
            shr(mh); rcr(ml);   // m>>=1
            shr(bh); rcr(bl);   // b>>=1
            continue;
            }
        al=l; ah=h;             // a>=b ?
        add(cl,cl,ml);          // c+=m
        adc(ch,ch,mh);
        }
    cy=0; c=cl; d=al;
    if ((ch)||(ah)) cy=1;       // overflow
    #else
    DWORD _al,_ah,_b,_c,_d;
    _al=al;
    _ah=ah;
    _b=b;
    asm {
        mov eax,_al
        mov edx,_ah
        mov ebx,_b
        div ebx
        mov _c,eax  // eax = H(edx),L(eax) / ebx
        mov _d,edx  // edx = H(edx),L(eax) % ebx
        }
    c=_c;
    d=_d;
    #endif
    }
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
mul and div are switchable between fast CPU assembly and slower C++ implementation with the #define _ALU32_no_asm 
DWORD is 32 bit unsigned int and can be defined like  typedef unsigned __int32 DWORD;