If you want 128bit multiplication then this should work this is in AT&T format.
__uint128_t FASTMUL128(const __uint128_t TA,const __uint128_t TB)
{
    union
    {
        __uint128_t WHOLE;
        struct
        {
            unsigned long long int LWORDS[2];
        } SPLIT;
    } KEY;
    register unsigned long long int __RAX,__RDX,__RSI,__RDI;
    __uint128_t RESULT;
KEY.WHOLE=TA;
__RAX=KEY.SPLIT.LWORDS[0];
__RDX=KEY.SPLIT.LWORDS[1];
KEY.WHOLE=TB;
__RSI=KEY.SPLIT.LWORDS[0];
__RDI=KEY.SPLIT.LWORDS[1];
__asm__ __volatile__(
    "movq           %0,                             %%rax                   \n\t"
    "movq           %1,                             %%rdx                   \n\t"
    "movq           %2,                             %%rsi                   \n\t"
    "movq           %3,                             %%rdi                   \n\t"
    "movq           %%rsi,                          %%rbx                   \n\t"
    "movq           %%rdi,                          %%rcx                   \n\t"
    "movq           %%rax,                          %%rsi                   \n\t"
    "movq           %%rdx,                          %%rdi                   \n\t"
    "xorq           %%rax,                          %%rax                   \n\t"
    "xorq           %%rdx,                          %%rdx                   \n\t"
    "movq           %%rdi,                          %%rax                   \n\t"
    "mulq           %%rbx                                                   \n\t"
    "xchgq          %%rbx,                          %%rax                   \n\t"
    "mulq           %%rsi                                                   \n\t"
    "xchgq          %%rax,                          %%rsi                   \n\t"
    "addq           %%rdx,                          %%rbx                   \n\t"
    "mulq           %%rcx                                                   \n\t"
    "addq           %%rax,                          %%rbx                   \n\t"
    "movq           %%rsi,                          %%rax                   \n\t"
    "movq           %%rbx,                          %%rdx                   \n\t"
    "movq           %%rax,                          %0                      \n\t"
    "movq           %%rdx,                          %1                      \n\t"
    "movq           %%rsi,                          %2                      \n\t"
    "movq           %%rdi,                          %3                      \n\t"
    : "=m"(__RAX),"=m"(__RDX),"=m"(__RSI),"=m"(__RDI)
    :  "m"(__RAX), "m"(__RDX), "m"(__RSI), "m"(__RDI)
    : "rax","rbx","ecx","rdx","rsi","rdi"
);
KEY.SPLIT.LWORDS[0]=__RAX;
KEY.SPLIT.LWORDS[1]=__RDX;
RESULT=KEY.WHOLE;
return RESULT;
}