I have following dispatch code for my user level thread library.
The code can pass GCC and runs correctly without optimization, but if I choose -O1 optimization (also higher levels), when run the code, program generates segmentation fault.
Basically the function does save context and jump to next context.
void __attribute__ ((noinline)) __lwt_dispatch(lwt_context *curr, lwt_context *next)
{
__asm__ __volatile
    (
    "mov 0xc(%ebp),%eax\n\t"
    "mov 0x4(%eax),%ecx\n\t"
    "mov (%eax),%edx\n\t"
    "mov 0x8(%ebp),%eax\n\t"
    "add $0x4,%eax\n\t"
    "mov 0x8(%ebp),%ebx\n\t"
    "push %ebp\n\t"
    "push %ebx\n\t"
    "mov %esp,(%eax)\n\t"
    "movl $return,(%ebx)\n\t"
    "mov %ecx,%esp\n\t"
    "jmp *%edx\n\t"
    "return: pop %ebx\n\t"
    "pop %ebp\n\t"
    );
}
 
    