Here we have three similar functions. They all always return 0.
This first function is optimized well. The compiler sees that x is always 1.
int f1()
{
    std::atomic<int> x = 1;
    if (x.load() == 1)
    {
        return 0;
    }
    return 1;
}
f1():                                 # @f3()
        xor     eax, eax
        ret
This second function is more interesting. The compiler realized that subtracting 0 from x doesn't do anything and so the operation was omitted. However, it doesn't notice that x is always 1 and does a cmparison. Also, despite using std::memory_order_relaxed an mfence instruction was emitted.
int f2()
{
    std::atomic<int> x = 1;
    if (x.fetch_sub(0, std::memory_order_relaxed) == 1)
    {
        return 0;
    }
    return 1;
}
f2():                                 # @f2()
        mov     dword ptr [rsp - 4], 1
        mfence
        mov     ecx, dword ptr [rsp - 4]
        xor     eax, eax
        cmp     ecx, 1
        setne   al
        ret
Finally, this is the real example of what I'm trying to optimize. What I'm really doing is implementing a simple shared_pointer and x represents the reference counter. I'd like the compiler to optimize the case when a temporary shared_pointer object is created and destroyed and avoid the needless atomic operation.
int f3()
{
    std::atomic<int> x = 1;
    if (x.fetch_sub(1, std::memory_order_relaxed) == 1)
    {
        return 0;
    }
    return 1;
}
f3():                                 # @f3()
        mov     dword ptr [rsp - 4], 1
        xor     eax, eax
        lock            dec     dword ptr [rsp - 4]
        setne   al
        ret
I am using clang. How can  I make it optimize f3 like it did with f1?
