I'm playing with lock-free algorithms in C and C++ and recently stumbled upon a behavior I don't quite understand. If you have the following code, running it will give you something like
reader started writer started iters=79895047, less=401131, eq=48996928, more=30496988
Aren't std::atomics are expected to be sequentially-consistent? If so, why does the reader sometimes see b being updated before a? I also tried to do various tricks involving memory fences with no success. The full compilable code can be seen at https://github.com/akamaus/fence_test 
What's wrong with the example?
std::atomic<uint> a(0);
std::atomic<uint> b(0);
volatile bool stop = false;
void *reader(void *p) {
    uint64_t iter_counter = 0;
    uint cnt_less = 0,
         cnt_eq = 0,
         cnt_more = 0;
    uint aa, bb;
    printf("reader started\n");
    while(!stop) {
        iter_counter++;
        aa = a.load(std::memory_order_seq_cst);
        bb = b.load(std::memory_order_seq_cst);
        if (aa < bb) {
            cnt_less++;
        } else if (aa > bb) {
            cnt_more++;
        } else {
            cnt_eq++;
        }
    }
        printf("iters=%lu, less=%u, eq=%u, more=%u\n", iter_counter, cnt_less, cnt_eq, cnt_more);
    return NULL;
}
void *writer(void *p) {
    printf("writer started\n");
    uint counter = 0;
    while(!stop) {
        a.store(counter, std::memory_order_seq_cst);
        b.store(counter, std::memory_order_seq_cst);
        counter++;
    }
}