Example of using membarrier function from linux manual: https://man7.org/linux/man-pages/man2/membarrier.2.html
       #include <stdlib.h>
       static volatile int a, b;
       static void
       fast_path(int *read_b)
       {
           a = 1;
           asm volatile ("mfence" : : : "memory");
           *read_b = b;
       }
       static void
       slow_path(int *read_a)
       {
           b = 1;
           asm volatile ("mfence" : : : "memory");
           *read_a = a;
       }
       int
       main(int argc, char **argv)
       {
           int read_a, read_b;
           /*
            * Real applications would call fast_path() and slow_path()
            * from different threads. Call those from main() to keep
            * this example short.
            */
           slow_path(&read_a);
           fast_path(&read_b);
           /*
            * read_b == 0 implies read_a == 1 and
            * read_a == 0 implies read_b == 1.
            */
           if (read_b == 0 && read_a == 0)
               abort();
           exit(EXIT_SUCCESS);
       }
The code above transformed to use membarrier() becomes:
       #define _GNU_SOURCE
       #include <stdlib.h>
       #include <stdio.h>
       #include <unistd.h>
       #include <sys/syscall.h>
       #include <linux/membarrier.h>
       static volatile int a, b;
       static int
       membarrier(int cmd, unsigned int flags, int cpu_id)
       {
           return syscall(__NR_membarrier, cmd, flags, cpu_id);
       }
       static int
       init_membarrier(void)
       {
           int ret;
           /* Check that membarrier() is supported. */
           ret = membarrier(MEMBARRIER_CMD_QUERY, 0, 0);
           if (ret < 0) {
               perror("membarrier");
               return -1;
           }
           if (!(ret & MEMBARRIER_CMD_GLOBAL)) {
               fprintf(stderr,
                   "membarrier does not support MEMBARRIER_CMD_GLOBAL\n");
               return -1;
           }
           return 0;
       }
       static void
       fast_path(int *read_b)
       {
           a = 1;
           asm volatile ("" : : : "memory");
           *read_b = b;
       }
       static void
       slow_path(int *read_a)
       {
           b = 1;
           membarrier(MEMBARRIER_CMD_GLOBAL, 0, 0);
           *read_a = a;
       }
       int
       main(int argc, char **argv)
       {
           int read_a, read_b;
           if (init_membarrier())
               exit(EXIT_FAILURE);
           /*
            * Real applications would call fast_path() and slow_path()
            * from different threads. Call those from main() to keep
            * this example short.
            */
           slow_path(&read_a);
           fast_path(&read_b);
           /*
            * read_b == 0 implies read_a == 1 and
            * read_a == 0 implies read_b == 1.
            */
           if (read_b == 0 && read_a == 0)
               abort();
           exit(EXIT_SUCCESS);
       }
This "membarrier" description is taken from the Linux manual. I am still confused about how does trhe "membarrier" function add overhead to the slow side, and remove overhead from the fast side, thus resulting in an overall performance increase as long as the slow side is infrequent enough that the overhead of the membarrier() calls does not outweigh the performance gain on the fast side.
Could you please help me to describe it in more detail.
Thanks!
 
    