I am running this code on Intel XEON Gold in order to measure the latency because of shared memory access. I have created 5 threads running on different cores and there is shared memory for inter core communication.
#define _GNU_SOURCE
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <linux/mman.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <sched.h>
typedef uint64_t time_tst;
#define NUM_THREADS   5
struct thread_info {
    int core_id; 
    int *addr;
};
time_tst time_tcv(void)
{ 
   unsigned long low, high;
   __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high));
   return (((uint64_t)high << 32) | low);
}
void* create_shared_memory(size_t size) 
{
  int fd = shm_open("carmv2shm", O_CREAT|O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | 
 S_IWOTH);
   if (!fd){
     printf("shm open error \n");
     return 0;
   }
   else
  {
     ftruncate(fd, 0x1000*size);
     return mmap(NULL, 0x1000*size, PROT_READ | PROT_WRITE, MAP_LOCKED|MAP_SHARED_VALIDATE, fd, 0);
   }
}
void* thread_func(void *args)
{
  struct thread_info *thread_info = args;
  pthread_t self = pthread_self();
  const unsigned int core_id = thread_info->core_id;
  cpu_set_t set;
  CPU_ZERO(&set);
  CPU_SET(core_id, &set);
            
  if(pthread_setaffinity_np(self, sizeof(set), &set) < 0){
     printf("Error setting affinity \n");
  }
  
 
  time_tst t1 = 0;
  time_tst t2 = 0;
 
  char message[] = "hello message";
 
  t1 = time_tcv();
  memcpy(thread_info->addr, message, sizeof(message));
  t2 = time_tcv();
  printf("thread id %u core id %u time diff 0x%" PRIu64 "\n", (unsigned int)self, core_id, (t2-t1)); 
  
  return 0;
}
int main()
{
  int i = 0;
  pthread_mutex_init(&lock, NULL);
  
  void* shmem = create_shared_memory(128);
  struct thread_info *thread_info = calloc(NUM_THREADS, sizeof(struct thread_info));
  
  thread_info->addr = shmem;
  pthread_t tid[NUM_THREADS];
  
  while(i<NUM_THREADS)
  {
    thread_info->core_id = i + 1;
    pthread_create(&tid[i], NULL, thread_func, (void*)thread_info);
    usleep(1);
    i++;
  }
  
  i = 0;
  while(i<NUM_THREADS)
  {
    pthread_join(tid[i], NULL);
    i++;
  }
  
  return 1;
}
The output is :
thread id 2912491264 core id 1 time diff 0x6312
thread id 2904098560 core id 2 time diff 0x486
thread id 2895705856 core id 3 time diff 0x498
thread id 2753095424 core id 4 time diff 0x522
thread id 2818569984 core id 5 time diff 0x230
This time difference looks quite high to me.  Could anyone suggest how to
reduce this difference.
Thanks
