I need to read a binary file which is made of many basic types such as int, double, UTF8 strings, etc. For instance, think about one file containing n  pairs of (int, double) one after the other, without any alignment with n being in the order of tens of millions. I need to get very fast access to that file. I read the file using fread calls and my own buffer which is about 16 kB long.
A profiler shows that my main bottleneck happens to be copying from the memory buffer to its final destination. The most obvious way to write a a function that copy from the buffer to a double would be:
// x: a pointer to the final destination of the data
// p: a pointer to the buffer used to read the file
//
void f0(double* x, const unsigned char* p) {
  unsigned char* q = reinterpret_cast<unsigned char*>(x);
  for (int i = 0; i < 8; ++i) {
    q[i] = p[i];
  }
}
It I use the following code, I get huge speedup on x86-64
void f1(double* x, const unsigned char* p) {
  double* r = reinterpret_cast<const double*>(p);
  *x = *r;
}
But, as I understand, the program would crash on ARM if p is not 8-byte aligned.
Here are my questions:
- Is the second program guaranteed to work on both x86 and x86-64?
 - How would you write such a function on ARM if you need it as fast as you can?
 
Here is a small benchmark to test on your machine
#include <chrono>
#include <iostream>
void copy_int_0(int* x, const unsigned char* p) {
  unsigned char* q = reinterpret_cast<unsigned char*>(x);
  for (std::size_t i = 0; i < 4; ++i) {
    q[i] = p[i];
  }
}
void copy_double_0(double* x, const unsigned char* p) {
  unsigned char* q = reinterpret_cast<unsigned char*>(x);
  for (std::size_t i = 0; i < 8; ++i) {
    q[i] = p[i];
  }
}
void copy_int_1(int* x, const unsigned char* p) {
  *x = *reinterpret_cast<const int*>(p);
}
void copy_double_1(double* x, const unsigned char* p) {
  *x = *reinterpret_cast<const double*>(p);
}
int main() {
  const std::size_t n = 10000000;
  const std::size_t nb_times = 200;
  unsigned char* p = new unsigned char[12 * n];
  for (std::size_t i = 0; i < 12 * n; ++i) {
    p[i] = 0;
  }
  int* q0 = new int[n];
  for (std::size_t i = 0; i < n; ++i) {
    q0[i] = 0;
  }
  double* q1 = new double[n];
  for (std::size_t i = 0; i < n; ++i) {
    q1[i] = 0.0;
  }
  const auto begin_0 = std::chrono::high_resolution_clock::now();
  for (std::size_t k = 0; k < nb_times; ++k) {
    for (std::size_t i = 0; i < n; ++i) {
      copy_int_0(q0 + i, p + 12 * i);
      copy_double_0(q1 + i, p + 4 + 12 * i);
    }
  }
  const auto end_0 = std::chrono::high_resolution_clock::now();
  const double time_0 =
      1.0e-9 *
      std::chrono::duration_cast<std::chrono::nanoseconds>(end_0 - begin_0)
          .count();
  std::cout << "Time 0: " << time_0 << " s" << std::endl;
  const auto begin_1 = std::chrono::high_resolution_clock::now();
  for (std::size_t k = 0; k < nb_times; ++k) {
    for (std::size_t i = 0; i < n; ++i) {
      copy_int_1(q0 + i, p + 12 * i);
      copy_double_1(q1 + i, p + 4 + 12 * i);
    }
  }
  const auto end_1 = std::chrono::high_resolution_clock::now();
  const double time_1 =
      1.0e-9 *
      std::chrono::duration_cast<std::chrono::nanoseconds>(end_1 - begin_1)
          .count();
  std::cout << "Time 1: " << time_1 << " s" << std::endl;
  std::cout << "Prevent optimization: " << q0[0] << " " << q1[0] << std::endl;
  delete[] q1;
  delete[] q0;
  delete[] p;
  return 0;
}
The results I get are
clang++ -std=c++11 -O3 -march=native copy.cpp -o copy
./copy
Time 0: 8.49403 s
Time 1: 4.01617 s
g++ -std=c++11 -O3 -march=native copy.cpp -o copy
./copy
Time 0: 8.65762 s
Time 1: 3.89979 s
icpc -std=c++11 -O3 -xHost copy.cpp -o copy
./copy
Time 0: 8.46155 s
Time 1: 0.0278496 s
I did not check the assembly yet but I guess that the Intel compiler is fooling my benchmark here.