#include <random>
int main() {
  std::vector<double> norms;
  norms.reserve(1000000);
  std::mt19937_64 mtEngine(42);
  std::normal_distribution<> nd;
  for (int i = 0; i != 1000000; ++i) {
    norms.push_back(nd(mtEngine));
  }
}
g++ -std=c++17 -O3 (version 10.2.0) and clang++ -std=c++17 -O3 (version 11.0.0) generate binaries that have significant difference in performance.
$ time ./random_clang
./random_clang  0.11s user 0.00s system 99% cpu 0.113 total
$ time ./random_gcc
./random_gcc  0.03s user 0.00s system 99% cpu 0.032 total
Here are the results from Compiler Explorer and valgrind --tool=callgrind.
./random_clang
--------------------------------------------------------------------------------
Ir          
--------------------------------------------------------------------------------
278,231,181  PROGRAM TOTALS
--------------------------------------------------------------------------------
Ir           file:function
--------------------------------------------------------------------------------
135,606,558  ???:double std::generate_canonical<double, 53ul, std::mersenne_twister_engine<unsigned long, 64ul, 312ul, 156ul, 31ul, 13043109905998158313ul, 29ul, 6148914691236517205ul, 17ul, 8202884508482404352ul, 37ul, 18444473444759240704ul, 43ul, 6364136223846793005ul> >(std::mersenne_twister_engine<unsigned long, 64ul, 312ul, 156ul, 31ul, 13043109905998158313ul, 29ul, 6148914691236517205ul, 17ul, 8202884508482404352ul, 37ul, 18444473444759240704ul, 43ul, 6364136223846793005ul>&) [/home/xxx/EffectiveCpp/test/random_clang]
 53,449,536  /build/glibc-eX1tMB/glibc-2.31/math/../sysdeps/x86_64/fpu/e_logl.S:__ieee754_logl [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
 32,096,514  ???:main [/home/xxx/EffectiveCpp/test/random_clang]
 27,997,376  /build/glibc-eX1tMB/glibc-2.31/math/w_logl_compat.c:logl [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
 22,905,902  /build/glibc-eX1tMB/glibc-2.31/math/../sysdeps/ieee754/dbl-64/e_log.c:__ieee754_log_fma [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
  2,500,000  /build/glibc-eX1tMB/glibc-2.31/math/./w_log_template.c:log@@GLIBC_2.29 [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
  1,000,000  ???:0x0000000004a322f0 [???]
./random_gcc
--------------------------------------------------------------------------------
Ir          
--------------------------------------------------------------------------------
125,607,194  PROGRAM TOTALS
--------------------------------------------------------------------------------
Ir          file:function
--------------------------------------------------------------------------------
75,746,682  ???:main [/home/xxx/EffectiveCpp/test/random_gcc]
22,905,902  /build/glibc-eX1tMB/glibc-2.31/math/../sysdeps/ieee754/dbl-64/e_log.c:__ieee754_log_fma [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
19,769,747  ???:std::mersenne_twister_engine<unsigned long, 64ul, 312ul, 156ul, 31ul, 13043109905998158313ul, 29ul, 6148914691236517205ul, 17ul, 8202884508482404352ul, 37ul, 18444473444759240704ul, 43ul, 6364136223846793005ul>::_M_gen_rand() [/home/xxx/EffectiveCpp/test/random_gcc]
 2,500,000  /build/glibc-eX1tMB/glibc-2.31/math/./w_log_template.c:log@@GLIBC_2.29 [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
 1,000,000  ???:0x00000000001090f0 [???]
 1,000,000  ???:0x0000000004a322f0 [???]
   916,425  /build/glibc-eX1tMB/glibc-2.31/elf/dl-lookup.c:_dl_lookup_symbol_x [/usr/lib/x86_64-linux-gnu/ld-2.31.so]
   544,815  /build/glibc-eX1tMB/glibc-2.31/elf/dl-lookup.c:do_lookup_x [/usr/lib/x86_64-linux-gnu/ld-2.31.so]
Why does the clang++ version spend so much time in calling std::generate_canonical? I have seen people claiming that g++ does inline more aggressively but changing the options on clang++ does not really help in my case (-mllvm -inline-threshold=10000).
Is this a bug or am I missing some other important compiler options? I know there are other ways to generate normally distributed random variables faster but I do not think this kind of speed inconsistency on a commonly used standard library function is normal.
UPDATE: It seems that after I linked the clang++ version to libc++ with -stdlib=libc++ -lc++abi, the performance became on par with the original g++ version.
$ time ./random_perf
./random_perf  0.03s user 0.00s system 98% cpu 0.027 total
./random_perf
--------------------------------------------------------------------------------
Ir          
--------------------------------------------------------------------------------
147,608,621  PROGRAM TOTALS
--------------------------------------------------------------------------------
Ir           file:function
--------------------------------------------------------------------------------
106,311,924  /usr/lib/llvm-10/bin/../include/c++/v1/random:double std::__1::normal_distribution<double>::operator()<std::__1::mersenne_twister_engine<unsigned long, 64ul, 312ul, 156ul, 31ul, 13043109905998158313ul, 29ul, 6148914691236517205ul, 17ul, 8202884508482404352ul, 37ul, 18444473444759240704ul, 43ul, 6364136223846793005ul> >(std::__1::mersenne_twister_engine<unsigned long, 64ul, 312ul, 156ul, 31ul, 13043109905998158313ul, 29ul, 6148914691236517205ul, 17ul, 8202884508482404352ul, 37ul, 18444473444759240704ul, 43ul, 6364136223846793005ul>&, std::__1::normal_distribution<double>::param_type const&) [/home/xxx/EffectiveCpp/bin/random_perf]
 22,905,902  /build/glibc-eX1tMB/glibc-2.31/math/../sysdeps/ieee754/dbl-64/e_log.c:__ieee754_log_fma [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
  6,000,007  /usr/lib/llvm-10/bin/../include/c++/v1/vector:main
  3,003,122  /usr/lib/llvm-10/bin/../include/c++/v1/random:main
  3,000,016  /home/xxx/EffectiveCpp/src/random_perf.cpp:main [/home/xxx/EffectiveCpp/bin/random_perf]
  2,500,000  /build/glibc-eX1tMB/glibc-2.31/math/./w_log_template.c:log@@GLIBC_2.29 [/usr/lib/x86_64-linux-gnu/libm-2.31.so]
  1,000,002  /usr/lib/llvm-10/bin/../include/c++/v1/memory:main
  1,000,000  ???:0x000000000494b2f0 [???]
    507,749  /build/glibc-eX1tMB/glibc-2.31/elf/dl-lookup.c:_dl_lookup_symbol_x [/usr/lib/x86_64-linux-gnu/ld-2.31.so]