In the following example the C++11 threads take about 50 seconds to execute, but the OMP threads only 5 seconds.  Any ideas why?  (I can assure you it still holds true if you are doing real work instead of doNothing, or if you do it in a different order, etc.)  I'm on a 16 core machine, too.
#include <iostream>
#include <omp.h>
#include <chrono>
#include <vector>
#include <thread>
using namespace std;
void doNothing() {}
int run(int algorithmToRun)
{
    auto startTime = std::chrono::system_clock::now();
    for(int j=1; j<100000; ++j)
    {
        if(algorithmToRun == 1)
        {
            vector<thread> threads;
            for(int i=0; i<16; i++)
            {
                threads.push_back(thread(doNothing));
            }
            for(auto& thread : threads) thread.join();
        }
        else if(algorithmToRun == 2)
        {
            #pragma omp parallel for num_threads(16)
            for(unsigned i=0; i<16; i++)
            {
                doNothing();
            }
        }
    }
    auto endTime = std::chrono::system_clock::now();
    std::chrono::duration<double> elapsed_seconds = endTime - startTime;
    return elapsed_seconds.count();
}
int main()
{
    int cppt = run(1);
    int ompt = run(2);
    cout<<cppt<<endl;
    cout<<ompt<<endl;
    return 0;
}
 
     
     
    