TL;DR - it seems that it's possible to use C++11 static variable initialization in a thread safe manner which has the same performance characteristics as dispatch_once.
Following Stephan Lechner's answer, I wrote the most simple code that tests the C++ static initialization flow:
class Object {  
};
static Object *GetObjectCppStatic() {
  static Object *object = new Object();
  return object;
}
int main() {
  GetObjectCppStatic();
}
Compiling this to assembly via clang++ test.cpp -O0 -fno-exceptions -S (-O0 to avoid inlining, same general code is produced for -Os, -fno-exceptions to simplify generated code), shows that GetObjectCppStatic compiles to:
__ZL18GetObjectCppStaticv:        ## @_ZL18GetObjectCppStaticv
  .cfi_startproc
## BB#0:
  pushq   %rbp
Lcfi6:
  .cfi_def_cfa_offset 16
Lcfi7:
  .cfi_offset %rbp, -16
  movq  %rsp, %rbp
Lcfi8:
  .cfi_def_cfa_register %rbp
  cmpb  $0, __ZGVZL18GetObjectCppStaticvE6object(%rip)
  jne LBB2_3
## BB#1:
  leaq  __ZGVZL18GetObjectCppStaticvE6object(%rip), %rdi
  callq   ___cxa_guard_acquire
  cmpl  $0, %eax
  je  LBB2_3
## BB#2:
  movl  $1, %eax
  movl  %eax, %edi
  callq   __Znwm
  leaq  __ZGVZL18GetObjectCppStaticvE6object(%rip), %rdi
  movq  %rax, __ZZL18GetObjectCppStaticvE6object(%rip)
  callq   ___cxa_guard_release
LBB2_3:
  movq  __ZZL18GetObjectCppStaticvE6object(%rip), %rax
  popq  %rbp
  retq
  .cfi_endproc
We can definitely see the ___cxa_guard_acquire and ___cxa_guard_release, implemented by the libc++ ABI here. Note that we didn't even had to specify to clang that we use C++11, as apparently this was supported by default even prior than that.
So we know both forms ensures thread-safe initialization of local statics. But what about performance? The following test code checks both methods with no contention (single threaded) and with heavy contention (multi threaded):
#include <cstdio>
#include <dispatch/dispatch.h>
#include <mach/mach_time.h>
class Object {  
};
static double Measure(int times, void(^executionBlock)(), void(^finallyBlock)()) {
  struct mach_timebase_info timebaseInfo;
  mach_timebase_info(&timebaseInfo);
  uint64_t start = mach_absolute_time();
  for (int i = 0; i < times; ++i) {
    executionBlock();
  }
  finallyBlock();
  uint64_t end = mach_absolute_time();
  uint64_t timeTook = end - start;
  return ((double)timeTook * timebaseInfo.numer / timebaseInfo.denom) /
      NSEC_PER_SEC;
}
static Object *GetObjectDispatchOnce() {
  static Object *object;
  static dispatch_once_t onceToken;
  dispatch_once(&onceToken, ^{
    object = new Object();
  });
  return object;
}
static Object *GetObjectCppStatic() {
  static Object *object = new Object();
  return object;
}
int main() {
  printf("Single thread statistics:\n");
  printf("DispatchOnce took %g\n", Measure(10000000, ^{
    GetObjectDispatchOnce();
  }, ^{}));
  printf("CppStatic took %g\n", Measure(10000000, ^{
    GetObjectCppStatic();
  }, ^{}));
  printf("\n");
  dispatch_queue_t queue = dispatch_queue_create("queue", 
      DISPATCH_QUEUE_CONCURRENT);
  dispatch_group_t group = dispatch_group_create();
  printf("Multi thread statistics:\n");
  printf("DispatchOnce took %g\n", Measure(1000000, ^{
    dispatch_group_async(group, queue, ^{
      GetObjectDispatchOnce();
    });
  }, ^{
    dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
  }));
  printf("CppStatic took %g\n", Measure(1000000, ^{
    dispatch_group_async(group, queue, ^{
      GetObjectCppStatic();
    });
  }, ^{
    dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
  }));
}
Which yields the following results on x64:
Single thread statistics:
DispatchOnce took 0.025486
CppStatic took 0.0232348
Multi thread statistics:
DispatchOnce took 0.285058
CppStatic took 0.32596
So up to measurement error, it seems that the performance characteristics of both methods are similar, mostly due to the double-check locking that is performed by both of them. For dispatch_once, this happens in the _dispatch_once function:
void
_dispatch_once(dispatch_once_t *predicate,
    DISPATCH_NOESCAPE dispatch_block_t block)
{
  if (DISPATCH_EXPECT(*predicate, ~0l) != ~0l) {
    // ...
  } else {
    // ...
  }
}
Where in the C++ static initialization flow it happens right before the call to ___cxa_guard_acquire.