I found this very recent paper https://arxiv.org/pdf/1901.01926.pdf, where all is nicely typeset an explained.
Apparently there is always a compromise between time and space complexity and there is no algorithm (yet?) that is O(n)-in-time and in-place.
(The paper claims to be the first subquadratic deterministic in-place algorithm.)
The algorithm you posted is not listed in the paper and would be O(n) in time and O(n) in space (i.e. out-of-place).
I post it here for reference, also to check the correctness of other implementations.
I used zero-base indexing for simplicity.
// O(n) out-of-place algorithm
template<class It, class Size, class ItOut>
void inverse_permutation_n(It first, Size n, ItOut d_first){
    for(Size i = 0; i != n; ++i){
        d_first[first[i]] = i;
    }
}
Then there is the algorithm that is listed as "folklore" in the table translated from the pseudocode in the paper (Listing 3 in the paper).
#include<algorithm> // for std::min
template<class It, class Index>
void reverse_cycle(It t, Index start){
    auto cur = t[start];
    auto prev = start;
    while( cur != start ){
        auto next = t[cur];
        t[cur] = prev;
        prev = cur;
        cur = next;
    }
    t[start] = prev;
}
template<class It, class Index>
auto cycle_leader(It t, Index start){
    auto cur = t[start];
    auto smallest = start;
    while(cur != start){
        smallest = std::min(smallest, cur);
        cur = t[cur];
    }
    return smallest;
}
// O(n²) in-place algorithm
template<class It, class Size>
void inverse_permutation_n(It first, Size n){
    for(Size i = 0; i != n; ++i){
        if( cycle_leader(first, i) == i ) reverse_cycle(first, i);
    }
}
The above algorithm is O(n²)-time in the average case.
The reason is that for each point you have to follow a cycle of length O(n) to find the leader.
Then you can build on top of this by putting a shortcut in the cycle leader search, once the cycle leader is less than the starting point the search returns false.
template<class It, class Index>
auto cycle_leader_shortcut(It t, Index start){
    auto cur = t[start];
    while(cur != start){
        if(cur < start) return false;
        cur = t[cur];
    }
    return true;
}
// O(n log n) in-place algorithm
template<class It, class Size>
void inverse_permutation_shortcut_n(It first, Size n){
    for(Size i = 0; i != n; ++i){
        if( cycle_leader_shortcut(first, i) ) reverse_cycle(first, i);
    }
}
This algorithm is fortunately O(N log N) (in average, I think).
The reason is that the cycles iteration become shorter as the point in the sequence increases because it is more likely that a point will have a low value that was already reversed.
This is the benchmark and the result:
#include<numeric> // for iota
#include<random>
// initialize rng
std::random_device rd;
std::mt19937 g(rd());
static void BM_inverse_permutation(benchmark::State& state){
    // allocate memory and initialize test buffer and reference solution
    auto permutation = std::vector<std::size_t>(state.range(0)); 
    std::iota(permutation.begin(), permutation.end(), 0);
    auto reference_inverse_permutation = std::vector<std::size_t>(permutation.size());
    for(auto _ : state){
        state.PauseTiming(); // to random shuffle and calculate reference solution
        std::shuffle(permutation.begin(), permutation.end(), g);
    //    inverse_permutation_n(permutation.cbegin(), permutation.size(), reference_inverse_permutation.begin());
        benchmark::DoNotOptimize(permutation.data());
        benchmark::ClobberMemory();
        state.ResumeTiming();
        inverse_permutation_n(permutation.begin(), permutation.size());
        benchmark::DoNotOptimize(permutation.data());
        benchmark::ClobberMemory();
    //  state.PauseTiming(); // to check that the solution is correct
    //  if(reference_inverse_permutation != permutation) throw std::runtime_error{""};
    //  state.ResumeTiming();
    }
    state.SetItemsProcessed(state.iterations()*permutation.size()                    );
    state.SetComplexityN(state.range(0));
}
BENCHMARK(BM_inverse_permutation)->RangeMultiplier(2)->Range(8, 8<<10)->Complexity(benchmark::oNSquared);
static void BM_inverse_permutation_shortcut(benchmark::State& state){
    // allocate memory and initialize test buffer and reference solution
    auto permutation = std::vector<std::size_t>(state.range(0)); 
    std::iota(permutation.begin(), permutation.end(), 0);
    auto reference_inverse_permutation = std::vector<std::size_t>(permutation.size());
    for(auto _ : state){
        state.PauseTiming(); // to random shuffle and calculate reference solution
        std::shuffle(permutation.begin(), permutation.end(), g);
    //    inverse_permutation_n(permutation.cbegin(), permutation.size(), reference_inverse_permutation.begin());
        benchmark::DoNotOptimize(permutation.data());
        benchmark::ClobberMemory();
        state.ResumeTiming();
        inverse_permutation_shortcut_n(permutation.begin(), permutation.size());
        benchmark::DoNotOptimize(permutation.data());
        benchmark::ClobberMemory();
    //    state.PauseTiming(); // to check that the solution is correct
    //    if(reference_inverse_permutation != permutation) throw std::runtime_error{""};
    //    state.ResumeTiming();
    }
    state.SetItemsProcessed(state.iterations()*permutation.size()                    );
    state.SetComplexityN(state.range(0));
}
BENCHMARK(BM_inverse_permutation_shortcut)->RangeMultiplier(2)->Range(8, 8<<10)->Complexity(benchmark::oNLogN);
BENCHMARK_MAIN();
$ c++ a.cpp -O3 -DNDEBUG -lbenchmark && ./a.out
2021-03-30 21:16:55
Running ./a.out
Run on (12 X 4600 MHz CPU s)
CPU Caches:
  L1 Data 32K (x6)
  L1 Instruction 32K (x6)
  L2 Unified 256K (x6)
  L3 Unified 12288K (x1)
Load Average: 1.26, 1.80, 1.76
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-----------------------------------------------------------------------------------------------
Benchmark                                     Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------------------------------
BM_inverse_permutation/8                    476 ns          478 ns      1352259 items_per_second=16.7276M/s
BM_inverse_permutation/16                   614 ns          616 ns      1124905 items_per_second=25.9688M/s
BM_inverse_permutation/32                  1106 ns         1107 ns       630398 items_per_second=28.9115M/s
BM_inverse_permutation/64                  2929 ns         2931 ns       238236 items_per_second=21.835M/s
BM_inverse_permutation/128                10748 ns        10758 ns        64708 items_per_second=11.898M/s
BM_inverse_permutation/256                41556 ns        41582 ns        16600 items_per_second=6.15656M/s
BM_inverse_permutation/512               164006 ns       164023 ns         4245 items_per_second=3.12151M/s
BM_inverse_permutation/1024              621341 ns       620840 ns         1056 items_per_second=1.64938M/s
BM_inverse_permutation/2048             2468060 ns      2466060 ns          293 items_per_second=830.474k/s
BM_inverse_permutation/4096            10248540 ns     10244982 ns           93 items_per_second=399.805k/s
BM_inverse_permutation/8192            55926122 ns     55926230 ns           10 items_per_second=146.479k/s
BM_inverse_permutation_BigO                0.82 N^2        0.82 N^2  
BM_inverse_permutation_RMS                   18 %            18 %    
BM_inverse_permutation_shortcut/8           499 ns          501 ns      1193871 items_per_second=15.9827M/s
BM_inverse_permutation_shortcut/16          565 ns          567 ns      1225056 items_per_second=28.2403M/s
BM_inverse_permutation_shortcut/32          740 ns          742 ns       937909 items_per_second=43.1509M/s
BM_inverse_permutation_shortcut/64         1121 ns         1121 ns       619016 items_per_second=57.0729M/s
BM_inverse_permutation_shortcut/128        1976 ns         1977 ns       355982 items_per_second=64.745M/s
BM_inverse_permutation_shortcut/256        3644 ns         3645 ns       191387 items_per_second=70.2375M/s
BM_inverse_permutation_shortcut/512        7282 ns         7288 ns        95434 items_per_second=70.2481M/s
BM_inverse_permutation_shortcut/1024      14732 ns        14752 ns        47417 items_per_second=69.4165M/s
BM_inverse_permutation_shortcut/2048      30590 ns        30398 ns        23079 items_per_second=67.3728M/s
BM_inverse_permutation_shortcut/4096      64374 ns        64039 ns        10766 items_per_second=63.9613M/s
BM_inverse_permutation_shortcut/8192     196961 ns       195786 ns         3646 items_per_second=41.8416M/s
BM_inverse_permutation_shortcut_BigO       1.74 NlgN       1.73 NlgN 
BM_inverse_permutation_shortcut_RMS          27 %            27 %