I have a code that uses FFT heavily, and a quick profiling shows indeed that the computer spends a significant portion of wall-clock time in my FFT functions. I have a simple attempt of using openMP here, however the code shows no speed up really compared to the series code. I am providing a simple example code here:
Main.cpp
static const int nx = 128;
static const int ny = 128; 
static const int nz = 128;
Eigen::initParallel();
Eigen::setNbThreads(5); 
Eigen::Tensor<double, 3> Re(nx,ny,nz); 
for(int i = 0; i< nx; i++){
        for(int j = 0; j< ny; j++){
            for(int k = 0; k< nz; k++){ 
                Re(k,i,j) = //Initialized w/ some function
            }
      }
}
Eigen::Tensor<std::complex<double>, 3> Im(nx,ny,nz); 
double start_time, run_time;
start_time = omp_get_wtime();
r2cfft3d(Re, Im); //function take real input and take forward fft to output complex result
run_time = omp_get_wtime() - start_time;
cout << "Time in s: " <<  run_time << "s\n";
The function looks like:
void r2cfft3d(Eigen::Tensor<double, 3>& rArr, Eigen::Tensor<std::complex<double>, 3>& cArr){    
    #pragma omp parallel 
    {
    fftw_complex *input_array;
    fftw_complex *output_array;
    input_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
    output_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
    #pragma omp parallel for
    for (int i = 0; i < nx; ++i) {
        for (int j = 0; j < ny; ++j) {
            for (int k = 0; k < nz; ++k) {
                {
                    input_array[k + nz * (j + ny * i)][REAL] = rArr(k,i,j);
                    input_array[k + nz * (j + ny * i)][IMAG] = 0;
                }
            }
        }
    }
    #pragma omp critical (make_plan)
    {
        fftw_plan forward = fftw_plan_dft_3d(nx, ny, nz, input_array, output_array, FFTW_FORWARD, FFTW_ESTIMATE);
        fftw_execute(forward);
        fftw_destroy_plan(forward);
        fftw_cleanup();
    }
    #pragma omp for nowait
     for(int i=0; i < nx; ++i){
        for(int j=0; j < ny; ++j) {
            for(int k=0; k < nz; ++k) {
                cArr(k,i,j).real(output_array[k + nz * (j + ny * i)][REAL]);
                cArr(k,i,j).imag(output_array[k + nz * (j + ny * i)][IMAG]);
                
            }
        }
    }
    
    fftw_free(input_array);
    fftw_free(output_array);
    }
}
The serial code returns an wall time of : Time in s: 0.026634s while the parallel code returns Time in s: 0.287648s
I am fairly new to OpenMP, and thought this would be more straightforward. Is there a better way to rewrite this function? Thanks!
EDIT: So, this is my second attempt, where the runtime has gotten better but not SIGNICANTLY better
void r2cfft3d(Eigen::Tensor<double, 3>& rArr, Eigen::Tensor<std::complex<double>, 3>& cArr){    
    #pragma omp parallel 
    {
    fftw_complex *input_array;
    fftw_complex *output_array;
    input_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
    output_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
    
    #pragma omp for
    for (int i = 0; i < nx; ++i) {
        for (int j = 0; j < ny; ++j) {
            for (int k = 0; k < nz; ++k) {
                {
                    input_array[k + nz * (j + ny * i)][REAL] = rArr(k,i,j);
                    input_array[k + nz * (j + ny * i)][IMAG] = 0;
                }
            }
        }
    }
    #pragma omp single //#pragma omp critical (make_plan)
    {
        fftw_plan forward = fftw_plan_dft_3d(nx, ny, nz, input_array, output_array, FFTW_FORWARD, FFTW_ESTIMATE);
        fftw_execute(forward);
        fftw_destroy_plan(forward);
        fftw_cleanup();
    }
    
    #pragma omp for nowait
     for(int i=0; i < nx; ++i){
        for(int j=0; j < ny; ++j) {
            for(int k=0; k < nz; ++k) {
                cArr(k,i,j).real(output_array[k + nz * (j + ny * i)][REAL]);
                cArr(k,i,j).imag(output_array[k + nz * (j + ny * i)][IMAG]);
                
            }
        }
    }
    
    fftw_free(input_array);
    fftw_free(output_array);
    
    }
  
    
}
As Jérôme Richard noted that the use nowait here might be a bad idea. I notice it reduces the runtime by few seconds which may has to do with the freeing of memory. Thanks all!
Edit 2:
I rewrote the whole function and trying to make things more streamlined and avoid use of for loops. Now, for the serial code both of the attempts return same/similar serial time: Serial Time in SEC: 0.039189s
However, by adding openmp parallelization attemp the time again is SLOWED down: Parallel Time in SEC: 0.0934717s
#pragma omp parallel
    {   
        fftw_complex *input_array;
        input_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
        memcpy(input_array, rArr.data(), nx*ny*nz * sizeof(fftw_complex));
        fftw_complex *output_array;
        output_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
        #pragma omp single
        {
            fftw_plan forward = fftw_plan_dft_3d(nx, ny, nz, input_array, output_array, FFTW_FORWARD, FFTW_ESTIMATE);
            fftw_execute(forward);
            fftw_destroy_plan(forward);
            fftw_cleanup();
        }
        
        memcpy(cArr.data(),output_array, nx*ny*nz * sizeof(fftw_complex));
        
        
        
        fftw_free(input_array);
        fftw_free(output_array);
    }
}
Edit 3: I finally used fftw parallel combined with OpenMP following the instructions from: https://www.fftw.org/fftw3_doc/Usage-of-Multi_002dthreaded-FFTW.html
Running with the flags:
-lfftw3 -lfftw3_threads -fopenmp
Example
fftw_complex *input_array;
        input_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
        
        memcpy(input_array, rArr.data(), nx*ny*nz * sizeof(fftw_complex));
        fftw_complex *output_array;
        output_array = (fftw_complex*) fftw_malloc(nx*ny*nz * sizeof(fftw_complex));
        fftw_init_threads();
        fftw_plan_with_nthreads(omp_get_max_threads());
        
        fftw_plan forward = fftw_plan_dft_3d(nx, ny, nz, input_array, output_array, FFTW_FORWARD, FFTW_ESTIMATE);
            
fftw_execute(forward);
fftw_destroy_plan(forward);
fftw_cleanup();
        
memcpy(cArr.data(),output_array, nx*ny*nz * sizeof(fftw_complex));
        
        
        
fftw_free(input_array);
fftw_free(output_array);
This returns a Parallel Time in SEC: 0.0281432s
But now I am not sure if this is correct cause it seems too straightforward?