I'm trying to parallelize a simulation of an Ising 2D model to get some expected values as a function of the temperature of the system. For L=48, the one-threaded version takes about 240 seconds to run 20 temperatures and 1 seed each, but the parallelized version takes about 268 seconds, which is similar.
If you take the time per seed per temperature, it results in 12 seconds for the one-threaded version and 13.4 seconds for the parallelized version. I'm looking for help with my code because I don't understand these durations. I thought that the parallelized version would split one temperature among all threads and therefore should take about 30 seconds to complete.
I need to run the simulation for 50 temperatures and 200 seeds each, for 5 values of L. It would be helpful to reduce the compute time, because otherwise it could take 20 hours for L=48 and some days for L=72.
I'm using an i7-10700KF (8 cores, 16 logical threads).
program Ising
    use omp_lib
    implicit none
    integer L, seed, i, j, seed0, nseed,k
    parameter (L=48)
    integer s(1:L, 1:L)
    integer*4 pbc(0:L+1), mctot, N, mcd, mcini, difE
    real*8 genrand_real2, magne, energ, energia, temp, temp1, DE
    real*8 mag, w(-8:8)
    real*8 start, finish
    real*8 sum, sume, sume2, summ, summ2, sumam, vare, varm, maxcv, maxx
    real*8 cv, x, Tmaxcv, Tmaxx
    integer irand, jrand
11  format(10(f20.6))
! Initialize variables
    mctot = 80000
    mcd = 20
    mcini = 8000
    N = L*L
    seed0 = 20347880
    nseed = 20
    maxcv=0.d0
    maxx=0.d0
! Initialize vector pbc
    pbc(0) = L
    pbc(L+1) = 1
    do i = 1, L
        pbc(i) = i
    end do
! Initialize matrix s with random values
    do i = 1, L
        do j = 1, L
            if (genrand_real2() < 0.5) then
                s(i,j) = 1
            else
                s(i,j) = -1
            endif
        end do
    end do
! Metropolis algorithm
    open(1, file='Expectation values.dat')
    start = omp_get_wtime()
    write(1,*) '#Temp,       ','E,       ','E2,       ','M,       ','M2,       ','|M|,       ','VarE,       ','VarM,       ',&
    'Cv,       ','X,       '
!Start loop to calculate for different temperatures
!$OMP PARALLEL PRIVATE(s,seed,w,energia,difE,irand,jrand,temp,mag,sum,sume,sume2,summ,summ2,sumam,vare,varm,cv,x) 
        temp1 = 1.59d0
!$OMP DO ordered schedule(dynamic)
    do k = 1, 10
        temp = temp1 + (0.01d0*k)
!Define the matrix w, which contains the values of the Boltzmann function for each temperature, so as not to have to calculate them each iteration
        do i = -8, 8
            w(i) = dexp(-i/temp)
        end do
        write(*,*) "Temperature: ", temp, "Thread", omp_get_thread_num()
        sum = 0.d0
        sume = 0.d0
        sume2 = 0.d0
        summ = 0.d0
        summ2 = 0.d0
        sumam = 0.d0
        do seed = seed0, seed0 + nseed-1, 1
            call init_genrand(seed)
            call reinicia(s,l)
            energia = energ(s,l,pbc)
            do i = 1, mctot
                do j = 1, N
                    irand = int(genrand_real2()*L) + 1
                    jrand = int(genrand_real2()*L) + 1
                    difE = int(DE(s,l,irand,jrand,pbc))
                    if (difE < 0) then
                        s(irand,jrand) = -s(irand,jrand)
                        energia = energia + difE
                    else if (genrand_real2() < w(int(difE))) then
                        s(irand,jrand) = -s(irand,jrand)
                        energia = energia + difE
                    endif
                end do
                if ((i > mcini).and.(mcd*(i/mcd)==i)) then
                    mag= magne(s,l)
                    sum = sum + 1.d0
                    sume = sume + energia
                    sume2 = sume2 + energia**2
                    summ = summ + mag
                    summ2 = summ2 + mag**2
                    sumam = sumam + abs(mag)
                endif
            end do
        end  do
!Energy
        sume=sume/(sum*N)
        sume2=sume2/(sum*N*N)
!Magnetitzation
        summ = summ/(sum*N)
        sumam=sumam/(sum*N)
        summ2=summ2/(sum*N*N)
!Variances
        vare = dsqrt(sume2-sume*sume)/dsqrt(sum)
        varm = dsqrt(summ2-summ*summ)/dsqrt(sum)
!Cv
        cv = (N*(sume2-sume*sume))/temp**2
        if (cv.gt.maxcv) then
            maxcv=cv
            Tmaxcv=temp
        endif
!X
        x = (N*(summ2-summ*summ))/temp
        if (x.gt.maxx) then
            maxx=x
            Tmaxx=temp
        endif
        write(1,11) temp,sume,sume2,summ,summ2,sumam,vare,varm,cv,x
    end do
!$OMP END DO
!$OMP END PARALLEL
    finish = omp_get_wtime()
    close(1)
    print*, "Time: ",(finish-start),"Seconds"
end program Ising
! Functions
!Function that calculates the energy of the matrix s
real*8 function energ(S,L, pbc)
    implicit none
    integer s(1:L, 1:L), i, j, L
    integer*4 pbc(0:L+1)
    real*8 ene
    ene = 0.0d0
    do i = 1, L
        do j = 1, L
            ene = ene - s(i,j) * s(pbc(i+1),j) - s(i,j) * s(i,pbc(j+1))
        end do
    end do
    energ = ene
    return
end function energ
!Function that calculates the difference in energy that occurs when the spin of position (i, j) is changed
real*8 function DE(S,L,i,j,pbc)
    implicit none
    integer s(1:L, 1:L), i, j, L, difE
    integer*4 pbc(0:L+1)
    real*8 suma
    difE = 0
    suma = 0.0d0
        suma = suma + s(pbc(i-1),j) + s(pbc(i+1),j) + s(i,pbc(j-1)) + s(i,pbc(j+1))
        difE = difE + int(2 * s(i,j) * suma)
    DE = difE
    return
end function DE
!Function that calculates the magnetization of the matrix s
real*8 function magne(S,L)
    implicit none
    integer s(1:L, 1:L),L
    magne = sum(s)
    return
end function magne
! SUBRUTINES
!Subroutine that resets the matrix s with random values
subroutine reinicia(S,L)
    implicit none
    integer s(1:L, 1:L), i,j,L
    real*8 genrand_real2
    do i = 1, L
        do j = 1, L
            if (genrand_real2() < 0.5) then
                s(i,j) = 1
            else
                s(i,j) = -1
            endif
        end do
    end do
    return
end subroutine
I have tried parallelizing the seeds loop instead of the temperatures, but it lasts almost the same, so i think i'm not parallelizing it correctly, because it looks a nice code to parallelize.
The other option I thought of is to manually parallelize the simulation. I could do this by compiling 16 programs, each of which handles a different range of temperatures. Then I could run all of the programs concurrently, so each program would get its own thread. However, this approach would require a lot of extra RAM.