I have some Fortran code that performs a matrix row exchange. In the legacy code, it is written as
do J = 1,N
   save_val  = B(IROW,J)
   B(IROW,J) = B(JCOL,J)
   B(JCOL,J) = save_val
end do
This exchanges row IROW with row JCOL (IROW and JCOL are integers). However, the function of that block of code is not intuitive. In my opinion, it would be more intuitive, or at least aid readability to write it as:
save_row  = B(IROW,:)
B(IROW,:) = B(JCOL,:)
B(JCOL,:) = save_row
(it is more clear that rows are being moved).
From the image attached, it is clear that the looping methods provide better performance relative to the array operations. Why is this? Is it because this becomes a memory limited process when the number of elements in the array gets large? (i.e. the array would get "chunked") Or is it something else?
Compiling as gfortran -O3 test.f95. Adding the flag fstack-arrays did not make significant difference.
program test
  implicit none
  integer  :: N
  integer  :: M
  integer  :: loop_max = 1e7
  integer  :: i                    ! loop index
  real     :: t1, t2
  real     :: time_loop, time_array, time_sub_loop, time_sub_array
  real, dimension(:, :), allocatable   :: B
  real, dimension(:)   , allocatable   :: save_row
  real :: save_val
  integer :: IROW, J, JCOL
  character(*), parameter :: format_header = '(A5, 1X, 4(A12,1X))'
  character(*), parameter :: format_data = '(I5, 1X, 4(ES12.5, 1X))'
  open(1, file = 'TimingRowExchange.txt', status = 'unknown')
  write(1, format_header) 'N', 't_loop', 't_array', 't_sub_loop', 't_sub_array'
  do N = 1, 100
    M = N + 1
    allocate(B(N,N), save_row(M))
    call random_number(B)
    JCOL = 1
    IROW = 3
    call CPU_time(t1)
    do i = 1, loop_max
      do J = 1,N
        save_val  = B(IROW,J)
        B(IROW,J) = B(JCOL,J)
        B(JCOL,J) = save_val
      end do
    end do
    call CPU_time(t2)
    time_loop = t2 - t1
    ! write ( *, * ) 'Using Loop =', t2 - t1
    call CPU_time(t1)
    do i = 1, loop_max
        save_row(1:N) = B(IROW,:)
        B(IROW,:)     = B(JCOL,:)
        B(JCOL,:)     = save_row(1:N)
    end do
    call CPU_time(t2)
    time_array = t2 - t1
    ! write ( *, * ) 'Using Array =', t2 - t1
    call CPU_time(t1)
    do i = 1, loop_max
      call exchange_rows_loop(B, JCOL, IROW)
    end do
    call CPU_time(t2)
    time_sub_loop = t2 - t1
    ! write ( *, * ) 'Loop Subrout =', t2 - t1
    call CPU_time(t1)
    do i = 1, loop_max
      call exchange_rows_array(B, JCOL, IROW)
    end do
    call CPU_time(t2)
    time_sub_array = t2 - t1
    ! write ( *, * ) 'Array Subrout =', t2 - t1
    deallocate(B, save_row)
    write(1, format_data) N, time_loop, time_array, time_sub_loop, time_sub_array
  end do
contains
  subroutine print_mat(A)
    implicit none
    real, dimension(:,:), intent(in) :: A
    integer :: n
    n = size(A,1) ! # of rows
    do i = 1,n
      print*, A(i,:)
    end do
    print*,
  end subroutine print_mat
  subroutine exchange_rows_loop(A, row1, row2)
    implicit none
    real, dimension(:,:), intent(in out) :: A
    integer,              intent(in)     :: row1, row2
    integer :: J
    real :: save_val
    do J = 1, size(A,1)
      save_val  = A(row1,J)
      A(row1,J) = A(row2,J)
      A(row2,J) = save_val
    end do
  end subroutine exchange_rows_loop
  subroutine exchange_rows_array(A, row1, row2)
    implicit none
    real, dimension(:,:), intent(in out) :: A
    integer,              intent(in)     :: row1, row2
    real, dimension(size(A,1))           :: save_row
    save_row  = A(row1,:)
    A(row1,:) = A(row2,:)
    A(row2,:) = save_row
  end subroutine exchange_rows_array
end program test

 
    
 
    