The code i wrote in C for matrix multiplication in MPI shows that my code is taking 5 seconds approx in global time but when i run the same thing in python mpi4py it takes very less time like few milliseconds, what is the problem with mpi in C, because it doesnt feel like 5 seconds when I run it in Linux shell,the output comes really fast but still shows the globaltime as 5 seconds.the code below is in C
#define N 4
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "mpi.h"
void print_results(char *prompt, int a[N][N]);
int main(int argc, char *argv[])
{
    int i, j, k, rank, size, tag = 99, sum = 0;
    int a[N][N];
    int b[N][N];
    int c[N][N];
    int aa[N],cc[N];
    int row,col;
    int dest = 0;
    int source;
    double time1, time2, duration, global;
    MPI_Status status;
    MPI_Init(&argc, &argv);
    time1 = MPI_Wtime();
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if(rank == 0){
        
        printf("enter the number of row =");    
        scanf("%d",&row);    
        printf("enter the number of column =");    
        scanf("%d",&col);    
        srand(time(NULL));
        for(i=0;i<row;i++) {
            for(j=0;j<col;j++){
                a[i][j] = rand() % 10;
            }
        }
        srand(time(NULL));
        for(i=0;i<row;i++){
            for(j=0;j<col;j++){
                b[i][j] = rand() % 10;
            }
        }
    }
    MPI_Scatter(a, N*N/size, MPI_INT, aa, N*N/size, MPI_INT,0,MPI_COMM_WORLD);
    MPI_Bcast(b, N*N, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
          for (i = 0; i < N; i++)
            {
                    for (j = 0; j < N; j++)
                    {
                            sum = sum + aa[j] * b[j][i];               
                    }
                    cc[i] = sum;
                    sum = 0;
            }
    MPI_Gather(cc, N*N/size, MPI_INT, c, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
    time2 = MPI_Wtime();
    duration = time2 - time1;
    MPI_Reduce(&duration,&global,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    if(rank == 0) {
        printf("Global runtime is %f\n",global);
    }
    printf("Runtime at %d is %f \n", rank,duration);       
    MPI_Finalize();
    if (rank == 0)                      
      print_results("C = ", c);
}
void print_results(char *prompt, int a[N][N])
{
    int i, j;
    printf ("\n\n%s\n", prompt);
    for (i = 0; i < N; i++) {
            for (j = 0; j < N; j++) {
                    printf(" %d", a[i][j]);
            }
            printf ("\n");
    }
    printf ("\n\n");
}
The output it gives is
4
4
enter the number of row =enter the number of column =Global runtime is 5.975327
Runtime at 0 is 1.493793 
Runtime at 1 is 1.493793 
Runtime at 2 is 1.493877 
Runtime at 3 is 1.493865 
C = 
 78 83 142 116
 128 138 236 194
 39 49 112 71
 96 109 204 156
Please let me know if there is some problem with the code!!
 
    