Following is a program which aims to sum two matrices of size 128 * 128 splitting the tasks into 8 process, hence each process sums 16 rows of matrices. 
int main(int argc, char** argv)
{
    int rank;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int **matrixA;
    int **matrixB;
    int **resultMatrix = malloc(ROWS * sizeof(int *));
    if (rank == 0) {
       matrixA = generateMatrix();
       matrixB = generateMatrix();
       for (i = 0; i < COLS; i++)
          resultMatrix[i] = malloc(COLS * sizeof(int));
    }
    int **auxA = malloc(16 * sizeof(int *));
    int **auxB = malloc(16 * sizeof(int *));
    int **auxC = malloc(16 * sizeof(int *));
    int i;
    int row, col;
    for (i = 0; i < 16; i++)
    {
       auxA[i] = malloc(COLS * sizeof(int));
       auxB[i] = malloc(COLS * sizeof(int));
       auxC[i] = malloc(COLS * sizeof(int));
    }
    MPI_Scatter(&(matrixA[0][0]), 16*COLS, MPI_INT, &(auxA[0][0]), 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Scatter(&(matrixB[0][0]), 16*COLS, MPI_INT, &(auxB[0][0]), 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);
    char hostname[HOST_NAME_MAX];
    if (! gethostname(hostname, sizeof hostname) == 0)
        perror("gethostname");
    for (row = 0; row < 16; row++)
    {
       for (col = 0; col < COLS; col++)
       {
           auxC[row][col] = auxA[row][col] + auxB[row][col];
       }
    }
    printf("Process Node %s %d\n done",hostname, rank);
    MPI_Gather(&auxC[0][0], 16*COLS, MPI_INT, &(resultMatrix[0][0]), 1, MPI_INT, 0, MPI_COMM_WORLD);
    if (rank == 0) {
       printMatrix(resultMatrix);
    }
    MPI_Finalize();
}   
Getting the following error message
[xxx115:12933] *** Process received signal ***
[xxx115:12933] Signal: Segmentation fault (11)
[xxx115:12933] Signal code: Address not mapped (1)
[xxx115:12933] Failing at address: 0x2
[xxx115:12934] *** Process received signal ***
[xxx115:12934] Signal: Segmentation fault (11)
[xxx115:12934] Signal code: Address not mapped (1)
[xxx115:12934] Failing at address: 0x2
[xxx115:12936] *** Process received signal ***
[xxx115:12936] Signal: Segmentation fault (11)
[xxx115:12936] Signal code: Address not mapped (1)
[xxx115:12936] Failing at address: 0x2
[xxx115:12933] [ 0] /lib64/libpthread.so.0[0x32ff00f7e0]
[xxx115:12933] [ 1] ./hw5[0x400bd4]
[xxx115:12933] [ 2] /lib64/libc.so.6(__libc_start_main+0xfd)[0x32fe41ed1d]
[xxx115:12933] [ 3] ./hw5[0x400879]
[xxx115:12936] [ 0] /lib64/libpthread.so.0[0x32ff00f7e0]
[xxx115:12936] [ 1] ./hw5[0x400bd4]
[xxx115:12936] [ 2] [wsu115:12933] *** End of error message ***
As i observe from other questions like this and this, segmentation fault arises when array makes an attempt to access memory out of its bounds.
However, i am not what part of program is causing this error. What can i do to fix the problem.
EDIT
After going through the comments, I realized that previous program was cluttered with lot of pointers. Here is a simplified version that is working without any error, but still not giving the expected output.
int **generateMatrix() 
{
    int **matrix = (int **)malloc(ROWS * sizeof(int *));
    int i;
    for (i = 0; i < ROWS; i++)
         matrix[i] = (int *)malloc(COLS * sizeof(int));
    int row, col;
    for (row = 0; row < ROWS; row++) 
    {
        for (col = 0; col < COLS; col++) 
        {
            matrix[row][col] = col; 
        }
    }
    return matrix;
}
int main(int argc, char** argv)
{
    int rank, world_size;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    int **matrixA;
    int **matrixB;
    int resultMatrix[128][128];
    int auxA[16][128];
    int auxB[16][128];
    int auxC[16][128];
    if (rank == 0) {
        matrixA = generateMatrix();
        matrixB = generateMatrix();
    }
    MPI_Scatter(matrixA, 16*COLS, MPI_INT, auxA, 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Scatter(matrixB, 16*COLS, MPI_INT, auxB, 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);
    char hostname[HOST_NAME_MAX];
    if (! gethostname(hostname, sizeof hostname) == 0)
      perror("gethostname");
    int row, col; 
    for (row = 0; row < 16; row++)
    {
        for (col = 0; col < COLS; col++)
        {
            auxC[row][col] = auxA[row][col] + auxB[row][col];
        }
    }
    printf("Process Node %s %d done\n",hostname, rank);
    MPI_Gather(auxC, 16*COLS, MPI_INT, resultMatrix, 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);
    if (rank == 0) {
        printMatrix(resultMatrix);
    }
    MPI_Finalize();
    return 0;
}   
