I am writing a multi-threaded program to traverse an n x n matrix, where the elements in the main diagonal are processed in a parallel manner, as shown in the code below:
int main(int argc, char * argv[] )
{   
  /* VARIABLES INITIALIZATION HERE */
  gettimeofday(&start_t, NULL); //start timing
  for (int slice = 0; slice < 2 * n - 1; ++slice)
  {  
    z = slice < n ? 0 : slice - n + 1;
    int L = 0;
    pthread_t threads[slice-z-z+1];
    struct thread_data td[slice-z-z+1];
    for (int j=z; j<=slice-z; ++j)
    {
      td[L].index= L;
      printf("create:%d\n", L );
      pthread_create(&threads[L],NULL,mult_thread,(void *)&td[L]);
      L++;
    }
    for (int j=0; j<L; j++) 
    {
      pthread_join(threads[j],NULL);
    }
  }     
  gettimeofday(&end_t, NULL); 
  printf("Total time taken by CPU: %ld \n", ( (end_t.tv_sec - start_t.tv_sec)*1000000 + end_t.tv_usec - start_t.tv_usec));
  return (0);
}
void *mult_thread(void *t)
{      
  struct thread_data *my_data= (struct thread_data*) t;
  /* SOME ADDITIONAL CODE LINES HERE */ 
  printf("ThreadFunction:%d\n", (*my_data).index );
  return (NULL);
}
The problem is that this multithreaded implementation gave me a very bad performance compared with the serial (naive) implementation.
Are there some adjustments that could be done to improve the performance of the multithreaded version ??
 
     
    