I am basically writing code to count if a pair sum is even(among all pairs from 1 to 100000). I wrote a code using pthreads and without pthreads. But the code with pthreads is taking more time than the serial one. Here is my serial code
#include<bits/stdc++.h>
using namespace std;
int main()
{
  long long sum = 0, count = 0, n = 100000;
  auto start = chrono::high_resolution_clock::now();
  for(int i = 1; i <= n; i++)
    for(int j = i-1; j >= 0; j--)
    {
        sum = i + j;
        if(sum%2 == 0)
            count++;
    }
  cout<<"count is "<<count<<endl;
  auto end = chrono::high_resolution_clock::now();
  double time_taken = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
  time_taken *= 1e-9;
  cout << "Time taken by program is : " << fixed << time_taken << setprecision(9)<<" secs"<<endl;
  return 0;
}
and here is my parallel code
#include<bits/stdc++.h>
using namespace std;
#define MAX_THREAD 3
long long cnt[5] = {0};
long long n = 100000;
int work_per_thread;
int start[] = {1, 60001, 83001, 100001};
void *count_array(void* arg)
{
   int t = *((int*)arg);
   long long sum = 0;
   for(int i = start[t]; i < start[t+1]; i++)
     for(int j = i-1; j >=0; j--)
     {
        sum = i + j;
            if(sum%2 == 0)
                cnt[t]++;
     }
   cout<<"thread"<<t<<" finished work "<<cnt[t]<<endl;
   return NULL;
}
int main()
{
    pthread_t threads[MAX_THREAD];
    int arr[] = {0,1,2};
    long long total_count = 0;
    work_per_thread = n/MAX_THREAD;
   auto start = chrono::high_resolution_clock::now();
   for(int i = 0; i < MAX_THREAD; i++)
       pthread_create(&threads[i], NULL, count_array, &arr[i]);
   for(int i = 0; i < MAX_THREAD; i++)
       pthread_join(threads[i], NULL);
   for(int i = 0; i < MAX_THREAD; i++)
       total_count += cnt[i];
   cout << "count is " << total_count << endl;
   auto end = chrono::high_resolution_clock::now();
   double time_taken = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
   time_taken *= 1e-9;
   cout << "Time taken by program is : " << fixed << time_taken << setprecision(9)<<" secs"<<endl;
   return 0;
}  
In the parallel code I am creating three threads and 1st thread will be doing its computation from 1 to 60000, 2nd thread from 60001 to 83000 and so on. I have chosen these numbers so that each thread gets to do approximately similar number of computations. The parallel execution takes 10.3 secs whereas serial one takes 7.7 secs. I have 6 cores and 2 threads per core. I also used htop command to check if the required number of threads are running or not and it seems to be working fine. I don't understand where the problem is.
 
     
    