On my laptop with Intel Pentium dual-core processor T2370 (Acer Extensa) I ran a simple multithreading speedup test. I am using Linux. The code is pasted below. While I was expecting a speedup of 2-3 times, I was surprised to see a slowdown by a factor of 2. I tried the same with gcc optimization levels -O0 ... -O3, but everytime I got the same result. I am using pthreads. I also tried the same with only two threads (instead of 3 threads in the code), but the performance was similar.
What could be the reason? The faster version took reasonably long - about 20 secs - so it seems is not an issue of startup overhead.
NOTE: This code is a lot buggy (indeed it does not make much sense as the output of serial and parallel versions would be different). The intention was just to "get" a speedup comparison for the same number of instructions.
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <pthread.h>
class Thread{
    private:
            pthread_t thread;
            static void *thread_func(void *d){((Thread *)d)->run();}
    public:
            Thread(){}
            virtual ~Thread(){}
            virtual void run(){}
            int start(){return pthread_create(&thread, NULL, Thread::thread_func, (void*)this);}
            int wait(){return pthread_join(thread, NULL);}
};
#include <iostream>
const int ARR_SIZE = 100000000;
const int N = 20;
int arr[ARR_SIZE];
int main(void)
{
    class Thread_a:public Thread{
            public:
                    Thread_a(int* a): arr_(a) {}
                    void run()
                    {
                            for(int n = 0; n<N; n++)
                            for(int i=0; i<ARR_SIZE/3; i++){ arr_[i] += arr_[i-1];}
                    }
            private:
                    int* arr_;
    };
    class Thread_b:public Thread{
            public:
                    Thread_b(int* a): arr_(a) {}
                    void run()
                    {
                            for(int n = 0; n<N; n++)
                            for(int i=ARR_SIZE/3; i<2*ARR_SIZE/3; i++){ arr_[i] += arr_[i-1];}
                    }
            private:
                    int* arr_;
    };
    class Thread_c:public Thread{
            public:
                    Thread_c(int* a): arr_(a) {}
                    void run()
                    {
                            for(int n = 0; n<N; n++)
                            for(int i=2*ARR_SIZE/3; i<ARR_SIZE; i++){ arr_[i] += arr_[i-1];}
                    }
            private:
                    int* arr_;
    };
    {
            Thread *a=new Thread_a(arr);
            Thread *b=new Thread_b(arr);
            Thread *c=new Thread_c(arr);
            clock_t start = clock();
            if (a->start() != 0) {
                    return 1;
            }
            if (b->start() != 0) {
                    return 1;
            }
            if (c->start() != 0) {
                    return 1;
            }
            if (a->wait() != 0) {
                    return 1;
            }
            if (b->wait() != 0) {
                    return 1;
            }
            if (c->wait() != 0) {
                    return 1;
            }
            clock_t end = clock();
            double duration = (double)(end - start) / CLOCKS_PER_SEC;
            std::cout << duration << "seconds\n";
            delete a;
            delete b;
    }
    {
            clock_t start = clock();
            for(int n = 0; n<N; n++)
            for(int i=0; i<ARR_SIZE; i++){ arr[i] += arr[i-1];}
            clock_t end = clock();
            double duration = (double)(end - start) / CLOCKS_PER_SEC;
            std::cout << "serial: " << duration << "seconds\n";
    }
    return 0;
  }
See also: What can make a program run slower when using more threads?