I tested parallel_for_  in OpenCV by comparing with the normal operation for just simple array summation and multiplication.
I have array of 100 integers and split into 10 each and run using parallel_for_. 
Then I also have normal 0 to 99 operation for summation and multiuplication.
Then I measured the elapsed time and normal operation is faster than parallel_for_  operation.
My CPU is Intel(R) Core(TM) i7-2600 Quard Core CPU.
parallel_for_ operation took 0.002sec (took 2 clock cycles) for summation and 0.003sec (took 3 clock cycles) for multiplication.
But normal operation took 0.0000sec (less than one click cycle) for both summation and multiplication. What am I missing? My code is as follow.
TEST Class
#include <opencv2\core\internal.hpp>
#include <opencv2\core\core.hpp>
#include <tbb\tbb.h>
using namespace tbb;
using namespace cv;
template <class type>
class Parallel_clipBufferValues:public cv::ParallelLoopBody
{
   private:
       type *buffertoClip;
       type maxSegment;
       char typeOperation;//m = mul, s = summation
       static double total;
   public:
       Parallel_clipBufferValues(){ParallelLoopBody::ParallelLoopBody();};
       Parallel_clipBufferValues(type *buffertoprocess, const type max, const char op): buffertoClip(buffertoprocess), maxSegment(max), typeOperation(op){ 
           if(typeOperation == 's')
                total = 0; 
           else if(typeOperation == 'm')
                total = 1; 
       }
       ~Parallel_clipBufferValues(){ParallelLoopBody::~ParallelLoopBody();};
       virtual void operator()(const cv::Range &r) const{
           double tot = 0;        
           type *inputOutputBufferPTR = buffertoClip+(r.start*maxSegment);
           for(int i = 0; i < 10; ++i)
           {
               if(typeOperation == 's')
                  total += *(inputOutputBufferPTR+i);
               else if(typeOperation == 'm')
                  total *= *(inputOutputBufferPTR+i);
           }
       }
       static double getTotal(){return total;}
       void normalOperation(){
           //int iteration = sizeof(buffertoClip)/sizeof(type);
           if(typeOperation == 'm')
           {
               for(int i = 0; i < 100; ++i)
               {
                  total *= buffertoClip[i];
               }
           }
           else if(typeOperation == 's')
           {
               for(int i = 0; i < 100; ++i)
               {
                  total += buffertoClip[i];
               }
           }
       }
};
MAIN
    #include "stdafx.h"
    #include "TestClass.h"
    #include <ctime>
    double Parallel_clipBufferValues<int>::total;
    int _tmain(int argc, _TCHAR* argv[])
    {
        const int SIZE=100;
        int myTab[SIZE];
        double totalSum_by_parallel;
        double totalSun_by_normaloperation;
        double elapsed_secs_parallel;
        double elapsed_secs_normal;
        for(int i = 1; i <= SIZE; i++)
        {
            myTab[i-1] = i;
        }
        int maxSeg =10;
        clock_t begin_parallel = clock();
        cv::parallel_for_(cv::Range(0,maxSeg), Parallel_clipBufferValues<int>(myTab, maxSeg, 'm'));
        totalSum_by_parallel = Parallel_clipBufferValues<int>::getTotal();
        clock_t end_parallel = clock();
        elapsed_secs_parallel = double(end_parallel - begin_parallel) / CLOCKS_PER_SEC;
        clock_t begin_normal = clock();
        Parallel_clipBufferValues<int> norm_op(myTab, maxSeg, 'm');
        norm_op.normalOperation();
        totalSun_by_normaloperation = norm_op.getTotal();
        clock_t end_normal = clock();
        elapsed_secs_normal = double(end_normal - begin_normal) / CLOCKS_PER_SEC;
        return 0;
    }
 
     
    