Consider following function for duplicating lines in image:
void DuplicateRows(char* image_in, char *image_out, int width, int height)
{
    for(int row = 0; row < height; i++)
    {
         memcpy(image_out + (2 * row)*width, image_in + row*width, width);
         memcpy(image_out + (2 * row + 1)*width, image_in + row*width, width);
    }
}
When I try to split image into several slices and assign each slice to separate thread(say, rows from 0-539 to Threads1, 540-1079 - Thread2), running time worsens with number of threads. Is there explanation for this? (I suspect that bottleneck is memory access which is serialized)
More detailed:
The test I ran was the following(It does not have 2 memcpy-s, but that does not matter, the example was just to prove usefullness):
#include <vector>
#include <thread>
#include <functional>
#include <condition_variable>
#include <mutex>
#include <iostream>
#include <chrono>
const int height = 1080;
const int width = 3840;
condition_variable cv;
mutex mu;
int finished;
void execute(vector<unsigned char>&vec_in, vector<unsigned char>& vec_out, int factor)
{
    auto src_row_ptr = &vec_in[0];
    auto dst_row_ptr = &vec_out[0];
    for(int i = 0; i<height/factor; i++)
    {
        memcpy(dst_row_ptr, src_row_ptr, width);
        src_row_ptr+= width;
        dst_row_ptr+= width;
    }
    unique_lock<mutex> lock(mu);
    finished++;
    lock.unlock();
    cv.notify_one();
}   
void check1thread()
{
    using namespace std::chrono;
    finished =0;
    cout<<"Checking 1 thread ... \n";
    vector<unsigned char> vec1(height * width, 1);
    vector<unsigned char> vec1_res(height * width ,0);
    auto tm0 = high_resolution_clock::now();
    auto src_row_ptr = &vec1[0];
    auto dst_row_ptr = &vec1_res[0];
    for(int i = 0; i<height; i++)
    {
        memcpy(dst_row_ptr, src_row_ptr, width);
        src_row_ptr+= width;
        dst_row_ptr+= width;
    }
    auto tm1 = high_resolution_clock::now();
    cout<<"work done\n";
    cout<<duration_cast<microseconds>(tm1-tm0).count() << " microseconds passed \n";
    cin.get();
}
void check2threads()
{
    using namespace std::chrono;
    finished =0;
    cout<<"Checking 2 thread ... \n";
    vector<unsigned char> vec1(height/2 * width, 1);
    vector<unsigned char> vec1_res(height/2 * width ,0);
    vector<unsigned char> vec2(height/2 * width, 1);
    vector<unsigned char> vec2_res(height/2 * width, 0);
    auto tm0 = high_resolution_clock::now();
    thread t1(execute, std::ref(vec1), std::ref(vec1_res) ,2 );
    thread t2(execute, std::ref(vec2), std::ref(vec2_res) ,2 );
    unique_lock<mutex> ul(mu);
    cv.wait(ul, [](){return finished == 2;} );
    auto tm1 = high_resolution_clock::now();
    cout<<"work done\n";
    cout<<duration_cast<microseconds>(tm1-tm0).count() << " microseconds passed \n";
    t1.join();
    t2.join();
}
int main()
{
    check1thread();
    check2threads();
    cin.get();
}
