I am doing an optimization task for memcpy function, I found this link here. How to increase performance of memcpy
Since I'm not familiar with multithread programming, I don't know how to insert the codes below to the original main function? How to modify the codes in the original question into a multithread memcpy project? I mean, how to create a complete project for this multithread memcpy project. Where are the places for inserting the functions like startCopyThreads or stopCopyThreads or mt_memcpy functions in the original main function?
#define NUM_CPY_THREADS 4
HANDLE hCopyThreads[NUM_CPY_THREADS] = {0};
HANDLE hCopyStartSemaphores[NUM_CPY_THREADS] = {0};
HANDLE hCopyStopSemaphores[NUM_CPY_THREADS] = {0};
typedef struct
{
    int ct;
    void * src, * dest;
    size_t size;
} mt_cpy_t;
mt_cpy_t mtParamters[NUM_CPY_THREADS] = {0};
DWORD WINAPI thread_copy_proc(LPVOID param)
{
    mt_cpy_t * p = (mt_cpy_t * ) param;
    while(1)
    {
        WaitForSingleObject(hCopyStartSemaphores[p->ct], INFINITE);
        memcpy(p->dest, p->src, p->size);
        ReleaseSemaphore(hCopyStopSemaphores[p->ct], 1, NULL);
    }
    return 0;
}
int startCopyThreads()
{
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
    {
        hCopyStartSemaphores[ctr] = CreateSemaphore(NULL, 0, 1, NULL);
        hCopyStopSemaphores[ctr] = CreateSemaphore(NULL, 0, 1, NULL);
        mtParamters[ctr].ct = ctr;
        hCopyThreads[ctr] = CreateThread(0, 0, thread_copy_proc, &mtParamters[ctr], 0,     NULL); 
}
    return 0;
}
void * mt_memcpy(void * dest, void * src, size_t bytes)
{
    //set up parameters
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
    {
        mtParamters[ctr].dest = (char *) dest + ctr * bytes / NUM_CPY_THREADS;
        mtParamters[ctr].src = (char *) src + ctr * bytes / NUM_CPY_THREADS;
        mtParamters[ctr].size = (ctr + 1) * bytes / NUM_CPY_THREADS - ctr * bytes /     NUM_CPY_THREADS;
    }
    //release semaphores to start computation
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
        ReleaseSemaphore(hCopyStartSemaphores[ctr], 1, NULL);
    //wait for all threads to finish
    WaitForMultipleObjects(NUM_CPY_THREADS, hCopyStopSemaphores, TRUE, INFINITE);
    return dest;
}
int stopCopyThreads()
{
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
    {
        TerminateThread(hCopyThreads[ctr], 0);
        CloseHandle(hCopyStartSemaphores[ctr]);
        CloseHandle(hCopyStopSemaphores[ctr]);
    }
    return 0;
}
 
     
    