I am trying to implement multithreading to a CPU intensive simulation that normally takes around 10hrs to run. I wrote a dummy code which works fine for multithreading and uses the specified number of CPU threads. But when I implement the same thing in my main program, it uses only one thread. What could be the reason for this?
My code: photongen()
void photongen() {
    // Iterating over Muon list
    for (auto m = muonList.begin(); m != muonList.end(); ++m) {
        
        nP = 0; // Initialise the number of photons to 0
        vector<Photon> tempP;   // Temporary vector to store generated photons
        
        // Muon propagation
        bool stopLoop = false;
        while (!stopLoop) {
            
            // Photon generation
            if (nPhotons()) {
                for (int i = 0; i < nPhotons(); i++){
                    // Create photon instance
                    Photon *P = new Photon(m->x, m->y, m->z, m->run, m->event, i);
                    tempP.push_back(*P);
                    delete P;   // Free memory by photon object
                }
            }
            // Check if muon has left the detector
            if (m->x > S_LENGTH/2. - PATH_STEP*sin(m->theta)*cos(m->phi) || m->x < -S_LENGTH/2. - PATH_STEP*sin(m->theta)*cos(m->phi)) {
                stopLoop = true;
            } else if (m->y > S_WIDTH/2. - PATH_STEP*sin(m->theta)*sin(m->phi) || m->y < -S_WIDTH/2. - PATH_STEP*sin(m->theta)*sin(m->phi)) {
                stopLoop = true;
            } else if (m->z > S_HEIGHT - PATH_STEP*cos(m->theta)) {
                stopLoop = true;
            } else {
                // Update Muon path
                m->x += PATH_STEP*sin(m->theta)*cos(m->phi);
                m->y += PATH_STEP*sin(m->theta)*sin(m->phi);
                m->z += PATH_STEP*cos(m->theta);
            }
        }
        multiprocess(8, tempP, &propagation);
        tempP.clear();
    }
}
propagation()
void propagation(Photon *P) {
    if (photonProp(P)) {
        pLock.lock();
        photonList.push_back(*P);
        nP++;
        pLock.unlock();
    }
}
photonProp()
bool photonProp(Photon*& P) {
    float dist[3];
    for (int i = 0; i < MAX_REFL; i++) {
        
        // Precalculation for optimisation
        float theta = P->theta;
        float phi = P->phi;
        float sintheta = sin(theta);
        float costheta = cos(theta);
        float sinphi = sin(phi);
        float cosphi = cos(phi);
        float P_x = P->x;
        float P_y = P->y;
        float P_z = P->z;
        // Find plane of intersection of particle
        dist[0] = (sintheta*cosphi > 0) ? (S_LENGTH/2. - P_x)/(sintheta*cosphi) : (-S_LENGTH/2. - P_x)/(sintheta*cosphi);
        dist[1] = (sintheta*sinphi > 0) ? (S_WIDTH/2. - P_y)/(sintheta*sinphi) : (-S_WIDTH/2. - P_y)/(sintheta*sinphi);
        dist[2] = (costheta > 0) ? (S_HEIGHT - P_z)/costheta : -P_z/costheta;
        // Calculate the plane of intersection
        int min_index = distance(dist,min_element(dist, dist + 3));
        float d = dist[min_index]; // distance from original point to plane
        // Check for attenuation
        float lambda_sc = 100.;
        float x_surv = -lambda_sc*log(r->Uniform());
        if (d > x_surv) return false;
        
        // Update particle coords
        P->x = P_x + d*sintheta*cosphi;
        P->y = P_y + d*sintheta*sinphi;
        P->z = P_z + d*costheta;
        // check if particle hit photodetector
        if (min_index  == 0 && sintheta*cosphi > 0 && P->y <= -S_WIDTH/2. + 1) {
            return true;
        }
        // Check for reflective loss
        if (r->Uniform() < 0.05) return false;
        // reflect particle
        if (min_index  == 0) {
            P->phi = M_PI - phi;
        } else if (min_index == 1) {
            P->phi = -phi;
        } else if (min_index == 2) {
            P->theta = -theta;
        }
    }
    
    return false;
}
multiprocess()
template <typename T>
void multiprocess(int nThreads, vector<T> arg, void (*func)(T*)) {
    size_t i;
    for (i=0; i < arg.size() - nThreads; i += nThreads) {
        
        vector<thread> t(nThreads);
        for (int j = 0; j < nThreads; j++){
            t[j] = thread(func, &arg[i+j]);
        }
        for (int j = 0; j < nThreads; j++){
            t[j].join();
        }
    }
    vector<thread> t(arg.size()%nThreads);
    for (std::size_t j = 0; j < arg.size()%nThreads; j++){ 
        t[j] = thread(func, &arg[i+j]);        
    }
    for (std::size_t j = 0; j < arg.size()%nThreads; j++){
        t[j].join();
    } 
}
