I am trying to perform a parallel graph traversal. The entire code has only one shared variable (writable) - i would like to know why the memory consumption is too much for this. As per my understanding the memory should be
  size of local variables in bfs  * no of threads + ~some overheads
But i get the values much more than this.
Am i doing something wrong ?
Compiled using gcc/9.2.0
g++ demo.cpp -I . -o d -std=c++11 -O2 -fopenmp
#include <iostream>
#include <queue>
#include <vector>
#include <set>
#include <algorithm>
#include <climits>
#include <cmath>
#include <iomanip>
#include <map>
#include <map>
#include <tuple>
#include <sstream>
#include <unordered_set>
#include "omp.h"
using namespace std;
static map<int,std::vector<int>> g_adjLst;
static map<int,std::vector<int>> g_pairLst;
static int g_vertex;
void setSize(int nodesLen) ;
void addEdge(const int src, const int dst);
void print();
int findShortestPath(int src,int dst);
string bfs(int src, const std::vector<int>& , int thread) ;
int getSize() ;
int getPairSize() ;
void clearPair () ;
void storePairForPathFinding(int src, int dst) ;
string runBFS();
void setSize(int nodesLen) {
    g_vertex = nodesLen;
}
string bfs( int src, const std::vector<int>& dst ,  int perc) {
    std::stringstream msg;
    if (g_adjLst.find(src) == g_adjLst.end()) {
        return  "";
    }
    std::vector<int>  dist (g_vertex,INT_MAX);
    std::vector<bool> visited(g_vertex,false);
    std::queue <int> q;
    q.push(src);
    visited.at(src) = true;
    dist.at(src)= 0;
    while (!q.empty()) {
        int size = q.size();
        while (size--) {
            int curr = q.front();
            q.pop();
            if (g_adjLst.find(curr) == g_adjLst.end()) {
                return "";
            }
            for (vector<int> ::const_iterator it = g_adjLst[curr].begin();  it != g_adjLst[curr].end(); ++it) {
                if (visited.at(*it)) {continue;}
                if (dist.at(*it)> dist.at(curr) +1) {
                    dist.at(*it)= dist.at(curr) + 1;
                    q.push(*it);
                }
                visited.at(*it) = 1;
            }
        }
    }
    stringstream s;
    for (std::vector<int> ::const_iterator it =  dst.begin() ; it != dst.end(); ++it) {
        s << " {" << src << "," << *it << "} " << dist[*it] ;
    }
    return s.str();
}
//
void storePairForPathFinding (int src, int dst) {
    g_pairLst[src].push_back(dst);
}
//
//
string runBFS() {
    int  i,tid=0,nthreads;
    vector<int> ::iterator ip ;
    for (std::map<int,std::vector<int>> ::iterator it = g_pairLst.begin(); it != g_pairLst.end() ;++it) {
        ip = std::unique(it->second.begin() , it->second.end());
        it->second.resize(std::distance(it->second.begin(),ip));
    }
    std::string totalStr  = "" ,partialStr = "";
    int total = g_pairLst.size();
    #pragma omp parallel for private(tid,nthreads,partialStr) shared (totalStr)
    for (i = 0 ; i < g_pairLst.size(); i++) {
        auto it = g_pairLst.begin();
        advance(it,i);
        tid = omp_get_thread_num();
        partialStr = bfs(it->first,it->second,int(i*100/total));
        // Create thread safe region.
        #pragma omp critical
        {
                //add each threads partial sum to the total sum
                totalStr += partialStr;
        }
    }
    return totalStr; 
}
//
//
void addEdge(int src, int dst) {
    g_adjLst[src].push_back(dst);
}
int main () {
  setSize(60000);
for(int i = 0; i < 60000;i++) {
    for (int j=0; j< 60000; j++) {
        addEdge(i,j);
        if (i==j) {
            storePairForPathFinding(0,i);
        }
    }
}
    string s = runBFS();
    cout << "Ans1 = " << s << endl;
    string e = runBFS();
    cout << "Ans2 = " << e << endl;  
}
runBFS calls the pair of bfs needs to be done - foreach src and destinationLst (stored in g_pairLst) bfs is done separately/parallely.
Then all the paths are concatenated in a string from each thread (partialStr) and then finally concatenated at the end (totalStr) - like summing up array .

