I try to count word in huge file. I want to use max of CPU resources and i try to split input data and count words in threads. But i have a problem, when i split data it can split the words and in the end i have wrong answer. How can i split data from file to avoid spliting words? Can somebody help me?
#include <iostream>
#include <fstream>
#include <set>
#include <string>
#include <thread>
#include <mutex>
#include <sstream>
#include <vector>
#include <algorithm>
#define BUFER_SIZE  1024
using namespace std;
std::mutex mtx;
void worker(int n, set<std::string> &mySet, std::string path)
{
    mtx.lock();
    ifstream file (path, ios::in);
    if (file.is_open())
    {
        char *memblock = new char [BUFER_SIZE];
        file.seekg (n * (BUFER_SIZE - 1), ios::beg);
        file.read(memblock, BUFER_SIZE - 1);
        std::string blockString(memblock);
        std::string buf;
        stringstream stream(blockString);
        while(stream >> buf) mySet.insert(buf);
        memblock[BUFER_SIZE] = '\0';
        file.close();
        delete[] memblock;
    }
    else 
        cout << "Unable to open file";
    mtx.unlock();
}
int main(int argc, char *argv[])
{
    set<std::string> uniqWords;
    int threadCount = 0;
    ifstream file(argv[1], ios::in);
    if(!file){
        std::cout << "Bad path.\n";
        return 1;
    }
    file.seekg(0, ios::end);
    int fileSize = file.tellg();
    file.close();
    std::cout << "Size of the file is" << " " << fileSize << " " << "bytes\n";
    threadCount = fileSize/BUFER_SIZE + 1;
    std::cout << "Thread count: " << threadCount << std::endl;
    std::vector<std::thread> vec;
    for(int i=0; i < threadCount; i++)
    {
        vec.push_back(std::thread(worker, i, std::ref(uniqWords), argv[1]));
    }
    std::for_each(vec.begin(), vec.end(), [](std::thread& th)
    {
        th.join();
    });
    std::cout << "Count: " << uniqWords.size() << std::endl;
    return 0;
}