I am trying to read multiple text files from a folder, and store each word's beginning position. I am using Boost to clear the text from punctuation.
I encounter a problem when the words have special characters such as (Õ, Ø, æ, etc). In this case, i get an error with the message: "Expression: (unsigned)(c+1)<=256".
Here is the code for the aplication I've mentioned:
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include<iterator>
#include<string>
#include "/../dirent.h/dirent.h"
#include <boost/tokenizer.hpp>
using namespace std;
using namespace boost;
int main() {
    DIR*     dir;
    dirent*  pdir;
    dir = opendir("D:/../dataset/"); 
    int number_of_words=0;
    int text_length = 30;
    char filename[300];
    int i=0;
    while (pdir = readdir(dir)) 
    {
        string fileString;
        cout<<"-------------------------------------------"<<endl;
        cout<<"Name of text file: "<<pdir->d_name << endl;
        strcpy(filename, "D:/.../dataset/");
        strcat(filename, pdir->d_name);
        ifstream file(filename);
        std::istream_iterator<std::string> beg(file), end;
        number_of_words = distance(beg,end);
        //cout<<"Number of words in file: "<<number_of_words<<endl;
        ifstream files(filename);
         //char output[200];
         if (file.is_open()) 
         {
             string output;
             while (!files.eof())
             {
                    files >> output;
                    fileString += " ";
                    fileString += output;
                    //cout<<output<<endl;
             }
             //cout<<fileString<<endl;
             cout<<"Number of characters: "<<fileString.size()<<endl;
             cout<<"-------------------------------------------"<<endl;
            string fileStringTokenized;
            tokenizer<>tok (fileString);
            int indice_cuvant_curent = 0;
            int index = 0;
            vector<int> myvector;
            for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
            {
                string currentWord;
                currentWord = *beg;
                myvector.push_back(index);
                index+=currentWord.size();
                //cout<<index<<"\t";
                //cout<<*beg<<endl;
                fileStringTokenized += *beg;
            }
         }
         file.close();
    }
    closedir(dir);
    return 0;
}
Why does this problem appear and how can I solve it?
 
     
    