I try to use NLTK pagkage in Greek text and I deal with a great problem with the encoding. My code is below
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, string, re, nltk
def find_bigrams(input_list):
   bigram_list = []
   for i in range(len(input_list)-1):
       bigram_list.append((input_list[i], input_list[i+1]))
       return bigram_list
def get_nice_string(list_or_iterator):
   return "[" + " , ".join( str(x) for x in list_or_iterator) + "]"
def stripText(rawText):
   text = rawText
    rules = [
    {r'{[^)]*\}' : ''},             # remove curly brackets
    {r'\([^)]*\)' : ''},            # remove parentheses
    {r'^https?:\/\/.*[\r\n]*' : ''},# remove urls
    {r' +' : ' '},                  # remove multiple whitespaces
    {r'^\s+': ''},                  # remove whitespaces beginning
    {r'\.\.+' : '.'}                # remove multiple fullstops
    ]
for rule in rules:
    for (k, v) in rule.items():
        regex = re.compile(k)
        text = regex.sub(v, text)
sentenceClean = text.translate(string.maketrans('', ''), '{}[]|?"=\'')
return sentenceClean
if __name__ == '__main__':
    f = open('C:\\Users\\Dimitris\\Desktop\\1.txt', 'r').readlines()
    newFile = open('C:\\Users\\Dimitris\\Desktop\\corpus.txt', 'w')
    newFile1 = open('C:\\Users\\Dimitris\\Desktop\\words.txt', 'w')
    words = ['jpg', 'jpeg', 'File', 'Image']
for line in f:
    sentences = stripText(line)
    whitespaces = sentences.count(' ')
    if any(word in sentences for word in words):
        continue
    elif whitespaces < 20:
        continue
    else:
        newFile.write(sentences+'\n')
        b = nltk.word_tokenize(sentences)
        print get_nice_string(b)
        get_nice_string(nltk.bigrams(b))
        print get_nice_string(nltk.bigrams(b))
        newFile1.write(get_nice_string(b))
newFile.close()
newFile1.close()
When I try to print the output from nltk.word_tokenize(sentences), the result is like something that (('\xe5\xe3\xea\xfe\xec\xe9\xe1', '\xe3\xe9')), but if I use the get_nice_string() function and I turn list into a string, then the result is normal greek text. So far so good.
But whether I use find_bigrams() function or nltk.bigrams() I get strings like the above (('\xe5\xe3\xea\xfe\xec\xe9\xe1', '\xe3\xe9')), even if I use get_nice_string() function, in order to turn list into string.
Also, I have tried to open file with the codecs.open() function, like this
f = codecs.open('C:\\Users\\Dimitris\\Desktop\\1.txt', 'r', 'utf-8').readlines()  
but the problem persists.
Any ideas?
 
     
    