I'm trying to write a mapper/reducer set of code for hadoop to count words in tweets, but I'm running into a bit of a problem. The file I input is a JSON file of collected tweet information. I start by setting my default encoding the utf-8, but when running my code I receive the following error:
Traceback (most recent call last): File "./mapperworks2.py", line 211, in my_json_dict = json.loads(line) File "/usr/lib/python2.6/json/init.py", line 307, in loads return _default_decoder.decode(s) File "/usr/lib/python2.6/json/decoder.py", line 319, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "/usr/lib/python2.6/json/decoder.py", line 338, in raw_decode raise ValueError("No JSON object could be decoded") ValueError: No JSON object could be decoded
Where the code for the program is
#!/usr/bin/python
import sys
import json
import string
reload(sys)
sys.setdefaultencoding('utf8')
stop_words = ['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 "can't",
 'cannot',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'yourselves']
numbers = ["0","1","2","3","4","5","6","7","8","9"]
def clean_word(word):
    for c in string.punctuation:
        word = word.replace(c,"")
    for c in numbers:
        word = word.replace(c,"")
    return word
def dont_stop(word):
    if word in stop_words or word == "":
        return False
    else:
        return True
# input comes from STDIN (standard input)
for line in sys.stdin:
############
############
############
############
    my_json_dict = json.loads(line)
    line = my_json_dict['text'].lower()
############
############
############
############
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        ##################
        ##################
        word = clean_word(word)
        ##################
        ##################
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        ##################
        ##################
        if dont_stop(word):
            print '%s\t%s' % (word, 1)
When I don't switch the encoding (that is, comment out the reload(sys) and sys.setdefaultencoding() I encounter the following error:
Traceback (most recent call last): File "./mapperworks2.py", line 236, in print '%s\t%s' % (word, 1) UnicodeEncodeError: 'ascii' codec can't encode character u'\u2026' in position >3: ordinal not in range(128)
Not sure how to fix this, any help is appreciated.
 
    