Use lxml. It can strip tags, elements, and more:
import urllib2
from lxml import etree
URL = 'http://stackoverflow.com/questions/9230675/python-html-processing'
html = urllib2.urlopen(URL).read()
tree = etree.fromstring(html, parser=etree.HTMLParser())
tree.xpath('//script')
# [<Element script at 102f831b0>,
#  ...
#  <Element script at 102f83ba8>]
tree.xpath('//style')
# [<Element style at 102f83c58>]
tags_to_strip = ['script', 'style']
etree.strip_elements(tree, *tags_to_strip)
tree.xpath('//style')
# []
tree.xpath('//script')
# []
body = tree.xpath('//body')
body = body[0]
text = ' '.join(body.itertext())
tokens = text.split()
# [u'Stack',
#  u'Exchange',
#  u'log',
#  u'in',
#  ...
#  u'Stack',
#  u'Overflow',
#  u'works',
#  u'best',
#  u'with',
#  u'JavaScript',
#  u'enabled']
In case of text in russian you get tokens looking likes this:
# [u'\xd1\x8d\xd1\x84\xd1\x84\xd0\xb5\xd0\xba\xd1\x82\xd1\x8b\xe2\x80\xa6',
#  u'\xd0\x9c\xd0\xb0\xd1\x80\xd0\xba',
#  ...
#  u'\xd0\x9c\xd0\xb0\xd0\xb9\xd0\xb5\xd1\x80']
Errors handling is your home assignment.