#!/usr/bin/env python3
import glob
import xml.etree.ElementTree as ET
filenames = glob.glob("C:\\Users\\####\\Desktop\\BNC2\\[A00-ZZZ]*.xml")
out_lines = []
for filename in filenames:
    with open(filename, 'r', encoding="utf-8") as content:
        tree = ET.parse(content)
        root = tree.getroot()
        for w in root.iter('w'):
            lemma = w.get('hw')
            pos = w.get('pos')
            tag = w.get('c5')
            out_lines.append(w.text + "," + lemma + "," + pos + "," + tag)
with open("C:\\Users\\####\\Desktop\\bnc.txt", "w") as out_file:
    for line in out_lines:
        line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
        out_file.write("{}\n".format(line))
Gives the error:
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 0: character maps to undefined
I thought this line would have solved that:
line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
 
     
     
    