I'm trying to parse some relative large xml-files using the standard sax parser in Python and I would prefer to avoid manually save/check on each element to a dictionary because I'm working with multiple xml-schemas and some are quite large.
Obviously the code example below doesn't work, but it's what I got so far. Other low-memory solutions is also welcome.
(Note: the complete xml files contains more than just two levels of nested structures)
from xml import sax
from cStringIO import StringIO
xml_string = """<?xml version="1.0" encoding="iso-8859-1"?>
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
  <n1:product>
    <n1:status>
      <n7:created>2005-02-08T18:18:30.53</n7:created>
      <n7:updated>2008-09-18T10:29:58.26</n7:updated>
    </n1:status>
    <n1:productid>28321503</n1:productid>
    <n1:producttext>
      <n7:text>Some product info</n7:text>
      <n7:type>Info</n7:type>
    </n1:producttext>
    <n1:terms>
      <n7:term>
        <n7:number>1</n7:number>
        <n7:name>Term1</n7:name>
      </n7:term>
      <n7:term>
        <n7:number>2</n7:number>
        <n7:name>Term2</n7:name>
      </n7:term>
    </n1:terms>   
  </n1:product>
</n1:products>
"""
class XML_Handler(sax.ContentHandler):    
    def __init__(self):
        self.data = {}
        self.vbuffer = ''
    def startElementNS(self, name, qname, attrs):
        (ns, localname) = name
        if localname == 'product':
            self.data = {}
            self.fetch = True
    def endElementNS(self, name, qname):
        (ns, localname) = name
        if localname == 'product':
            # Got my data, call some process function..
            print self.data
        elif self.fetch:
            if self.vbuffer != '':
                self.data[localname] = self.vbuffer
            else:
                pass
        self.vbuffer = ''
    def characters (self, ch):
        self.vbuffer += ch.rstrip()
if __name__ == '__main__':
    parser = sax.make_parser()
    parser.setContentHandler(XML_Handler())
    parser.setFeature(sax.handler.feature_namespaces, 1)
    inpsrc = sax.xmlreader.InputSource()
    inpsrc.setByteStream(StringIO(xml_string))
    parser.parse(inpsrc)
What I'm trying to achieve:
result = {
    'status' : {
        'created' : '2005-02-08T18:18:30.53',
        'updated' : '2008-09-18T10:29:58.26',
    },
    'productid' : '28321503',
    'producttext' : {
        'text' : 'Some product',
        'type' : 'Info',
    },
    'terms' : [{'number': '1', 'name': 'Term1'}, {'number': '2', 'name': 'Term2'}]
}
 
     
    