Converting xml to dictionary using ElementTree Converting xml to dictionary using ElementTree xml xml

Converting xml to dictionary using ElementTree


The following XML-to-Python-dict snippet parses entities as well as attributes following this XML-to-JSON "specification":

from collections import defaultdictdef etree_to_dict(t):    d = {t.tag: {} if t.attrib else None}    children = list(t)    if children:        dd = defaultdict(list)        for dc in map(etree_to_dict, children):            for k, v in dc.items():                dd[k].append(v)        d = {t.tag: {k: v[0] if len(v) == 1 else v                     for k, v in dd.items()}}    if t.attrib:        d[t.tag].update(('@' + k, v)                        for k, v in t.attrib.items())    if t.text:        text = t.text.strip()        if children or t.attrib:            if text:                d[t.tag]['#text'] = text        else:            d[t.tag] = text    return d

It is used:

from xml.etree import cElementTree as ETe = ET.XML('''<root>  <e />  <e>text</e>  <e name="value" />  <e name="value">text</e>  <e> <a>text</a> <b>text</b> </e>  <e> <a>text</a> <a>text</a> </e>  <e> text <a>text</a> </e></root>''')from pprint import pprintd = etree_to_dict(e)pprint(d)

The output of this example (as per above-linked "specification") should be:

{'root': {'e': [None,                'text',                {'@name': 'value'},                {'#text': 'text', '@name': 'value'},                {'a': 'text', 'b': 'text'},                {'a': ['text', 'text']},                {'#text': 'text', 'a': 'text'}]}}

Not necessarily pretty, but it is unambiguous, and simpler XML inputs result in simpler JSON. :)


Update

If you want to do the reverse, emit an XML string from a JSON/dict, you can use:

try:  basestringexcept NameError:  # python3  basestring = strdef dict_to_etree(d):    def _to_etree(d, root):        if not d:            pass        elif isinstance(d, str):            root.text = d        elif isinstance(d, dict):            for k,v in d.items():                assert isinstance(k, str)                if k.startswith('#'):                    assert k == '#text' and isinstance(v, str)                    root.text = v                elif k.startswith('@'):                    assert isinstance(v, str)                    root.set(k[1:], v)                elif isinstance(v, list):                    for e in v:                        _to_etree(e, ET.SubElement(root, k))                else:                    _to_etree(v, ET.SubElement(root, k))        else:            assert d == 'invalid type', (type(d), d)    assert isinstance(d, dict) and len(d) == 1    tag, body = next(iter(d.items()))    node = ET.Element(tag)    _to_etree(body, node)    return nodeprint(ET.tostring(dict_to_etree(d)))


def etree_to_dict(t):    d = {t.tag : map(etree_to_dict, t.iterchildren())}    d.update(('@' + k, v) for k, v in t.attrib.iteritems())    d['text'] = t.text    return d

Call as

tree = etree.parse("some_file.xml")etree_to_dict(tree.getroot())

This works as long as you don't actually have an attribute text; if you do, then change the third line in the function body to use a different key. Also, you can't handle mixed content with this.

(Tested on LXML.)


Based on @larsmans, if you don't need attributes, this will give you a tighter dictionary --

def etree_to_dict(t):    return {t.tag : map(etree_to_dict, t.iterchildren()) or t.text}