1 # A sane XML-to-objects parser
2 # TODO: error & better malformed xml handling
9 def __init__(self
, name
, attrs
, data
='', parser
=None):
13 if type(self
.attrs
) == type(''):
14 self
.attrs
= splitattrs(self
.attrs
)
16 if a
.startswith('xmlns'):
18 parser
.namespaces
[nsname
] = self
.attrs
[a
]
19 self
.rawname
= self
.name
24 if nsname
in parser
.namespaces
:
25 self
.ns
= parser
.namespaces
[nsname
]
26 self
.name
= self
.rawname
[p
+1:]
29 #print self.rawname, '->', self.name, self.ns
32 # Emulate dictionary d
36 def __getitem__(self
, key
):
39 def __setitem__(self
, key
, value
):
42 def __delitem__(self
, key
):
46 return self
.d
.iterkeys()
48 def __contains__(self
, key
):
51 def prettyPrint (self
, indent
=0):
54 s
+= u
'<%s %s> %s ' % (self
.name
, self
.attrs
, self
.data
)
56 s
+= u
'<%s> %s ' % (self
.name
, self
.data
)
61 if type(self
.d
[k
]) == type(self
):
63 s
+= self
.d
[k
].prettyPrint(indent
+ 1)
66 #s += u'|' + u','.join([x.prettyPrint(indent + 1) for x in self.d[k]]) + u'|\n'
71 """Returns unicode semi human-readable representation of the structure"""
73 s
= u
'<%s %s> %s ' % (self
.name
, self
.attrs
, self
.data
)
75 s
= u
'<%s> %s ' % (self
.name
, self
.data
)
78 if type(self
.d
[k
]) == type(self
):
79 s
+= u
'|%s: %s|' % (k
, str(self
.d
[k
]))
81 s
+= u
'|' + u
','.join([str(x
) for x
in self
.d
[k
]]) + u
'|'
85 def addChild(self
, tag
):
86 """Adds a child to self. tag must be instance of Tag"""
87 if tag
.name
in self
.d
:
88 if type(self
.d
[tag
.name
]) == type(self
): # If there are multiple sibiling tags with same name, form a list :)
89 self
.d
[tag
.name
] = [self
.d
[tag
.name
]]
90 self
.d
[tag
.name
].append(tag
)
92 self
.d
[tag
.name
] = tag
96 def toUnicode(self
, fromencoding
, recurse
=True):
97 """Converts data & attribute data to unicode from specified encoding"""
98 if type(self
.data
) == type(''):
99 self
.data
= self
.data
.decode(fromencoding
, 'replace')
101 if type(self
.attrs
[a
] == type('')):
102 self
.attrs
[a
] = self
.attrs
[a
].decode(fromencoding
, 'replace')
105 if type(self
.d
[k
]) == type(self
):
106 self
.d
[k
].toUnicode(fromencoding
, recurse
)
110 class XMLDict_Parser
:
112 def __init__(self
, xml
):
115 self
.encoding
= sys
.getdefaultencoding()
119 def getnexttag(self
):
120 ptag
= self
.xml
.find('<', self
.p
)
122 return None, None, self
.xml
[self
.p
:].strip()
124 data
= self
.xml
[self
.p
:ptag
].strip()
129 p2
= self
.xml
.find('>', self
.p
+1)
131 raise "Malformed XML - unclosed tag?"
133 tag
= self
.xml
[ptag
+1:p2
]
139 tag
, attrs
= tag
.split(' ', 1)
143 return tag
, attrs
, data
147 """Builds a nested-dictionary-like structure from the xml. This method
148 picks up tags on the main level and calls processTag() for nested tags."""
149 d
= Tag('<root>', '')
151 tag
, attrs
, data
= self
.getnexttag()
152 if data
!= '': # data is actually that between the last tag and this one
153 sys
.stderr
.write("Warning: inline data between tags?!\n")
156 if tag
[-1] == '/': # an 'empty' tag (e.g. <empty/>)
157 d
.addChild(Tag(tag
[:-1], attrs
, parser
=self
))
159 elif tag
[0] == '?': # special tag
160 t
= d
.addChild(Tag(tag
, attrs
, parser
=self
))
161 if tag
== '?xml' and 'encoding' in t
.attrs
:
162 self
.encoding
= t
.attrs
['encoding']
165 self
.processTag(d
.addChild(Tag(tag
, attrs
, parser
=self
)))
167 sys
.stderr
.write("Error processing tag %s\n" % tag
)
168 d
.encoding
= self
.encoding
172 def processTag(self
, dtag
):
173 """Process single tag's data"""
174 until
= '/'+dtag
.rawname
176 tag
, attrs
, data
= self
.getnexttag()
180 sys
.stderr
.write("Unterminated tag '"+dtag
.rawname
+"'?\n")
185 dtag
.addChild(Tag(tag
[:-1], attrs
, parser
=self
))
187 self
.processTag(dtag
.addChild(Tag(tag
, attrs
, parser
=self
)))
191 """Extracts name="value" pairs from string; returns them as dictionary"""
193 for m
in re
.findall('([a-zA-Z_][a-zA-Z_:0-9]*?)="(.+?)"', att
):
199 """Wrapper function for straightforward parsing"""
200 p
= XMLDict_Parser(xml
)
204 if __name__
== '__main__': # functionality test
205 p
= XMLDict_Parser('<tag1>text</tag1>')
208 print "Contents of tag1 is: '%s'" % d
['tag1'].data
209 p
= XMLDict_Parser('<group><user>joe</user><user>nick</user><user>john</user></group>')
213 for u
in d
['group']['user']:
217 p
= XMLDict_Parser('<group><user/><user/><user/></group>')
221 p
= XMLDict_Parser('<users><joe/><nick/><john/></users>')
224 if 'joe' in d
['users']:
225 print 'have no fear, joe is near.'
226 if 'george' in d
['users']:
227 print 'george is evil'