1 # A sane XML-to-objects parser
2 # TODO: error & better malformed xml handling
7 from xml
.dom
import minidom
11 def __init__(self
, name
, attrs
, data
='', parser
=None):
15 if type(self
.attrs
) == type(''):
16 self
.attrs
= splitattrs(self
.attrs
)
18 if a
.startswith('xmlns'):
20 parser
.namespaces
[nsname
] = self
.attrs
[a
]
21 self
.rawname
= self
.name
26 if nsname
in parser
.namespaces
:
27 self
.ns
= parser
.namespaces
[nsname
]
28 self
.name
= self
.rawname
[p
+1:]
31 #print self.rawname, '->', self.name, self.ns
34 # Emulate dictionary d
38 def __getitem__(self
, key
):
41 def __setitem__(self
, key
, value
):
44 def __delitem__(self
, key
):
48 return self
.d
.iterkeys()
50 def __contains__(self
, key
):
53 def prettyPrint (self
, indent
=0):
56 s
+= u
'<%s %s> %s ' % (self
.name
, self
.attrs
, self
.data
)
58 s
+= u
'<%s> %s ' % (self
.name
, self
.data
)
62 if type(self
.d
[k
]) == type(self
):
63 s
+= " " + self
.d
[k
].prettyPrint(indent
+ 1)
66 s
+= "-" + e
.prettyPrint(indent
+ 1)
70 """Returns unicode semi human-readable representation of the structure"""
72 s
= u
'<%s %s> %s ' % (self
.name
, self
.attrs
, self
.data
)
74 s
= u
'<%s> %s ' % (self
.name
, self
.data
)
77 if type(self
.d
[k
]) == type(self
):
78 s
+= u
'|%s: %s|' % (k
, str(self
.d
[k
]))
80 s
+= u
'|' + u
','.join([str(x
) for x
in self
.d
[k
]]) + u
'|'
84 def addChild(self
, tag
):
85 """Adds a child to self. tag must be instance of Tag"""
86 if tag
.name
in self
.d
:
87 if type(self
.d
[tag
.name
]) == type(self
): # If there are multiple sibiling tags with same name, form a list :)
88 self
.d
[tag
.name
] = [self
.d
[tag
.name
]]
89 self
.d
[tag
.name
].append(tag
)
91 self
.d
[tag
.name
] = tag
95 def toUnicode(self
, fromencoding
, recurse
=True):
96 """Converts data & attribute data to unicode from specified encoding"""
97 if type(self
.data
) == type(''):
98 self
.data
= self
.data
.decode(fromencoding
, 'replace')
100 if type(self
.attrs
[a
] == type('')):
101 self
.attrs
[a
] = self
.attrs
[a
].decode(fromencoding
, 'replace')
104 if type(self
.d
[k
]) == type(self
):
105 self
.d
[k
].toUnicode(fromencoding
, recurse
)
109 class XMLDict_Parser
:
111 def __init__(self
, xml
):
114 self
.encoding
= sys
.getdefaultencoding()
117 def parseNode (self
, parent
, node
):
119 assert(type(parent
) == type(Tag("", "", "")))
122 for n
in node
.childNodes
:
123 if n
.nodeType
== n
.TEXT_NODE
:
124 nodeText
= nodeText
+ n
.nodeValue
.strip()
125 elif n
.nodeType
== n
.COMMENT_NODE
:
126 sys
.stderr
.write("Note: ignoring comment\n")
127 elif n
.nodeType
== n
.ELEMENT_NODE
:
129 # ignore tags that are not in DAV: namespace;
130 # but, as exception, allow tags that are in no namespace
131 if n
.namespaceURI
!= None and n
.namespaceURI
!= "DAV:":
132 sys
.stderr
.write("Note: ignoring non-DAV element (%s%s)\n" %
133 (n
.namespaceURI
, n
.localName
) )
136 if len(nodeText
) > 0:
137 sys
.stderr
.write("Warning: mixed content (tags/data) ?!\n")
138 newTag
= Tag(n
.localName
, "", parser
=self
)
139 parent
.addChild( newTag
)
142 for attr
in n
.attributes
.keys():
143 newTag
.attrs
[attr
] = n
.attributes
[attr
].value
146 self
.parseNode(newTag
, n
)
148 parent
.data
= nodeText
151 """Builds a nested-dictionary-like structure from the xml. This method
152 picks up tags on the main level and calls processTag() for nested tags."""
154 xmldoc
= minidom
.parseString(self
.xml
)
155 d
= Tag('<root>', '')
157 self
.parseNode(d
, xmldoc
)
163 """Extracts name="value" pairs from string; returns them as dictionary"""
165 for m
in re
.findall('([a-zA-Z_][a-zA-Z_:0-9]*?)="(.+?)"', att
):
171 """Wrapper function for straightforward parsing"""
172 p
= XMLDict_Parser(xml
)
179 class XMLTest(unittest
.TestCase
):
180 def testOneTagWithContent1(self
):
181 """one tag with content"""
182 d
= builddict("<tag1>text</tag1>")
183 self
.assertEqual(len(d
), 1)
184 self
.assertEqual(d
["tag1"].name
, "tag1")
185 self
.assertEqual(d
["tag1"].attrs
, {})
186 self
.assertEqual(d
["tag1"].data
, "text")
188 def testOneEmptyTag1(self
):
189 "one tag without content (short notation)"
190 d
= builddict("<tag1/>")
191 self
.assertEqual(len(d
), 1)
192 self
.assertEqual(d
["tag1"].name
, "tag1")
193 self
.assertEqual(d
["tag1"].attrs
, {})
194 self
.assertEqual(d
["tag1"].data
, "")
196 def testOneEmptyTag2(self
):
197 "one tag without content (short notation); with in-tag whitespace"
198 d
= builddict("<tag1 />")
199 self
.assertEqual(len(d
), 1)
200 self
.assertEqual(d
["tag1"].name
, "tag1")
201 self
.assertEqual(d
["tag1"].attrs
, {})
202 self
.assertEqual(d
["tag1"].data
, "")
204 def testTwoNestedTagsWithContent1(self
):
205 "two nested tags, with content"
206 d
= builddict("<group><user>joe</user><user>nick</user><user>john</user></group>")
207 self
.assertEqual(len(d
), 1)
208 self
.assertEqual(d
["group"].name
, "group")
209 self
.assertEqual(d
["group"].attrs
, {})
210 self
.assertEqual(d
["group"].data
, "")
212 self
.assertEqual(type(d
["group"]["user"]), type([]))
213 self
.assertEqual(len(d
["group"]["user"]), 3)
214 self
.assertEqual(d
["group"]["user"][0].name
, "user")
215 self
.assertEqual(d
["group"]["user"][0].attrs
, {})
216 self
.assertEqual(d
["group"]["user"][0].data
, "joe")
217 self
.assertEqual(d
["group"]["user"][1].name
, "user")
218 self
.assertEqual(d
["group"]["user"][1].attrs
, {})
219 self
.assertEqual(d
["group"]["user"][1].data
, "nick")
220 self
.assertEqual(d
["group"]["user"][2].name
, "user")
221 self
.assertEqual(d
["group"]["user"][2].attrs
, {})
222 self
.assertEqual(d
["group"]["user"][2].data
, "john")
224 def testTwoNestedEmptyTags1(self
):
225 "two nested tags, short notation"
226 d
= builddict("<group><user/><user/><user/><user/></group>")
227 self
.assertEqual(len(d
), 1)
228 self
.assertEqual(d
["group"].name
, "group")
229 self
.assertEqual(d
["group"].attrs
, {})
230 self
.assertEqual(d
["group"].data
, "")
232 self
.assertEqual(type(d
["group"]["user"]), type([]))
233 self
.assertEqual(len(d
["group"]["user"]), 4)
234 self
.assertEqual(d
["group"]["user"][0].name
, "user")
235 self
.assertEqual(d
["group"]["user"][0].attrs
, {})
236 self
.assertEqual(d
["group"]["user"][0].data
, "")
237 self
.assertEqual(d
["group"]["user"][1].name
, "user")
238 self
.assertEqual(d
["group"]["user"][1].attrs
, {})
239 self
.assertEqual(d
["group"]["user"][1].data
, "")
240 self
.assertEqual(d
["group"]["user"][2].name
, "user")
241 self
.assertEqual(d
["group"]["user"][2].attrs
, {})
242 self
.assertEqual(d
["group"]["user"][2].data
, "")
243 self
.assertEqual(d
["group"]["user"][3].name
, "user")
244 self
.assertEqual(d
["group"]["user"][3].attrs
, {})
245 self
.assertEqual(d
["group"]["user"][3].data
, "")
246 self
.assertEqual(str(d
["group"]["user"][0]), str(d
["group"]["user"][1]))
247 self
.assertEqual(str(d
["group"]["user"][0]), str(d
["group"]["user"][2]))
248 self
.assertEqual(str(d
["group"]["user"][0]), str(d
["group"]["user"][3]))
250 def testTwoNestedEmptyTags2(self
):
251 "two nested tags, short notation, different tag names"
252 d
= builddict("<users><joe/><nick/><john/></users>")
253 self
.assertEqual(len(d
), 1)
254 self
.assertEqual(d
["users"].name
, "users")
255 self
.assertEqual(d
["users"].attrs
, {})
256 self
.assertEqual(d
["users"].data
, "")
258 self
.assertEqual(len(d
["users"]), 3)
259 self
.assertEqual(d
["users"]["joe"].name
, "joe")
260 self
.assertEqual(d
["users"]["joe"].attrs
, {})
261 self
.assertEqual(d
["users"]["joe"].data
, "")
262 self
.assertEqual(d
["users"]["nick"].name
, "nick")
263 self
.assertEqual(d
["users"]["nick"].attrs
, {})
264 self
.assertEqual(d
["users"]["nick"].data
, "")
265 self
.assertEqual(d
["users"]["john"].name
, "john")
266 self
.assertEqual(d
["users"]["john"].attrs
, {})
267 self
.assertEqual(d
["users"]["john"].data
, "")
269 def testThreeNestedTags1(self
):
270 "three nested tags, one with short notation, one with content"
271 d
= builddict("<tag1><tag2/><tag3>cont3</tag3></tag1>")
272 self
.assertEqual(len(d
), 1)
273 self
.assertEqual(d
["tag1"].name
, "tag1")
274 self
.assertEqual(d
["tag1"].attrs
, {})
275 self
.assertEqual(d
["tag1"].data
, "")
277 self
.assertEqual(len(d
["tag1"]), 2)
278 self
.assertEqual(d
["tag1"]["tag2"].name
, "tag2")
279 self
.assertEqual(d
["tag1"]["tag2"].attrs
, {})
280 self
.assertEqual(d
["tag1"]["tag2"].data
, "")
281 self
.assertEqual(d
["tag1"]["tag3"].name
, "tag3")
282 self
.assertEqual(d
["tag1"]["tag3"].attrs
, {})
283 self
.assertEqual(d
["tag1"]["tag3"].data
, "cont3")
286 def testThreeNestedTags2(self
):
287 "three nested tags, one with short notation and in-tag whitespace, one with content"
288 d
= builddict("<tag1><tag2 /><tag3>cont3</tag3></tag1>")
289 self
.assertEqual(len(d
), 1)
290 self
.assertEqual(d
["tag1"].name
, "tag1")
291 self
.assertEqual(d
["tag1"].attrs
, {})
292 self
.assertEqual(d
["tag1"].data
, "")
294 self
.assertEqual(len(d
["tag1"]), 2)
295 self
.assertEqual(d
["tag1"]["tag2"].name
, "tag2")
296 self
.assertEqual(d
["tag1"]["tag2"].attrs
, {})
297 self
.assertEqual(d
["tag1"]["tag2"].data
, "")
298 self
.assertEqual(d
["tag1"]["tag3"].name
, "tag3")
299 self
.assertEqual(d
["tag1"]["tag3"].attrs
, {})
300 self
.assertEqual(d
["tag1"]["tag3"].data
, "cont3")
303 def testOneTagWithAttr1(self
):
304 "tag with attribute and single quotes"
305 d
= builddict("<tag1 someattr='mycontent'>text</tag1>")
306 self
.assertEqual(len(d
), 1)
307 self
.assertEqual(d
["tag1"].name
, "tag1")
308 self
.assertEqual(d
["tag1"].attrs
, {"someattr" : "mycontent"})
309 self
.assertEqual(d
["tag1"].data
, "text")
311 def testOneTagWithAttr2(self
):
312 "tag with attribute and double quotes"
313 d
= builddict('<tag1 someattr="mycontent">text</tag1>')
314 self
.assertEqual(len(d
), 1)
315 self
.assertEqual(d
["tag1"].name
, "tag1")
316 self
.assertEqual(d
["tag1"].attrs
, {"someattr" : "mycontent"})
317 self
.assertEqual(d
["tag1"].data
, "text")
320 def testRealContent1(self
):
322 d
= builddict("""<propfind xmlns="DAV:"><prop>
323 <getlastmodified xmlns="DAV:" />
324 </prop></propfind>""")
326 self
.assertEqual(len(d
), 1)
327 self
.assertEqual(d
["propfind"].name
, "propfind")
328 self
.assertEqual(d
["propfind"].attrs
, {"xmlns" : "DAV:"})
329 self
.assertEqual(d
["propfind"].data
, "")
331 self
.assertEqual(len(d
["propfind"]), 1)
332 self
.assertEqual(d
["propfind"]["prop"].name
, "prop")
333 self
.assertEqual(d
["propfind"]["prop"].attrs
, {})
334 self
.assertEqual(d
["propfind"]["prop"].data
, "")
336 self
.assertEqual(len(d
["propfind"]["prop"]), 1)
337 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].name
, "getlastmodified")
338 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].attrs
, {"xmlns" : "DAV:"})
339 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].data
, "")
342 def testRealContent2(self
):
344 d
= builddict("""<?xml version="1.0" encoding="utf-8"?>
345 <propfind xmlns="DAV:"><prop>
346 <getlastmodified xmlns="DAV:"/>
347 <creationdate xmlns="DAV:"/>
348 <resourcetype xmlns="DAV:"/>
349 <getcontenttype xmlns="DAV:"/>
350 <getcontentlength xmlns="DAV:"/>
351 </prop></propfind>""")
353 self
.assertEqual(len(d
), 1)
354 self
.assertEqual(d
["propfind"].name
, "propfind")
355 self
.assertEqual(d
["propfind"].attrs
, {"xmlns" : "DAV:"})
356 self
.assertEqual(d
["propfind"].data
, "")
358 self
.assertEqual(len(d
["propfind"]), 1)
359 self
.assertEqual(d
["propfind"]["prop"].name
, "prop")
360 self
.assertEqual(d
["propfind"]["prop"].attrs
, {})
361 self
.assertEqual(d
["propfind"]["prop"].data
, "")
363 self
.assertEqual(len(d
["propfind"]["prop"]), 5)
364 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].name
, "getlastmodified")
365 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].attrs
, {"xmlns" : "DAV:"})
366 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].data
, "")
367 self
.assertEqual(d
["propfind"]["prop"]["creationdate"].name
, "creationdate")
368 self
.assertEqual(d
["propfind"]["prop"]["creationdate"].attrs
, {"xmlns" : "DAV:"})
369 self
.assertEqual(d
["propfind"]["prop"]["creationdate"].data
, "")
370 self
.assertEqual(d
["propfind"]["prop"]["resourcetype"].name
, "resourcetype")
371 self
.assertEqual(d
["propfind"]["prop"]["resourcetype"].attrs
, {"xmlns" : "DAV:"})
372 self
.assertEqual(d
["propfind"]["prop"]["resourcetype"].data
, "")
373 self
.assertEqual(d
["propfind"]["prop"]["getcontenttype"].name
, "getcontenttype")
374 self
.assertEqual(d
["propfind"]["prop"]["getcontenttype"].attrs
, {"xmlns" : "DAV:"})
375 self
.assertEqual(d
["propfind"]["prop"]["getcontenttype"].data
, "")
376 self
.assertEqual(d
["propfind"]["prop"]["getcontentlength"].name
, "getcontentlength")
377 self
.assertEqual(d
["propfind"]["prop"]["getcontentlength"].attrs
, {"xmlns" : "DAV:"})
378 self
.assertEqual(d
["propfind"]["prop"]["getcontentlength"].data
, "")
381 def testRealContent3(self
):
382 "short DAV XML with namespace"
383 d
= builddict("""<?xml version="1.0" encoding="utf-8" ?>
384 <D:propfind xmlns:D="DAV:"><D:prop>
386 </D:prop></D:propfind>""")
388 self
.assertEqual(len(d
), 1)
389 self
.assertEqual(d
["propfind"].name
, "propfind")
390 self
.assertEqual(d
["propfind"].attrs
, {"xmlns:D" : "DAV:"})
391 self
.assertEqual(d
["propfind"].data
, "")
393 self
.assertEqual(len(d
["propfind"]), 1)
394 self
.assertEqual(d
["propfind"]["prop"].name
, "prop")
395 self
.assertEqual(d
["propfind"]["prop"].attrs
, {})
396 self
.assertEqual(d
["propfind"]["prop"].data
, "")
398 self
.assertEqual(len(d
["propfind"]["prop"]), 1)
399 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].name
, "getlastmodified")
400 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].attrs
, {})
401 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].data
, "")
404 def testMixedNamespaces(self
):
405 "tags from two namespaces"
406 d
= builddict("""<MyD:propfind xmlns:MyD="DAV:" xmlns="http://example.com/blah/">
407 <someothertag>aklsj</someothertag>
409 <getlastmodified xmlns="DAV:" />
411 </MyD:prop></MyD:propfind>""")
413 self
.assertEqual(len(d
), 1)
414 self
.assertEqual(d
["propfind"].name
, "propfind")
415 self
.assertEqual(d
["propfind"].attrs
, {"xmlns" : "http://example.com/blah/", "xmlns:MyD" : "DAV:"})
416 self
.assertEqual(d
["propfind"].data
, "")
418 self
.assertEqual(len(d
["propfind"]), 1)
419 self
.assertEqual(d
["propfind"]["prop"].name
, "prop")
420 self
.assertEqual(d
["propfind"]["prop"].attrs
, {})
421 self
.assertEqual(d
["propfind"]["prop"].data
, "")
423 self
.assertEqual(len(d
["propfind"]["prop"]), 1)
424 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].name
, "getlastmodified")
425 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].attrs
, {"xmlns" : "DAV:"})
426 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].data
, "")
428 if __name__
== '__main__': # functionality test
430 if len(sys
.argv
) > 1 and sys
.argv
[1] == "unittest":
431 #unittest.main() # strangely, this doesn't work
433 suite
= unittest
.TestLoader().loadTestsFromTestCase(XMLTest
)
434 unittest
.TextTestRunner(verbosity
=2).run(suite
)
437 p
= XMLDict_Parser('<tag1>text</tag1>')
440 print "Contents of tag1 is: '%s'" % d
['tag1'].data
441 p
= XMLDict_Parser('<group><user>joe</user><user>nick</user><user>john</user></group>')
445 for u
in d
['group']['user']:
449 p
= XMLDict_Parser('<group><user/><user/><user/></group>')
453 p
= XMLDict_Parser('<users><joe/><nick/><john/></users>')
456 if 'joe' in d
['users']:
457 print 'have no fear, joe is near.'
458 if 'george' in d
['users']:
459 print 'george is evil'