ignore XML tags that are not in DAV: namespace
[pandav-og.git] / xmldict.py
blobdb1aabc3cd6ff7dda6ff743d54d697f865f72db8
1 # A sane XML-to-objects parser
2 # TODO: error & better malformed xml handling
3 # (c) 2005. Ivan Voras
4 import sys
5 import re
7 from xml.dom import minidom
9 class Tag:
11 def __init__(self, name, attrs, data='', parser=None):
12 self.d = {}
13 self.name = name
14 self.attrs = attrs
15 if type(self.attrs) == type(''):
16 self.attrs = splitattrs(self.attrs)
17 for a in self.attrs:
18 if a.startswith('xmlns'):
19 nsname = a[6:]
20 parser.namespaces[nsname] = self.attrs[a]
21 self.rawname = self.name
23 p = name.find(':')
24 if p > 0:
25 nsname = name[0:p]
26 if nsname in parser.namespaces:
27 self.ns = parser.namespaces[nsname]
28 self.name = self.rawname[p+1:]
29 else:
30 self.ns = ''
31 #print self.rawname, '->', self.name, self.ns
32 self.data = data
34 # Emulate dictionary d
35 def __len__(self):
36 return len(self.d)
38 def __getitem__(self, key):
39 return self.d[key]
41 def __setitem__(self, key, value):
42 self.d[key] = value
44 def __delitem__(self, key):
45 del self.d[key]
47 def __iter__(self):
48 return self.d.iterkeys()
50 def __contains__(self, key):
51 return key in self.d
53 def prettyPrint (self, indent=0):
54 s = " " * indent
55 if self.attrs:
56 s += u'<%s %s> %s ' % (self.name, self.attrs, self.data)
57 else:
58 s += u'<%s> %s ' % (self.name, self.data)
60 s += "\n"
61 for k in self.d:
62 if type(self.d[k]) == type(self):
63 s += " " + self.d[k].prettyPrint(indent + 1)
64 else:
65 for e in self.d[k]:
66 s += "-" + e.prettyPrint(indent + 1)
67 return s
69 def __str__(self):
70 """Returns unicode semi human-readable representation of the structure"""
71 if self.attrs:
72 s = u'<%s %s> %s ' % (self.name, self.attrs, self.data)
73 else:
74 s = u'<%s> %s ' % (self.name, self.data)
76 for k in self.d:
77 if type(self.d[k]) == type(self):
78 s += u'|%s: %s|' % (k, str(self.d[k]))
79 else:
80 s += u'|' + u','.join([str(x) for x in self.d[k]]) + u'|'
81 return s
84 def addChild(self, tag):
85 """Adds a child to self. tag must be instance of Tag"""
86 if tag.name in self.d:
87 if type(self.d[tag.name]) == type(self): # If there are multiple sibiling tags with same name, form a list :)
88 self.d[tag.name] = [self.d[tag.name]]
89 self.d[tag.name].append(tag)
90 else:
91 self.d[tag.name] = tag
92 return tag
95 def toUnicode(self, fromencoding, recurse=True):
96 """Converts data & attribute data to unicode from specified encoding"""
97 if type(self.data) == type(''):
98 self.data = self.data.decode(fromencoding, 'replace')
99 for a in self.attrs:
100 if type(self.attrs[a] == type('')):
101 self.attrs[a] = self.attrs[a].decode(fromencoding, 'replace')
102 if recurse:
103 for k in self.d:
104 if type(self.d[k]) == type(self):
105 self.d[k].toUnicode(fromencoding, recurse)
109 class XMLDict_Parser:
111 def __init__(self, xml):
112 self.xml = xml
113 self.p = 0
114 self.encoding = sys.getdefaultencoding()
115 self.namespaces = {}
117 def parseNode (self, parent, node):
119 assert(type(parent) == type(Tag("", "", "")))
121 nodeText = ""
122 for n in node.childNodes:
123 if n.nodeType == n.TEXT_NODE:
124 nodeText = nodeText + n.nodeValue.strip()
125 elif n.nodeType == n.COMMENT_NODE:
126 sys.stderr.write("Note: ignoring comment\n")
127 elif n.nodeType == n.ELEMENT_NODE:
129 # ignore tags that are not in DAV: namespace;
130 # but, as exception, allow tags that are in no namespace
131 if n.namespaceURI != None and n.namespaceURI != "DAV:":
132 sys.stderr.write("Note: ignoring non-DAV element (%s%s)\n" %
133 (n.namespaceURI, n.localName) )
134 continue
136 if len(nodeText) > 0:
137 sys.stderr.write("Warning: mixed content (tags/data) ?!\n")
138 newTag = Tag(n.localName, "", parser=self)
139 parent.addChild( newTag )
141 # add attributes
142 for attr in n.attributes.keys():
143 newTag.attrs[attr] = n.attributes[attr].value
145 # add child nodes
146 self.parseNode(newTag, n)
148 parent.data = nodeText
150 def builddict(self):
151 """Builds a nested-dictionary-like structure from the xml. This method
152 picks up tags on the main level and calls processTag() for nested tags."""
154 xmldoc = minidom.parseString(self.xml)
155 d = Tag('<root>', '')
157 self.parseNode(d, xmldoc)
159 return d
162 def splitattrs(att):
163 """Extracts name="value" pairs from string; returns them as dictionary"""
164 d = {}
165 for m in re.findall('([a-zA-Z_][a-zA-Z_:0-9]*?)="(.+?)"', att):
166 d[m[0]] = m[1]
167 return d
170 def builddict(xml):
171 """Wrapper function for straightforward parsing"""
172 p = XMLDict_Parser(xml)
173 return p.builddict()
177 import unittest
179 class XMLTest(unittest.TestCase):
180 def testOneTagWithContent1(self):
181 """one tag with content"""
182 d = builddict("<tag1>text</tag1>")
183 self.assertEqual(len(d), 1)
184 self.assertEqual(d["tag1"].name, "tag1")
185 self.assertEqual(d["tag1"].attrs, {})
186 self.assertEqual(d["tag1"].data, "text")
188 def testOneEmptyTag1(self):
189 "one tag without content (short notation)"
190 d = builddict("<tag1/>")
191 self.assertEqual(len(d), 1)
192 self.assertEqual(d["tag1"].name, "tag1")
193 self.assertEqual(d["tag1"].attrs, {})
194 self.assertEqual(d["tag1"].data, "")
196 def testOneEmptyTag2(self):
197 "one tag without content (short notation); with in-tag whitespace"
198 d = builddict("<tag1 />")
199 self.assertEqual(len(d), 1)
200 self.assertEqual(d["tag1"].name, "tag1")
201 self.assertEqual(d["tag1"].attrs, {})
202 self.assertEqual(d["tag1"].data, "")
204 def testTwoNestedTagsWithContent1(self):
205 "two nested tags, with content"
206 d = builddict("<group><user>joe</user><user>nick</user><user>john</user></group>")
207 self.assertEqual(len(d), 1)
208 self.assertEqual(d["group"].name, "group")
209 self.assertEqual(d["group"].attrs, {})
210 self.assertEqual(d["group"].data, "")
212 self.assertEqual(type(d["group"]["user"]), type([]))
213 self.assertEqual(len(d["group"]["user"]), 3)
214 self.assertEqual(d["group"]["user"][0].name, "user")
215 self.assertEqual(d["group"]["user"][0].attrs, {})
216 self.assertEqual(d["group"]["user"][0].data, "joe")
217 self.assertEqual(d["group"]["user"][1].name, "user")
218 self.assertEqual(d["group"]["user"][1].attrs, {})
219 self.assertEqual(d["group"]["user"][1].data, "nick")
220 self.assertEqual(d["group"]["user"][2].name, "user")
221 self.assertEqual(d["group"]["user"][2].attrs, {})
222 self.assertEqual(d["group"]["user"][2].data, "john")
224 def testTwoNestedEmptyTags1(self):
225 "two nested tags, short notation"
226 d = builddict("<group><user/><user/><user/><user/></group>")
227 self.assertEqual(len(d), 1)
228 self.assertEqual(d["group"].name, "group")
229 self.assertEqual(d["group"].attrs, {})
230 self.assertEqual(d["group"].data, "")
232 self.assertEqual(type(d["group"]["user"]), type([]))
233 self.assertEqual(len(d["group"]["user"]), 4)
234 self.assertEqual(d["group"]["user"][0].name, "user")
235 self.assertEqual(d["group"]["user"][0].attrs, {})
236 self.assertEqual(d["group"]["user"][0].data, "")
237 self.assertEqual(d["group"]["user"][1].name, "user")
238 self.assertEqual(d["group"]["user"][1].attrs, {})
239 self.assertEqual(d["group"]["user"][1].data, "")
240 self.assertEqual(d["group"]["user"][2].name, "user")
241 self.assertEqual(d["group"]["user"][2].attrs, {})
242 self.assertEqual(d["group"]["user"][2].data, "")
243 self.assertEqual(d["group"]["user"][3].name, "user")
244 self.assertEqual(d["group"]["user"][3].attrs, {})
245 self.assertEqual(d["group"]["user"][3].data, "")
246 self.assertEqual(str(d["group"]["user"][0]), str(d["group"]["user"][1]))
247 self.assertEqual(str(d["group"]["user"][0]), str(d["group"]["user"][2]))
248 self.assertEqual(str(d["group"]["user"][0]), str(d["group"]["user"][3]))
250 def testTwoNestedEmptyTags2(self):
251 "two nested tags, short notation, different tag names"
252 d = builddict("<users><joe/><nick/><john/></users>")
253 self.assertEqual(len(d), 1)
254 self.assertEqual(d["users"].name, "users")
255 self.assertEqual(d["users"].attrs, {})
256 self.assertEqual(d["users"].data, "")
258 self.assertEqual(len(d["users"]), 3)
259 self.assertEqual(d["users"]["joe"].name, "joe")
260 self.assertEqual(d["users"]["joe"].attrs, {})
261 self.assertEqual(d["users"]["joe"].data, "")
262 self.assertEqual(d["users"]["nick"].name, "nick")
263 self.assertEqual(d["users"]["nick"].attrs, {})
264 self.assertEqual(d["users"]["nick"].data, "")
265 self.assertEqual(d["users"]["john"].name, "john")
266 self.assertEqual(d["users"]["john"].attrs, {})
267 self.assertEqual(d["users"]["john"].data, "")
269 def testThreeNestedTags1(self):
270 "three nested tags, one with short notation, one with content"
271 d = builddict("<tag1><tag2/><tag3>cont3</tag3></tag1>")
272 self.assertEqual(len(d), 1)
273 self.assertEqual(d["tag1"].name, "tag1")
274 self.assertEqual(d["tag1"].attrs, {})
275 self.assertEqual(d["tag1"].data, "")
277 self.assertEqual(len(d["tag1"]), 2)
278 self.assertEqual(d["tag1"]["tag2"].name, "tag2")
279 self.assertEqual(d["tag1"]["tag2"].attrs, {})
280 self.assertEqual(d["tag1"]["tag2"].data, "")
281 self.assertEqual(d["tag1"]["tag3"].name, "tag3")
282 self.assertEqual(d["tag1"]["tag3"].attrs, {})
283 self.assertEqual(d["tag1"]["tag3"].data, "cont3")
286 def testThreeNestedTags2(self):
287 "three nested tags, one with short notation and in-tag whitespace, one with content"
288 d = builddict("<tag1><tag2 /><tag3>cont3</tag3></tag1>")
289 self.assertEqual(len(d), 1)
290 self.assertEqual(d["tag1"].name, "tag1")
291 self.assertEqual(d["tag1"].attrs, {})
292 self.assertEqual(d["tag1"].data, "")
294 self.assertEqual(len(d["tag1"]), 2)
295 self.assertEqual(d["tag1"]["tag2"].name, "tag2")
296 self.assertEqual(d["tag1"]["tag2"].attrs, {})
297 self.assertEqual(d["tag1"]["tag2"].data, "")
298 self.assertEqual(d["tag1"]["tag3"].name, "tag3")
299 self.assertEqual(d["tag1"]["tag3"].attrs, {})
300 self.assertEqual(d["tag1"]["tag3"].data, "cont3")
303 def testOneTagWithAttr1(self):
304 "tag with attribute and single quotes"
305 d = builddict("<tag1 someattr='mycontent'>text</tag1>")
306 self.assertEqual(len(d), 1)
307 self.assertEqual(d["tag1"].name, "tag1")
308 self.assertEqual(d["tag1"].attrs, {"someattr" : "mycontent"})
309 self.assertEqual(d["tag1"].data, "text")
311 def testOneTagWithAttr2(self):
312 "tag with attribute and double quotes"
313 d = builddict('<tag1 someattr="mycontent">text</tag1>')
314 self.assertEqual(len(d), 1)
315 self.assertEqual(d["tag1"].name, "tag1")
316 self.assertEqual(d["tag1"].attrs, {"someattr" : "mycontent"})
317 self.assertEqual(d["tag1"].data, "text")
320 def testRealContent1(self):
321 "short DAV XML"
322 d = builddict("""<propfind xmlns="DAV:"><prop>
323 <getlastmodified xmlns="DAV:" />
324 </prop></propfind>""")
326 self.assertEqual(len(d), 1)
327 self.assertEqual(d["propfind"].name, "propfind")
328 self.assertEqual(d["propfind"].attrs, {"xmlns" : "DAV:"})
329 self.assertEqual(d["propfind"].data, "")
331 self.assertEqual(len(d["propfind"]), 1)
332 self.assertEqual(d["propfind"]["prop"].name, "prop")
333 self.assertEqual(d["propfind"]["prop"].attrs, {})
334 self.assertEqual(d["propfind"]["prop"].data, "")
336 self.assertEqual(len(d["propfind"]["prop"]), 1)
337 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].name, "getlastmodified")
338 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].attrs, {"xmlns" : "DAV:"})
339 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].data, "")
342 def testRealContent2(self):
343 "longer DAV XML"
344 d = builddict("""<?xml version="1.0" encoding="utf-8"?>
345 <propfind xmlns="DAV:"><prop>
346 <getlastmodified xmlns="DAV:"/>
347 <creationdate xmlns="DAV:"/>
348 <resourcetype xmlns="DAV:"/>
349 <getcontenttype xmlns="DAV:"/>
350 <getcontentlength xmlns="DAV:"/>
351 </prop></propfind>""")
353 self.assertEqual(len(d), 1)
354 self.assertEqual(d["propfind"].name, "propfind")
355 self.assertEqual(d["propfind"].attrs, {"xmlns" : "DAV:"})
356 self.assertEqual(d["propfind"].data, "")
358 self.assertEqual(len(d["propfind"]), 1)
359 self.assertEqual(d["propfind"]["prop"].name, "prop")
360 self.assertEqual(d["propfind"]["prop"].attrs, {})
361 self.assertEqual(d["propfind"]["prop"].data, "")
363 self.assertEqual(len(d["propfind"]["prop"]), 5)
364 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].name, "getlastmodified")
365 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].attrs, {"xmlns" : "DAV:"})
366 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].data, "")
367 self.assertEqual(d["propfind"]["prop"]["creationdate"].name, "creationdate")
368 self.assertEqual(d["propfind"]["prop"]["creationdate"].attrs, {"xmlns" : "DAV:"})
369 self.assertEqual(d["propfind"]["prop"]["creationdate"].data, "")
370 self.assertEqual(d["propfind"]["prop"]["resourcetype"].name, "resourcetype")
371 self.assertEqual(d["propfind"]["prop"]["resourcetype"].attrs, {"xmlns" : "DAV:"})
372 self.assertEqual(d["propfind"]["prop"]["resourcetype"].data, "")
373 self.assertEqual(d["propfind"]["prop"]["getcontenttype"].name, "getcontenttype")
374 self.assertEqual(d["propfind"]["prop"]["getcontenttype"].attrs, {"xmlns" : "DAV:"})
375 self.assertEqual(d["propfind"]["prop"]["getcontenttype"].data, "")
376 self.assertEqual(d["propfind"]["prop"]["getcontentlength"].name, "getcontentlength")
377 self.assertEqual(d["propfind"]["prop"]["getcontentlength"].attrs, {"xmlns" : "DAV:"})
378 self.assertEqual(d["propfind"]["prop"]["getcontentlength"].data, "")
381 def testRealContent3(self):
382 "short DAV XML with namespace"
383 d = builddict("""<?xml version="1.0" encoding="utf-8" ?>
384 <D:propfind xmlns:D="DAV:"><D:prop>
385 <D:getlastmodified/>
386 </D:prop></D:propfind>""")
388 self.assertEqual(len(d), 1)
389 self.assertEqual(d["propfind"].name, "propfind")
390 self.assertEqual(d["propfind"].attrs, {"xmlns:D" : "DAV:"})
391 self.assertEqual(d["propfind"].data, "")
393 self.assertEqual(len(d["propfind"]), 1)
394 self.assertEqual(d["propfind"]["prop"].name, "prop")
395 self.assertEqual(d["propfind"]["prop"].attrs, {})
396 self.assertEqual(d["propfind"]["prop"].data, "")
398 self.assertEqual(len(d["propfind"]["prop"]), 1)
399 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].name, "getlastmodified")
400 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].attrs, {})
401 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].data, "")
404 def testMixedNamespaces(self):
405 "tags from two namespaces"
406 d = builddict("""<MyD:propfind xmlns:MyD="DAV:" xmlns="http://example.com/blah/">
407 <someothertag>aklsj</someothertag>
408 <MyD:prop>
409 <getlastmodified xmlns="DAV:" />
410 <getcolor />
411 </MyD:prop></MyD:propfind>""")
413 self.assertEqual(len(d), 1)
414 self.assertEqual(d["propfind"].name, "propfind")
415 self.assertEqual(d["propfind"].attrs, {"xmlns" : "http://example.com/blah/", "xmlns:MyD" : "DAV:"})
416 self.assertEqual(d["propfind"].data, "")
418 self.assertEqual(len(d["propfind"]), 1)
419 self.assertEqual(d["propfind"]["prop"].name, "prop")
420 self.assertEqual(d["propfind"]["prop"].attrs, {})
421 self.assertEqual(d["propfind"]["prop"].data, "")
423 self.assertEqual(len(d["propfind"]["prop"]), 1)
424 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].name, "getlastmodified")
425 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].attrs, {"xmlns" : "DAV:"})
426 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].data, "")
428 if __name__ == '__main__': # functionality test
430 if len(sys.argv) > 1 and sys.argv[1] == "unittest":
431 #unittest.main() # strangely, this doesn't work
433 suite = unittest.TestLoader().loadTestsFromTestCase(XMLTest)
434 unittest.TextTestRunner(verbosity=2).run(suite)
435 sys.exit(0)
437 p = XMLDict_Parser('<tag1>text</tag1>')
438 d = p.builddict()
439 print d
440 print "Contents of tag1 is: '%s'" % d['tag1'].data
441 p = XMLDict_Parser('<group><user>joe</user><user>nick</user><user>john</user></group>')
442 d = p.builddict()
443 print d
444 print 'users are:'
445 for u in d['group']['user']:
446 print u
447 # print d['group']
448 # print d['group'].d
449 p = XMLDict_Parser('<group><user/><user/><user/></group>')
450 d = p.builddict()
451 print d
452 # print d['group'].d
453 p = XMLDict_Parser('<users><joe/><nick/><john/></users>')
454 d = p.builddict()
455 print d
456 if 'joe' in d['users']:
457 print 'have no fear, joe is near.'
458 if 'george' in d['users']:
459 print 'george is evil'
460 print 'users are:'
461 for u in d['users']:
462 print u