xmldict.py

   1 # A sane XML-to-objects parser
   2 # TODO: error & better malformed xml handling
   3 # (c) 2005. Ivan Voras
   4 import sys
   5 import re
   6
   7 class Tag:
   8
   9     def __init__(self, name, attrs, data='', parser=None):
  10         self.d = {}
  11         self.name = name
  12         self.attrs = attrs
  13         if type(self.attrs) == type(''):
  14             self.attrs = splitattrs(self.attrs)
  15         for a in self.attrs:
  16             if a.startswith('xmlns'):
  17                 nsname = a[6:]
  18                 parser.namespaces[nsname] = self.attrs[a]
  19         self.rawname = self.name
  20
  21         p = name.find(':')
  22         if p > 0:
  23             nsname = name[0:p]
  24             if nsname in parser.namespaces:
  25                 self.ns = parser.namespaces[nsname]
  26                 self.name = self.rawname[p+1:]
  27         else:
  28             self.ns = ''
  29         #print self.rawname, '->', self.name, self.ns
  30         self.data = data
  31
  32     # Emulate dictionary d
  33     def __len__(self):
  34         return len(self.d)
  35
  36     def __getitem__(self, key):
  37         return self.d[key]
  38
  39     def __setitem__(self, key, value):
  40         self.d[key] = value
  41
  42     def __delitem__(self, key):
  43         del self.d[key]
  44
  45     def __iter__(self):
  46         return self.d.iterkeys()
  47
  48     def __contains__(self, key):
  49         return key in self.d
  50
  51     def prettyPrint (self, indent=0):
  52         s = "  " * indent
  53         if self.attrs:
  54             s += u'<%s %s> %s ' % (self.name, self.attrs, self.data)
  55         else:
  56             s += u'<%s> %s ' % (self.name, self.data)
  57
  58         s += "\n"
  59         for k in self.d:
  60             s += "  " * indent
  61             if type(self.d[k]) == type(self):
  62                 s += u'%s:\n' % k
  63                 s += self.d[k].prettyPrint(indent + 1)
  64             else:
  65                 raise "NIY"
  66                 #s += u'|' + u','.join([x.prettyPrint(indent + 1) for x in self.d[k]]) + u'|\n'
  67                 pass
  68         return s
  69
  70     def __str__(self):
  71         """Returns unicode semi human-readable representation of the structure"""
  72         if self.attrs:
  73             s = u'<%s %s> %s ' % (self.name, self.attrs, self.data)
  74         else:
  75             s = u'<%s> %s ' % (self.name, self.data)
  76
  77         for k in self.d:
  78             if type(self.d[k]) == type(self):
  79                 s += u'|%s: %s|' % (k, str(self.d[k]))
  80             else:
  81                 s += u'|' + u','.join([str(x) for x in self.d[k]]) + u'|'
  82         return s
  83
  84
  85     def addChild(self, tag):
  86         """Adds a child to self. tag must be instance of Tag"""
  87         if tag.name in self.d:
  88             if type(self.d[tag.name]) == type(self): # If there are multiple sibiling tags with same name, form a list :)
  89                 self.d[tag.name] = [self.d[tag.name]]
  90             self.d[tag.name].append(tag)
  91         else:
  92             self.d[tag.name] = tag
  93         return tag
  94
  95
  96     def toUnicode(self, fromencoding, recurse=True):
  97         """Converts data & attribute data to unicode from specified encoding"""
  98         if type(self.data) == type(''):
  99             self.data = self.data.decode(fromencoding, 'replace')
 100         for a in self.attrs:
 101             if type(self.attrs[a] == type('')):
 102                 self.attrs[a] = self.attrs[a].decode(fromencoding, 'replace')
 103         if recurse:
 104             for k in self.d:
 105                 if type(self.d[k]) == type(self):
 106                     self.d[k].toUnicode(fromencoding, recurse)
 107
 108
 109
 110 class XMLDict_Parser:
 111
 112     def __init__(self, xml):
 113         self.xml = xml
 114         self.p = 0
 115         self.encoding = sys.getdefaultencoding()
 116         self.namespaces = {}
 117
 118
 119     def getnexttag(self):
 120         ptag = self.xml.find('<', self.p)
 121         if ptag < 0:
 122             return None, None, self.xml[self.p:].strip()
 123
 124         data = self.xml[self.p:ptag].strip()
 125
 126         self.p = ptag
 127         self.tagbegin = ptag
 128
 129         p2 = self.xml.find('>', self.p+1)
 130         if p2 < 0:
 131             raise "Malformed XML - unclosed tag?"
 132
 133         tag = self.xml[ptag+1:p2]
 134         self.p = p2+1
 135         self.tagend = p2+1
 136
 137         ps = tag.find(' ')
 138         if ps > 0:
 139             tag, attrs = tag.split(' ', 1)
 140         else:
 141             attrs = ''
 142
 143         return tag, attrs, data
 144
 145
 146     def builddict(self):
 147         """Builds a nested-dictionary-like structure from the xml. This method
 148         picks up tags on the main level and calls processTag() for nested tags."""
 149         d = Tag('<root>', '')
 150         while True:
 151             tag, attrs, data = self.getnexttag()
 152             if data != '': # data is actually that between the last tag and this one
 153                 sys.stderr.write("Warning: inline data between tags?!\n")
 154             if not tag:
 155                 break
 156             if tag[-1] == '/': # an 'empty' tag (e.g. <empty/>)
 157                 d.addChild(Tag(tag[:-1], attrs, parser=self))
 158                 continue
 159             elif tag[0] == '?': # special tag
 160                 t = d.addChild(Tag(tag, attrs, parser=self))
 161                 if tag == '?xml' and 'encoding' in t.attrs:
 162                     self.encoding = t.attrs['encoding']
 163             else:
 164                 try:
 165                     self.processTag(d.addChild(Tag(tag, attrs, parser=self)))
 166                 except:
 167                     sys.stderr.write("Error processing tag %s\n" % tag)
 168         d.encoding = self.encoding
 169         return d
 170
 171
 172     def processTag(self, dtag):
 173         """Process single tag's data"""
 174         until = '/'+dtag.rawname
 175         while True:
 176             tag, attrs, data = self.getnexttag()
 177             if data:
 178                 dtag.data += data
 179             if tag == None:
 180                 sys.stderr.write("Unterminated tag '"+dtag.rawname+"'?\n")
 181                 break
 182             if tag == until:
 183                 break
 184             if tag[-1] == '/':
 185                 dtag.addChild(Tag(tag[:-1], attrs, parser=self))
 186                 continue
 187             self.processTag(dtag.addChild(Tag(tag, attrs, parser=self)))
 188
 189
 190 def splitattrs(att):
 191     """Extracts name="value" pairs from string; returns them as dictionary"""
 192     d = {}
 193     for m in re.findall('([a-zA-Z_][a-zA-Z_:0-9]*?)="(.+?)"', att):
 194         d[m[0]] = m[1]
 195     return d
 196
 197
 198 def builddict(xml):
 199     """Wrapper function for straightforward parsing"""
 200     p = XMLDict_Parser(xml)
 201     return p.builddict()
 202
 203
 204 if __name__ == '__main__': # functionality test
 205     p = XMLDict_Parser('<tag1>text</tag1>')
 206     d = p.builddict()
 207     print d
 208     print "Contents of tag1 is: '%s'" % d['tag1'].data
 209     p = XMLDict_Parser('<group><user>joe</user><user>nick</user><user>john</user></group>')
 210     d = p.builddict()
 211     print d
 212     print 'users are:'
 213     for u in d['group']['user']:
 214         print u
 215 #    print d['group']
 216 #    print d['group'].d
 217     p = XMLDict_Parser('<group><user/><user/><user/></group>')
 218     d = p.builddict()
 219     print d
 220 #    print d['group'].d
 221     p = XMLDict_Parser('<users><joe/><nick/><john/></users>')
 222     d = p.builddict()
 223     print d
 224     if 'joe' in d['users']:
 225         print 'have no fear, joe is near.'
 226     if 'george' in d['users']:
 227         print 'george is evil'
 228     print 'users are:'
 229     for u in d['users']:
 230         print u
 231