Lib/test/test_pyexpat.py

   1 # Very simple test - Parse a file and print what happens
   2
   3 # XXX TypeErrors on calling handlers, or on bad return values from a
   4 # handler, are obscure and unhelpful.
   5
   6 import pyexpat
   7 from xml.parsers import expat
   8
   9 from test.test_support import sortdict, TestFailed
  10
  11 class Outputter:
  12     def StartElementHandler(self, name, attrs):
  13         print 'Start element:\n\t', repr(name), sortdict(attrs)
  14
  15     def EndElementHandler(self, name):
  16         print 'End element:\n\t', repr(name)
  17
  18     def CharacterDataHandler(self, data):
  19         data = data.strip()
  20         if data:
  21             print 'Character data:'
  22             print '\t', repr(data)
  23
  24     def ProcessingInstructionHandler(self, target, data):
  25         print 'PI:\n\t', repr(target), repr(data)
  26
  27     def StartNamespaceDeclHandler(self, prefix, uri):
  28         print 'NS decl:\n\t', repr(prefix), repr(uri)
  29
  30     def EndNamespaceDeclHandler(self, prefix):
  31         print 'End of NS decl:\n\t', repr(prefix)
  32
  33     def StartCdataSectionHandler(self):
  34         print 'Start of CDATA section'
  35
  36     def EndCdataSectionHandler(self):
  37         print 'End of CDATA section'
  38
  39     def CommentHandler(self, text):
  40         print 'Comment:\n\t', repr(text)
  41
  42     def NotationDeclHandler(self, *args):
  43         name, base, sysid, pubid = args
  44         print 'Notation declared:', args
  45
  46     def UnparsedEntityDeclHandler(self, *args):
  47         entityName, base, systemId, publicId, notationName = args
  48         print 'Unparsed entity decl:\n\t', args
  49
  50     def NotStandaloneHandler(self, userData):
  51         print 'Not standalone'
  52         return 1
  53
  54     def ExternalEntityRefHandler(self, *args):
  55         context, base, sysId, pubId = args
  56         print 'External entity ref:', args[1:]
  57         return 1
  58
  59     def DefaultHandler(self, userData):
  60         pass
  61
  62     def DefaultHandlerExpand(self, userData):
  63         pass
  64
  65
  66 def confirm(ok):
  67     if ok:
  68         print "OK."
  69     else:
  70         print "Not OK."
  71
  72 out = Outputter()
  73 parser = expat.ParserCreate(namespace_separator='!')
  74
  75 # Test getting/setting returns_unicode
  76 parser.returns_unicode = 0; confirm(parser.returns_unicode == 0)
  77 parser.returns_unicode = 1; confirm(parser.returns_unicode == 1)
  78 parser.returns_unicode = 2; confirm(parser.returns_unicode == 1)
  79 parser.returns_unicode = 0; confirm(parser.returns_unicode == 0)
  80
  81 # Test getting/setting ordered_attributes
  82 parser.ordered_attributes = 0; confirm(parser.ordered_attributes == 0)
  83 parser.ordered_attributes = 1; confirm(parser.ordered_attributes == 1)
  84 parser.ordered_attributes = 2; confirm(parser.ordered_attributes == 1)
  85 parser.ordered_attributes = 0; confirm(parser.ordered_attributes == 0)
  86
  87 # Test getting/setting specified_attributes
  88 parser.specified_attributes = 0; confirm(parser.specified_attributes == 0)
  89 parser.specified_attributes = 1; confirm(parser.specified_attributes == 1)
  90 parser.specified_attributes = 2; confirm(parser.specified_attributes == 1)
  91 parser.specified_attributes = 0; confirm(parser.specified_attributes == 0)
  92
  93 HANDLER_NAMES = [
  94     'StartElementHandler', 'EndElementHandler',
  95     'CharacterDataHandler', 'ProcessingInstructionHandler',
  96     'UnparsedEntityDeclHandler', 'NotationDeclHandler',
  97     'StartNamespaceDeclHandler', 'EndNamespaceDeclHandler',
  98     'CommentHandler', 'StartCdataSectionHandler',
  99     'EndCdataSectionHandler',
 100     'DefaultHandler', 'DefaultHandlerExpand',
 101     #'NotStandaloneHandler',
 102     'ExternalEntityRefHandler'
 103     ]
 104 for name in HANDLER_NAMES:
 105     setattr(parser, name, getattr(out, name))
 106
 107 data = '''\
 108 <?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
 109 <?xml-stylesheet href="stylesheet.css"?>
 110 <!-- comment data -->
 111 <!DOCTYPE quotations SYSTEM "quotations.dtd" [
 112 <!ELEMENT root ANY>
 113 <!NOTATION notation SYSTEM "notation.jpeg">
 114 <!ENTITY acirc "&#226;">
 115 <!ENTITY external_entity SYSTEM "entity.file">
 116 <!ENTITY unparsed_entity SYSTEM "entity.file" NDATA notation>
 117 %unparsed_entity;
 118 ]>
 119
 120 <root attr1="value1" attr2="value2&#8000;">
 121 <myns:subelement xmlns:myns="http://www.python.org/namespace">
 122      Contents of subelements
 123 </myns:subelement>
 124 <sub2><![CDATA[contents of CDATA section]]></sub2>
 125 &external_entity;
 126 </root>
 127 '''
 128
 129 # Produce UTF-8 output
 130 parser.returns_unicode = 0
 131 try:
 132     parser.Parse(data, 1)
 133 except expat.error:
 134     print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode)
 135     print '** Line', parser.ErrorLineNumber
 136     print '** Column', parser.ErrorColumnNumber
 137     print '** Byte', parser.ErrorByteIndex
 138
 139 # Try the parse again, this time producing Unicode output
 140 parser = expat.ParserCreate(namespace_separator='!')
 141 parser.returns_unicode = 1
 142
 143 for name in HANDLER_NAMES:
 144     setattr(parser, name, getattr(out, name))
 145 try:
 146     parser.Parse(data, 1)
 147 except expat.error:
 148     print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode)
 149     print '** Line', parser.ErrorLineNumber
 150     print '** Column', parser.ErrorColumnNumber
 151     print '** Byte', parser.ErrorByteIndex
 152
 153 # Try parsing a file
 154 parser = expat.ParserCreate(namespace_separator='!')
 155 parser.returns_unicode = 1
 156
 157 for name in HANDLER_NAMES:
 158     setattr(parser, name, getattr(out, name))
 159 import StringIO
 160 file = StringIO.StringIO(data)
 161 try:
 162     parser.ParseFile(file)
 163 except expat.error:
 164     print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode)
 165     print '** Line', parser.ErrorLineNumber
 166     print '** Column', parser.ErrorColumnNumber
 167     print '** Byte', parser.ErrorByteIndex
 168
 169
 170 # Tests that make sure we get errors when the namespace_separator value
 171 # is illegal, and that we don't for good values:
 172 print
 173 print "Testing constructor for proper handling of namespace_separator values:"
 174 expat.ParserCreate()
 175 expat.ParserCreate(namespace_separator=None)
 176 expat.ParserCreate(namespace_separator=' ')
 177 print "Legal values tested o.k."
 178 try:
 179     expat.ParserCreate(namespace_separator=42)
 180 except TypeError, e:
 181     print "Caught expected TypeError:"
 182     print e
 183 else:
 184     print "Failed to catch expected TypeError."
 185
 186 try:
 187     expat.ParserCreate(namespace_separator='too long')
 188 except ValueError, e:
 189     print "Caught expected ValueError:"
 190     print e
 191 else:
 192     print "Failed to catch expected ValueError."
 193
 194 # ParserCreate() needs to accept a namespace_separator of zero length
 195 # to satisfy the requirements of RDF applications that are required
 196 # to simply glue together the namespace URI and the localname.  Though
 197 # considered a wart of the RDF specifications, it needs to be supported.
 198 #
 199 # See XML-SIG mailing list thread starting with
 200 # http://mail.python.org/pipermail/xml-sig/2001-April/005202.html
 201 #
 202 expat.ParserCreate(namespace_separator='') # too short
 203
 204 # Test the interning machinery.
 205 p = expat.ParserCreate()
 206 L = []
 207 def collector(name, *args):
 208     L.append(name)
 209 p.StartElementHandler = collector
 210 p.EndElementHandler = collector
 211 p.Parse("<e> <e/> <e></e> </e>", 1)
 212 tag = L[0]
 213 if len(L) != 6:
 214     print "L should only contain 6 entries; found", len(L)
 215 for entry in L:
 216     if tag is not entry:
 217         print "expected L to contain many references to the same string",
 218         print "(it didn't)"
 219         print "L =", repr(L)
 220         break
 221
 222 # Tests of the buffer_text attribute.
 223 import sys
 224
 225 class TextCollector:
 226     def __init__(self, parser):
 227         self.stuff = []
 228
 229     def check(self, expected, label):
 230         require(self.stuff == expected,
 231                 "%s\nstuff    = %r\nexpected = %r"
 232                 % (label, self.stuff, map(unicode, expected)))
 233
 234     def CharacterDataHandler(self, text):
 235         self.stuff.append(text)
 236
 237     def StartElementHandler(self, name, attrs):
 238         self.stuff.append("<%s>" % name)
 239         bt = attrs.get("buffer-text")
 240         if bt == "yes":
 241             parser.buffer_text = 1
 242         elif bt == "no":
 243             parser.buffer_text = 0
 244
 245     def EndElementHandler(self, name):
 246         self.stuff.append("</%s>" % name)
 247
 248     def CommentHandler(self, data):
 249         self.stuff.append("<!--%s-->" % data)
 250
 251 def require(cond, label):
 252     # similar to confirm(), but no extraneous output
 253     if not cond:
 254         raise TestFailed(label)
 255
 256 def setup(handlers=[]):
 257     parser = expat.ParserCreate()
 258     require(not parser.buffer_text,
 259             "buffer_text not disabled by default")
 260     parser.buffer_text = 1
 261     handler = TextCollector(parser)
 262     parser.CharacterDataHandler = handler.CharacterDataHandler
 263     for name in handlers:
 264         setattr(parser, name, getattr(handler, name))
 265     return parser, handler
 266
 267 parser, handler = setup()
 268 require(parser.buffer_text,
 269         "text buffering either not acknowledged or not enabled")
 270 parser.Parse("<a>1<b/>2<c/>3</a>", 1)
 271 handler.check(["123"],
 272               "buffered text not properly collapsed")
 273
 274 # XXX This test exposes more detail of Expat's text chunking than we
 275 # XXX like, but it tests what we need to concisely.
 276 parser, handler = setup(["StartElementHandler"])
 277 parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1)
 278 handler.check(["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"],
 279               "buffering control not reacting as expected")
 280
 281 parser, handler = setup()
 282 parser.Parse("<a>1<b/>&lt;2&gt;<c/>&#32;\n&#x20;3</a>", 1)
 283 handler.check(["1<2> \n 3"],
 284               "buffered text not properly collapsed")
 285
 286 parser, handler = setup(["StartElementHandler"])
 287 parser.Parse("<a>1<b/>2<c/>3</a>", 1)
 288 handler.check(["<a>", "1", "<b>", "2", "<c>", "3"],
 289               "buffered text not properly split")
 290
 291 parser, handler = setup(["StartElementHandler", "EndElementHandler"])
 292 parser.CharacterDataHandler = None
 293 parser.Parse("<a>1<b/>2<c/>3</a>", 1)
 294 handler.check(["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"],
 295               "huh?")
 296
 297 parser, handler = setup(["StartElementHandler", "EndElementHandler"])
 298 parser.Parse("<a>1<b></b>2<c/>3</a>", 1)
 299 handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"],
 300               "huh?")
 301
 302 parser, handler = setup(["CommentHandler", "EndElementHandler",
 303                          "StartElementHandler"])
 304 parser.Parse("<a>1<b/>2<c></c>345</a> ", 1)
 305 handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"],
 306               "buffered text not properly split")
 307
 308 parser, handler = setup(["CommentHandler", "EndElementHandler",
 309                          "StartElementHandler"])
 310 parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1)
 311 handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3",
 312                "<!--abc-->", "4", "<!--def-->", "5", "</a>"],
 313               "buffered text not properly split")
 314
 315 # Test handling of exception from callback:
 316 def StartElementHandler(name, attrs):
 317     raise RuntimeError(name)
 318
 319 parser = expat.ParserCreate()
 320 parser.StartElementHandler = StartElementHandler
 321
 322 try:
 323     parser.Parse("<a><b><c/></b></a>", 1)
 324 except RuntimeError, e:
 325     if e.args[0] != "a":
 326         print "Expected RuntimeError for element 'a'; found %r" % e.args[0]
 327 else:
 328     print "Expected RuntimeError for 'a'"
 329
 330 # Test Current* members:
 331 class PositionTest:
 332
 333     def __init__(self, expected_list, parser):
 334         self.parser = parser
 335         self.parser.StartElementHandler = self.StartElementHandler
 336         self.parser.EndElementHandler = self.EndElementHandler
 337         self.expected_list = expected_list
 338         self.upto = 0
 339
 340     def StartElementHandler(self, name, attrs):
 341         self.check_pos('s')
 342
 343     def EndElementHandler(self, name):
 344         self.check_pos('e')
 345
 346     def check_pos(self, event):
 347         pos = (event,
 348                self.parser.CurrentByteIndex,
 349                self.parser.CurrentLineNumber,
 350                self.parser.CurrentColumnNumber)
 351         require(self.upto < len(self.expected_list),
 352                 'too many parser events')
 353         expected = self.expected_list[self.upto]
 354         require(pos == expected,
 355                 'expected position %s, got %s' % (expected, pos))
 356         self.upto += 1
 357
 358
 359 parser = expat.ParserCreate()
 360 handler = PositionTest([('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2),
 361                         ('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)],
 362                        parser)
 363 parser.Parse('''<a>
 364  <b>
 365   <c/>
 366  </b>
 367 </a>''', 1)