1 """A parser for XML, using the derived class as static DTD."""
3 # Author: Sjoerd Mullender.
9 warnings
.warn("The xmllib module is obsolete. Use xml.sax instead.",
10 DeprecationWarning, 2)
15 class Error(RuntimeError):
18 # Regular expressions used for parsing
20 _S
= '[ \t\r\n]+' # white space
21 _opS
= '[ \t\r\n]*' # optional white space
22 _Name
= '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
23 _QStr
= "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
24 illegal
= re
.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
25 interesting
= re
.compile('[]&<]')
28 ref
= re
.compile('&(' + _Name
+ '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
29 entityref
= re
.compile('&(?P<name>' + _Name
+ ')[^-a-zA-Z0-9._:]')
30 charref
= re
.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
31 space
= re
.compile(_S
+ '$')
32 newline
= re
.compile('\n')
34 attrfind
= re
.compile(
35 _S
+ '(?P<name>' + _Name
+ ')'
36 '(' + _opS
+ '=' + _opS
+
37 '(?P<value>'+_QStr
+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
38 starttagopen
= re
.compile('<' + _Name
)
39 starttagend
= re
.compile(_opS
+ '(?P<slash>/?)>')
40 starttagmatch
= re
.compile('<(?P<tagname>'+_Name
+')'
41 '(?P<attrs>(?:'+attrfind
.pattern
+')*)'+
43 endtagopen
= re
.compile('</')
44 endbracket
= re
.compile(_opS
+ '>')
45 endbracketfind
= re
.compile('(?:[^>\'"]|'+_QStr
+')*>')
46 tagfind
= re
.compile(_Name
)
47 cdataopen
= re
.compile(r
'<!\[CDATA\[')
48 cdataclose
= re
.compile(r
'\]\]>')
49 # this matches one of the following:
50 # SYSTEM SystemLiteral
51 # PUBLIC PubidLiteral SystemLiteral
52 _SystemLiteral
= '(?P<%s>'+_QStr
+')'
53 _PublicLiteral
= '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
54 "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
55 _ExternalId
= '(?:SYSTEM|' \
56 'PUBLIC'+_S
+_PublicLiteral
%'pubid'+ \
57 ')'+_S
+_SystemLiteral
%'syslit'
58 doctype
= re
.compile('<!DOCTYPE'+_S
+'(?P<name>'+_Name
+')'
59 '(?:'+_S
+_ExternalId
+')?'+_opS
)
60 xmldecl
= re
.compile('<\?xml'+_S
+
61 'version'+_opS
+'='+_opS
+'(?P<version>'+_QStr
+')'+
62 '(?:'+_S
+'encoding'+_opS
+'='+_opS
+
63 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
64 '"[A-Za-z][-A-Za-z0-9._]*"))?'
65 '(?:'+_S
+'standalone'+_opS
+'='+_opS
+
66 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
68 procopen
= re
.compile(r
'<\?(?P<proc>' + _Name
+ ')' + _opS
)
69 procclose
= re
.compile(_opS
+ r
'\?>')
70 commentopen
= re
.compile('<!--')
71 commentclose
= re
.compile('-->')
72 doubledash
= re
.compile('--')
73 attrtrans
= string
.maketrans(' \r\n\t', ' ')
75 # definitions for XML namespaces
76 _NCName
= '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
77 ncname
= re
.compile(_NCName
+ '$')
78 qname
= re
.compile('(?:(?P<prefix>' + _NCName
+ '):)?' # optional prefix
79 '(?P<local>' + _NCName
+ ')$')
81 xmlns
= re
.compile('xmlns(?::(?P<ncname>'+_NCName
+'))?$')
83 # XML parser base class -- find tags and call handler functions.
84 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
85 # The dtd is defined by deriving a class which defines methods with
86 # special names to handle tags: start_foo and end_foo to handle <foo>
87 # and </foo>, respectively. The data between tags is passed to the
88 # parser by calling self.handle_data() with some data as argument (the
89 # data may be split up in arbitrary chunks).
92 attributes
= {} # default, to be overridden
93 elements
= {} # default, to be overridden
95 # parsing options, settable using keyword args in __init__
96 __accept_unquoted_attributes
= 0
97 __accept_missing_endtag_name
= 0
100 __translate_attribute_references
= 1
102 # Interface -- initialize and reset this instance
103 def __init__(self
, **kw
):
105 if 'accept_unquoted_attributes' in kw
:
106 self
.__accept
_unquoted
_attributes
= kw
['accept_unquoted_attributes']
107 if 'accept_missing_endtag_name' in kw
:
108 self
.__accept
_missing
_endtag
_name
= kw
['accept_missing_endtag_name']
110 self
.__map
_case
= kw
['map_case']
111 if 'accept_utf8' in kw
:
112 self
.__accept
_utf
8 = kw
['accept_utf8']
113 if 'translate_attribute_references' in kw
:
114 self
.__translate
_attribute
_references
= kw
['translate_attribute_references']
117 def __fixelements(self
):
120 self
.__fixdict
(self
.__dict
__)
121 self
.__fixclass
(self
.__class
__)
123 def __fixclass(self
, kl
):
124 self
.__fixdict
(kl
.__dict
__)
125 for k
in kl
.__bases
__:
128 def __fixdict(self
, dict):
129 for key
in dict.keys():
130 if key
[:6] == 'start_':
132 start
, end
= self
.elements
.get(tag
, (None, None))
134 self
.elements
[tag
] = getattr(self
, key
), end
135 elif key
[:4] == 'end_':
137 start
, end
= self
.elements
.get(tag
, (None, None))
139 self
.elements
[tag
] = start
, getattr(self
, key
)
141 # Interface -- reset this instance. Loses all unprocessed data
149 self
.__seen
_doctype
= None
150 self
.__seen
_starttag
= 0
151 self
.__use
_namespaces
= 0
152 self
.__namespaces
= {'xml':None} # xml is implicitly declared
153 # backward compatibility hack: if elements not overridden,
154 # fill it in ourselves
155 if self
.elements
is XMLParser
.elements
:
158 # For derived classes only -- enter literal mode (CDATA) till EOF
159 def setnomoretags(self
):
160 self
.nomoretags
= self
.literal
= 1
162 # For derived classes only -- enter literal mode (CDATA)
163 def setliteral(self
, *args
):
166 # Interface -- feed some data to the parser. Call this as
167 # often as you want, with as little or as much text as you
168 # want (may include '\n'). (This just saves the text, all the
169 # processing is done by goahead().)
170 def feed(self
, data
):
171 self
.rawdata
= self
.rawdata
+ data
174 # Interface -- handle the remaining data
179 # remove self.elements so that we don't leak
182 # Interface -- translate references
183 def translate_references(self
, data
, all
= 1):
184 if not self
.__translate
_attribute
_references
:
188 res
= amp
.search(data
, i
)
192 res
= ref
.match(data
, s
)
194 self
.syntax_error("bogus `&'")
202 str = chr(int(str[2:], 16))
204 str = chr(int(str[1:]))
205 if data
[i
- 1] != ';':
206 self
.syntax_error("`;' missing after char reference")
209 if str in self
.entitydefs
:
210 str = self
.entitydefs
[str]
212 elif data
[i
- 1] != ';':
213 self
.syntax_error("bogus `&'")
214 i
= s
+ 1 # just past the &
217 self
.syntax_error("reference to unknown entity `&%s;'" % str)
218 str = '&' + str + ';'
219 elif data
[i
- 1] != ';':
220 self
.syntax_error("bogus `&'")
221 i
= s
+ 1 # just past the &
224 # when we get here, str contains the translated text and i points
225 # to the end of the string that is to be replaced
226 data
= data
[:s
] + str + data
[i
:]
232 # Interface - return a dictionary of all namespaces currently valid
233 def getnamespace(self
):
235 for t
, d
, nst
in self
.stack
:
239 # Internal -- handle data as far as reasonable. May leave state
240 # and data to be processed by a subsequent call. If 'end' is
241 # true, force handling all data as if followed by EOF marker.
242 def goahead(self
, end
):
243 rawdata
= self
.rawdata
251 self
.handle_data(data
)
252 self
.lineno
= self
.lineno
+ data
.count('\n')
255 res
= interesting
.search(rawdata
, i
)
262 if self
.__at
_start
and space
.match(data
) is None:
263 self
.syntax_error('illegal data at start of file')
265 if not self
.stack
and space
.match(data
) is None:
266 self
.syntax_error('data not in content')
267 if not self
.__accept
_utf
8 and illegal
.search(data
):
268 self
.syntax_error('illegal character in content')
269 self
.handle_data(data
)
270 self
.lineno
= self
.lineno
+ data
.count('\n')
273 if rawdata
[i
] == '<':
274 if starttagopen
.match(rawdata
, i
):
277 self
.handle_data(data
)
278 self
.lineno
= self
.lineno
+ data
.count('\n')
281 k
= self
.parse_starttag(i
)
283 self
.__seen
_starttag
= 1
284 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
287 if endtagopen
.match(rawdata
, i
):
288 k
= self
.parse_endtag(i
)
290 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
293 if commentopen
.match(rawdata
, i
):
296 self
.handle_data(data
)
297 self
.lineno
= self
.lineno
+ data
.count('\n')
300 k
= self
.parse_comment(i
)
302 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
305 if cdataopen
.match(rawdata
, i
):
306 k
= self
.parse_cdata(i
)
308 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
311 res
= xmldecl
.match(rawdata
, i
)
313 if not self
.__at
_start
:
314 self
.syntax_error("<?xml?> declaration not at start of document")
315 version
, encoding
, standalone
= res
.group('version',
318 if version
[1:-1] != '1.0':
319 raise Error('only XML version 1.0 supported')
320 if encoding
: encoding
= encoding
[1:-1]
321 if standalone
: standalone
= standalone
[1:-1]
322 self
.handle_xml(encoding
, standalone
)
325 res
= procopen
.match(rawdata
, i
)
327 k
= self
.parse_proc(i
)
329 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
332 res
= doctype
.match(rawdata
, i
)
336 self
.handle_data(data
)
337 self
.lineno
= self
.lineno
+ data
.count('\n')
340 if self
.__seen
_doctype
:
341 self
.syntax_error('multiple DOCTYPE elements')
342 if self
.__seen
_starttag
:
343 self
.syntax_error('DOCTYPE not at beginning of document')
344 k
= self
.parse_doctype(res
)
346 self
.__seen
_doctype
= res
.group('name')
348 self
.__seen
_doctype
= self
.__seen
_doctype
.lower()
349 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
352 elif rawdata
[i
] == '&':
355 self
.handle_data(data
)
358 res
= charref
.match(rawdata
, i
)
361 if rawdata
[i
-1] != ';':
362 self
.syntax_error("`;' missing in charref")
365 self
.syntax_error('data not in content')
366 self
.handle_charref(res
.group('char')[:-1])
367 self
.lineno
= self
.lineno
+ res
.group(0).count('\n')
369 res
= entityref
.match(rawdata
, i
)
372 if rawdata
[i
-1] != ';':
373 self
.syntax_error("`;' missing in entityref")
375 name
= res
.group('name')
378 if name
in self
.entitydefs
:
379 self
.rawdata
= rawdata
= rawdata
[:res
.start(0)] + self
.entitydefs
[name
] + rawdata
[i
:]
383 self
.unknown_entityref(name
)
384 self
.lineno
= self
.lineno
+ res
.group(0).count('\n')
386 elif rawdata
[i
] == ']':
389 self
.handle_data(data
)
394 if cdataclose
.match(rawdata
, i
):
395 self
.syntax_error("bogus `]]>'")
396 self
.handle_data(rawdata
[i
])
400 raise Error('neither < nor & ??')
401 # We get here only if incomplete matches but
409 self
.syntax_error("bogus `%s'" % data
)
410 if not self
.__accept
_utf
8 and illegal
.search(data
):
411 self
.syntax_error('illegal character in content')
412 self
.handle_data(data
)
413 self
.lineno
= self
.lineno
+ data
.count('\n')
414 self
.rawdata
= rawdata
[i
+1:]
415 return self
.goahead(end
)
416 self
.rawdata
= rawdata
[i
:]
418 if not self
.__seen
_starttag
:
419 self
.syntax_error('no elements in file')
421 self
.syntax_error('missing end tags')
423 self
.finish_endtag(self
.stack
[-1][0])
425 # Internal -- parse comment, return length or -1 if not terminated
426 def parse_comment(self
, i
):
427 rawdata
= self
.rawdata
428 if rawdata
[i
:i
+4] != '<!--':
429 raise Error('unexpected call to handle_comment')
430 res
= commentclose
.search(rawdata
, i
+4)
433 if doubledash
.search(rawdata
, i
+4, res
.start(0)):
434 self
.syntax_error("`--' inside comment")
435 if rawdata
[res
.start(0)-1] == '-':
436 self
.syntax_error('comment cannot end in three dashes')
437 if not self
.__accept
_utf
8 and \
438 illegal
.search(rawdata
, i
+4, res
.start(0)):
439 self
.syntax_error('illegal character in comment')
440 self
.handle_comment(rawdata
[i
+4: res
.start(0)])
443 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
444 def parse_doctype(self
, res
):
445 rawdata
= self
.rawdata
447 name
= res
.group('name')
450 pubid
, syslit
= res
.group('pubid', 'syslit')
451 if pubid
is not None:
452 pubid
= pubid
[1:-1] # remove quotes
453 pubid
= ' '.join(pubid
.split()) # normalize
454 if syslit
is not None: syslit
= syslit
[1:-1] # remove quotes
458 if rawdata
[k
] == '[':
464 if not sq
and c
== '"':
466 elif not dq
and c
== "'":
470 elif level
<= 0 and c
== ']':
471 res
= endbracket
.match(rawdata
, k
+1)
474 self
.handle_doctype(name
, pubid
, syslit
, rawdata
[j
+1:k
])
481 self
.syntax_error("bogus `>' in DOCTYPE")
483 res
= endbracketfind
.match(rawdata
, k
)
486 if endbracket
.match(rawdata
, k
) is None:
487 self
.syntax_error('garbage in DOCTYPE')
488 self
.handle_doctype(name
, pubid
, syslit
, None)
491 # Internal -- handle CDATA tag, return length or -1 if not terminated
492 def parse_cdata(self
, i
):
493 rawdata
= self
.rawdata
494 if rawdata
[i
:i
+9] != '<![CDATA[':
495 raise Error('unexpected call to parse_cdata')
496 res
= cdataclose
.search(rawdata
, i
+9)
499 if not self
.__accept
_utf
8 and \
500 illegal
.search(rawdata
, i
+9, res
.start(0)):
501 self
.syntax_error('illegal character in CDATA')
503 self
.syntax_error('CDATA not in content')
504 self
.handle_cdata(rawdata
[i
+9:res
.start(0)])
507 __xml_namespace_attributes
= {'ns':None, 'src':None, 'prefix':None}
508 # Internal -- handle a processing instruction tag
509 def parse_proc(self
, i
):
510 rawdata
= self
.rawdata
511 end
= procclose
.search(rawdata
, i
)
515 if not self
.__accept
_utf
8 and illegal
.search(rawdata
, i
+2, j
):
516 self
.syntax_error('illegal character in processing instruction')
517 res
= tagfind
.match(rawdata
, i
+2)
519 raise Error('unexpected call to parse_proc')
524 if name
== 'xml:namespace':
525 self
.syntax_error('old-fashioned namespace declaration')
526 self
.__use
_namespaces
= -1
527 # namespace declaration
528 # this must come after the <?xml?> declaration (if any)
529 # and before the <!DOCTYPE> (if any).
530 if self
.__seen
_doctype
or self
.__seen
_starttag
:
531 self
.syntax_error('xml:namespace declaration too late in document')
532 attrdict
, namespace
, k
= self
.parse_attributes(name
, k
, j
)
534 self
.syntax_error('namespace declaration inside namespace declaration')
535 for attrname
in attrdict
.keys():
536 if not attrname
in self
.__xml
_namespace
_attributes
:
537 self
.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname
)
538 if not 'ns' in attrdict
or not 'prefix' in attrdict
:
539 self
.syntax_error('xml:namespace without required attributes')
540 prefix
= attrdict
.get('prefix')
541 if ncname
.match(prefix
) is None:
542 self
.syntax_error('xml:namespace illegal prefix value')
544 if prefix
in self
.__namespaces
:
545 self
.syntax_error('xml:namespace prefix not unique')
546 self
.__namespaces
[prefix
] = attrdict
['ns']
548 if name
.lower() == 'xml':
549 self
.syntax_error('illegal processing instruction target name')
550 self
.handle_proc(name
, rawdata
[k
:j
])
553 # Internal -- parse attributes between i and j
554 def parse_attributes(self
, tag
, i
, j
):
555 rawdata
= self
.rawdata
559 res
= attrfind
.match(rawdata
, i
)
562 attrname
, attrvalue
= res
.group('name', 'value')
564 attrname
= attrname
.lower()
566 if attrvalue
is None:
567 self
.syntax_error("no value specified for attribute `%s'" % attrname
)
569 elif attrvalue
[:1] == "'" == attrvalue
[-1:] or \
570 attrvalue
[:1] == '"' == attrvalue
[-1:]:
571 attrvalue
= attrvalue
[1:-1]
572 elif not self
.__accept
_unquoted
_attributes
:
573 self
.syntax_error("attribute `%s' value not quoted" % attrname
)
574 res
= xmlns
.match(attrname
)
576 # namespace declaration
577 ncname
= res
.group('ncname')
578 namespace
[ncname
or ''] = attrvalue
or None
579 if not self
.__use
_namespaces
:
580 self
.__use
_namespaces
= len(self
.stack
)+1
583 self
.syntax_error("`<' illegal in attribute value")
584 if attrname
in attrdict
:
585 self
.syntax_error("attribute `%s' specified twice" % attrname
)
586 attrvalue
= attrvalue
.translate(attrtrans
)
587 attrdict
[attrname
] = self
.translate_references(attrvalue
)
588 return attrdict
, namespace
, i
590 # Internal -- handle starttag, return length or -1 if not terminated
591 def parse_starttag(self
, i
):
592 rawdata
= self
.rawdata
593 # i points to start of tag
594 end
= endbracketfind
.match(rawdata
, i
+1)
597 tag
= starttagmatch
.match(rawdata
, i
)
598 if tag
is None or tag
.end(0) != end
.end(0):
599 self
.syntax_error('garbage in starttag')
601 nstag
= tagname
= tag
.group('tagname')
603 nstag
= tagname
= nstag
.lower()
604 if not self
.__seen
_starttag
and self
.__seen
_doctype
and \
605 tagname
!= self
.__seen
_doctype
:
606 self
.syntax_error('starttag does not match DOCTYPE')
607 if self
.__seen
_starttag
and not self
.stack
:
608 self
.syntax_error('multiple elements on top level')
609 k
, j
= tag
.span('attrs')
610 attrdict
, nsdict
, k
= self
.parse_attributes(tagname
, k
, j
)
611 self
.stack
.append((tagname
, nsdict
, nstag
))
612 if self
.__use
_namespaces
:
613 res
= qname
.match(tagname
)
617 prefix
, nstag
= res
.group('prefix', 'local')
621 for t
, d
, nst
in self
.stack
:
624 if ns
is None and prefix
!= '':
625 ns
= self
.__namespaces
.get(prefix
)
627 nstag
= ns
+ ' ' + nstag
629 nstag
= prefix
+ ':' + nstag
# undo split
630 self
.stack
[-1] = tagname
, nsdict
, nstag
631 # translate namespace of attributes
632 attrnamemap
= {} # map from new name to old name (used for error reporting)
633 for key
in attrdict
.keys():
634 attrnamemap
[key
] = key
635 if self
.__use
_namespaces
:
637 for key
, val
in attrdict
.items():
639 res
= qname
.match(key
)
641 aprefix
, key
= res
.group('prefix', 'local')
644 if aprefix
is not None:
646 for t
, d
, nst
in self
.stack
:
650 ans
= self
.__namespaces
.get(aprefix
)
652 key
= ans
+ ' ' + key
654 key
= aprefix
+ ':' + key
656 attrnamemap
[key
] = okey
658 attributes
= self
.attributes
.get(nstag
)
659 if attributes
is not None:
660 for key
in attrdict
.keys():
661 if not key
in attributes
:
662 self
.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap
[key
], tagname
))
663 for key
, val
in attributes
.items():
664 if val
is not None and not key
in attrdict
:
666 method
= self
.elements
.get(nstag
, (None, None))[0]
667 self
.finish_starttag(nstag
, attrdict
, method
)
668 if tag
.group('slash') == '/':
669 self
.finish_endtag(tagname
)
672 # Internal -- parse endtag
673 def parse_endtag(self
, i
):
674 rawdata
= self
.rawdata
675 end
= endbracketfind
.match(rawdata
, i
+1)
678 res
= tagfind
.match(rawdata
, i
+2)
681 self
.handle_data(rawdata
[i
])
683 if not self
.__accept
_missing
_endtag
_name
:
684 self
.syntax_error('no name specified in end tag')
685 tag
= self
.stack
[-1][0]
692 if not self
.stack
or tag
!= self
.stack
[-1][0]:
693 self
.handle_data(rawdata
[i
])
696 if endbracket
.match(rawdata
, k
) is None:
697 self
.syntax_error('garbage in end tag')
698 self
.finish_endtag(tag
)
701 # Internal -- finish processing of start tag
702 def finish_starttag(self
, tagname
, attrdict
, method
):
703 if method
is not None:
704 self
.handle_starttag(tagname
, method
, attrdict
)
706 self
.unknown_starttag(tagname
, attrdict
)
708 # Internal -- finish processing of end tag
709 def finish_endtag(self
, tag
):
712 self
.syntax_error('name-less end tag')
713 found
= len(self
.stack
) - 1
715 self
.unknown_endtag(tag
)
719 for i
in range(len(self
.stack
)):
720 if tag
== self
.stack
[i
][0]:
723 self
.syntax_error('unopened end tag')
725 while len(self
.stack
) > found
:
726 if found
< len(self
.stack
) - 1:
727 self
.syntax_error('missing close tag for %s' % self
.stack
[-1][2])
728 nstag
= self
.stack
[-1][2]
729 method
= self
.elements
.get(nstag
, (None, None))[1]
730 if method
is not None:
731 self
.handle_endtag(nstag
, method
)
733 self
.unknown_endtag(nstag
)
734 if self
.__use
_namespaces
== len(self
.stack
):
735 self
.__use
_namespaces
= 0
738 # Overridable -- handle xml processing instruction
739 def handle_xml(self
, encoding
, standalone
):
742 # Overridable -- handle DOCTYPE
743 def handle_doctype(self
, tag
, pubid
, syslit
, data
):
746 # Overridable -- handle start tag
747 def handle_starttag(self
, tag
, method
, attrs
):
750 # Overridable -- handle end tag
751 def handle_endtag(self
, tag
, method
):
754 # Example -- handle character reference, no need to override
755 def handle_charref(self
, name
):
758 n
= int(name
[1:], 16)
762 self
.unknown_charref(name
)
764 if not 0 <= n
<= 255:
765 self
.unknown_charref(name
)
767 self
.handle_data(chr(n
))
769 # Definition of entities -- derived classes may override
770 entitydefs
= {'lt': '<', # must use charref
772 'amp': '&', # must use charref
777 # Example -- handle data, should be overridden
778 def handle_data(self
, data
):
781 # Example -- handle cdata, could be overridden
782 def handle_cdata(self
, data
):
785 # Example -- handle comment, could be overridden
786 def handle_comment(self
, data
):
789 # Example -- handle processing instructions, could be overridden
790 def handle_proc(self
, name
, data
):
793 # Example -- handle relatively harmless syntax errors, could be overridden
794 def syntax_error(self
, message
):
795 raise Error('Syntax error at line %d: %s' % (self
.lineno
, message
))
797 # To be overridden -- handlers for unknown objects
798 def unknown_starttag(self
, tag
, attrs
): pass
799 def unknown_endtag(self
, tag
): pass
800 def unknown_charref(self
, ref
): pass
801 def unknown_entityref(self
, name
):
802 self
.syntax_error("reference to unknown entity `&%s;'" % name
)
805 class TestXMLParser(XMLParser
):
807 def __init__(self
, **kw
):
809 XMLParser
.__init
__(self
, **kw
)
811 def handle_xml(self
, encoding
, standalone
):
813 print 'xml: encoding =',encoding
,'standalone =',standalone
815 def handle_doctype(self
, tag
, pubid
, syslit
, data
):
817 print 'DOCTYPE:',tag
, repr(data
)
819 def handle_data(self
, data
):
820 self
.testdata
= self
.testdata
+ data
821 if len(repr(self
.testdata
)) >= 70:
828 print 'data:', repr(data
)
830 def handle_cdata(self
, data
):
832 print 'cdata:', repr(data
)
834 def handle_proc(self
, name
, data
):
836 print 'processing:',name
,repr(data
)
838 def handle_comment(self
, data
):
842 r
= r
[:32] + '...' + r
[-32:]
845 def syntax_error(self
, message
):
846 print 'error at line %d:' % self
.lineno
, message
848 def unknown_starttag(self
, tag
, attrs
):
851 print 'start tag: <' + tag
+ '>'
853 print 'start tag: <' + tag
,
854 for name
, value
in attrs
.items():
855 print name
+ '=' + '"' + value
+ '"',
858 def unknown_endtag(self
, tag
):
860 print 'end tag: </' + tag
+ '>'
862 def unknown_entityref(self
, ref
):
864 print '*** unknown entity ref: &' + ref
+ ';'
866 def unknown_charref(self
, ref
):
868 print '*** unknown char ref: &#' + ref
+ ';'
871 XMLParser
.close(self
)
874 def test(args
= None):
876 from time
import time
881 opts
, args
= getopt
.getopt(args
, 'st')
882 klass
= TestXMLParser
905 if f
is not sys
.stdin
:
922 print 'total time: %g' % (t1
-t0
)
926 print 'total time: %g' % (t1
-t0
)
929 if __name__
== '__main__':