1 # -*- coding: iso-8859-1 -*-
2 """ A SAX2 driver for libxml2, on top of it's XmlReader API
5 # put this file (drv_libxml2.py) in PYTHONPATH
7 reader = xml.sax.make_parser(["drv_libxml2"])
8 # ...and the rest is standard python sax.
11 - Lexical handlers are supported, except for start/endEntity
12 (waiting for XmlReader.ResolveEntity) and start/endDTD
13 - Error callbacks are not exactly synchronous, they tend
14 to be invoked before the corresponding content callback,
15 because the underlying reader interface parses
16 data by chunks of 512 bytes
20 - some ErrorHandler events (warning)
21 - some ContentHandler events (setDocumentLocator, skippedEntity)
22 - EntityResolver (using libxml2.?)
23 - DTDHandler (if/when libxml2 exposes such node types)
24 - DeclHandler (if/when libxml2 exposes such node types)
25 - property_xml_string?
26 - feature_string_interning?
28 - additional performance tuning:
29 - one might cache callbacks to avoid some name lookups
30 - one might implement a smarter way to pass attributes to startElement
31 (some kind of lazy evaluation?)
32 - there might be room for improvement in start/endPrefixMapping
37 __author__
= u
"Stéphane Bidoul <sbi@skynet.be>"
41 from types
import StringType
, UnicodeType
42 StringTypes
= (StringType
,UnicodeType
)
44 from xml
.sax
._exceptions
import *
45 from xml
.sax
import xmlreader
, saxutils
46 from xml
.sax
.handler
import \
48 feature_namespace_prefixes
, \
49 feature_string_interning
, \
51 feature_external_ges
, \
52 feature_external_pes
, \
53 property_lexical_handler
, \
54 property_declaration_handler
, \
58 # libxml2 returns strings as UTF8
59 _decoder
= codecs
.lookup("utf8")[1]
68 except ImportError, e
:
69 raise SAXReaderNotAvailable("libxml2 not available: " \
70 "import error was: %s" % e
)
72 class Locator(xmlreader
.Locator
):
73 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
75 def __init__(self
,locator
):
76 self
.__locator
= locator
78 def getColumnNumber(self
):
79 "Return the column number where the current event ends."
82 def getLineNumber(self
):
83 "Return the line number where the current event ends."
84 return self
.__locator
.LineNumber()
86 def getPublicId(self
):
87 "Return the public identifier for the current event."
90 def getSystemId(self
):
91 "Return the system identifier for the current event."
92 return self
.__locator
.BaseURI()
94 class LibXml2Reader(xmlreader
.XMLReader
):
97 xmlreader
.XMLReader
.__init
__(self
)
105 # additional handlers
106 self
.__lex
_handler
= None
107 self
.__decl
_handler
= None
108 # error messages accumulator
111 def _errorHandler(self
,arg
,msg
,severity
,locator
):
112 if self
.__errors
is None:
114 self
.__errors
.append((severity
,
115 SAXParseException(msg
,None,
118 def _reportErrors(self
,fatal
):
119 for severity
,exception
in self
.__errors
:
120 if severity
in (libxml2
.PARSER_SEVERITY_VALIDITY_WARNING
,
121 libxml2
.PARSER_SEVERITY_WARNING
):
122 self
._err
_handler
.warning(exception
)
124 # when fatal is set, the parse will stop;
125 # we consider that the last error reported
127 if fatal
and exception
is self
.__errors
[-1][1]:
128 self
._err
_handler
.fatalError(exception
)
130 self
._err
_handler
.error(exception
)
133 def parse(self
, source
):
136 # prepare source and create reader
137 if type(source
) in StringTypes
:
138 reader
= libxml2
.newTextReaderFilename(source
)
140 source
= saxutils
.prepare_input_source(source
)
141 input = libxml2
.inputBuffer(source
.getByteStream())
142 reader
= input.newTextReader(source
.getSystemId())
143 reader
.SetErrorHandler(self
._errorHandler
,None)
146 reader
.SetParserProp(libxml2
.PARSER_LOADDTD
,1)
147 reader
.SetParserProp(libxml2
.PARSER_DEFAULTATTRS
,1)
148 reader
.SetParserProp(libxml2
.PARSER_SUBST_ENTITIES
,1)
149 reader
.SetParserProp(libxml2
.PARSER_VALIDATE
,self
.__validate
)
151 reader
.SetParserProp(libxml2
.PARSER_LOADDTD
, 0)
152 # we reuse attribute maps (for a slight performance gain)
154 attributesNSImpl
= xmlreader
.AttributesNSImpl({},{})
156 attributesImpl
= xmlreader
.AttributesImpl({})
157 # prefixes to pop (for endPrefixMapping)
160 self
._cont
_handler
.startDocument()
165 if not self
.__errors
is None:
166 self
._reportErrors
(0)
168 if not self
.__errors
is None:
169 self
._reportErrors
(0)
172 if not self
.__errors
is None:
173 self
._reportErrors
(1)
175 self
._err
_handler
.fatalError(\
176 SAXException("Read failed (no details available)"))
177 break # fatal parse error
179 nodeType
= reader
.NodeType()
183 eltName
= (_d(reader
.NamespaceUri()),\
184 _d(reader
.LocalName()))
185 eltQName
= _d(reader
.Name())
186 attributesNSImpl
._attrs
= attrs
= {}
187 attributesNSImpl
._qnames
= qnames
= {}
189 while reader
.MoveToNextAttribute():
190 qname
= _d(reader
.Name())
191 value
= _d(reader
.Value())
192 if qname
.startswith("xmlns"):
194 newPrefix
= qname
[6:]
197 newPrefixes
.append(newPrefix
)
198 self
._cont
_handler
.startPrefixMapping(\
201 continue # don't report xmlns attribute
202 attName
= (_d(reader
.NamespaceUri()),
203 _d(reader
.LocalName()))
204 qnames
[attName
] = qname
205 attrs
[attName
] = value
206 reader
.MoveToElement()
207 self
._cont
_handler
.startElementNS( \
208 eltName
,eltQName
,attributesNSImpl
)
209 if reader
.IsEmptyElement():
210 self
._cont
_handler
.endElementNS(eltName
,eltQName
)
211 for newPrefix
in newPrefixes
:
212 self
._cont
_handler
.endPrefixMapping(newPrefix
)
214 prefixes
.append(newPrefixes
)
216 eltName
= _d(reader
.Name())
217 attributesImpl
._attrs
= attrs
= {}
218 while reader
.MoveToNextAttribute():
219 attName
= _d(reader
.Name())
220 attrs
[attName
] = _d(reader
.Value())
221 reader
.MoveToElement()
222 self
._cont
_handler
.startElement( \
223 eltName
,attributesImpl
)
224 if reader
.IsEmptyElement():
225 self
._cont
_handler
.endElement(eltName
)
229 self
._cont
_handler
.endElementNS( \
230 (_d(reader
.NamespaceUri()),_d(reader
.LocalName())),
232 for prefix
in prefixes
.pop():
233 self
._cont
_handler
.endPrefixMapping(prefix
)
235 self
._cont
_handler
.endElement(_d(reader
.Name()))
238 self
._cont
_handler
.characters(_d(reader
.Value()))
241 self
._cont
_handler
.ignorableWhitespace(_d(reader
.Value()))
242 # SignificantWhitespace
244 self
._cont
_handler
.characters(_d(reader
.Value()))
247 if not self
.__lex
_handler
is None:
248 self
.__lex
_handler
.startCDATA()
249 self
._cont
_handler
.characters(_d(reader
.Value()))
250 if not self
.__lex
_handler
is None:
251 self
.__lex
_handler
.endCDATA()
254 if not self
.__lex
_handler
is None:
255 self
.startEntity(_d(reader
.Name()))
256 reader
.ResolveEntity()
259 if not self
.__lex
_handler
is None:
260 self
.endEntity(_d(reader
.Name()))
261 # ProcessingInstruction
263 self
._cont
_handler
.processingInstruction( \
264 _d(reader
.Name()),_d(reader
.Value()))
267 if not self
.__lex
_handler
is None:
268 self
.__lex
_handler
.comment(_d(reader
.Value()))
271 #if not self.__lex_handler is None:
272 # self.__lex_handler.startDTD()
273 pass # TODO (how to detect endDTD? on first non-dtd event?)
279 pass # TODO (entity decl)
283 # Attribute (never in this loop)
286 # Document (not exposed)
289 # DocumentFragment (never returned by XmlReader)
290 #elif nodeType == 11:
297 raise SAXException("Unexpected node type %d" % nodeType
)
299 self
._cont
_handler
.endDocument()
304 def setDTDHandler(self
, handler
):
305 # TODO (when supported, the inherited method works just fine)
306 raise SAXNotSupportedException("DTDHandler not supported")
308 def setEntityResolver(self
, resolver
):
309 # TODO (when supported, the inherited method works just fine)
310 raise SAXNotSupportedException("EntityResolver not supported")
312 def getFeature(self
, name
):
313 if name
== feature_namespaces
:
315 elif name
== feature_namespace_prefixes
:
317 elif name
== feature_validation
:
318 return self
.__validate
319 elif name
== feature_external_ges
:
320 return 1 # TODO (does that relate to PARSER_LOADDTD)?
321 elif name
== feature_external_pes
:
322 return self
.__extparams
324 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
327 def setFeature(self
, name
, state
):
329 raise SAXNotSupportedException("Cannot set feature %s " \
330 "while parsing" % name
)
331 if name
== feature_namespaces
:
333 elif name
== feature_namespace_prefixes
:
335 elif name
== feature_validation
:
336 self
.__validate
= state
337 elif name
== feature_external_ges
:
339 # TODO (does that relate to PARSER_LOADDTD)?
340 raise SAXNotSupportedException("Feature '%s' not supported" % \
342 elif name
== feature_external_pes
:
343 self
.__extparams
= state
345 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
348 def getProperty(self
, name
):
349 if name
== property_lexical_handler
:
350 return self
.__lex
_handler
351 elif name
== property_declaration_handler
:
352 return self
.__decl
_handler
354 raise SAXNotRecognizedException("Property '%s' not recognized" % \
357 def setProperty(self
, name
, value
):
358 if name
== property_lexical_handler
:
359 self
.__lex
_handler
= value
360 elif name
== property_declaration_handler
:
361 # TODO: remove if/when libxml2 supports dtd events
362 raise SAXNotSupportedException("Property '%s' not supported" % \
364 self
.__decl
_handler
= value
366 raise SAXNotRecognizedException("Property '%s' not recognized" % \
370 return LibXml2Reader()