1 # $Id: utils.py 7073 2011-07-07 06:49:19Z milde $
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 Miscellaneous utilities for the documentation utilities.
9 __docformat__
= 'reStructuredText'
16 from docutils
import ApplicationError
, DataError
17 from docutils
import nodes
18 from docutils
.error_reporting
import ErrorOutput
, SafeString
21 class SystemMessage(ApplicationError
):
23 def __init__(self
, system_message
, level
):
24 Exception.__init
__(self
, system_message
.astext())
28 class SystemMessagePropagation(ApplicationError
): pass
34 Info/warning/error reporter and ``system_message`` element generator.
36 Five levels of system messages are defined, along with corresponding
37 methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
39 There is typically one Reporter object per process. A Reporter object is
40 instantiated with thresholds for reporting (generating warnings) and
41 halting processing (raising exceptions), a switch to turn debug output on
42 or off, and an I/O stream for warnings. These are stored as instance
45 When a system message is generated, its level is compared to the stored
46 thresholds, and a warning or error is generated as appropriate. Debug
47 messages are produced if the stored debug switch is on, independently of
48 other thresholds. Message output is sent to the stored warning stream if
51 The Reporter class also employs a modified form of the "Observer" pattern
52 [GoF95]_ to track system messages generated. The `attach_observer` method
53 should be called before parsing, with a bound method or function which
54 accepts system messages. The observer can be removed with
55 `detach_observer`, and another added in its place.
57 .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
58 Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
62 levels
= 'DEBUG INFO WARNING ERROR SEVERE'.split()
63 """List of names for system message levels, indexed by level."""
65 # system message level constants:
70 SEVERE_LEVEL
) = range(5)
72 def __init__(self
, source
, report_level
, halt_level
, stream
=None,
73 debug
=0, encoding
=None, error_handler
='backslashreplace'):
76 - `source`: The path to or description of the source data.
77 - `report_level`: The level at or above which warning output will
79 - `halt_level`: The level at or above which `SystemMessage`
80 exceptions will be raised, halting execution.
81 - `debug`: Show debug (level=0) system messages?
82 - `stream`: Where warning output is sent. Can be file-like (has a
83 ``.write`` method), a string (file name, opened for writing),
84 '' (empty string) or `False` (for discarding all stream messages)
85 or `None` (implies `sys.stderr`; default).
86 - `encoding`: The output encoding.
87 - `error_handler`: The error handler for stderr output encoding.
91 """The path to or description of the source data."""
93 self
.error_handler
= error_handler
94 """The character encoding error handler."""
96 self
.debug_flag
= debug
97 """Show debug (level=0) system messages?"""
99 self
.report_level
= report_level
100 """The level at or above which warning output will be sent
103 self
.halt_level
= halt_level
104 """The level at or above which `SystemMessage` exceptions
105 will be raised, halting execution."""
107 if not isinstance(stream
, ErrorOutput
):
108 stream
= ErrorOutput(stream
, encoding
, error_handler
)
111 """Where warning output is sent."""
113 self
.encoding
= encoding
or getattr(stream
, 'encoding', 'ascii')
114 """The output character encoding."""
117 """List of bound methods or functions to call with each system_message
121 """The highest level system message generated so far."""
123 def set_conditions(self
, category
, report_level
, halt_level
,
124 stream
=None, debug
=0):
125 warnings
.warn('docutils.utils.Reporter.set_conditions deprecated; '
126 'set attributes via configuration settings or directly',
127 DeprecationWarning, stacklevel
=2)
128 self
.report_level
= report_level
129 self
.halt_level
= halt_level
130 if not isinstance(stream
, ErrorOutput
):
131 stream
= ErrorOutput(stream
, self
.encoding
, self
.error_handler
)
133 self
.debug_flag
= debug
135 def attach_observer(self
, observer
):
137 The `observer` parameter is a function or bound method which takes one
138 argument, a `nodes.system_message` instance.
140 self
.observers
.append(observer
)
142 def detach_observer(self
, observer
):
143 self
.observers
.remove(observer
)
145 def notify_observers(self
, message
):
146 for observer
in self
.observers
:
149 def system_message(self
, level
, message
, *children
, **kwargs
):
151 Return a system_message object.
153 Raise an exception or generate a warning if appropriate.
155 # `message` can be a `string`, `unicode`, or `Exception` instance.
156 if isinstance(message
, Exception):
157 message
= SafeString(message
)
159 attributes
= kwargs
.copy()
160 if 'base_node' in kwargs
:
161 source
, line
= get_source_line(kwargs
['base_node'])
162 del attributes
['base_node']
163 if source
is not None:
164 attributes
.setdefault('source', source
)
166 attributes
.setdefault('line', line
)
167 # assert source is not None, "node has line- but no source-argument"
168 if not 'source' in attributes
: # 'line' is absolute line number
169 try: # look up (source, line-in-source)
170 source
, line
= self
.locator(attributes
.get('line'))
171 # print "locator lookup", kwargs.get('line'), "->", source, line
172 except AttributeError:
173 source
, line
= None, None
174 if source
is not None:
175 attributes
['source'] = source
177 attributes
['line'] = line
178 # assert attributes['line'] is not None, (message, kwargs)
179 # assert attributes['source'] is not None, (message, kwargs)
180 attributes
.setdefault('source', self
.source
)
182 msg
= nodes
.system_message(message
, level
=level
,
183 type=self
.levels
[level
],
184 *children
, **attributes
)
185 if self
.stream
and (level
>= self
.report_level
186 or self
.debug_flag
and level
== self
.DEBUG_LEVEL
187 or level
>= self
.halt_level
):
188 self
.stream
.write(msg
.astext() + '\n')
189 if level
>= self
.halt_level
:
190 raise SystemMessage(msg
, level
)
191 if level
> self
.DEBUG_LEVEL
or self
.debug_flag
:
192 self
.notify_observers(msg
)
193 self
.max_level
= max(level
, self
.max_level
)
196 def debug(self
, *args
, **kwargs
):
198 Level-0, "DEBUG": an internal reporting issue. Typically, there is no
199 effect on the processing. Level-0 system messages are handled
200 separately from the others.
203 return self
.system_message(self
.DEBUG_LEVEL
, *args
, **kwargs
)
205 def info(self
, *args
, **kwargs
):
207 Level-1, "INFO": a minor issue that can be ignored. Typically there is
208 no effect on processing, and level-1 system messages are not reported.
210 return self
.system_message(self
.INFO_LEVEL
, *args
, **kwargs
)
212 def warning(self
, *args
, **kwargs
):
214 Level-2, "WARNING": an issue that should be addressed. If ignored,
215 there may be unpredictable problems with the output.
217 return self
.system_message(self
.WARNING_LEVEL
, *args
, **kwargs
)
219 def error(self
, *args
, **kwargs
):
221 Level-3, "ERROR": an error that should be addressed. If ignored, the
222 output will contain errors.
224 return self
.system_message(self
.ERROR_LEVEL
, *args
, **kwargs
)
226 def severe(self
, *args
, **kwargs
):
228 Level-4, "SEVERE": a severe error that must be addressed. If ignored,
229 the output will contain severe errors. Typically level-4 system
230 messages are turned into exceptions which halt processing.
232 return self
.system_message(self
.SEVERE_LEVEL
, *args
, **kwargs
)
235 class ExtensionOptionError(DataError
): pass
236 class BadOptionError(ExtensionOptionError
): pass
237 class BadOptionDataError(ExtensionOptionError
): pass
238 class DuplicateOptionError(ExtensionOptionError
): pass
241 def extract_extension_options(field_list
, options_spec
):
243 Return a dictionary mapping extension option names to converted values.
246 - `field_list`: A flat field list without field arguments, where each
247 field body consists of a single paragraph only.
248 - `options_spec`: Dictionary mapping known option names to a
249 conversion function such as `int` or `float`.
252 - `KeyError` for unknown option names.
253 - `ValueError` for invalid option values (raised by the conversion
255 - `TypeError` for invalid option value types (raised by conversion
257 - `DuplicateOptionError` for duplicate options.
258 - `BadOptionError` for invalid fields.
259 - `BadOptionDataError` for invalid option data (missing name,
260 missing data, bad quotes, etc.).
262 option_list
= extract_options(field_list
)
263 option_dict
= assemble_option_dict(option_list
, options_spec
)
266 def extract_options(field_list
):
268 Return a list of option (name, value) pairs from field names & bodies.
271 `field_list`: A flat field list, where each field name is a single
272 word and each field body consists of a single paragraph only.
275 - `BadOptionError` for invalid fields.
276 - `BadOptionDataError` for invalid option data (missing name,
277 missing data, bad quotes, etc.).
280 for field
in field_list
:
281 if len(field
[0].astext().split()) != 1:
282 raise BadOptionError(
283 'extension option field name may not contain multiple words')
284 name
= str(field
[0].astext().lower())
288 elif len(body
) > 1 or not isinstance(body
[0], nodes
.paragraph
) \
289 or len(body
[0]) != 1 or not isinstance(body
[0][0], nodes
.Text
):
290 raise BadOptionDataError(
291 'extension option field body may contain\n'
292 'a single paragraph only (option "%s")' % name
)
294 data
= body
[0][0].astext()
295 option_list
.append((name
, data
))
298 def assemble_option_dict(option_list
, options_spec
):
300 Return a mapping of option names to values.
303 - `option_list`: A list of (name, value) pairs (the output of
304 `extract_options()`).
305 - `options_spec`: Dictionary mapping known option names to a
306 conversion function such as `int` or `float`.
309 - `KeyError` for unknown option names.
310 - `DuplicateOptionError` for duplicate options.
311 - `ValueError` for invalid option values (raised by conversion
313 - `TypeError` for invalid option value types (raised by conversion
317 for name
, value
in option_list
:
318 convertor
= options_spec
[name
] # raises KeyError if unknown
319 if convertor
is None:
320 raise KeyError(name
) # or if explicitly disabled
322 raise DuplicateOptionError('duplicate option "%s"' % name
)
324 options
[name
] = convertor(value
)
325 except (ValueError, TypeError), detail
:
326 raise detail
.__class
__('(option: "%s"; value: %r)\n%s'
327 % (name
, value
, ' '.join(detail
.args
)))
331 class NameValueError(DataError
): pass
334 def decode_path(path
):
336 Ensure `path` is Unicode. Return `nodes.reprunicode` object.
338 Decode file/path string in a failsave manner if not already done.
340 # see also http://article.gmane.org/gmane.text.docutils.user/2905
341 if isinstance(path
, unicode):
344 path
= path
.decode(sys
.getfilesystemencoding(), 'strict')
345 except AttributeError: # default value None has no decode method
346 return nodes
.reprunicode(path
)
347 except UnicodeDecodeError:
349 path
= path
.decode('utf-8', 'strict')
350 except UnicodeDecodeError:
351 path
= path
.decode('ascii', 'replace')
352 return nodes
.reprunicode(path
)
355 def extract_name_value(line
):
357 Return a list of (name, value) from a line of the form "name=value ...".
360 `NameValueError` for invalid input (missing name, missing data, bad
365 equals
= line
.find('=')
367 raise NameValueError('missing "="')
368 attname
= line
[:equals
].strip()
369 if equals
== 0 or not attname
:
370 raise NameValueError(
371 'missing attribute name before "="')
372 line
= line
[equals
+1:].lstrip()
374 raise NameValueError(
375 'missing value after "%s="' % attname
)
377 endquote
= line
.find(line
[0], 1)
379 raise NameValueError(
380 'attribute "%s" missing end quote (%s)'
381 % (attname
, line
[0]))
382 if len(line
) > endquote
+ 1 and line
[endquote
+ 1].strip():
383 raise NameValueError(
384 'attribute "%s" end quote (%s) not followed by '
385 'whitespace' % (attname
, line
[0]))
386 data
= line
[1:endquote
]
387 line
= line
[endquote
+1:].lstrip()
389 space
= line
.find(' ')
395 line
= line
[space
+1:].lstrip()
396 attlist
.append((attname
.lower(), data
))
399 def new_reporter(source_path
, settings
):
401 Return a new Reporter object.
405 The path to or description of the source text of the document.
406 `settings` : optparse.Values object
410 source_path
, settings
.report_level
, settings
.halt_level
,
411 stream
=settings
.warning_stream
, debug
=settings
.debug
,
412 encoding
=settings
.error_encoding
,
413 error_handler
=settings
.error_encoding_error_handler
)
416 def new_document(source_path
, settings
=None):
418 Return a new empty document object.
421 `source_path` : string
422 The path to or description of the source text of the document.
423 `settings` : optparse.Values object
424 Runtime settings. If none are provided, a default core set will
425 be used. If you will use the document object with any Docutils
426 components, you must provide their default settings as well. For
427 example, if parsing, at least provide the parser settings,
428 obtainable as follows::
430 settings = docutils.frontend.OptionParser(
431 components=(docutils.parsers.rst.Parser,)
432 ).get_default_values()
434 from docutils
import frontend
436 settings
= frontend
.OptionParser().get_default_values()
437 source_path
= decode_path(source_path
)
438 reporter
= new_reporter(source_path
, settings
)
439 document
= nodes
.document(settings
, reporter
, source
=source_path
)
440 document
.note_source(source_path
, -1)
443 def clean_rcs_keywords(paragraph
, keyword_substitutions
):
444 if len(paragraph
) == 1 and isinstance(paragraph
[0], nodes
.Text
):
445 textnode
= paragraph
[0]
446 for pattern
, substitution
in keyword_substitutions
:
447 match
= pattern
.search(textnode
)
449 paragraph
[0] = nodes
.Text(pattern
.sub(substitution
, textnode
))
452 def relative_path(source
, target
):
454 Build and return a path to `target`, relative to `source` (both files).
456 If there is no common prefix, return the absolute path to `target`.
458 source_parts
= os
.path
.abspath(source
or 'dummy_file').split(os
.sep
)
459 target_parts
= os
.path
.abspath(target
).split(os
.sep
)
460 # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
461 if source_parts
[:2] != target_parts
[:2]:
462 # Nothing in common between paths.
463 # Return absolute path, using '/' for URLs:
464 return '/'.join(target_parts
)
465 source_parts
.reverse()
466 target_parts
.reverse()
467 while (source_parts
and target_parts
468 and source_parts
[-1] == target_parts
[-1]):
469 # Remove path components in common:
472 target_parts
.reverse()
473 parts
= ['..'] * (len(source_parts
) - 1) + target_parts
474 return '/'.join(parts
)
476 def get_stylesheet_reference(settings
, relative_to
=None):
478 Retrieve a stylesheet reference from the settings object.
480 Deprecated. Use get_stylesheet_reference_list() instead to
481 enable specification of multiple stylesheets as a comma-separated
484 if settings
.stylesheet_path
:
485 assert not settings
.stylesheet
, (
486 'stylesheet and stylesheet_path are mutually exclusive.')
487 if relative_to
== None:
488 relative_to
= settings
._destination
489 return relative_path(relative_to
, settings
.stylesheet_path
)
491 return settings
.stylesheet
493 # Return 'stylesheet' or 'stylesheet_path' arguments as list.
495 # The original settings arguments are kept unchanged: you can test
496 # with e.g. ``if settings.stylesheet_path:``
498 # Differences to ``get_stylesheet_reference``:
499 # * return value is a list
500 # * no re-writing of the path (and therefore no optional argument)
501 # (if required, use ``utils.relative_path(source, target)``
502 # in the calling script)
503 def get_stylesheet_list(settings
):
505 Retrieve list of stylesheet references from the settings object.
507 assert not (settings
.stylesheet
and settings
.stylesheet_path
), (
508 'stylesheet and stylesheet_path are mutually exclusive.')
509 if settings
.stylesheet_path
:
510 sheets
= settings
.stylesheet_path
.split(",")
511 elif settings
.stylesheet
:
512 sheets
= settings
.stylesheet
.split(",")
515 # strip whitespace (frequently occuring in config files)
516 return [sheet
.strip(u
' \t\n') for sheet
in sheets
]
518 def get_trim_footnote_ref_space(settings
):
520 Return whether or not to trim footnote space.
522 If trim_footnote_reference_space is not None, return it.
524 If trim_footnote_reference_space is None, return False unless the
525 footnote reference style is 'superscript'.
527 if settings
.trim_footnote_reference_space
is None:
528 return hasattr(settings
, 'footnote_references') and \
529 settings
.footnote_references
== 'superscript'
531 return settings
.trim_footnote_reference_space
533 def get_source_line(node
):
535 Return the "source" and "line" attributes from the `node` given or from
536 its closest ancestor.
539 if node
.source
or node
.line
:
540 return node
.source
, node
.line
544 def escape2null(text
):
545 """Return a string with escape-backslashes converted to nulls."""
549 found
= text
.find('\\', start
)
551 parts
.append(text
[start
:])
552 return ''.join(parts
)
553 parts
.append(text
[start
:found
])
554 parts
.append('\x00' + text
[found
+1:found
+2])
555 start
= found
+ 2 # skip character after escape
557 def unescape(text
, restore_backslashes
=0):
559 Return a string with nulls removed or restored to backslashes.
560 Backslash-escaped spaces are also removed.
562 if restore_backslashes
:
563 return text
.replace('\x00', '\\')
565 for sep
in ['\x00 ', '\x00\n', '\x00']:
566 text
= ''.join(text
.split(sep
))
569 east_asian_widths
= {'W': 2, # Wide
570 'F': 2, # Full-width (wide)
572 'H': 1, # Half-width (narrow)
573 'N': 1, # Neutral (not East Asian, treated as narrow)
574 'A': 1} # Ambiguous (s/b wide in East Asian context,
575 # narrow otherwise, but that doesn't work)
576 """Mapping of result codes from `unicodedata.east_asian_widt()` to character
579 def column_width(text
):
580 """Return the column width of text.
582 Correct ``len(text)`` for wide East Asian and combining Unicode chars.
584 if isinstance(text
, str) and sys
.version_info
< (3,0):
586 combining_correction
= sum([-1 for c
in text
587 if unicodedata
.combining(c
)])
589 width
= sum([east_asian_widths
[unicodedata
.east_asian_width(c
)]
591 except AttributeError: # east_asian_width() New in version 2.4.
593 return width
+ combining_correction
602 # by Li Daobing http://code.activestate.com/recipes/190465/
603 # since Python 2.6 there is also itertools.combinations()
604 def unique_combinations(items
, n
):
605 """Return r-length tuples, in sorted order, no repeated elements"""
608 for i
in xrange(len(items
)-n
+1):
609 for cc
in unique_combinations(items
[i
+1:],n
-1):
612 def normalize_language_tag(tag
):
613 """Return a list of normalized combinations for a `BCP 47` language tag.
617 >>> normalize_language_tag('de-AT-1901')
618 ['de_at_1901', 'de_at', 'de_1901', 'de']
621 tag
= tag
.lower().replace('-','_')
622 # find all combinations of subtags
624 base_tag
= tag
.split('_')[:1]
625 subtags
= tag
.split('_')[1:]
626 # print base_tag, subtags
627 for n
in range(len(subtags
), 0, -1):
628 for tags
in unique_combinations(subtags
, n
):
630 taglist
.append('_'.join(base_tag
+ tags
))
634 class DependencyList
:
637 List of dependencies, with file recording support.
639 Note that the output file is not automatically closed. You have
640 to explicitly call the close() method.
643 def __init__(self
, output_file
=None, dependencies
=[]):
645 Initialize the dependency list, automatically setting the
646 output file to `output_file` (see `set_output()`) and adding
647 all supplied dependencies.
649 self
.set_output(output_file
)
650 for i
in dependencies
:
653 def set_output(self
, output_file
):
655 Set the output file and clear the list of already added
658 `output_file` must be a string. The specified file is
659 immediately overwritten.
661 If output_file is '-', the output will be written to stdout.
662 If it is None, no file output is done when calling add().
665 if output_file
== '-':
666 self
.file = sys
.stdout
668 self
.file = open(output_file
, 'w')
672 def add(self
, *filenames
):
674 If the dependency `filename` has not already been added,
675 append it to self.list and print it to self.file if self.file
678 for filename
in filenames
:
679 if not filename
in self
.list:
680 self
.list.append(filename
)
681 if self
.file is not None:
682 print >>self
.file, filename
686 Close the output file.
688 if self
.file not in (sys
.stdout
, sys
.stderr
):
694 output_file
= self
.file.name
697 return '%s(%r, %s)' % (self
.__class
__.__name
__, output_file
, self
.list)
699 import xml
.sax
.handler
700 from xml
.sax
.handler
import feature_namespaces
701 from StringIO
import StringIO
703 class CopyTree(xml
.sax
.ContentHandler
):
705 Needed class for the function XmlStringToDocutilsNodes function.
706 Don't invoke this class directly.
710 def __init__(self
, default_namespace
= None, ns_dict
= None):
711 self
.__characters
= ''
712 self
.__current
_node
_list
= []
714 self
.__default
_namespace
= default_namespace
715 self
.__ns
_prefix
_dict
= {
716 'http://www.w3.org/XML/1998/namespace': 'xml',
717 'http://www.w3.org/1998/Math/MathML': 'ml',
718 'http://www.w3.org/1999/xhtml': 'xhtml',
719 'http://www.w3.org/1999/XSL/Transform':'xsl',
722 self
.__ns
_prefix
_dict
.update(ns_dict
)
724 self
.__ns
_prefix
_no
_write
_dict
= {
725 'http://www.w3.org/XML/1998/namespace': 'xml',
729 def characters (self
, characters
):
730 self
.__characters
+= characters
733 def startElementNS(self
, name
, qname
, attrs
):
738 Get the information from the start of the element to write a docutis node Element.
740 If the default_namespace is set and it is the first element, get the
741 namespace from the first element. Write it as xmlns="http:/...." Do
742 not write any other namespaces. If the default_namespace is set but
743 no namespace is found, raise an error.
745 If there are no namespaces, just get the element's name, create an Element method, and
749 If there is namespace:
751 1. See if a convenient prefix exists. If so, write that prefix:
752 <math xmlns="http://www.w3.org/XML/1998/namespace" => <ml:math
754 2. If there is a namespace but no prefix, use ns1 as the prefix
755 <customElement xmlns="http://www.custom.org" => <ns1:customElement
757 3. If the namespace needs to be decalred, then write it:
758 <math xmlsn="http://www.w3.org/XML/1998/namespace" => <ml:math xmlsn:ml=http://www.w3.org/XML/1998/namespace"
760 <customElement xmlns="http://www.custom.org" => <ns1:customElement xmlns:ns1="http://www.custom.org"
762 4. If the namespace does not need to be written, don't write it:
763 (Don't think any examles exist for elements.)
768 The same strategy is followd for the attributes, with the exception of using the ns# for a default prefix.
769 If no convenient prefix is found:
771 1. If the namespace for the attribute matches the namespace for the element, use it (ns1)
773 2. Otherwise, start with ns2, and use the next number (ns3) for the next prefix, and so on.
776 if len(self
.__current
_node
_list
) > 0:
778 ns
= name
[0] # for example, "http://www.w3.org/XML/1998/namespace"
779 ns_prefix
= self
.__ns
_prefix
_dict
.get(ns
)
780 el_name
= name
[1] # a string indicating the tag name, for example, "math"
781 element
= nodes
.Element()
782 element
.tagname
= el_name
783 if len(self
.__current
_node
_list
) > 0:
784 self
.__current
_node
_list
[-1].append(element
)
785 # if there is a namespace that does not match the root; and not an
786 # implicit namespace, like XML, raise an error
787 if ns
and self
.__default
_namespace
and ns
!= self
.__default
_namespace
and not(self
.__ns
_prefix
_no
_write
_dict
.get(ns
)):
788 raise SystemError('default namespace "%s" does not match root namespace "%s"' % (ns
, self
.__default
_namespace
))
790 self
.__tree
= element
791 if self
.__default
_namespace
:
793 raise SystemError('no default namespace found, yet default_namespace passed to function')
794 element
['xmlns'] = ns
795 self
.__default
_namespace
= ns
796 self
.__current
_node
_list
.append(element
)
797 if not self
.__default
_namespace
:
799 element
.tagname
= '%s:%s' % (ns_prefix
, el_name
)
801 element
.tagname
= 'ns1:%s' % el_name
803 if ns
and self
.__ns
_prefix
_no
_write
_dict
.get(ns
): # don't need to write certain namespaces, like xml
805 elif ns
and ns_prefix
:
806 element
['xmlns:%s' % ns_prefix
] = ns
808 element
['xmlns:ns1'] = ns
809 elif self
.__ns
_prefix
_no
_write
_dict
.get(ns
):
810 # unlikey to actually occurr, but just in case
811 element
.tagname
= '%s:%s' % (ns_prefix
, el_name
)
814 the_keys
= attrs
.keys()
816 for the_key
in the_keys
:
819 att_name
= the_key
[1]
820 value
= attrs
[the_key
]
821 ns_prefix
= self
.__ns
_prefix
_dict
.get(ns_att
)
822 if not self
.__default
_namespace
:# all cases for non-default space, including no namespace and xml namespace
823 if ns_att
and ns_att
!= ns
:
824 if not(self
.__ns
_prefix
_no
_write
_dict
.get(ns_att
)):
825 att
= 'xmlns:ns%s' % counter
827 element
[att
] = the_value
828 if ns_att
and ns_prefix
:
829 att
= '%s:%s' % (ns_prefix
, att_name
)
831 elif ns_att
and ns_att
== ns
:
832 att
= 'ns1:%s' % att_name
835 att
= 'ns%s:%s' % (counter
, att_name
)
838 element
[att_name
] = value
839 else: # default namespace only write prefixes such as xml; otherwise just write attribute
840 if ns_att
and self
.__ns
_prefix
_no
_write
_dict
.get(ns_att
):
841 att_name
= '%s:%s' % (ns_prefix
, att_name
)
842 element
[att_name
] = value
844 element
[att_name
] = value
846 def __write_text(self
):
847 text
= self
.__characters
848 self
.__current
_node
_list
[-1].append(nodes
.Text(text
))
849 self
.__characters
= ''
851 def endElementNS(self
, name
, qname
):
853 self
.__current
_node
_list
.pop()
858 def endDocument(self
):
861 def XmlStringToDocutilsNodes(xml_string
, encoding
='utf8', default_namespace
= None, ns_dict
= None):
863 Converts an XML String into a docutils node tree, and returns that tree.
865 xml_string can either be a unicode object or a string (for Python < 3); or
866 a string or a byte string (for pyton >=3.0).
868 The encoding is the encoding for the xm_string.
870 The default_namespace should be set to some boolean value, such as True or
871 False. If set, default_namespace makes easier-to read XML by writing the
872 namespace in only the first element:
874 <ml:math xmlns:ml="http://www.w3.org/1998/Math/MathML>
875 <ml:style xmlns:ml="http://www.w3.org/1998/Math/MathMl">
881 <math xmlns="http://www.w3.org/1998/Math/MathML>
886 An error is raised if no namespace is found for the first element, or a namespace is found
887 for subequent elements that does not match.
889 The ns_dict is a dictionary of namespaces mapped to a prefix. For example:
891 {"http://www.tei-c.org/ns/1.0":'tei'}
893 If any element is found with the namespace http://www.tei-c.org/ns/1.0,
894 then the prefix "tei" is used. Note that this dictionary only makes the
895 XML look more readable, and is not needed to create valid XML with the
896 correct namespaces. For example, if the parser finds an element with a
897 namespace "http://www.tei-c.org/ns/1.0", and no dict is passed to this
898 function, the parser assigns its own prefix:
900 <ns1:paragraph xmlns:ns1="http://www.tei-c.org/ns/1.0"
905 if sys
.version_info
< (3,):
906 if type(xml_string
) == type(unicode('x')):
907 xml_string
= xml_string
.encode('utf8')
908 elif type(xml_string
) == type('x'):
909 xml_string
= xml_string
.decode(encoding
)
910 xml_string
= xml_string
.encode('utf8')
912 if type(xml_string
) == type(b
'x'):
913 xml_string
= xml_string
.decode(encoding
)
914 read_obj
= StringIO(xml_string
)
915 the_handle
=CopyTree(ns_dict
= ns_dict
, default_namespace
= default_namespace
)
916 parser
= xml
.sax
.make_parser()
917 parser
.setFeature(feature_namespaces
, 1)
918 parser
.setContentHandler(the_handle
)
919 parser
.setFeature("http://xml.org/sax/features/external-general-entities", True)
920 parser
.parse(read_obj
)
922 docutils_tree
= the_handle
.get_tree()
925 import xml
.dom
.minidom
926 import xml
.sax
.saxutils
929 takes a dom element as current_element
933 def start_tag(local_name
):
934 sys
.stdout
.write('<%s>' % local_name
)
936 def end_tag(local_name
):
937 sys
.stdout
.write('</%s>' % local_name
)
939 dom
= xml
.dom
.minidom
.parse('test.xml')
940 out_doc
= xml
.dom
.minidom
.Document()
941 def copy_tree(current_element
):
942 elements
= current_element
.childNodes
943 for element
in elements
:
944 if element
.nodeType
== xml
.dom
.Node
.ELEMENT_NODE
:
945 element_name
= element
.localName
946 if element
.attributes
!= None:
947 for attr
in element
.attributes
.values():
948 ns
= attr
.namespaceURI
949 local_name
= attr
.localName
953 new_att
= out_doc
.createAttribute(name
)
954 start_tag(element_name
)
956 end_tag(element_name
)
957 elif element
.nodeType
== xml
.dom
.Node
.TEXT_NODE
:
958 parent
= element
.parentNode
959 if parent
.localName
== 'math':
960 sys
.stdout
.write(element
.data
)
962 sys
.stdout
.write(element
.data
)