Nearly complete
[docutils/kirr.git] / sandbox / paultremblay / other / utils.py
blob46d8fb7237ac6c4491bcdc950f9282be08c08841
1 # $Id: utils.py 7073 2011-07-07 06:49:19Z milde $
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 Miscellaneous utilities for the documentation utilities.
7 """
9 __docformat__ = 'reStructuredText'
11 import sys
12 import os
13 import os.path
14 import warnings
15 import unicodedata
16 from docutils import ApplicationError, DataError
17 from docutils import nodes
18 from docutils.error_reporting import ErrorOutput, SafeString
21 class SystemMessage(ApplicationError):
23 def __init__(self, system_message, level):
24 Exception.__init__(self, system_message.astext())
25 self.level = level
28 class SystemMessagePropagation(ApplicationError): pass
31 class Reporter:
33 """
34 Info/warning/error reporter and ``system_message`` element generator.
36 Five levels of system messages are defined, along with corresponding
37 methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
39 There is typically one Reporter object per process. A Reporter object is
40 instantiated with thresholds for reporting (generating warnings) and
41 halting processing (raising exceptions), a switch to turn debug output on
42 or off, and an I/O stream for warnings. These are stored as instance
43 attributes.
45 When a system message is generated, its level is compared to the stored
46 thresholds, and a warning or error is generated as appropriate. Debug
47 messages are produced if the stored debug switch is on, independently of
48 other thresholds. Message output is sent to the stored warning stream if
49 not set to ''.
51 The Reporter class also employs a modified form of the "Observer" pattern
52 [GoF95]_ to track system messages generated. The `attach_observer` method
53 should be called before parsing, with a bound method or function which
54 accepts system messages. The observer can be removed with
55 `detach_observer`, and another added in its place.
57 .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
58 Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
59 1995.
60 """
62 levels = 'DEBUG INFO WARNING ERROR SEVERE'.split()
63 """List of names for system message levels, indexed by level."""
65 # system message level constants:
66 (DEBUG_LEVEL,
67 INFO_LEVEL,
68 WARNING_LEVEL,
69 ERROR_LEVEL,
70 SEVERE_LEVEL) = range(5)
72 def __init__(self, source, report_level, halt_level, stream=None,
73 debug=0, encoding=None, error_handler='backslashreplace'):
74 """
75 :Parameters:
76 - `source`: The path to or description of the source data.
77 - `report_level`: The level at or above which warning output will
78 be sent to `stream`.
79 - `halt_level`: The level at or above which `SystemMessage`
80 exceptions will be raised, halting execution.
81 - `debug`: Show debug (level=0) system messages?
82 - `stream`: Where warning output is sent. Can be file-like (has a
83 ``.write`` method), a string (file name, opened for writing),
84 '' (empty string) or `False` (for discarding all stream messages)
85 or `None` (implies `sys.stderr`; default).
86 - `encoding`: The output encoding.
87 - `error_handler`: The error handler for stderr output encoding.
88 """
90 self.source = source
91 """The path to or description of the source data."""
93 self.error_handler = error_handler
94 """The character encoding error handler."""
96 self.debug_flag = debug
97 """Show debug (level=0) system messages?"""
99 self.report_level = report_level
100 """The level at or above which warning output will be sent
101 to `self.stream`."""
103 self.halt_level = halt_level
104 """The level at or above which `SystemMessage` exceptions
105 will be raised, halting execution."""
107 if not isinstance(stream, ErrorOutput):
108 stream = ErrorOutput(stream, encoding, error_handler)
110 self.stream = stream
111 """Where warning output is sent."""
113 self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
114 """The output character encoding."""
116 self.observers = []
117 """List of bound methods or functions to call with each system_message
118 created."""
120 self.max_level = -1
121 """The highest level system message generated so far."""
123 def set_conditions(self, category, report_level, halt_level,
124 stream=None, debug=0):
125 warnings.warn('docutils.utils.Reporter.set_conditions deprecated; '
126 'set attributes via configuration settings or directly',
127 DeprecationWarning, stacklevel=2)
128 self.report_level = report_level
129 self.halt_level = halt_level
130 if not isinstance(stream, ErrorOutput):
131 stream = ErrorOutput(stream, self.encoding, self.error_handler)
132 self.stream = stream
133 self.debug_flag = debug
135 def attach_observer(self, observer):
137 The `observer` parameter is a function or bound method which takes one
138 argument, a `nodes.system_message` instance.
140 self.observers.append(observer)
142 def detach_observer(self, observer):
143 self.observers.remove(observer)
145 def notify_observers(self, message):
146 for observer in self.observers:
147 observer(message)
149 def system_message(self, level, message, *children, **kwargs):
151 Return a system_message object.
153 Raise an exception or generate a warning if appropriate.
155 # `message` can be a `string`, `unicode`, or `Exception` instance.
156 if isinstance(message, Exception):
157 message = SafeString(message)
159 attributes = kwargs.copy()
160 if 'base_node' in kwargs:
161 source, line = get_source_line(kwargs['base_node'])
162 del attributes['base_node']
163 if source is not None:
164 attributes.setdefault('source', source)
165 if line is not None:
166 attributes.setdefault('line', line)
167 # assert source is not None, "node has line- but no source-argument"
168 if not 'source' in attributes: # 'line' is absolute line number
169 try: # look up (source, line-in-source)
170 source, line = self.locator(attributes.get('line'))
171 # print "locator lookup", kwargs.get('line'), "->", source, line
172 except AttributeError:
173 source, line = None, None
174 if source is not None:
175 attributes['source'] = source
176 if line is not None:
177 attributes['line'] = line
178 # assert attributes['line'] is not None, (message, kwargs)
179 # assert attributes['source'] is not None, (message, kwargs)
180 attributes.setdefault('source', self.source)
182 msg = nodes.system_message(message, level=level,
183 type=self.levels[level],
184 *children, **attributes)
185 if self.stream and (level >= self.report_level
186 or self.debug_flag and level == self.DEBUG_LEVEL
187 or level >= self.halt_level):
188 self.stream.write(msg.astext() + '\n')
189 if level >= self.halt_level:
190 raise SystemMessage(msg, level)
191 if level > self.DEBUG_LEVEL or self.debug_flag:
192 self.notify_observers(msg)
193 self.max_level = max(level, self.max_level)
194 return msg
196 def debug(self, *args, **kwargs):
198 Level-0, "DEBUG": an internal reporting issue. Typically, there is no
199 effect on the processing. Level-0 system messages are handled
200 separately from the others.
202 if self.debug_flag:
203 return self.system_message(self.DEBUG_LEVEL, *args, **kwargs)
205 def info(self, *args, **kwargs):
207 Level-1, "INFO": a minor issue that can be ignored. Typically there is
208 no effect on processing, and level-1 system messages are not reported.
210 return self.system_message(self.INFO_LEVEL, *args, **kwargs)
212 def warning(self, *args, **kwargs):
214 Level-2, "WARNING": an issue that should be addressed. If ignored,
215 there may be unpredictable problems with the output.
217 return self.system_message(self.WARNING_LEVEL, *args, **kwargs)
219 def error(self, *args, **kwargs):
221 Level-3, "ERROR": an error that should be addressed. If ignored, the
222 output will contain errors.
224 return self.system_message(self.ERROR_LEVEL, *args, **kwargs)
226 def severe(self, *args, **kwargs):
228 Level-4, "SEVERE": a severe error that must be addressed. If ignored,
229 the output will contain severe errors. Typically level-4 system
230 messages are turned into exceptions which halt processing.
232 return self.system_message(self.SEVERE_LEVEL, *args, **kwargs)
235 class ExtensionOptionError(DataError): pass
236 class BadOptionError(ExtensionOptionError): pass
237 class BadOptionDataError(ExtensionOptionError): pass
238 class DuplicateOptionError(ExtensionOptionError): pass
241 def extract_extension_options(field_list, options_spec):
243 Return a dictionary mapping extension option names to converted values.
245 :Parameters:
246 - `field_list`: A flat field list without field arguments, where each
247 field body consists of a single paragraph only.
248 - `options_spec`: Dictionary mapping known option names to a
249 conversion function such as `int` or `float`.
251 :Exceptions:
252 - `KeyError` for unknown option names.
253 - `ValueError` for invalid option values (raised by the conversion
254 function).
255 - `TypeError` for invalid option value types (raised by conversion
256 function).
257 - `DuplicateOptionError` for duplicate options.
258 - `BadOptionError` for invalid fields.
259 - `BadOptionDataError` for invalid option data (missing name,
260 missing data, bad quotes, etc.).
262 option_list = extract_options(field_list)
263 option_dict = assemble_option_dict(option_list, options_spec)
264 return option_dict
266 def extract_options(field_list):
268 Return a list of option (name, value) pairs from field names & bodies.
270 :Parameter:
271 `field_list`: A flat field list, where each field name is a single
272 word and each field body consists of a single paragraph only.
274 :Exceptions:
275 - `BadOptionError` for invalid fields.
276 - `BadOptionDataError` for invalid option data (missing name,
277 missing data, bad quotes, etc.).
279 option_list = []
280 for field in field_list:
281 if len(field[0].astext().split()) != 1:
282 raise BadOptionError(
283 'extension option field name may not contain multiple words')
284 name = str(field[0].astext().lower())
285 body = field[1]
286 if len(body) == 0:
287 data = None
288 elif len(body) > 1 or not isinstance(body[0], nodes.paragraph) \
289 or len(body[0]) != 1 or not isinstance(body[0][0], nodes.Text):
290 raise BadOptionDataError(
291 'extension option field body may contain\n'
292 'a single paragraph only (option "%s")' % name)
293 else:
294 data = body[0][0].astext()
295 option_list.append((name, data))
296 return option_list
298 def assemble_option_dict(option_list, options_spec):
300 Return a mapping of option names to values.
302 :Parameters:
303 - `option_list`: A list of (name, value) pairs (the output of
304 `extract_options()`).
305 - `options_spec`: Dictionary mapping known option names to a
306 conversion function such as `int` or `float`.
308 :Exceptions:
309 - `KeyError` for unknown option names.
310 - `DuplicateOptionError` for duplicate options.
311 - `ValueError` for invalid option values (raised by conversion
312 function).
313 - `TypeError` for invalid option value types (raised by conversion
314 function).
316 options = {}
317 for name, value in option_list:
318 convertor = options_spec[name] # raises KeyError if unknown
319 if convertor is None:
320 raise KeyError(name) # or if explicitly disabled
321 if name in options:
322 raise DuplicateOptionError('duplicate option "%s"' % name)
323 try:
324 options[name] = convertor(value)
325 except (ValueError, TypeError), detail:
326 raise detail.__class__('(option: "%s"; value: %r)\n%s'
327 % (name, value, ' '.join(detail.args)))
328 return options
331 class NameValueError(DataError): pass
334 def decode_path(path):
336 Ensure `path` is Unicode. Return `nodes.reprunicode` object.
338 Decode file/path string in a failsave manner if not already done.
340 # see also http://article.gmane.org/gmane.text.docutils.user/2905
341 if isinstance(path, unicode):
342 return path
343 try:
344 path = path.decode(sys.getfilesystemencoding(), 'strict')
345 except AttributeError: # default value None has no decode method
346 return nodes.reprunicode(path)
347 except UnicodeDecodeError:
348 try:
349 path = path.decode('utf-8', 'strict')
350 except UnicodeDecodeError:
351 path = path.decode('ascii', 'replace')
352 return nodes.reprunicode(path)
355 def extract_name_value(line):
357 Return a list of (name, value) from a line of the form "name=value ...".
359 :Exception:
360 `NameValueError` for invalid input (missing name, missing data, bad
361 quotes, etc.).
363 attlist = []
364 while line:
365 equals = line.find('=')
366 if equals == -1:
367 raise NameValueError('missing "="')
368 attname = line[:equals].strip()
369 if equals == 0 or not attname:
370 raise NameValueError(
371 'missing attribute name before "="')
372 line = line[equals+1:].lstrip()
373 if not line:
374 raise NameValueError(
375 'missing value after "%s="' % attname)
376 if line[0] in '\'"':
377 endquote = line.find(line[0], 1)
378 if endquote == -1:
379 raise NameValueError(
380 'attribute "%s" missing end quote (%s)'
381 % (attname, line[0]))
382 if len(line) > endquote + 1 and line[endquote + 1].strip():
383 raise NameValueError(
384 'attribute "%s" end quote (%s) not followed by '
385 'whitespace' % (attname, line[0]))
386 data = line[1:endquote]
387 line = line[endquote+1:].lstrip()
388 else:
389 space = line.find(' ')
390 if space == -1:
391 data = line
392 line = ''
393 else:
394 data = line[:space]
395 line = line[space+1:].lstrip()
396 attlist.append((attname.lower(), data))
397 return attlist
399 def new_reporter(source_path, settings):
401 Return a new Reporter object.
403 :Parameters:
404 `source` : string
405 The path to or description of the source text of the document.
406 `settings` : optparse.Values object
407 Runtime settings.
409 reporter = Reporter(
410 source_path, settings.report_level, settings.halt_level,
411 stream=settings.warning_stream, debug=settings.debug,
412 encoding=settings.error_encoding,
413 error_handler=settings.error_encoding_error_handler)
414 return reporter
416 def new_document(source_path, settings=None):
418 Return a new empty document object.
420 :Parameters:
421 `source_path` : string
422 The path to or description of the source text of the document.
423 `settings` : optparse.Values object
424 Runtime settings. If none are provided, a default core set will
425 be used. If you will use the document object with any Docutils
426 components, you must provide their default settings as well. For
427 example, if parsing, at least provide the parser settings,
428 obtainable as follows::
430 settings = docutils.frontend.OptionParser(
431 components=(docutils.parsers.rst.Parser,)
432 ).get_default_values()
434 from docutils import frontend
435 if settings is None:
436 settings = frontend.OptionParser().get_default_values()
437 source_path = decode_path(source_path)
438 reporter = new_reporter(source_path, settings)
439 document = nodes.document(settings, reporter, source=source_path)
440 document.note_source(source_path, -1)
441 return document
443 def clean_rcs_keywords(paragraph, keyword_substitutions):
444 if len(paragraph) == 1 and isinstance(paragraph[0], nodes.Text):
445 textnode = paragraph[0]
446 for pattern, substitution in keyword_substitutions:
447 match = pattern.search(textnode)
448 if match:
449 paragraph[0] = nodes.Text(pattern.sub(substitution, textnode))
450 return
452 def relative_path(source, target):
454 Build and return a path to `target`, relative to `source` (both files).
456 If there is no common prefix, return the absolute path to `target`.
458 source_parts = os.path.abspath(source or 'dummy_file').split(os.sep)
459 target_parts = os.path.abspath(target).split(os.sep)
460 # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
461 if source_parts[:2] != target_parts[:2]:
462 # Nothing in common between paths.
463 # Return absolute path, using '/' for URLs:
464 return '/'.join(target_parts)
465 source_parts.reverse()
466 target_parts.reverse()
467 while (source_parts and target_parts
468 and source_parts[-1] == target_parts[-1]):
469 # Remove path components in common:
470 source_parts.pop()
471 target_parts.pop()
472 target_parts.reverse()
473 parts = ['..'] * (len(source_parts) - 1) + target_parts
474 return '/'.join(parts)
476 def get_stylesheet_reference(settings, relative_to=None):
478 Retrieve a stylesheet reference from the settings object.
480 Deprecated. Use get_stylesheet_reference_list() instead to
481 enable specification of multiple stylesheets as a comma-separated
482 list.
484 if settings.stylesheet_path:
485 assert not settings.stylesheet, (
486 'stylesheet and stylesheet_path are mutually exclusive.')
487 if relative_to == None:
488 relative_to = settings._destination
489 return relative_path(relative_to, settings.stylesheet_path)
490 else:
491 return settings.stylesheet
493 # Return 'stylesheet' or 'stylesheet_path' arguments as list.
495 # The original settings arguments are kept unchanged: you can test
496 # with e.g. ``if settings.stylesheet_path:``
498 # Differences to ``get_stylesheet_reference``:
499 # * return value is a list
500 # * no re-writing of the path (and therefore no optional argument)
501 # (if required, use ``utils.relative_path(source, target)``
502 # in the calling script)
503 def get_stylesheet_list(settings):
505 Retrieve list of stylesheet references from the settings object.
507 assert not (settings.stylesheet and settings.stylesheet_path), (
508 'stylesheet and stylesheet_path are mutually exclusive.')
509 if settings.stylesheet_path:
510 sheets = settings.stylesheet_path.split(",")
511 elif settings.stylesheet:
512 sheets = settings.stylesheet.split(",")
513 else:
514 sheets = []
515 # strip whitespace (frequently occuring in config files)
516 return [sheet.strip(u' \t\n') for sheet in sheets]
518 def get_trim_footnote_ref_space(settings):
520 Return whether or not to trim footnote space.
522 If trim_footnote_reference_space is not None, return it.
524 If trim_footnote_reference_space is None, return False unless the
525 footnote reference style is 'superscript'.
527 if settings.trim_footnote_reference_space is None:
528 return hasattr(settings, 'footnote_references') and \
529 settings.footnote_references == 'superscript'
530 else:
531 return settings.trim_footnote_reference_space
533 def get_source_line(node):
535 Return the "source" and "line" attributes from the `node` given or from
536 its closest ancestor.
538 while node:
539 if node.source or node.line:
540 return node.source, node.line
541 node = node.parent
542 return None, None
544 def escape2null(text):
545 """Return a string with escape-backslashes converted to nulls."""
546 parts = []
547 start = 0
548 while 1:
549 found = text.find('\\', start)
550 if found == -1:
551 parts.append(text[start:])
552 return ''.join(parts)
553 parts.append(text[start:found])
554 parts.append('\x00' + text[found+1:found+2])
555 start = found + 2 # skip character after escape
557 def unescape(text, restore_backslashes=0):
559 Return a string with nulls removed or restored to backslashes.
560 Backslash-escaped spaces are also removed.
562 if restore_backslashes:
563 return text.replace('\x00', '\\')
564 else:
565 for sep in ['\x00 ', '\x00\n', '\x00']:
566 text = ''.join(text.split(sep))
567 return text
569 east_asian_widths = {'W': 2, # Wide
570 'F': 2, # Full-width (wide)
571 'Na': 1, # Narrow
572 'H': 1, # Half-width (narrow)
573 'N': 1, # Neutral (not East Asian, treated as narrow)
574 'A': 1} # Ambiguous (s/b wide in East Asian context,
575 # narrow otherwise, but that doesn't work)
576 """Mapping of result codes from `unicodedata.east_asian_widt()` to character
577 column widths."""
579 def column_width(text):
580 """Return the column width of text.
582 Correct ``len(text)`` for wide East Asian and combining Unicode chars.
584 if isinstance(text, str) and sys.version_info < (3,0):
585 return len(text)
586 combining_correction = sum([-1 for c in text
587 if unicodedata.combining(c)])
588 try:
589 width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
590 for c in text])
591 except AttributeError: # east_asian_width() New in version 2.4.
592 width = len(text)
593 return width + combining_correction
595 def uniq(L):
596 r = []
597 for item in L:
598 if not item in r:
599 r.append(item)
600 return r
602 # by Li Daobing http://code.activestate.com/recipes/190465/
603 # since Python 2.6 there is also itertools.combinations()
604 def unique_combinations(items, n):
605 """Return r-length tuples, in sorted order, no repeated elements"""
606 if n==0: yield []
607 else:
608 for i in xrange(len(items)-n+1):
609 for cc in unique_combinations(items[i+1:],n-1):
610 yield [items[i]]+cc
612 def normalize_language_tag(tag):
613 """Return a list of normalized combinations for a `BCP 47` language tag.
615 Example:
617 >>> normalize_language_tag('de-AT-1901')
618 ['de_at_1901', 'de_at', 'de_1901', 'de']
620 # normalize:
621 tag = tag.lower().replace('-','_')
622 # find all combinations of subtags
623 taglist = []
624 base_tag= tag.split('_')[:1]
625 subtags = tag.split('_')[1:]
626 # print base_tag, subtags
627 for n in range(len(subtags), 0, -1):
628 for tags in unique_combinations(subtags, n):
629 # print tags
630 taglist.append('_'.join(base_tag + tags))
631 taglist += base_tag
632 return taglist
634 class DependencyList:
637 List of dependencies, with file recording support.
639 Note that the output file is not automatically closed. You have
640 to explicitly call the close() method.
643 def __init__(self, output_file=None, dependencies=[]):
645 Initialize the dependency list, automatically setting the
646 output file to `output_file` (see `set_output()`) and adding
647 all supplied dependencies.
649 self.set_output(output_file)
650 for i in dependencies:
651 self.add(i)
653 def set_output(self, output_file):
655 Set the output file and clear the list of already added
656 dependencies.
658 `output_file` must be a string. The specified file is
659 immediately overwritten.
661 If output_file is '-', the output will be written to stdout.
662 If it is None, no file output is done when calling add().
664 self.list = []
665 if output_file == '-':
666 self.file = sys.stdout
667 elif output_file:
668 self.file = open(output_file, 'w')
669 else:
670 self.file = None
672 def add(self, *filenames):
674 If the dependency `filename` has not already been added,
675 append it to self.list and print it to self.file if self.file
676 is not None.
678 for filename in filenames:
679 if not filename in self.list:
680 self.list.append(filename)
681 if self.file is not None:
682 print >>self.file, filename
684 def close(self):
686 Close the output file.
688 if self.file not in (sys.stdout, sys.stderr):
689 self.file.close()
690 self.file = None
692 def __repr__(self):
693 if self.file:
694 output_file = self.file.name
695 else:
696 output_file = None
697 return '%s(%r, %s)' % (self.__class__.__name__, output_file, self.list)
699 import xml.sax.handler
700 from xml.sax.handler import feature_namespaces
701 from StringIO import StringIO
703 class CopyTree(xml.sax.ContentHandler):
705 Needed class for the function XmlStringToDocutilsNodes function.
706 Don't invoke this class directly.
710 def __init__(self, default_namespace = None, ns_dict = None):
711 self.__characters = ''
712 self.__current_node_list = []
713 self.__tree = None
714 self.__default_namespace = default_namespace
715 self.__ns_prefix_dict = {
716 'http://www.w3.org/XML/1998/namespace': 'xml',
717 'http://www.w3.org/1998/Math/MathML': 'ml',
718 'http://www.w3.org/1999/xhtml': 'xhtml',
719 'http://www.w3.org/1999/XSL/Transform':'xsl',
721 if ns_dict != None:
722 self.__ns_prefix_dict.update(ns_dict)
724 self.__ns_prefix_no_write_dict = {
725 'http://www.w3.org/XML/1998/namespace': 'xml',
729 def characters (self, characters):
730 self.__characters += characters
733 def startElementNS(self, name, qname, attrs):
735 Elements
736 ========
738 Get the information from the start of the element to write a docutis node Element.
740 If the default_namespace is set and it is the first element, get the
741 namespace from the first element. Write it as xmlns="http:/...." Do
742 not write any other namespaces. If the default_namespace is set but
743 no namespace is found, raise an error.
745 If there are no namespaces, just get the element's name, create an Element method, and
746 set the tagname.
749 If there is namespace:
751 1. See if a convenient prefix exists. If so, write that prefix:
752 <math xmlns="http://www.w3.org/XML/1998/namespace" => <ml:math
754 2. If there is a namespace but no prefix, use ns1 as the prefix
755 <customElement xmlns="http://www.custom.org" => <ns1:customElement
757 3. If the namespace needs to be decalred, then write it:
758 <math xmlsn="http://www.w3.org/XML/1998/namespace" => <ml:math xmlsn:ml=http://www.w3.org/XML/1998/namespace"
760 <customElement xmlns="http://www.custom.org" => <ns1:customElement xmlns:ns1="http://www.custom.org"
762 4. If the namespace does not need to be written, don't write it:
763 (Don't think any examles exist for elements.)
765 Attributes
766 ==========
768 The same strategy is followd for the attributes, with the exception of using the ns# for a default prefix.
769 If no convenient prefix is found:
771 1. If the namespace for the attribute matches the namespace for the element, use it (ns1)
773 2. Otherwise, start with ns2, and use the next number (ns3) for the next prefix, and so on.
776 if len(self.__current_node_list) > 0:
777 self.__write_text()
778 ns = name[0] # for example, "http://www.w3.org/XML/1998/namespace"
779 ns_prefix = self.__ns_prefix_dict.get(ns)
780 el_name = name[1] # a string indicating the tag name, for example, "math"
781 element = nodes.Element()
782 element.tagname = el_name
783 if len(self.__current_node_list) > 0:
784 self.__current_node_list[-1].append(element)
785 # if there is a namespace that does not match the root; and not an
786 # implicit namespace, like XML, raise an error
787 if ns and self.__default_namespace and ns != self.__default_namespace and not(self.__ns_prefix_no_write_dict.get(ns)):
788 raise SystemError('default namespace "%s" does not match root namespace "%s"' % (ns, self.__default_namespace))
789 else:
790 self.__tree = element
791 if self.__default_namespace:
792 if not ns:
793 raise SystemError('no default namespace found, yet default_namespace passed to function')
794 element['xmlns'] = ns
795 self.__default_namespace = ns
796 self.__current_node_list.append(element)
797 if not self.__default_namespace:
798 if ns and ns_prefix:
799 element.tagname = '%s:%s' % (ns_prefix, el_name)
800 elif ns:
801 element.tagname = 'ns1:%s' % el_name
803 if ns and self.__ns_prefix_no_write_dict.get(ns): # don't need to write certain namespaces, like xml
804 pass
805 elif ns and ns_prefix:
806 element['xmlns:%s' % ns_prefix] = ns
807 elif ns:
808 element['xmlns:ns1'] = ns
809 elif self.__ns_prefix_no_write_dict.get(ns):
810 # unlikey to actually occurr, but just in case
811 element.tagname = '%s:%s' % (ns_prefix, el_name)
814 the_keys = attrs.keys()
815 counter = 1
816 for the_key in the_keys:
817 counter +=1
818 ns_att = the_key[0]
819 att_name = the_key[1]
820 value = attrs[the_key]
821 ns_prefix = self.__ns_prefix_dict.get(ns_att)
822 if not self.__default_namespace:# all cases for non-default space, including no namespace and xml namespace
823 if ns_att and ns_att != ns:
824 if not(self.__ns_prefix_no_write_dict.get(ns_att)):
825 att = 'xmlns:ns%s' % counter
826 the_value = ns_att
827 element[att] = the_value
828 if ns_att and ns_prefix:
829 att = '%s:%s' % (ns_prefix, att_name)
830 element[att] = value
831 elif ns_att and ns_att == ns:
832 att = 'ns1:%s' % att_name
833 element[att] = value
834 elif ns_att:
835 att = 'ns%s:%s' % (counter, att_name)
836 element[att] = value
837 else:
838 element[att_name] = value
839 else: # default namespace only write prefixes such as xml; otherwise just write attribute
840 if ns_att and self.__ns_prefix_no_write_dict.get(ns_att):
841 att_name = '%s:%s' % (ns_prefix, att_name)
842 element[att_name] = value
843 else:
844 element[att_name] = value
846 def __write_text(self):
847 text = self.__characters
848 self.__current_node_list[-1].append(nodes.Text(text))
849 self.__characters = ''
851 def endElementNS(self, name, qname):
852 self.__write_text()
853 self.__current_node_list.pop()
855 def get_tree(self):
856 return self.__tree
858 def endDocument(self):
859 pass
861 def XmlStringToDocutilsNodes(xml_string, encoding='utf8', default_namespace = None, ns_dict = None):
863 Converts an XML String into a docutils node tree, and returns that tree.
865 xml_string can either be a unicode object or a string (for Python < 3); or
866 a string or a byte string (for pyton >=3.0).
868 The encoding is the encoding for the xm_string.
870 The default_namespace should be set to some boolean value, such as True or
871 False. If set, default_namespace makes easier-to read XML by writing the
872 namespace in only the first element:
874 <ml:math xmlns:ml="http://www.w3.org/1998/Math/MathML>
875 <ml:style xmlns:ml="http://www.w3.org/1998/Math/MathMl">
876 </ml:style>
877 </ml:math
879 Becomes:
881 <math xmlns="http://www.w3.org/1998/Math/MathML>
882 <style >
883 </style>
884 </math
886 An error is raised if no namespace is found for the first element, or a namespace is found
887 for subequent elements that does not match.
889 The ns_dict is a dictionary of namespaces mapped to a prefix. For example:
891 {"http://www.tei-c.org/ns/1.0":'tei'}
893 If any element is found with the namespace http://www.tei-c.org/ns/1.0,
894 then the prefix "tei" is used. Note that this dictionary only makes the
895 XML look more readable, and is not needed to create valid XML with the
896 correct namespaces. For example, if the parser finds an element with a
897 namespace "http://www.tei-c.org/ns/1.0", and no dict is passed to this
898 function, the parser assigns its own prefix:
900 <ns1:paragraph xmlns:ns1="http://www.tei-c.org/ns/1.0"
905 if sys.version_info < (3,):
906 if type(xml_string) == type(unicode('x')):
907 xml_string = xml_string.encode('utf8')
908 elif type(xml_string) == type('x'):
909 xml_string = xml_string.decode(encoding)
910 xml_string = xml_string.encode('utf8')
911 else:
912 if type(xml_string) == type(b'x'):
913 xml_string = xml_string.decode(encoding)
914 read_obj = StringIO(xml_string)
915 the_handle=CopyTree(ns_dict = ns_dict, default_namespace = default_namespace)
916 parser = xml.sax.make_parser()
917 parser.setFeature(feature_namespaces, 1)
918 parser.setContentHandler(the_handle)
919 parser.setFeature("http://xml.org/sax/features/external-general-entities", True)
920 parser.parse(read_obj)
921 read_obj.close()
922 docutils_tree = the_handle.get_tree()
923 return docutils_tree