sandbox/paultremblay/other/utils.py

   1 # $Id: utils.py 7073 2011-07-07 06:49:19Z milde $
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 Miscellaneous utilities for the documentation utilities.
   7 """
   8
   9 __docformat__ = 'reStructuredText'
  10
  11 import sys
  12 import os
  13 import os.path
  14 import warnings
  15 import unicodedata
  16 from docutils import ApplicationError, DataError
  17 from docutils import nodes
  18 from docutils.error_reporting import ErrorOutput, SafeString
  19
  20
  21 class SystemMessage(ApplicationError):
  22
  23     def __init__(self, system_message, level):
  24         Exception.__init__(self, system_message.astext())
  25         self.level = level
  26
  27
  28 class SystemMessagePropagation(ApplicationError): pass
  29
  30
  31 class Reporter:
  32
  33     """
  34     Info/warning/error reporter and ``system_message`` element generator.
  35
  36     Five levels of system messages are defined, along with corresponding
  37     methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
  38
  39     There is typically one Reporter object per process.  A Reporter object is
  40     instantiated with thresholds for reporting (generating warnings) and
  41     halting processing (raising exceptions), a switch to turn debug output on
  42     or off, and an I/O stream for warnings.  These are stored as instance
  43     attributes.
  44
  45     When a system message is generated, its level is compared to the stored
  46     thresholds, and a warning or error is generated as appropriate.  Debug
  47     messages are produced if the stored debug switch is on, independently of
  48     other thresholds.  Message output is sent to the stored warning stream if
  49     not set to ''.
  50
  51     The Reporter class also employs a modified form of the "Observer" pattern
  52     [GoF95]_ to track system messages generated.  The `attach_observer` method
  53     should be called before parsing, with a bound method or function which
  54     accepts system messages.  The observer can be removed with
  55     `detach_observer`, and another added in its place.
  56
  57     .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
  58        Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
  59        1995.
  60     """
  61
  62     levels = 'DEBUG INFO WARNING ERROR SEVERE'.split()
  63     """List of names for system message levels, indexed by level."""
  64
  65     # system message level constants:
  66     (DEBUG_LEVEL,
  67      INFO_LEVEL,
  68      WARNING_LEVEL,
  69      ERROR_LEVEL,
  70      SEVERE_LEVEL) = range(5)
  71
  72     def __init__(self, source, report_level, halt_level, stream=None,
  73                  debug=0, encoding=None, error_handler='backslashreplace'):
  74         """
  75         :Parameters:
  76             - `source`: The path to or description of the source data.
  77             - `report_level`: The level at or above which warning output will
  78               be sent to `stream`.
  79             - `halt_level`: The level at or above which `SystemMessage`
  80               exceptions will be raised, halting execution.
  81             - `debug`: Show debug (level=0) system messages?
  82             - `stream`: Where warning output is sent.  Can be file-like (has a
  83               ``.write`` method), a string (file name, opened for writing),
  84               '' (empty string) or `False` (for discarding all stream messages)
  85               or `None` (implies `sys.stderr`; default).
  86             - `encoding`: The output encoding.
  87             - `error_handler`: The error handler for stderr output encoding.
  88         """
  89
  90         self.source = source
  91         """The path to or description of the source data."""
  92
  93         self.error_handler = error_handler
  94         """The character encoding error handler."""
  95
  96         self.debug_flag = debug
  97         """Show debug (level=0) system messages?"""
  98
  99         self.report_level = report_level
 100         """The level at or above which warning output will be sent
 101         to `self.stream`."""
 102
 103         self.halt_level = halt_level
 104         """The level at or above which `SystemMessage` exceptions
 105         will be raised, halting execution."""
 106
 107         if not isinstance(stream, ErrorOutput):
 108             stream = ErrorOutput(stream, encoding, error_handler)
 109
 110         self.stream = stream
 111         """Where warning output is sent."""
 112
 113         self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
 114         """The output character encoding."""
 115
 116         self.observers = []
 117         """List of bound methods or functions to call with each system_message
 118         created."""
 119
 120         self.max_level = -1
 121         """The highest level system message generated so far."""
 122
 123     def set_conditions(self, category, report_level, halt_level,
 124                        stream=None, debug=0):
 125         warnings.warn('docutils.utils.Reporter.set_conditions deprecated; '
 126                       'set attributes via configuration settings or directly',
 127                       DeprecationWarning, stacklevel=2)
 128         self.report_level = report_level
 129         self.halt_level = halt_level
 130         if not isinstance(stream, ErrorOutput):
 131             stream = ErrorOutput(stream, self.encoding, self.error_handler)
 132         self.stream = stream
 133         self.debug_flag = debug
 134
 135     def attach_observer(self, observer):
 136         """
 137         The `observer` parameter is a function or bound method which takes one
 138         argument, a `nodes.system_message` instance.
 139         """
 140         self.observers.append(observer)
 141
 142     def detach_observer(self, observer):
 143         self.observers.remove(observer)
 144
 145     def notify_observers(self, message):
 146         for observer in self.observers:
 147             observer(message)
 148
 149     def system_message(self, level, message, *children, **kwargs):
 150         """
 151         Return a system_message object.
 152
 153         Raise an exception or generate a warning if appropriate.
 154         """
 155         # `message` can be a `string`, `unicode`, or `Exception` instance.
 156         if isinstance(message, Exception):
 157             message = SafeString(message)
 158
 159         attributes = kwargs.copy()
 160         if 'base_node' in kwargs:
 161             source, line = get_source_line(kwargs['base_node'])
 162             del attributes['base_node']
 163             if source is not None:
 164                 attributes.setdefault('source', source)
 165             if line is not None:
 166                 attributes.setdefault('line', line)
 167                 # assert source is not None, "node has line- but no source-argument"
 168         if not 'source' in attributes: # 'line' is absolute line number
 169             try: # look up (source, line-in-source)
 170                 source, line = self.locator(attributes.get('line'))
 171                 # print "locator lookup", kwargs.get('line'), "->", source, line
 172             except AttributeError:
 173                 source, line = None, None
 174             if source is not None:
 175                 attributes['source'] = source
 176             if line is not None:
 177                 attributes['line'] = line
 178         # assert attributes['line'] is not None, (message, kwargs)
 179         # assert attributes['source'] is not None, (message, kwargs)
 180         attributes.setdefault('source', self.source)
 181
 182         msg = nodes.system_message(message, level=level,
 183                                    type=self.levels[level],
 184                                    *children, **attributes)
 185         if self.stream and (level >= self.report_level
 186                             or self.debug_flag and level == self.DEBUG_LEVEL
 187                             or level >= self.halt_level):
 188             self.stream.write(msg.astext() + '\n')
 189         if level >= self.halt_level:
 190             raise SystemMessage(msg, level)
 191         if level > self.DEBUG_LEVEL or self.debug_flag:
 192             self.notify_observers(msg)
 193         self.max_level = max(level, self.max_level)
 194         return msg
 195
 196     def debug(self, *args, **kwargs):
 197         """
 198         Level-0, "DEBUG": an internal reporting issue. Typically, there is no
 199         effect on the processing. Level-0 system messages are handled
 200         separately from the others.
 201         """
 202         if self.debug_flag:
 203             return self.system_message(self.DEBUG_LEVEL, *args, **kwargs)
 204
 205     def info(self, *args, **kwargs):
 206         """
 207         Level-1, "INFO": a minor issue that can be ignored. Typically there is
 208         no effect on processing, and level-1 system messages are not reported.
 209         """
 210         return self.system_message(self.INFO_LEVEL, *args, **kwargs)
 211
 212     def warning(self, *args, **kwargs):
 213         """
 214         Level-2, "WARNING": an issue that should be addressed. If ignored,
 215         there may be unpredictable problems with the output.
 216         """
 217         return self.system_message(self.WARNING_LEVEL, *args, **kwargs)
 218
 219     def error(self, *args, **kwargs):
 220         """
 221         Level-3, "ERROR": an error that should be addressed. If ignored, the
 222         output will contain errors.
 223         """
 224         return self.system_message(self.ERROR_LEVEL, *args, **kwargs)
 225
 226     def severe(self, *args, **kwargs):
 227         """
 228         Level-4, "SEVERE": a severe error that must be addressed. If ignored,
 229         the output will contain severe errors. Typically level-4 system
 230         messages are turned into exceptions which halt processing.
 231         """
 232         return self.system_message(self.SEVERE_LEVEL, *args, **kwargs)
 233
 234
 235 class ExtensionOptionError(DataError): pass
 236 class BadOptionError(ExtensionOptionError): pass
 237 class BadOptionDataError(ExtensionOptionError): pass
 238 class DuplicateOptionError(ExtensionOptionError): pass
 239
 240
 241 def extract_extension_options(field_list, options_spec):
 242     """
 243     Return a dictionary mapping extension option names to converted values.
 244
 245     :Parameters:
 246         - `field_list`: A flat field list without field arguments, where each
 247           field body consists of a single paragraph only.
 248         - `options_spec`: Dictionary mapping known option names to a
 249           conversion function such as `int` or `float`.
 250
 251     :Exceptions:
 252         - `KeyError` for unknown option names.
 253         - `ValueError` for invalid option values (raised by the conversion
 254            function).
 255         - `TypeError` for invalid option value types (raised by conversion
 256            function).
 257         - `DuplicateOptionError` for duplicate options.
 258         - `BadOptionError` for invalid fields.
 259         - `BadOptionDataError` for invalid option data (missing name,
 260           missing data, bad quotes, etc.).
 261     """
 262     option_list = extract_options(field_list)
 263     option_dict = assemble_option_dict(option_list, options_spec)
 264     return option_dict
 265
 266 def extract_options(field_list):
 267     """
 268     Return a list of option (name, value) pairs from field names & bodies.
 269
 270     :Parameter:
 271         `field_list`: A flat field list, where each field name is a single
 272         word and each field body consists of a single paragraph only.
 273
 274     :Exceptions:
 275         - `BadOptionError` for invalid fields.
 276         - `BadOptionDataError` for invalid option data (missing name,
 277           missing data, bad quotes, etc.).
 278     """
 279     option_list = []
 280     for field in field_list:
 281         if len(field[0].astext().split()) != 1:
 282             raise BadOptionError(
 283                 'extension option field name may not contain multiple words')
 284         name = str(field[0].astext().lower())
 285         body = field[1]
 286         if len(body) == 0:
 287             data = None
 288         elif len(body) > 1 or not isinstance(body[0], nodes.paragraph) \
 289               or len(body[0]) != 1 or not isinstance(body[0][0], nodes.Text):
 290             raise BadOptionDataError(
 291                   'extension option field body may contain\n'
 292                   'a single paragraph only (option "%s")' % name)
 293         else:
 294             data = body[0][0].astext()
 295         option_list.append((name, data))
 296     return option_list
 297
 298 def assemble_option_dict(option_list, options_spec):
 299     """
 300     Return a mapping of option names to values.
 301
 302     :Parameters:
 303         - `option_list`: A list of (name, value) pairs (the output of
 304           `extract_options()`).
 305         - `options_spec`: Dictionary mapping known option names to a
 306           conversion function such as `int` or `float`.
 307
 308     :Exceptions:
 309         - `KeyError` for unknown option names.
 310         - `DuplicateOptionError` for duplicate options.
 311         - `ValueError` for invalid option values (raised by conversion
 312            function).
 313         - `TypeError` for invalid option value types (raised by conversion
 314            function).
 315     """
 316     options = {}
 317     for name, value in option_list:
 318         convertor = options_spec[name]  # raises KeyError if unknown
 319         if convertor is None:
 320             raise KeyError(name)        # or if explicitly disabled
 321         if name in options:
 322             raise DuplicateOptionError('duplicate option "%s"' % name)
 323         try:
 324             options[name] = convertor(value)
 325         except (ValueError, TypeError), detail:
 326             raise detail.__class__('(option: "%s"; value: %r)\n%s'
 327                                    % (name, value, ' '.join(detail.args)))
 328     return options
 329
 330
 331 class NameValueError(DataError): pass
 332
 333
 334 def decode_path(path):
 335     """
 336     Ensure `path` is Unicode. Return `nodes.reprunicode` object.
 337
 338     Decode file/path string in a failsave manner if not already done.
 339     """
 340     # see also http://article.gmane.org/gmane.text.docutils.user/2905
 341     if isinstance(path, unicode):
 342         return path
 343     try:
 344         path = path.decode(sys.getfilesystemencoding(), 'strict')
 345     except AttributeError: # default value None has no decode method
 346         return nodes.reprunicode(path)
 347     except UnicodeDecodeError:
 348         try:
 349             path = path.decode('utf-8', 'strict')
 350         except UnicodeDecodeError:
 351             path = path.decode('ascii', 'replace')
 352     return nodes.reprunicode(path)
 353
 354
 355 def extract_name_value(line):
 356     """
 357     Return a list of (name, value) from a line of the form "name=value ...".
 358
 359     :Exception:
 360         `NameValueError` for invalid input (missing name, missing data, bad
 361         quotes, etc.).
 362     """
 363     attlist = []
 364     while line:
 365         equals = line.find('=')
 366         if equals == -1:
 367             raise NameValueError('missing "="')
 368         attname = line[:equals].strip()
 369         if equals == 0 or not attname:
 370             raise NameValueError(
 371                   'missing attribute name before "="')
 372         line = line[equals+1:].lstrip()
 373         if not line:
 374             raise NameValueError(
 375                   'missing value after "%s="' % attname)
 376         if line[0] in '\'"':
 377             endquote = line.find(line[0], 1)
 378             if endquote == -1:
 379                 raise NameValueError(
 380                       'attribute "%s" missing end quote (%s)'
 381                       % (attname, line[0]))
 382             if len(line) > endquote + 1 and line[endquote + 1].strip():
 383                 raise NameValueError(
 384                       'attribute "%s" end quote (%s) not followed by '
 385                       'whitespace' % (attname, line[0]))
 386             data = line[1:endquote]
 387             line = line[endquote+1:].lstrip()
 388         else:
 389             space = line.find(' ')
 390             if space == -1:
 391                 data = line
 392                 line = ''
 393             else:
 394                 data = line[:space]
 395                 line = line[space+1:].lstrip()
 396         attlist.append((attname.lower(), data))
 397     return attlist
 398
 399 def new_reporter(source_path, settings):
 400     """
 401     Return a new Reporter object.
 402
 403     :Parameters:
 404         `source` : string
 405             The path to or description of the source text of the document.
 406         `settings` : optparse.Values object
 407             Runtime settings.
 408     """
 409     reporter = Reporter(
 410         source_path, settings.report_level, settings.halt_level,
 411         stream=settings.warning_stream, debug=settings.debug,
 412         encoding=settings.error_encoding,
 413         error_handler=settings.error_encoding_error_handler)
 414     return reporter
 415
 416 def new_document(source_path, settings=None):
 417     """
 418     Return a new empty document object.
 419
 420     :Parameters:
 421         `source_path` : string
 422             The path to or description of the source text of the document.
 423         `settings` : optparse.Values object
 424             Runtime settings.  If none are provided, a default core set will
 425             be used.  If you will use the document object with any Docutils
 426             components, you must provide their default settings as well.  For
 427             example, if parsing, at least provide the parser settings,
 428             obtainable as follows::
 429
 430                 settings = docutils.frontend.OptionParser(
 431                     components=(docutils.parsers.rst.Parser,)
 432                     ).get_default_values()
 433     """
 434     from docutils import frontend
 435     if settings is None:
 436         settings = frontend.OptionParser().get_default_values()
 437     source_path = decode_path(source_path)
 438     reporter = new_reporter(source_path, settings)
 439     document = nodes.document(settings, reporter, source=source_path)
 440     document.note_source(source_path, -1)
 441     return document
 442
 443 def clean_rcs_keywords(paragraph, keyword_substitutions):
 444     if len(paragraph) == 1 and isinstance(paragraph[0], nodes.Text):
 445         textnode = paragraph[0]
 446         for pattern, substitution in keyword_substitutions:
 447             match = pattern.search(textnode)
 448             if match:
 449                 paragraph[0] = nodes.Text(pattern.sub(substitution, textnode))
 450                 return
 451
 452 def relative_path(source, target):
 453     """
 454     Build and return a path to `target`, relative to `source` (both files).
 455
 456     If there is no common prefix, return the absolute path to `target`.
 457     """
 458     source_parts = os.path.abspath(source or 'dummy_file').split(os.sep)
 459     target_parts = os.path.abspath(target).split(os.sep)
 460     # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
 461     if source_parts[:2] != target_parts[:2]:
 462         # Nothing in common between paths.
 463         # Return absolute path, using '/' for URLs:
 464         return '/'.join(target_parts)
 465     source_parts.reverse()
 466     target_parts.reverse()
 467     while (source_parts and target_parts
 468            and source_parts[-1] == target_parts[-1]):
 469         # Remove path components in common:
 470         source_parts.pop()
 471         target_parts.pop()
 472     target_parts.reverse()
 473     parts = ['..'] * (len(source_parts) - 1) + target_parts
 474     return '/'.join(parts)
 475
 476 def get_stylesheet_reference(settings, relative_to=None):
 477     """
 478     Retrieve a stylesheet reference from the settings object.
 479
 480     Deprecated. Use get_stylesheet_reference_list() instead to
 481     enable specification of multiple stylesheets as a comma-separated
 482     list.
 483     """
 484     if settings.stylesheet_path:
 485         assert not settings.stylesheet, (
 486             'stylesheet and stylesheet_path are mutually exclusive.')
 487         if relative_to == None:
 488             relative_to = settings._destination
 489         return relative_path(relative_to, settings.stylesheet_path)
 490     else:
 491         return settings.stylesheet
 492
 493 # Return 'stylesheet' or 'stylesheet_path' arguments as list.
 494 #
 495 # The original settings arguments are kept unchanged: you can test
 496 # with e.g. ``if settings.stylesheet_path:``
 497 #
 498 # Differences to ``get_stylesheet_reference``:
 499 # * return value is a list
 500 # * no re-writing of the path (and therefore no optional argument)
 501 #   (if required, use ``utils.relative_path(source, target)``
 502 #   in the calling script)
 503 def get_stylesheet_list(settings):
 504     """
 505     Retrieve list of stylesheet references from the settings object.
 506     """
 507     assert not (settings.stylesheet and settings.stylesheet_path), (
 508             'stylesheet and stylesheet_path are mutually exclusive.')
 509     if settings.stylesheet_path:
 510         sheets = settings.stylesheet_path.split(",")
 511     elif settings.stylesheet:
 512         sheets = settings.stylesheet.split(",")
 513     else:
 514         sheets = []
 515     # strip whitespace (frequently occuring in config files)
 516     return [sheet.strip(u' \t\n') for sheet in sheets]
 517
 518 def get_trim_footnote_ref_space(settings):
 519     """
 520     Return whether or not to trim footnote space.
 521
 522     If trim_footnote_reference_space is not None, return it.
 523
 524     If trim_footnote_reference_space is None, return False unless the
 525     footnote reference style is 'superscript'.
 526     """
 527     if settings.trim_footnote_reference_space is None:
 528         return hasattr(settings, 'footnote_references') and \
 529                settings.footnote_references == 'superscript'
 530     else:
 531         return settings.trim_footnote_reference_space
 532
 533 def get_source_line(node):
 534     """
 535     Return the "source" and "line" attributes from the `node` given or from
 536     its closest ancestor.
 537     """
 538     while node:
 539         if node.source or node.line:
 540             return node.source, node.line
 541         node = node.parent
 542     return None, None
 543
 544 def escape2null(text):
 545     """Return a string with escape-backslashes converted to nulls."""
 546     parts = []
 547     start = 0
 548     while 1:
 549         found = text.find('\\', start)
 550         if found == -1:
 551             parts.append(text[start:])
 552             return ''.join(parts)
 553         parts.append(text[start:found])
 554         parts.append('\x00' + text[found+1:found+2])
 555         start = found + 2               # skip character after escape
 556
 557 def unescape(text, restore_backslashes=0):
 558     """
 559     Return a string with nulls removed or restored to backslashes.
 560     Backslash-escaped spaces are also removed.
 561     """
 562     if restore_backslashes:
 563         return text.replace('\x00', '\\')
 564     else:
 565         for sep in ['\x00 ', '\x00\n', '\x00']:
 566             text = ''.join(text.split(sep))
 567         return text
 568
 569 east_asian_widths = {'W': 2,   # Wide
 570                      'F': 2,   # Full-width (wide)
 571                      'Na': 1,  # Narrow
 572                      'H': 1,   # Half-width (narrow)
 573                      'N': 1,   # Neutral (not East Asian, treated as narrow)
 574                      'A': 1}   # Ambiguous (s/b wide in East Asian context,
 575                                # narrow otherwise, but that doesn't work)
 576 """Mapping of result codes from `unicodedata.east_asian_widt()` to character
 577 column widths."""
 578
 579 def column_width(text):
 580     """Return the column width of text.
 581
 582     Correct ``len(text)`` for wide East Asian and combining Unicode chars.
 583     """
 584     if isinstance(text, str) and sys.version_info < (3,0):
 585         return len(text)
 586     combining_correction = sum([-1 for c in text
 587                                 if unicodedata.combining(c)])
 588     try:
 589         width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
 590                      for c in text])
 591     except AttributeError:  # east_asian_width() New in version 2.4.
 592         width = len(text)
 593     return width + combining_correction
 594
 595 def uniq(L):
 596      r = []
 597      for item in L:
 598          if not item in r:
 599              r.append(item)
 600      return r
 601
 602 # by Li Daobing http://code.activestate.com/recipes/190465/
 603 # since Python 2.6 there is also itertools.combinations()
 604 def unique_combinations(items, n):
 605     """Return r-length tuples, in sorted order, no repeated elements"""
 606     if n==0: yield []
 607     else:
 608         for i in xrange(len(items)-n+1):
 609             for cc in unique_combinations(items[i+1:],n-1):
 610                 yield [items[i]]+cc
 611
 612 def normalize_language_tag(tag):
 613     """Return a list of normalized combinations for a `BCP 47` language tag.
 614
 615     Example:
 616
 617       >>> normalize_language_tag('de-AT-1901')
 618       ['de_at_1901', 'de_at', 'de_1901', 'de']
 619     """
 620     # normalize:
 621     tag = tag.lower().replace('-','_')
 622     # find all combinations of subtags
 623     taglist = []
 624     base_tag= tag.split('_')[:1]
 625     subtags = tag.split('_')[1:]
 626     # print base_tag, subtags
 627     for n in range(len(subtags), 0, -1):
 628         for tags in unique_combinations(subtags, n):
 629             # print tags
 630             taglist.append('_'.join(base_tag + tags))
 631     taglist += base_tag
 632     return taglist
 633
 634 class DependencyList:
 635
 636     """
 637     List of dependencies, with file recording support.
 638
 639     Note that the output file is not automatically closed.  You have
 640     to explicitly call the close() method.
 641     """
 642
 643     def __init__(self, output_file=None, dependencies=[]):
 644         """
 645         Initialize the dependency list, automatically setting the
 646         output file to `output_file` (see `set_output()`) and adding
 647         all supplied dependencies.
 648         """
 649         self.set_output(output_file)
 650         for i in dependencies:
 651             self.add(i)
 652
 653     def set_output(self, output_file):
 654         """
 655         Set the output file and clear the list of already added
 656         dependencies.
 657
 658         `output_file` must be a string.  The specified file is
 659         immediately overwritten.
 660
 661         If output_file is '-', the output will be written to stdout.
 662         If it is None, no file output is done when calling add().
 663         """
 664         self.list = []
 665         if output_file == '-':
 666             self.file = sys.stdout
 667         elif output_file:
 668             self.file = open(output_file, 'w')
 669         else:
 670             self.file = None
 671
 672     def add(self, *filenames):
 673         """
 674         If the dependency `filename` has not already been added,
 675         append it to self.list and print it to self.file if self.file
 676         is not None.
 677         """
 678         for filename in filenames:
 679             if not filename in self.list:
 680                 self.list.append(filename)
 681                 if self.file is not None:
 682                     print >>self.file, filename
 683
 684     def close(self):
 685         """
 686         Close the output file.
 687         """
 688         if self.file not in (sys.stdout, sys.stderr):
 689             self.file.close()
 690             self.file = None
 691
 692     def __repr__(self):
 693         if self.file:
 694             output_file = self.file.name
 695         else:
 696             output_file = None
 697         return '%s(%r, %s)' % (self.__class__.__name__, output_file, self.list)
 698
 699 import xml.sax.handler
 700 from xml.sax.handler import feature_namespaces
 701 from StringIO import StringIO
 702
 703 class CopyTree(xml.sax.ContentHandler):
 704     """
 705     Needed class for the function  XmlStringToDocutilsNodes function.
 706     Don't invoke this class directly.
 707
 708     """
 709
 710     def __init__(self, default_namespace = None, ns_dict = None):
 711           self.__characters = ''
 712           self.__current_node_list = []
 713           self.__tree = None
 714           self.__default_namespace = default_namespace
 715           self.__ns_prefix_dict = {
 716                   'http://www.w3.org/XML/1998/namespace': 'xml',
 717                   'http://www.w3.org/1998/Math/MathML': 'ml',
 718                   'http://www.w3.org/1999/xhtml': 'xhtml',
 719                   'http://www.w3.org/1999/XSL/Transform':'xsl',
 720                   }
 721           if ns_dict != None:
 722                self.__ns_prefix_dict.update(ns_dict)
 723
 724           self.__ns_prefix_no_write_dict = {
 725                   'http://www.w3.org/XML/1998/namespace': 'xml',
 726                   }
 727
 728
 729     def characters (self, characters):
 730         self.__characters += characters
 731
 732
 733     def startElementNS(self, name, qname, attrs):
 734         """
 735         Elements
 736         ========
 737
 738         Get the information from the start of the element to write a docutis node Element.
 739
 740         If the default_namespace is set and it is the first element, get the
 741         namespace from the first element. Write it as xmlns="http:/...." Do
 742         not write any other namespaces.  If the default_namespace is set but
 743         no namespace is found, raise an error.
 744
 745         If there are no namespaces, just get the element's name, create an Element method, and
 746         set the tagname.
 747
 748
 749         If there is namespace:
 750
 751           1. See if a convenient prefix exists. If so, write that prefix:
 752              <math xmlns="http://www.w3.org/XML/1998/namespace" => <ml:math
 753
 754           2. If there is a namespace but no prefix,  use ns1 as the prefix
 755             <customElement xmlns="http://www.custom.org" => <ns1:customElement
 756
 757           3. If the namespace needs to be decalred, then write it:
 758              <math xmlsn="http://www.w3.org/XML/1998/namespace" => <ml:math xmlsn:ml=http://www.w3.org/XML/1998/namespace"
 759              of
 760             <customElement xmlns="http://www.custom.org" => <ns1:customElement xmlns:ns1="http://www.custom.org"
 761
 762           4. If the namespace does not need to be written, don't write it:
 763             (Don't think any examles exist for elements.)
 764
 765         Attributes
 766         ==========
 767
 768         The same strategy is followd for the attributes, with the exception of using the ns# for a default prefix.
 769         If no convenient prefix is found:
 770
 771         1. If the namespace for the attribute matches the namespace for the element, use it (ns1)
 772
 773         2. Otherwise, start with ns2, and use the next number (ns3) for the next prefix, and so on.
 774
 775         """
 776         if len(self.__current_node_list) > 0:
 777             self.__write_text()
 778         ns = name[0] # for example, "http://www.w3.org/XML/1998/namespace"
 779         ns_prefix = self.__ns_prefix_dict.get(ns)
 780         el_name = name[1] # a string indicating the tag name, for example, "math"
 781         element = nodes.Element()
 782         element.tagname = el_name
 783         if len(self.__current_node_list) > 0:
 784             self.__current_node_list[-1].append(element)
 785             # if there is a namespace that does not match the root; and not an
 786             # implicit namespace, like XML, raise an error
 787             if ns and self.__default_namespace and ns != self.__default_namespace and not(self.__ns_prefix_no_write_dict.get(ns)):
 788                 raise SystemError('default namespace "%s"  does not match root namespace "%s"' % (ns, self.__default_namespace))
 789         else:
 790             self.__tree = element
 791             if self.__default_namespace:
 792                 if not ns:
 793                     raise SystemError('no default namespace found, yet default_namespace passed to function')
 794                 element['xmlns'] = ns
 795                 self.__default_namespace = ns
 796         self.__current_node_list.append(element)
 797         if not self.__default_namespace:
 798             if ns and ns_prefix:
 799                 element.tagname = '%s:%s' % (ns_prefix, el_name)
 800             elif ns:
 801                 element.tagname = 'ns1:%s' % el_name
 802
 803             if ns and self.__ns_prefix_no_write_dict.get(ns): # don't need to write certain namespaces, like xml
 804                 pass
 805             elif ns and ns_prefix:
 806                 element['xmlns:%s' % ns_prefix] = ns
 807             elif ns:
 808                 element['xmlns:ns1'] = ns
 809         elif self.__ns_prefix_no_write_dict.get(ns):
 810             # unlikey to actually occurr, but just in case
 811             element.tagname = '%s:%s' % (ns_prefix, el_name)
 812
 813
 814         the_keys = attrs.keys()
 815         counter = 1
 816         for the_key in the_keys:
 817             counter +=1
 818             ns_att = the_key[0]
 819             att_name = the_key[1]
 820             value = attrs[the_key]
 821             ns_prefix = self.__ns_prefix_dict.get(ns_att)
 822             if not self.__default_namespace:# all cases for non-default space, including no namespace and xml namespace
 823                 if ns_att and ns_att != ns:
 824                     if not(self.__ns_prefix_no_write_dict.get(ns_att)):
 825                         att = 'xmlns:ns%s' % counter
 826                         the_value = ns_att
 827                         element[att] = the_value
 828                 if ns_att and ns_prefix:
 829                     att = '%s:%s' % (ns_prefix, att_name)
 830                     element[att] = value
 831                 elif ns_att and ns_att == ns:
 832                     att = 'ns1:%s' % att_name
 833                     element[att] = value
 834                 elif ns_att:
 835                     att = 'ns%s:%s' % (counter, att_name)
 836                     element[att] = value
 837                 else:
 838                     element[att_name] = value
 839             else: # default namespace only write prefixes such as xml; otherwise just write attribute
 840                 if ns_att and self.__ns_prefix_no_write_dict.get(ns_att):
 841                     att_name = '%s:%s' % (ns_prefix, att_name)
 842                     element[att_name] = value
 843                 else:
 844                     element[att_name] = value
 845
 846     def __write_text(self):
 847         text = self.__characters
 848         self.__current_node_list[-1].append(nodes.Text(text))
 849         self.__characters = ''
 850
 851     def endElementNS(self, name, qname):
 852         self.__write_text()
 853         self.__current_node_list.pop()
 854
 855     def get_tree(self):
 856         return self.__tree
 857
 858     def endDocument(self):
 859         pass
 860
 861 def XmlStringToDocutilsNodes(xml_string, encoding='utf8', default_namespace = None, ns_dict = None):
 862     """
 863     Converts an XML String into a docutils node tree, and returns that tree.
 864
 865     xml_string can either be a unicode object or a string (for Python < 3); or
 866     a string or a byte string (for pyton >=3.0).
 867
 868     The encoding is the encoding for the xm_string.
 869
 870     The default_namespace should be set to some boolean value, such as True or
 871     False. If set, default_namespace makes easier-to read XML by writing the
 872     namespace in only the first element:
 873
 874     <ml:math xmlns:ml="http://www.w3.org/1998/Math/MathML>
 875       <ml:style xmlns:ml="http://www.w3.org/1998/Math/MathMl">
 876       </ml:style>
 877      </ml:math
 878
 879      Becomes:
 880
 881     <math xmlns="http://www.w3.org/1998/Math/MathML>
 882       <style >
 883       </style>
 884      </math
 885
 886      An error is raised if no namespace is found for the first element, or a namespace is found
 887      for subequent elements that does not match.
 888
 889     The ns_dict is a dictionary of namespaces mapped to a prefix. For example:
 890
 891      {"http://www.tei-c.org/ns/1.0":'tei'}
 892
 893      If any element is found with the namespace http://www.tei-c.org/ns/1.0,
 894      then the prefix "tei" is used.  Note that this dictionary only makes the
 895      XML look more readable, and is not needed to create valid XML with the
 896      correct namespaces. For example, if the parser finds an element with a
 897      namespace "http://www.tei-c.org/ns/1.0", and no dict is passed to this
 898      function, the parser assigns its own prefix:
 899
 900      <ns1:paragraph xmlns:ns1="http://www.tei-c.org/ns/1.0"
 901
 902
 903     """
 904
 905     if sys.version_info < (3,):
 906         if type(xml_string) == type(unicode('x')):
 907             xml_string = xml_string.encode('utf8')
 908         elif type(xml_string) == type('x'):
 909             xml_string = xml_string.decode(encoding)
 910             xml_string = xml_string.encode('utf8')
 911     else:
 912         if type(xml_string) == type(b'x'):
 913             xml_string = xml_string.decode(encoding)
 914     read_obj = StringIO(xml_string)
 915     the_handle=CopyTree(ns_dict = ns_dict, default_namespace = default_namespace)
 916     parser = xml.sax.make_parser()
 917     parser.setFeature(feature_namespaces, 1)
 918     parser.setContentHandler(the_handle)
 919     parser.setFeature("http://xml.org/sax/features/external-general-entities", True)
 920     parser.parse(read_obj)
 921     read_obj.close()
 922     docutils_tree = the_handle.get_tree()
 923     return docutils_tree
 924
 925 import xml.dom.minidom
 926 import xml.sax.saxutils
 927
 928     """
 929     takes a dom element as current_element
 930
 931     """
 932
 933 def start_tag(local_name):
 934     sys.stdout.write('<%s>' % local_name)
 935
 936 def end_tag(local_name):
 937     sys.stdout.write('</%s>' % local_name)
 938
 939 dom = xml.dom.minidom.parse('test.xml')
 940 out_doc = xml.dom.minidom.Document()
 941 def copy_tree(current_element):
 942     elements = current_element.childNodes
 943     for element in elements:
 944         if element.nodeType == xml.dom.Node.ELEMENT_NODE:
 945             element_name = element.localName
 946         if element.attributes!= None:
 947             for attr in element.attributes.values():
 948                 ns = attr.namespaceURI
 949                 local_name = attr.localName
 950                 name = attr.name
 951                 value = attr.value
 952                 prefix = attr.prefix
 953                 new_att = out_doc.createAttribute(name )
 954             start_tag(element_name)
 955             copy_tree(element)
 956             end_tag(element_name)
 957         elif element.nodeType == xml.dom.Node.TEXT_NODE:
 958             parent = element.parentNode
 959             if parent.localName == 'math':
 960                 sys.stdout.write(element.data)
 961             else:
 962                 sys.stdout.write(element.data)
 963
 964
 965
 966 copy_tree(dom)