math, error_reporting, and urischemes moved to the utils package.
[docutils.git] / docutils / utils / __init__.py
blob6ab4ff8e22761060da216360b39075f1d8052491
1 # coding: utf8
2 # $Id$
3 # Author: David Goodger <goodger@python.org>
4 # Copyright: This module has been placed in the public domain.
6 """
7 Miscellaneous utilities for the documentation utilities.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 import os
14 import os.path
15 import warnings
16 import unicodedata
17 from docutils import ApplicationError, DataError
18 from docutils import nodes
19 from docutils.io import FileOutput
20 from docutils.utils.error_reporting import ErrorOutput, SafeString
23 class SystemMessage(ApplicationError):
25 def __init__(self, system_message, level):
26 Exception.__init__(self, system_message.astext())
27 self.level = level
30 class SystemMessagePropagation(ApplicationError): pass
33 class Reporter:
35 """
36 Info/warning/error reporter and ``system_message`` element generator.
38 Five levels of system messages are defined, along with corresponding
39 methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
41 There is typically one Reporter object per process. A Reporter object is
42 instantiated with thresholds for reporting (generating warnings) and
43 halting processing (raising exceptions), a switch to turn debug output on
44 or off, and an I/O stream for warnings. These are stored as instance
45 attributes.
47 When a system message is generated, its level is compared to the stored
48 thresholds, and a warning or error is generated as appropriate. Debug
49 messages are produced if the stored debug switch is on, independently of
50 other thresholds. Message output is sent to the stored warning stream if
51 not set to ''.
53 The Reporter class also employs a modified form of the "Observer" pattern
54 [GoF95]_ to track system messages generated. The `attach_observer` method
55 should be called before parsing, with a bound method or function which
56 accepts system messages. The observer can be removed with
57 `detach_observer`, and another added in its place.
59 .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
60 Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
61 1995.
62 """
64 levels = 'DEBUG INFO WARNING ERROR SEVERE'.split()
65 """List of names for system message levels, indexed by level."""
67 # system message level constants:
68 (DEBUG_LEVEL,
69 INFO_LEVEL,
70 WARNING_LEVEL,
71 ERROR_LEVEL,
72 SEVERE_LEVEL) = range(5)
74 def __init__(self, source, report_level, halt_level, stream=None,
75 debug=False, encoding=None, error_handler='backslashreplace'):
76 """
77 :Parameters:
78 - `source`: The path to or description of the source data.
79 - `report_level`: The level at or above which warning output will
80 be sent to `stream`.
81 - `halt_level`: The level at or above which `SystemMessage`
82 exceptions will be raised, halting execution.
83 - `debug`: Show debug (level=0) system messages?
84 - `stream`: Where warning output is sent. Can be file-like (has a
85 ``.write`` method), a string (file name, opened for writing),
86 '' (empty string) or `False` (for discarding all stream messages)
87 or `None` (implies `sys.stderr`; default).
88 - `encoding`: The output encoding.
89 - `error_handler`: The error handler for stderr output encoding.
90 """
92 self.source = source
93 """The path to or description of the source data."""
95 self.error_handler = error_handler
96 """The character encoding error handler."""
98 self.debug_flag = debug
99 """Show debug (level=0) system messages?"""
101 self.report_level = report_level
102 """The level at or above which warning output will be sent
103 to `self.stream`."""
105 self.halt_level = halt_level
106 """The level at or above which `SystemMessage` exceptions
107 will be raised, halting execution."""
109 if not isinstance(stream, ErrorOutput):
110 stream = ErrorOutput(stream, encoding, error_handler)
112 self.stream = stream
113 """Where warning output is sent."""
115 self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
116 """The output character encoding."""
118 self.observers = []
119 """List of bound methods or functions to call with each system_message
120 created."""
122 self.max_level = -1
123 """The highest level system message generated so far."""
125 def set_conditions(self, category, report_level, halt_level,
126 stream=None, debug=False):
127 warnings.warn('docutils.utils.Reporter.set_conditions deprecated; '
128 'set attributes via configuration settings or directly',
129 DeprecationWarning, stacklevel=2)
130 self.report_level = report_level
131 self.halt_level = halt_level
132 if not isinstance(stream, ErrorOutput):
133 stream = ErrorOutput(stream, self.encoding, self.error_handler)
134 self.stream = stream
135 self.debug_flag = debug
137 def attach_observer(self, observer):
139 The `observer` parameter is a function or bound method which takes one
140 argument, a `nodes.system_message` instance.
142 self.observers.append(observer)
144 def detach_observer(self, observer):
145 self.observers.remove(observer)
147 def notify_observers(self, message):
148 for observer in self.observers:
149 observer(message)
151 def system_message(self, level, message, *children, **kwargs):
153 Return a system_message object.
155 Raise an exception or generate a warning if appropriate.
157 # `message` can be a `string`, `unicode`, or `Exception` instance.
158 if isinstance(message, Exception):
159 message = SafeString(message)
161 attributes = kwargs.copy()
162 if 'base_node' in kwargs:
163 source, line = get_source_line(kwargs['base_node'])
164 del attributes['base_node']
165 if source is not None:
166 attributes.setdefault('source', source)
167 if line is not None:
168 attributes.setdefault('line', line)
169 # assert source is not None, "node has line- but no source-argument"
170 if not 'source' in attributes: # 'line' is absolute line number
171 try: # look up (source, line-in-source)
172 source, line = self.get_source_and_line(attributes.get('line'))
173 # print "locator lookup", kwargs.get('line'), "->", source, line
174 except AttributeError:
175 source, line = None, None
176 if source is not None:
177 attributes['source'] = source
178 if line is not None:
179 attributes['line'] = line
180 # assert attributes['line'] is not None, (message, kwargs)
181 # assert attributes['source'] is not None, (message, kwargs)
182 attributes.setdefault('source', self.source)
184 msg = nodes.system_message(message, level=level,
185 type=self.levels[level],
186 *children, **attributes)
187 if self.stream and (level >= self.report_level
188 or self.debug_flag and level == self.DEBUG_LEVEL
189 or level >= self.halt_level):
190 self.stream.write(msg.astext() + '\n')
191 if level >= self.halt_level:
192 raise SystemMessage(msg, level)
193 if level > self.DEBUG_LEVEL or self.debug_flag:
194 self.notify_observers(msg)
195 self.max_level = max(level, self.max_level)
196 return msg
198 def debug(self, *args, **kwargs):
200 Level-0, "DEBUG": an internal reporting issue. Typically, there is no
201 effect on the processing. Level-0 system messages are handled
202 separately from the others.
204 if self.debug_flag:
205 return self.system_message(self.DEBUG_LEVEL, *args, **kwargs)
207 def info(self, *args, **kwargs):
209 Level-1, "INFO": a minor issue that can be ignored. Typically there is
210 no effect on processing, and level-1 system messages are not reported.
212 return self.system_message(self.INFO_LEVEL, *args, **kwargs)
214 def warning(self, *args, **kwargs):
216 Level-2, "WARNING": an issue that should be addressed. If ignored,
217 there may be unpredictable problems with the output.
219 return self.system_message(self.WARNING_LEVEL, *args, **kwargs)
221 def error(self, *args, **kwargs):
223 Level-3, "ERROR": an error that should be addressed. If ignored, the
224 output will contain errors.
226 return self.system_message(self.ERROR_LEVEL, *args, **kwargs)
228 def severe(self, *args, **kwargs):
230 Level-4, "SEVERE": a severe error that must be addressed. If ignored,
231 the output will contain severe errors. Typically level-4 system
232 messages are turned into exceptions which halt processing.
234 return self.system_message(self.SEVERE_LEVEL, *args, **kwargs)
237 class ExtensionOptionError(DataError): pass
238 class BadOptionError(ExtensionOptionError): pass
239 class BadOptionDataError(ExtensionOptionError): pass
240 class DuplicateOptionError(ExtensionOptionError): pass
243 def extract_extension_options(field_list, options_spec):
245 Return a dictionary mapping extension option names to converted values.
247 :Parameters:
248 - `field_list`: A flat field list without field arguments, where each
249 field body consists of a single paragraph only.
250 - `options_spec`: Dictionary mapping known option names to a
251 conversion function such as `int` or `float`.
253 :Exceptions:
254 - `KeyError` for unknown option names.
255 - `ValueError` for invalid option values (raised by the conversion
256 function).
257 - `TypeError` for invalid option value types (raised by conversion
258 function).
259 - `DuplicateOptionError` for duplicate options.
260 - `BadOptionError` for invalid fields.
261 - `BadOptionDataError` for invalid option data (missing name,
262 missing data, bad quotes, etc.).
264 option_list = extract_options(field_list)
265 option_dict = assemble_option_dict(option_list, options_spec)
266 return option_dict
268 def extract_options(field_list):
270 Return a list of option (name, value) pairs from field names & bodies.
272 :Parameter:
273 `field_list`: A flat field list, where each field name is a single
274 word and each field body consists of a single paragraph only.
276 :Exceptions:
277 - `BadOptionError` for invalid fields.
278 - `BadOptionDataError` for invalid option data (missing name,
279 missing data, bad quotes, etc.).
281 option_list = []
282 for field in field_list:
283 if len(field[0].astext().split()) != 1:
284 raise BadOptionError(
285 'extension option field name may not contain multiple words')
286 name = str(field[0].astext().lower())
287 body = field[1]
288 if len(body) == 0:
289 data = None
290 elif len(body) > 1 or not isinstance(body[0], nodes.paragraph) \
291 or len(body[0]) != 1 or not isinstance(body[0][0], nodes.Text):
292 raise BadOptionDataError(
293 'extension option field body may contain\n'
294 'a single paragraph only (option "%s")' % name)
295 else:
296 data = body[0][0].astext()
297 option_list.append((name, data))
298 return option_list
300 def assemble_option_dict(option_list, options_spec):
302 Return a mapping of option names to values.
304 :Parameters:
305 - `option_list`: A list of (name, value) pairs (the output of
306 `extract_options()`).
307 - `options_spec`: Dictionary mapping known option names to a
308 conversion function such as `int` or `float`.
310 :Exceptions:
311 - `KeyError` for unknown option names.
312 - `DuplicateOptionError` for duplicate options.
313 - `ValueError` for invalid option values (raised by conversion
314 function).
315 - `TypeError` for invalid option value types (raised by conversion
316 function).
318 options = {}
319 for name, value in option_list:
320 convertor = options_spec[name] # raises KeyError if unknown
321 if convertor is None:
322 raise KeyError(name) # or if explicitly disabled
323 if name in options:
324 raise DuplicateOptionError('duplicate option "%s"' % name)
325 try:
326 options[name] = convertor(value)
327 except (ValueError, TypeError), detail:
328 raise detail.__class__('(option: "%s"; value: %r)\n%s'
329 % (name, value, ' '.join(detail.args)))
330 return options
333 class NameValueError(DataError): pass
336 def decode_path(path):
338 Ensure `path` is Unicode. Return `nodes.reprunicode` object.
340 Decode file/path string in a failsave manner if not already done.
342 # see also http://article.gmane.org/gmane.text.docutils.user/2905
343 if isinstance(path, unicode):
344 return path
345 try:
346 path = path.decode(sys.getfilesystemencoding(), 'strict')
347 except AttributeError: # default value None has no decode method
348 return nodes.reprunicode(path)
349 except UnicodeDecodeError:
350 try:
351 path = path.decode('utf-8', 'strict')
352 except UnicodeDecodeError:
353 path = path.decode('ascii', 'replace')
354 return nodes.reprunicode(path)
357 def extract_name_value(line):
359 Return a list of (name, value) from a line of the form "name=value ...".
361 :Exception:
362 `NameValueError` for invalid input (missing name, missing data, bad
363 quotes, etc.).
365 attlist = []
366 while line:
367 equals = line.find('=')
368 if equals == -1:
369 raise NameValueError('missing "="')
370 attname = line[:equals].strip()
371 if equals == 0 or not attname:
372 raise NameValueError(
373 'missing attribute name before "="')
374 line = line[equals+1:].lstrip()
375 if not line:
376 raise NameValueError(
377 'missing value after "%s="' % attname)
378 if line[0] in '\'"':
379 endquote = line.find(line[0], 1)
380 if endquote == -1:
381 raise NameValueError(
382 'attribute "%s" missing end quote (%s)'
383 % (attname, line[0]))
384 if len(line) > endquote + 1 and line[endquote + 1].strip():
385 raise NameValueError(
386 'attribute "%s" end quote (%s) not followed by '
387 'whitespace' % (attname, line[0]))
388 data = line[1:endquote]
389 line = line[endquote+1:].lstrip()
390 else:
391 space = line.find(' ')
392 if space == -1:
393 data = line
394 line = ''
395 else:
396 data = line[:space]
397 line = line[space+1:].lstrip()
398 attlist.append((attname.lower(), data))
399 return attlist
401 def new_reporter(source_path, settings):
403 Return a new Reporter object.
405 :Parameters:
406 `source` : string
407 The path to or description of the source text of the document.
408 `settings` : optparse.Values object
409 Runtime settings.
411 reporter = Reporter(
412 source_path, settings.report_level, settings.halt_level,
413 stream=settings.warning_stream, debug=settings.debug,
414 encoding=settings.error_encoding,
415 error_handler=settings.error_encoding_error_handler)
416 return reporter
418 def new_document(source_path, settings=None):
420 Return a new empty document object.
422 :Parameters:
423 `source_path` : string
424 The path to or description of the source text of the document.
425 `settings` : optparse.Values object
426 Runtime settings. If none are provided, a default core set will
427 be used. If you will use the document object with any Docutils
428 components, you must provide their default settings as well. For
429 example, if parsing, at least provide the parser settings,
430 obtainable as follows::
432 settings = docutils.frontend.OptionParser(
433 components=(docutils.parsers.rst.Parser,)
434 ).get_default_values()
436 from docutils import frontend
437 if settings is None:
438 settings = frontend.OptionParser().get_default_values()
439 source_path = decode_path(source_path)
440 reporter = new_reporter(source_path, settings)
441 document = nodes.document(settings, reporter, source=source_path)
442 document.note_source(source_path, -1)
443 return document
445 def clean_rcs_keywords(paragraph, keyword_substitutions):
446 if len(paragraph) == 1 and isinstance(paragraph[0], nodes.Text):
447 textnode = paragraph[0]
448 for pattern, substitution in keyword_substitutions:
449 match = pattern.search(textnode)
450 if match:
451 paragraph[0] = nodes.Text(pattern.sub(substitution, textnode))
452 return
454 def relative_path(source, target):
456 Build and return a path to `target`, relative to `source` (both files).
458 If there is no common prefix, return the absolute path to `target`.
460 source_parts = os.path.abspath(source or type(target)('dummy_file')
461 ).split(os.sep)
462 target_parts = os.path.abspath(target).split(os.sep)
463 # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
464 if source_parts[:2] != target_parts[:2]:
465 # Nothing in common between paths.
466 # Return absolute path, using '/' for URLs:
467 return '/'.join(target_parts)
468 source_parts.reverse()
469 target_parts.reverse()
470 while (source_parts and target_parts
471 and source_parts[-1] == target_parts[-1]):
472 # Remove path components in common:
473 source_parts.pop()
474 target_parts.pop()
475 target_parts.reverse()
476 parts = ['..'] * (len(source_parts) - 1) + target_parts
477 return '/'.join(parts)
479 def get_stylesheet_reference(settings, relative_to=None):
481 Retrieve a stylesheet reference from the settings object.
483 Deprecated. Use get_stylesheet_list() instead to
484 enable specification of multiple stylesheets as a comma-separated
485 list.
487 if settings.stylesheet_path:
488 assert not settings.stylesheet, (
489 'stylesheet and stylesheet_path are mutually exclusive.')
490 if relative_to == None:
491 relative_to = settings._destination
492 return relative_path(relative_to, settings.stylesheet_path)
493 else:
494 return settings.stylesheet
496 # Return 'stylesheet' or 'stylesheet_path' arguments as list.
498 # The original settings arguments are kept unchanged: you can test
499 # with e.g. ``if settings.stylesheet_path:``
501 # Differences to ``get_stylesheet_reference``:
502 # * return value is a list
503 # * no re-writing of the path (and therefore no optional argument)
504 # (if required, use ``utils.relative_path(source, target)``
505 # in the calling script)
506 def get_stylesheet_list(settings):
508 Retrieve list of stylesheet references from the settings object.
510 assert not (settings.stylesheet and settings.stylesheet_path), (
511 'stylesheet and stylesheet_path are mutually exclusive.')
512 if settings.stylesheet_path:
513 sheets = settings.stylesheet_path.split(",")
514 elif settings.stylesheet:
515 sheets = settings.stylesheet.split(",")
516 else:
517 sheets = []
518 # strip whitespace (frequently occuring in config files)
519 return [sheet.strip(u' \t\n') for sheet in sheets]
521 def get_trim_footnote_ref_space(settings):
523 Return whether or not to trim footnote space.
525 If trim_footnote_reference_space is not None, return it.
527 If trim_footnote_reference_space is None, return False unless the
528 footnote reference style is 'superscript'.
530 if settings.trim_footnote_reference_space is None:
531 return hasattr(settings, 'footnote_references') and \
532 settings.footnote_references == 'superscript'
533 else:
534 return settings.trim_footnote_reference_space
536 def get_source_line(node):
538 Return the "source" and "line" attributes from the `node` given or from
539 its closest ancestor.
541 while node:
542 if node.source or node.line:
543 return node.source, node.line
544 node = node.parent
545 return None, None
547 def escape2null(text):
548 """Return a string with escape-backslashes converted to nulls."""
549 parts = []
550 start = 0
551 while True:
552 found = text.find('\\', start)
553 if found == -1:
554 parts.append(text[start:])
555 return ''.join(parts)
556 parts.append(text[start:found])
557 parts.append('\x00' + text[found+1:found+2])
558 start = found + 2 # skip character after escape
560 def unescape(text, restore_backslashes=False):
562 Return a string with nulls removed or restored to backslashes.
563 Backslash-escaped spaces are also removed.
565 if restore_backslashes:
566 return text.replace('\x00', '\\')
567 else:
568 for sep in ['\x00 ', '\x00\n', '\x00']:
569 text = ''.join(text.split(sep))
570 return text
572 def strip_combining_chars(text):
573 if isinstance(text, str) and sys.version_info < (3,0):
574 return text
575 return u''.join([c for c in text if not unicodedata.combining(c)])
577 def find_combining_chars(text):
578 """Return indices of all combining chars in Unicode string `text`.
580 >>> find_combining_chars(u'A t̆ab̆lĕ')
581 [3, 6, 9]
583 if isinstance(text, str) and sys.version_info < (3,0):
584 return []
585 return [i for i,c in enumerate(text) if unicodedata.combining(c)]
587 def column_indices(text):
588 """Indices of Unicode string `text` when skipping combining characters.
590 >>> column_indices(u'A t̆ab̆lĕ')
591 [0, 1, 2, 4, 5, 7, 8]
593 # TODO: account for asian wide chars here instead of using dummy
594 # replacements in the tableparser?
595 string_indices = range(len(text))
596 for index in find_combining_chars(text):
597 string_indices[index] = None
598 return [i for i in string_indices if i is not None]
600 east_asian_widths = {'W': 2, # Wide
601 'F': 2, # Full-width (wide)
602 'Na': 1, # Narrow
603 'H': 1, # Half-width (narrow)
604 'N': 1, # Neutral (not East Asian, treated as narrow)
605 'A': 1} # Ambiguous (s/b wide in East Asian context,
606 # narrow otherwise, but that doesn't work)
607 """Mapping of result codes from `unicodedata.east_asian_widt()` to character
608 column widths."""
610 def column_width(text):
611 """Return the column width of text.
613 Correct ``len(text)`` for wide East Asian and combining Unicode chars.
615 if isinstance(text, str) and sys.version_info < (3,0):
616 return len(text)
617 try:
618 width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
619 for c in text])
620 except AttributeError: # east_asian_width() New in version 2.4.
621 width = len(text)
622 # correction for combining chars:
623 width -= len(find_combining_chars(text))
624 return width
626 def uniq(L):
627 r = []
628 for item in L:
629 if not item in r:
630 r.append(item)
631 return r
633 # by Li Daobing http://code.activestate.com/recipes/190465/
634 # since Python 2.6 there is also itertools.combinations()
635 def unique_combinations(items, n):
636 """Return n-length tuples, in sorted order, no repeated elements"""
637 if n==0: yield []
638 else:
639 for i in xrange(len(items)-n+1):
640 for cc in unique_combinations(items[i+1:],n-1):
641 yield [items[i]]+cc
643 def normalize_language_tag(tag):
644 """Return a list of normalized combinations for a `BCP 47` language tag.
646 Example:
648 >>> normalize_language_tag('de-AT-1901')
649 ['de_at_1901', 'de_at', 'de_1901', 'de']
651 # normalize:
652 tag = tag.lower().replace('-','_')
653 # find all combinations of subtags
654 taglist = []
655 base_tag= tag.split('_')[:1]
656 subtags = tag.split('_')[1:]
657 # print base_tag, subtags
658 for n in range(len(subtags), 0, -1):
659 for tags in unique_combinations(subtags, n):
660 # print tags
661 taglist.append('_'.join(base_tag + tags))
662 taglist += base_tag
663 return taglist
666 class DependencyList(object):
669 List of dependencies, with file recording support.
671 Note that the output file is not automatically closed. You have
672 to explicitly call the close() method.
675 def __init__(self, output_file=None, dependencies=[]):
677 Initialize the dependency list, automatically setting the
678 output file to `output_file` (see `set_output()`) and adding
679 all supplied dependencies.
681 self.set_output(output_file)
682 for i in dependencies:
683 self.add(i)
685 def set_output(self, output_file):
687 Set the output file and clear the list of already added
688 dependencies.
690 `output_file` must be a string. The specified file is
691 immediately overwritten.
693 If output_file is '-', the output will be written to stdout.
694 If it is None, no file output is done when calling add().
696 self.list = []
697 if output_file:
698 if output_file == '-':
699 of = None
700 else:
701 of = output_file
702 self.file = FileOutput(destination_path=of,
703 encoding='utf8', autoclose=False)
704 else:
705 self.file = None
707 def add(self, *filenames):
709 If the dependency `filename` has not already been added,
710 append it to self.list and print it to self.file if self.file
711 is not None.
713 for filename in filenames:
714 if not filename in self.list:
715 self.list.append(filename)
716 if self.file is not None:
717 self.file.write(filename+'\n')
719 def close(self):
721 Close the output file.
723 self.file.close()
724 self.file = None
726 def __repr__(self):
727 try:
728 output_file = self.file.name
729 except AttributeError:
730 output_file = None
731 return '%s(%r, %s)' % (self.__class__.__name__, output_file, self.list)