docutils.utils is now a package (providing a place for sub-modules)
[docutils.git] / docutils / utils / __init__.py
blobed3abfd2169596f363516c6dd0a55186270fdfe7
1 # coding: utf8
2 # $Id$
3 # Author: David Goodger <goodger@python.org>
4 # Copyright: This module has been placed in the public domain.
6 """
7 Miscellaneous utilities for the documentation utilities.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 import os
14 import os.path
15 import warnings
16 import unicodedata
17 from docutils import ApplicationError, DataError
18 from docutils import nodes
19 from docutils.io import FileOutput
20 from docutils.error_reporting import ErrorOutput, SafeString
23 class SystemMessage(ApplicationError):
25 def __init__(self, system_message, level):
26 Exception.__init__(self, system_message.astext())
27 self.level = level
30 class SystemMessagePropagation(ApplicationError): pass
33 class Reporter:
35 """
36 Info/warning/error reporter and ``system_message`` element generator.
38 Five levels of system messages are defined, along with corresponding
39 methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
41 There is typically one Reporter object per process. A Reporter object is
42 instantiated with thresholds for reporting (generating warnings) and
43 halting processing (raising exceptions), a switch to turn debug output on
44 or off, and an I/O stream for warnings. These are stored as instance
45 attributes.
47 When a system message is generated, its level is compared to the stored
48 thresholds, and a warning or error is generated as appropriate. Debug
49 messages are produced if the stored debug switch is on, independently of
50 other thresholds. Message output is sent to the stored warning stream if
51 not set to ''.
53 The Reporter class also employs a modified form of the "Observer" pattern
54 [GoF95]_ to track system messages generated. The `attach_observer` method
55 should be called before parsing, with a bound method or function which
56 accepts system messages. The observer can be removed with
57 `detach_observer`, and another added in its place.
59 .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
60 Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
61 1995.
62 """
64 levels = 'DEBUG INFO WARNING ERROR SEVERE'.split()
65 """List of names for system message levels, indexed by level."""
67 # system message level constants:
68 (DEBUG_LEVEL,
69 INFO_LEVEL,
70 WARNING_LEVEL,
71 ERROR_LEVEL,
72 SEVERE_LEVEL) = range(5)
74 def __init__(self, source, report_level, halt_level, stream=None,
75 debug=0, encoding=None, error_handler='backslashreplace'):
76 """
77 :Parameters:
78 - `source`: The path to or description of the source data.
79 - `report_level`: The level at or above which warning output will
80 be sent to `stream`.
81 - `halt_level`: The level at or above which `SystemMessage`
82 exceptions will be raised, halting execution.
83 - `debug`: Show debug (level=0) system messages?
84 - `stream`: Where warning output is sent. Can be file-like (has a
85 ``.write`` method), a string (file name, opened for writing),
86 '' (empty string) or `False` (for discarding all stream messages)
87 or `None` (implies `sys.stderr`; default).
88 - `encoding`: The output encoding.
89 - `error_handler`: The error handler for stderr output encoding.
90 """
92 self.source = source
93 """The path to or description of the source data."""
95 self.error_handler = error_handler
96 """The character encoding error handler."""
98 self.debug_flag = debug
99 """Show debug (level=0) system messages?"""
101 self.report_level = report_level
102 """The level at or above which warning output will be sent
103 to `self.stream`."""
105 self.halt_level = halt_level
106 """The level at or above which `SystemMessage` exceptions
107 will be raised, halting execution."""
109 if not isinstance(stream, ErrorOutput):
110 stream = ErrorOutput(stream, encoding, error_handler)
112 self.stream = stream
113 """Where warning output is sent."""
115 self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
116 """The output character encoding."""
118 self.observers = []
119 """List of bound methods or functions to call with each system_message
120 created."""
122 self.max_level = -1
123 """The highest level system message generated so far."""
125 def set_conditions(self, category, report_level, halt_level,
126 stream=None, debug=0):
127 warnings.warn('docutils.utils.Reporter.set_conditions deprecated; '
128 'set attributes via configuration settings or directly',
129 DeprecationWarning, stacklevel=2)
130 self.report_level = report_level
131 self.halt_level = halt_level
132 if not isinstance(stream, ErrorOutput):
133 stream = ErrorOutput(stream, self.encoding, self.error_handler)
134 self.stream = stream
135 self.debug_flag = debug
137 def attach_observer(self, observer):
139 The `observer` parameter is a function or bound method which takes one
140 argument, a `nodes.system_message` instance.
142 self.observers.append(observer)
144 def detach_observer(self, observer):
145 self.observers.remove(observer)
147 def notify_observers(self, message):
148 for observer in self.observers:
149 observer(message)
151 def system_message(self, level, message, *children, **kwargs):
153 Return a system_message object.
155 Raise an exception or generate a warning if appropriate.
157 # `message` can be a `string`, `unicode`, or `Exception` instance.
158 if isinstance(message, Exception):
159 message = SafeString(message)
161 attributes = kwargs.copy()
162 if 'base_node' in kwargs:
163 source, line = get_source_line(kwargs['base_node'])
164 del attributes['base_node']
165 if source is not None:
166 attributes.setdefault('source', source)
167 if line is not None:
168 attributes.setdefault('line', line)
169 # assert source is not None, "node has line- but no source-argument"
170 if not 'source' in attributes: # 'line' is absolute line number
171 try: # look up (source, line-in-source)
172 source, line = self.locator(attributes.get('line'))
173 # print "locator lookup", kwargs.get('line'), "->", source, line
174 except AttributeError:
175 source, line = None, None
176 if source is not None:
177 attributes['source'] = source
178 if line is not None:
179 attributes['line'] = line
180 # assert attributes['line'] is not None, (message, kwargs)
181 # assert attributes['source'] is not None, (message, kwargs)
182 attributes.setdefault('source', self.source)
184 msg = nodes.system_message(message, level=level,
185 type=self.levels[level],
186 *children, **attributes)
187 if self.stream and (level >= self.report_level
188 or self.debug_flag and level == self.DEBUG_LEVEL
189 or level >= self.halt_level):
190 self.stream.write(msg.astext() + '\n')
191 if level >= self.halt_level:
192 raise SystemMessage(msg, level)
193 if level > self.DEBUG_LEVEL or self.debug_flag:
194 self.notify_observers(msg)
195 self.max_level = max(level, self.max_level)
196 return msg
198 def debug(self, *args, **kwargs):
200 Level-0, "DEBUG": an internal reporting issue. Typically, there is no
201 effect on the processing. Level-0 system messages are handled
202 separately from the others.
204 if self.debug_flag:
205 return self.system_message(self.DEBUG_LEVEL, *args, **kwargs)
207 def info(self, *args, **kwargs):
209 Level-1, "INFO": a minor issue that can be ignored. Typically there is
210 no effect on processing, and level-1 system messages are not reported.
212 return self.system_message(self.INFO_LEVEL, *args, **kwargs)
214 def warning(self, *args, **kwargs):
216 Level-2, "WARNING": an issue that should be addressed. If ignored,
217 there may be unpredictable problems with the output.
219 return self.system_message(self.WARNING_LEVEL, *args, **kwargs)
221 def error(self, *args, **kwargs):
223 Level-3, "ERROR": an error that should be addressed. If ignored, the
224 output will contain errors.
226 return self.system_message(self.ERROR_LEVEL, *args, **kwargs)
228 def severe(self, *args, **kwargs):
230 Level-4, "SEVERE": a severe error that must be addressed. If ignored,
231 the output will contain severe errors. Typically level-4 system
232 messages are turned into exceptions which halt processing.
234 return self.system_message(self.SEVERE_LEVEL, *args, **kwargs)
237 class ExtensionOptionError(DataError): pass
238 class BadOptionError(ExtensionOptionError): pass
239 class BadOptionDataError(ExtensionOptionError): pass
240 class DuplicateOptionError(ExtensionOptionError): pass
243 def extract_extension_options(field_list, options_spec):
245 Return a dictionary mapping extension option names to converted values.
247 :Parameters:
248 - `field_list`: A flat field list without field arguments, where each
249 field body consists of a single paragraph only.
250 - `options_spec`: Dictionary mapping known option names to a
251 conversion function such as `int` or `float`.
253 :Exceptions:
254 - `KeyError` for unknown option names.
255 - `ValueError` for invalid option values (raised by the conversion
256 function).
257 - `TypeError` for invalid option value types (raised by conversion
258 function).
259 - `DuplicateOptionError` for duplicate options.
260 - `BadOptionError` for invalid fields.
261 - `BadOptionDataError` for invalid option data (missing name,
262 missing data, bad quotes, etc.).
264 option_list = extract_options(field_list)
265 option_dict = assemble_option_dict(option_list, options_spec)
266 return option_dict
268 def extract_options(field_list):
270 Return a list of option (name, value) pairs from field names & bodies.
272 :Parameter:
273 `field_list`: A flat field list, where each field name is a single
274 word and each field body consists of a single paragraph only.
276 :Exceptions:
277 - `BadOptionError` for invalid fields.
278 - `BadOptionDataError` for invalid option data (missing name,
279 missing data, bad quotes, etc.).
281 option_list = []
282 for field in field_list:
283 if len(field[0].astext().split()) != 1:
284 raise BadOptionError(
285 'extension option field name may not contain multiple words')
286 name = str(field[0].astext().lower())
287 body = field[1]
288 if len(body) == 0:
289 data = None
290 elif len(body) > 1 or not isinstance(body[0], nodes.paragraph) \
291 or len(body[0]) != 1 or not isinstance(body[0][0], nodes.Text):
292 raise BadOptionDataError(
293 'extension option field body may contain\n'
294 'a single paragraph only (option "%s")' % name)
295 else:
296 data = body[0][0].astext()
297 option_list.append((name, data))
298 return option_list
300 def assemble_option_dict(option_list, options_spec):
302 Return a mapping of option names to values.
304 :Parameters:
305 - `option_list`: A list of (name, value) pairs (the output of
306 `extract_options()`).
307 - `options_spec`: Dictionary mapping known option names to a
308 conversion function such as `int` or `float`.
310 :Exceptions:
311 - `KeyError` for unknown option names.
312 - `DuplicateOptionError` for duplicate options.
313 - `ValueError` for invalid option values (raised by conversion
314 function).
315 - `TypeError` for invalid option value types (raised by conversion
316 function).
318 options = {}
319 for name, value in option_list:
320 convertor = options_spec[name] # raises KeyError if unknown
321 if convertor is None:
322 raise KeyError(name) # or if explicitly disabled
323 if name in options:
324 raise DuplicateOptionError('duplicate option "%s"' % name)
325 try:
326 options[name] = convertor(value)
327 except (ValueError, TypeError), detail:
328 raise detail.__class__('(option: "%s"; value: %r)\n%s'
329 % (name, value, ' '.join(detail.args)))
330 return options
333 class NameValueError(DataError): pass
336 def decode_path(path):
338 Ensure `path` is Unicode. Return `nodes.reprunicode` object.
340 Decode file/path string in a failsave manner if not already done.
342 # see also http://article.gmane.org/gmane.text.docutils.user/2905
343 if isinstance(path, unicode):
344 return path
345 try:
346 path = path.decode(sys.getfilesystemencoding(), 'strict')
347 except AttributeError: # default value None has no decode method
348 return nodes.reprunicode(path)
349 except UnicodeDecodeError:
350 try:
351 path = path.decode('utf-8', 'strict')
352 except UnicodeDecodeError:
353 path = path.decode('ascii', 'replace')
354 return nodes.reprunicode(path)
357 def extract_name_value(line):
359 Return a list of (name, value) from a line of the form "name=value ...".
361 :Exception:
362 `NameValueError` for invalid input (missing name, missing data, bad
363 quotes, etc.).
365 attlist = []
366 while line:
367 equals = line.find('=')
368 if equals == -1:
369 raise NameValueError('missing "="')
370 attname = line[:equals].strip()
371 if equals == 0 or not attname:
372 raise NameValueError(
373 'missing attribute name before "="')
374 line = line[equals+1:].lstrip()
375 if not line:
376 raise NameValueError(
377 'missing value after "%s="' % attname)
378 if line[0] in '\'"':
379 endquote = line.find(line[0], 1)
380 if endquote == -1:
381 raise NameValueError(
382 'attribute "%s" missing end quote (%s)'
383 % (attname, line[0]))
384 if len(line) > endquote + 1 and line[endquote + 1].strip():
385 raise NameValueError(
386 'attribute "%s" end quote (%s) not followed by '
387 'whitespace' % (attname, line[0]))
388 data = line[1:endquote]
389 line = line[endquote+1:].lstrip()
390 else:
391 space = line.find(' ')
392 if space == -1:
393 data = line
394 line = ''
395 else:
396 data = line[:space]
397 line = line[space+1:].lstrip()
398 attlist.append((attname.lower(), data))
399 return attlist
401 def new_reporter(source_path, settings):
403 Return a new Reporter object.
405 :Parameters:
406 `source` : string
407 The path to or description of the source text of the document.
408 `settings` : optparse.Values object
409 Runtime settings.
411 reporter = Reporter(
412 source_path, settings.report_level, settings.halt_level,
413 stream=settings.warning_stream, debug=settings.debug,
414 encoding=settings.error_encoding,
415 error_handler=settings.error_encoding_error_handler)
416 return reporter
418 def new_document(source_path, settings=None):
420 Return a new empty document object.
422 :Parameters:
423 `source_path` : string
424 The path to or description of the source text of the document.
425 `settings` : optparse.Values object
426 Runtime settings. If none are provided, a default core set will
427 be used. If you will use the document object with any Docutils
428 components, you must provide their default settings as well. For
429 example, if parsing, at least provide the parser settings,
430 obtainable as follows::
432 settings = docutils.frontend.OptionParser(
433 components=(docutils.parsers.rst.Parser,)
434 ).get_default_values()
436 from docutils import frontend
437 if settings is None:
438 settings = frontend.OptionParser().get_default_values()
439 source_path = decode_path(source_path)
440 reporter = new_reporter(source_path, settings)
441 document = nodes.document(settings, reporter, source=source_path)
442 document.note_source(source_path, -1)
443 return document
445 def clean_rcs_keywords(paragraph, keyword_substitutions):
446 if len(paragraph) == 1 and isinstance(paragraph[0], nodes.Text):
447 textnode = paragraph[0]
448 for pattern, substitution in keyword_substitutions:
449 match = pattern.search(textnode)
450 if match:
451 paragraph[0] = nodes.Text(pattern.sub(substitution, textnode))
452 return
454 def relative_path(source, target):
456 Build and return a path to `target`, relative to `source` (both files).
458 If there is no common prefix, return the absolute path to `target`.
460 source_parts = os.path.abspath(source or 'dummy_file').split(os.sep)
461 target_parts = os.path.abspath(target).split(os.sep)
462 # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
463 if source_parts[:2] != target_parts[:2]:
464 # Nothing in common between paths.
465 # Return absolute path, using '/' for URLs:
466 return '/'.join(target_parts)
467 source_parts.reverse()
468 target_parts.reverse()
469 while (source_parts and target_parts
470 and source_parts[-1] == target_parts[-1]):
471 # Remove path components in common:
472 source_parts.pop()
473 target_parts.pop()
474 target_parts.reverse()
475 parts = ['..'] * (len(source_parts) - 1) + target_parts
476 return '/'.join(parts)
478 def get_stylesheet_reference(settings, relative_to=None):
480 Retrieve a stylesheet reference from the settings object.
482 Deprecated. Use get_stylesheet_list() instead to
483 enable specification of multiple stylesheets as a comma-separated
484 list.
486 if settings.stylesheet_path:
487 assert not settings.stylesheet, (
488 'stylesheet and stylesheet_path are mutually exclusive.')
489 if relative_to == None:
490 relative_to = settings._destination
491 return relative_path(relative_to, settings.stylesheet_path)
492 else:
493 return settings.stylesheet
495 # Return 'stylesheet' or 'stylesheet_path' arguments as list.
497 # The original settings arguments are kept unchanged: you can test
498 # with e.g. ``if settings.stylesheet_path:``
500 # Differences to ``get_stylesheet_reference``:
501 # * return value is a list
502 # * no re-writing of the path (and therefore no optional argument)
503 # (if required, use ``utils.relative_path(source, target)``
504 # in the calling script)
505 def get_stylesheet_list(settings):
507 Retrieve list of stylesheet references from the settings object.
509 assert not (settings.stylesheet and settings.stylesheet_path), (
510 'stylesheet and stylesheet_path are mutually exclusive.')
511 if settings.stylesheet_path:
512 sheets = settings.stylesheet_path.split(",")
513 elif settings.stylesheet:
514 sheets = settings.stylesheet.split(",")
515 else:
516 sheets = []
517 # strip whitespace (frequently occuring in config files)
518 return [sheet.strip(u' \t\n') for sheet in sheets]
520 def get_trim_footnote_ref_space(settings):
522 Return whether or not to trim footnote space.
524 If trim_footnote_reference_space is not None, return it.
526 If trim_footnote_reference_space is None, return False unless the
527 footnote reference style is 'superscript'.
529 if settings.trim_footnote_reference_space is None:
530 return hasattr(settings, 'footnote_references') and \
531 settings.footnote_references == 'superscript'
532 else:
533 return settings.trim_footnote_reference_space
535 def get_source_line(node):
537 Return the "source" and "line" attributes from the `node` given or from
538 its closest ancestor.
540 while node:
541 if node.source or node.line:
542 return node.source, node.line
543 node = node.parent
544 return None, None
546 def escape2null(text):
547 """Return a string with escape-backslashes converted to nulls."""
548 parts = []
549 start = 0
550 while 1:
551 found = text.find('\\', start)
552 if found == -1:
553 parts.append(text[start:])
554 return ''.join(parts)
555 parts.append(text[start:found])
556 parts.append('\x00' + text[found+1:found+2])
557 start = found + 2 # skip character after escape
559 def unescape(text, restore_backslashes=0):
561 Return a string with nulls removed or restored to backslashes.
562 Backslash-escaped spaces are also removed.
564 if restore_backslashes:
565 return text.replace('\x00', '\\')
566 else:
567 for sep in ['\x00 ', '\x00\n', '\x00']:
568 text = ''.join(text.split(sep))
569 return text
571 def strip_combining_chars(text):
572 if isinstance(text, str) and sys.version_info < (3,0):
573 return text
574 return u''.join([c for c in text if not unicodedata.combining(c)])
576 def find_combining_chars(text):
577 """Return indices of all combining chars in Unicode string `text`.
579 >>> find_combining_chars(u'A t̆ab̆lĕ')
580 [3, 6, 9]
582 if isinstance(text, str) and sys.version_info < (3,0):
583 return []
584 return [i for i,c in enumerate(text) if unicodedata.combining(c)]
586 def column_indices(text):
587 """Indices of Unicode string `text` when skipping combining characters.
589 >>> column_indices(u'A t̆ab̆lĕ')
590 [0, 1, 2, 4, 5, 7, 8]
592 # TODO: account for asian wide chars here instead of using dummy
593 # replacements in the tableparser?
594 string_indices = range(len(text))
595 for index in find_combining_chars(text):
596 string_indices[index] = None
597 return [i for i in string_indices if i is not None]
599 east_asian_widths = {'W': 2, # Wide
600 'F': 2, # Full-width (wide)
601 'Na': 1, # Narrow
602 'H': 1, # Half-width (narrow)
603 'N': 1, # Neutral (not East Asian, treated as narrow)
604 'A': 1} # Ambiguous (s/b wide in East Asian context,
605 # narrow otherwise, but that doesn't work)
606 """Mapping of result codes from `unicodedata.east_asian_widt()` to character
607 column widths."""
609 def column_width(text):
610 """Return the column width of text.
612 Correct ``len(text)`` for wide East Asian and combining Unicode chars.
614 if isinstance(text, str) and sys.version_info < (3,0):
615 return len(text)
616 try:
617 width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
618 for c in text])
619 except AttributeError: # east_asian_width() New in version 2.4.
620 width = len(text)
621 # correction for combining chars:
622 width -= len(find_combining_chars(text))
623 return width
625 def uniq(L):
626 r = []
627 for item in L:
628 if not item in r:
629 r.append(item)
630 return r
632 # by Li Daobing http://code.activestate.com/recipes/190465/
633 # since Python 2.6 there is also itertools.combinations()
634 def unique_combinations(items, n):
635 """Return n-length tuples, in sorted order, no repeated elements"""
636 if n==0: yield []
637 else:
638 for i in xrange(len(items)-n+1):
639 for cc in unique_combinations(items[i+1:],n-1):
640 yield [items[i]]+cc
642 def normalize_language_tag(tag):
643 """Return a list of normalized combinations for a `BCP 47` language tag.
645 Example:
647 >>> normalize_language_tag('de-AT-1901')
648 ['de_at_1901', 'de_at', 'de_1901', 'de']
650 # normalize:
651 tag = tag.lower().replace('-','_')
652 # find all combinations of subtags
653 taglist = []
654 base_tag= tag.split('_')[:1]
655 subtags = tag.split('_')[1:]
656 # print base_tag, subtags
657 for n in range(len(subtags), 0, -1):
658 for tags in unique_combinations(subtags, n):
659 # print tags
660 taglist.append('_'.join(base_tag + tags))
661 taglist += base_tag
662 return taglist
665 class DependencyList(object):
668 List of dependencies, with file recording support.
670 Note that the output file is not automatically closed. You have
671 to explicitly call the close() method.
674 def __init__(self, output_file=None, dependencies=[]):
676 Initialize the dependency list, automatically setting the
677 output file to `output_file` (see `set_output()`) and adding
678 all supplied dependencies.
680 self.set_output(output_file)
681 for i in dependencies:
682 self.add(i)
684 def set_output(self, output_file):
686 Set the output file and clear the list of already added
687 dependencies.
689 `output_file` must be a string. The specified file is
690 immediately overwritten.
692 If output_file is '-', the output will be written to stdout.
693 If it is None, no file output is done when calling add().
695 self.list = []
696 if output_file:
697 if output_file == '-':
698 of = None
699 else:
700 of = output_file
701 self.file = FileOutput(destination_path=of,
702 encoding='utf8', autoclose=False)
703 else:
704 self.file = None
706 def add(self, *filenames):
708 If the dependency `filename` has not already been added,
709 append it to self.list and print it to self.file if self.file
710 is not None.
712 for filename in filenames:
713 if not filename in self.list:
714 self.list.append(filename)
715 if self.file is not None:
716 self.file.write(filename+'\n')
718 def close(self):
720 Close the output file.
722 self.file.close()
723 self.file = None
725 def __repr__(self):
726 try:
727 output_file = self.file.name
728 except AttributeError:
729 output_file = None
730 return '%s(%r, %s)' % (self.__class__.__name__, output_file, self.list)