More robust guess of input/output encoding.
[docutils.git] / docutils / utils.py
bloba5d75734c743554584946a7df92d68968c635f45
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 Miscellaneous utilities for the documentation utilities.
7 """
9 __docformat__ = 'reStructuredText'
11 import sys
12 import os
13 import os.path
14 import warnings
15 import unicodedata
16 from docutils import ApplicationError, DataError
17 from docutils import nodes
18 from docutils._compat import bytes
21 class SystemMessage(ApplicationError):
23 def __init__(self, system_message, level):
24 Exception.__init__(self, system_message.astext())
25 self.level = level
28 class SystemMessagePropagation(ApplicationError): pass
31 class Reporter:
33 """
34 Info/warning/error reporter and ``system_message`` element generator.
36 Five levels of system messages are defined, along with corresponding
37 methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
39 There is typically one Reporter object per process. A Reporter object is
40 instantiated with thresholds for reporting (generating warnings) and
41 halting processing (raising exceptions), a switch to turn debug output on
42 or off, and an I/O stream for warnings. These are stored as instance
43 attributes.
45 When a system message is generated, its level is compared to the stored
46 thresholds, and a warning or error is generated as appropriate. Debug
47 messages are produced if the stored debug switch is on, independently of
48 other thresholds. Message output is sent to the stored warning stream if
49 not set to ''.
51 The Reporter class also employs a modified form of the "Observer" pattern
52 [GoF95]_ to track system messages generated. The `attach_observer` method
53 should be called before parsing, with a bound method or function which
54 accepts system messages. The observer can be removed with
55 `detach_observer`, and another added in its place.
57 .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
58 Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
59 1995.
60 """
62 levels = 'DEBUG INFO WARNING ERROR SEVERE'.split()
63 """List of names for system message levels, indexed by level."""
65 # system message level constants:
66 (DEBUG_LEVEL,
67 INFO_LEVEL,
68 WARNING_LEVEL,
69 ERROR_LEVEL,
70 SEVERE_LEVEL) = range(5)
72 def __init__(self, source, report_level, halt_level, stream=None,
73 debug=0, encoding=None, error_handler='backslashreplace'):
74 """
75 :Parameters:
76 - `source`: The path to or description of the source data.
77 - `report_level`: The level at or above which warning output will
78 be sent to `stream`.
79 - `halt_level`: The level at or above which `SystemMessage`
80 exceptions will be raised, halting execution.
81 - `debug`: Show debug (level=0) system messages?
82 - `stream`: Where warning output is sent. Can be file-like (has a
83 ``.write`` method), a string (file name, opened for writing),
84 '' (empty string, for discarding all stream messages) or
85 `None` (implies `sys.stderr`; default).
86 - `encoding`: The output encoding.
87 - `error_handler`: The error handler for stderr output encoding.
88 """
90 self.source = source
91 """The path to or description of the source data."""
93 self.error_handler = error_handler
94 """The character encoding error handler."""
96 self.debug_flag = debug
97 """Show debug (level=0) system messages?"""
99 self.report_level = report_level
100 """The level at or above which warning output will be sent
101 to `self.stream`."""
103 self.halt_level = halt_level
104 """The level at or above which `SystemMessage` exceptions
105 will be raised, halting execution."""
107 if stream is None:
108 stream = sys.stderr
109 elif stream and type(stream) in (unicode, bytes):
110 # if `stream` is a file name, open it
111 if type(stream) is bytes:
112 stream = open(stream, 'w')
113 else:
114 stream = open(stream.encode(), 'w')
116 self.stream = stream
117 """Where warning output is sent."""
119 self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
120 """The output character encoding."""
122 self.observers = []
123 """List of bound methods or functions to call with each system_message
124 created."""
126 self.max_level = -1
127 """The highest level system message generated so far."""
129 def set_conditions(self, category, report_level, halt_level,
130 stream=None, debug=0):
131 warnings.warn('docutils.utils.Reporter.set_conditions deprecated; '
132 'set attributes via configuration settings or directly',
133 DeprecationWarning, stacklevel=2)
134 self.report_level = report_level
135 self.halt_level = halt_level
136 if stream is None:
137 stream = sys.stderr
138 self.stream = stream
139 self.debug_flag = debug
141 def attach_observer(self, observer):
143 The `observer` parameter is a function or bound method which takes one
144 argument, a `nodes.system_message` instance.
146 self.observers.append(observer)
148 def detach_observer(self, observer):
149 self.observers.remove(observer)
151 def notify_observers(self, message):
152 for observer in self.observers:
153 observer(message)
155 def system_message(self, level, message, *children, **kwargs):
157 Return a system_message object.
159 Raise an exception or generate a warning if appropriate.
161 # `message` can be a `string`, `unicode`, or `Exception` instance.
162 # Convert now to detect errors:
163 try:
164 message = unicode(message)
165 except UnicodeError, err:
166 # In Python < 2.6, # unicode(<exception instance>) uses __str__
167 # and fails with non-ASCII chars in arguments
168 if sys.version_info < (2,6):
169 try:
170 message = u', '.join(message.args)
171 except AttributeError:
172 raise err
173 else:
174 raise err
176 attributes = kwargs.copy()
177 if 'base_node' in kwargs:
178 source, line = get_source_line(kwargs['base_node'])
179 del attributes['base_node']
180 if source is not None:
181 attributes.setdefault('source', source)
182 if line is not None:
183 attributes.setdefault('line', line)
184 # assert source is not None, "node has line- but no source-argument"
185 if not 'source' in attributes: # 'line' is absolute line number
186 try: # look up (source, line-in-source)
187 source, line = self.locator(attributes.get('line'))
188 # print "locator lookup", kwargs.get('line'), "->", source, line
189 except AttributeError:
190 source, line = None, None
191 if source is not None:
192 attributes['source'] = source
193 if line is not None:
194 attributes['line'] = line
195 # assert attributes['line'] is not None, (message, kwargs)
196 # assert attributes['source'] is not None, (message, kwargs)
197 attributes.setdefault('source', self.source)
199 msg = nodes.system_message(message, level=level,
200 type=self.levels[level],
201 *children, **attributes)
202 if self.stream and (level >= self.report_level
203 or self.debug_flag and level == self.DEBUG_LEVEL
204 or level >= self.halt_level):
205 msgtext = msg.astext() + '\n'
206 try:
207 self.stream.write(msgtext)
208 except UnicodeEncodeError:
209 self.stream.write(msgtext.encode(self.encoding,
210 self.error_handler))
211 if level >= self.halt_level:
212 raise SystemMessage(msg, level)
213 if level > self.DEBUG_LEVEL or self.debug_flag:
214 self.notify_observers(msg)
215 self.max_level = max(level, self.max_level)
216 return msg
218 def debug(self, *args, **kwargs):
220 Level-0, "DEBUG": an internal reporting issue. Typically, there is no
221 effect on the processing. Level-0 system messages are handled
222 separately from the others.
224 if self.debug_flag:
225 return self.system_message(self.DEBUG_LEVEL, *args, **kwargs)
227 def info(self, *args, **kwargs):
229 Level-1, "INFO": a minor issue that can be ignored. Typically there is
230 no effect on processing, and level-1 system messages are not reported.
232 return self.system_message(self.INFO_LEVEL, *args, **kwargs)
234 def warning(self, *args, **kwargs):
236 Level-2, "WARNING": an issue that should be addressed. If ignored,
237 there may be unpredictable problems with the output.
239 return self.system_message(self.WARNING_LEVEL, *args, **kwargs)
241 def error(self, *args, **kwargs):
243 Level-3, "ERROR": an error that should be addressed. If ignored, the
244 output will contain errors.
246 return self.system_message(self.ERROR_LEVEL, *args, **kwargs)
248 def severe(self, *args, **kwargs):
250 Level-4, "SEVERE": a severe error that must be addressed. If ignored,
251 the output will contain severe errors. Typically level-4 system
252 messages are turned into exceptions which halt processing.
254 return self.system_message(self.SEVERE_LEVEL, *args, **kwargs)
257 class ExtensionOptionError(DataError): pass
258 class BadOptionError(ExtensionOptionError): pass
259 class BadOptionDataError(ExtensionOptionError): pass
260 class DuplicateOptionError(ExtensionOptionError): pass
263 def extract_extension_options(field_list, options_spec):
265 Return a dictionary mapping extension option names to converted values.
267 :Parameters:
268 - `field_list`: A flat field list without field arguments, where each
269 field body consists of a single paragraph only.
270 - `options_spec`: Dictionary mapping known option names to a
271 conversion function such as `int` or `float`.
273 :Exceptions:
274 - `KeyError` for unknown option names.
275 - `ValueError` for invalid option values (raised by the conversion
276 function).
277 - `TypeError` for invalid option value types (raised by conversion
278 function).
279 - `DuplicateOptionError` for duplicate options.
280 - `BadOptionError` for invalid fields.
281 - `BadOptionDataError` for invalid option data (missing name,
282 missing data, bad quotes, etc.).
284 option_list = extract_options(field_list)
285 option_dict = assemble_option_dict(option_list, options_spec)
286 return option_dict
288 def extract_options(field_list):
290 Return a list of option (name, value) pairs from field names & bodies.
292 :Parameter:
293 `field_list`: A flat field list, where each field name is a single
294 word and each field body consists of a single paragraph only.
296 :Exceptions:
297 - `BadOptionError` for invalid fields.
298 - `BadOptionDataError` for invalid option data (missing name,
299 missing data, bad quotes, etc.).
301 option_list = []
302 for field in field_list:
303 if len(field[0].astext().split()) != 1:
304 raise BadOptionError(
305 'extension option field name may not contain multiple words')
306 name = str(field[0].astext().lower())
307 body = field[1]
308 if len(body) == 0:
309 data = None
310 elif len(body) > 1 or not isinstance(body[0], nodes.paragraph) \
311 or len(body[0]) != 1 or not isinstance(body[0][0], nodes.Text):
312 raise BadOptionDataError(
313 'extension option field body may contain\n'
314 'a single paragraph only (option "%s")' % name)
315 else:
316 data = body[0][0].astext()
317 option_list.append((name, data))
318 return option_list
320 def assemble_option_dict(option_list, options_spec):
322 Return a mapping of option names to values.
324 :Parameters:
325 - `option_list`: A list of (name, value) pairs (the output of
326 `extract_options()`).
327 - `options_spec`: Dictionary mapping known option names to a
328 conversion function such as `int` or `float`.
330 :Exceptions:
331 - `KeyError` for unknown option names.
332 - `DuplicateOptionError` for duplicate options.
333 - `ValueError` for invalid option values (raised by conversion
334 function).
335 - `TypeError` for invalid option value types (raised by conversion
336 function).
338 options = {}
339 for name, value in option_list:
340 convertor = options_spec[name] # raises KeyError if unknown
341 if convertor is None:
342 raise KeyError(name) # or if explicitly disabled
343 if name in options:
344 raise DuplicateOptionError('duplicate option "%s"' % name)
345 try:
346 options[name] = convertor(value)
347 except (ValueError, TypeError), detail:
348 raise detail.__class__('(option: "%s"; value: %r)\n%s'
349 % (name, value, ' '.join(detail.args)))
350 return options
353 class NameValueError(DataError): pass
356 def decode_path(path):
358 Ensure `path` is Unicode. Return `nodes.reprunicode` object.
360 Decode file/path string in a failsave manner if not already done.
362 # see also http://article.gmane.org/gmane.text.docutils.user/2905
363 if isinstance(path, unicode):
364 return path
365 try:
366 path = path.decode(sys.getfilesystemencoding(), 'strict')
367 except AttributeError: # default value None has no decode method
368 return nodes.reprunicode(path)
369 except UnicodeDecodeError:
370 try:
371 path = path.decode('utf-8', 'strict')
372 except UnicodeDecodeError:
373 path = path.decode('ascii', 'replace')
374 return nodes.reprunicode(path)
377 def extract_name_value(line):
379 Return a list of (name, value) from a line of the form "name=value ...".
381 :Exception:
382 `NameValueError` for invalid input (missing name, missing data, bad
383 quotes, etc.).
385 attlist = []
386 while line:
387 equals = line.find('=')
388 if equals == -1:
389 raise NameValueError('missing "="')
390 attname = line[:equals].strip()
391 if equals == 0 or not attname:
392 raise NameValueError(
393 'missing attribute name before "="')
394 line = line[equals+1:].lstrip()
395 if not line:
396 raise NameValueError(
397 'missing value after "%s="' % attname)
398 if line[0] in '\'"':
399 endquote = line.find(line[0], 1)
400 if endquote == -1:
401 raise NameValueError(
402 'attribute "%s" missing end quote (%s)'
403 % (attname, line[0]))
404 if len(line) > endquote + 1 and line[endquote + 1].strip():
405 raise NameValueError(
406 'attribute "%s" end quote (%s) not followed by '
407 'whitespace' % (attname, line[0]))
408 data = line[1:endquote]
409 line = line[endquote+1:].lstrip()
410 else:
411 space = line.find(' ')
412 if space == -1:
413 data = line
414 line = ''
415 else:
416 data = line[:space]
417 line = line[space+1:].lstrip()
418 attlist.append((attname.lower(), data))
419 return attlist
421 def new_reporter(source_path, settings):
423 Return a new Reporter object.
425 :Parameters:
426 `source` : string
427 The path to or description of the source text of the document.
428 `settings` : optparse.Values object
429 Runtime settings.
431 reporter = Reporter(
432 source_path, settings.report_level, settings.halt_level,
433 stream=settings.warning_stream, debug=settings.debug,
434 encoding=settings.error_encoding,
435 error_handler=settings.error_encoding_error_handler)
436 return reporter
438 def new_document(source_path, settings=None):
440 Return a new empty document object.
442 :Parameters:
443 `source_path` : string
444 The path to or description of the source text of the document.
445 `settings` : optparse.Values object
446 Runtime settings. If none are provided, a default core set will
447 be used. If you will use the document object with any Docutils
448 components, you must provide their default settings as well. For
449 example, if parsing, at least provide the parser settings,
450 obtainable as follows::
452 settings = docutils.frontend.OptionParser(
453 components=(docutils.parsers.rst.Parser,)
454 ).get_default_values()
456 from docutils import frontend
457 if settings is None:
458 settings = frontend.OptionParser().get_default_values()
459 source_path = decode_path(source_path)
460 reporter = new_reporter(source_path, settings)
461 document = nodes.document(settings, reporter, source=source_path)
462 document.note_source(source_path, -1)
463 return document
465 def clean_rcs_keywords(paragraph, keyword_substitutions):
466 if len(paragraph) == 1 and isinstance(paragraph[0], nodes.Text):
467 textnode = paragraph[0]
468 for pattern, substitution in keyword_substitutions:
469 match = pattern.search(textnode)
470 if match:
471 paragraph[0] = nodes.Text(pattern.sub(substitution, textnode))
472 return
474 def relative_path(source, target):
476 Build and return a path to `target`, relative to `source` (both files).
478 If there is no common prefix, return the absolute path to `target`.
480 source_parts = os.path.abspath(source or 'dummy_file').split(os.sep)
481 target_parts = os.path.abspath(target).split(os.sep)
482 # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
483 if source_parts[:2] != target_parts[:2]:
484 # Nothing in common between paths.
485 # Return absolute path, using '/' for URLs:
486 return '/'.join(target_parts)
487 source_parts.reverse()
488 target_parts.reverse()
489 while (source_parts and target_parts
490 and source_parts[-1] == target_parts[-1]):
491 # Remove path components in common:
492 source_parts.pop()
493 target_parts.pop()
494 target_parts.reverse()
495 parts = ['..'] * (len(source_parts) - 1) + target_parts
496 return '/'.join(parts)
498 def get_stylesheet_reference(settings, relative_to=None):
500 Retrieve a stylesheet reference from the settings object.
502 Deprecated. Use get_stylesheet_reference_list() instead to
503 enable specification of multiple stylesheets as a comma-separated
504 list.
506 if settings.stylesheet_path:
507 assert not settings.stylesheet, (
508 'stylesheet and stylesheet_path are mutually exclusive.')
509 if relative_to == None:
510 relative_to = settings._destination
511 return relative_path(relative_to, settings.stylesheet_path)
512 else:
513 return settings.stylesheet
515 # Return 'stylesheet' or 'stylesheet_path' arguments as list.
517 # The original settings arguments are kept unchanged: you can test
518 # with e.g. ``if settings.stylesheet_path:``
520 # Differences to ``get_stylesheet_reference``:
521 # * return value is a list
522 # * no re-writing of the path (and therefore no optional argument)
523 # (if required, use ``utils.relative_path(source, target)``
524 # in the calling script)
525 def get_stylesheet_list(settings):
527 Retrieve list of stylesheet references from the settings object.
529 assert not (settings.stylesheet and settings.stylesheet_path), (
530 'stylesheet and stylesheet_path are mutually exclusive.')
531 if settings.stylesheet_path:
532 sheets = settings.stylesheet_path.split(",")
533 elif settings.stylesheet:
534 sheets = settings.stylesheet.split(",")
535 else:
536 sheets = []
537 # strip whitespace (frequently occuring in config files)
538 return [sheet.strip(u' \t\n') for sheet in sheets]
540 def get_trim_footnote_ref_space(settings):
542 Return whether or not to trim footnote space.
544 If trim_footnote_reference_space is not None, return it.
546 If trim_footnote_reference_space is None, return False unless the
547 footnote reference style is 'superscript'.
549 if settings.trim_footnote_reference_space is None:
550 return hasattr(settings, 'footnote_references') and \
551 settings.footnote_references == 'superscript'
552 else:
553 return settings.trim_footnote_reference_space
555 def get_source_line(node):
557 Return the "source" and "line" attributes from the `node` given or from
558 its closest ancestor.
560 while node:
561 if node.source or node.line:
562 return node.source, node.line
563 node = node.parent
564 return None, None
566 def escape2null(text):
567 """Return a string with escape-backslashes converted to nulls."""
568 parts = []
569 start = 0
570 while 1:
571 found = text.find('\\', start)
572 if found == -1:
573 parts.append(text[start:])
574 return ''.join(parts)
575 parts.append(text[start:found])
576 parts.append('\x00' + text[found+1:found+2])
577 start = found + 2 # skip character after escape
579 def unescape(text, restore_backslashes=0):
581 Return a string with nulls removed or restored to backslashes.
582 Backslash-escaped spaces are also removed.
584 if restore_backslashes:
585 return text.replace('\x00', '\\')
586 else:
587 for sep in ['\x00 ', '\x00\n', '\x00']:
588 text = ''.join(text.split(sep))
589 return text
591 east_asian_widths = {'W': 2, # Wide
592 'F': 2, # Full-width (wide)
593 'Na': 1, # Narrow
594 'H': 1, # Half-width (narrow)
595 'N': 1, # Neutral (not East Asian, treated as narrow)
596 'A': 1} # Ambiguous (s/b wide in East Asian context,
597 # narrow otherwise, but that doesn't work)
598 """Mapping of result codes from `unicodedata.east_asian_widt()` to character
599 column widths."""
601 def column_width(text):
602 """Return the column width of text.
604 Correct ``len(text)`` for wide East Asian and combining Unicode chars.
606 if isinstance(text, str) and sys.version_info < (3,0):
607 return len(text)
608 combining_correction = sum([-1 for c in text
609 if unicodedata.combining(c)])
610 try:
611 width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
612 for c in text])
613 except AttributeError: # east_asian_width() New in version 2.4.
614 width = len(text)
615 return width + combining_correction
617 def uniq(L):
618 r = []
619 for item in L:
620 if not item in r:
621 r.append(item)
622 return r
624 # by Li Daobing http://code.activestate.com/recipes/190465/
625 # since Python 2.6 there is also itertools.combinations()
626 def unique_combinations(items, n):
627 """Return r-length tuples, in sorted order, no repeated elements"""
628 if n==0: yield []
629 else:
630 for i in xrange(len(items)-n+1):
631 for cc in unique_combinations(items[i+1:],n-1):
632 yield [items[i]]+cc
634 def normalize_language_tag(tag):
635 """Return a list of normalized combinations for a `BCP 47` language tag.
637 Example:
639 >>> normalize_language_tag('de-AT-1901')
640 ['de_at_1901', 'de_at', 'de_1901', 'de']
642 # normalize:
643 tag = tag.lower().replace('-','_')
644 # find all combinations of subtags
645 taglist = []
646 base_tag= tag.split('_')[:1]
647 subtags = tag.split('_')[1:]
648 # print base_tag, subtags
649 for n in range(len(subtags), 0, -1):
650 for tags in unique_combinations(subtags, n):
651 # print tags
652 taglist.append('_'.join(base_tag + tags))
653 taglist += base_tag
654 return taglist
656 class DependencyList:
659 List of dependencies, with file recording support.
661 Note that the output file is not automatically closed. You have
662 to explicitly call the close() method.
665 def __init__(self, output_file=None, dependencies=[]):
667 Initialize the dependency list, automatically setting the
668 output file to `output_file` (see `set_output()`) and adding
669 all supplied dependencies.
671 self.set_output(output_file)
672 for i in dependencies:
673 self.add(i)
675 def set_output(self, output_file):
677 Set the output file and clear the list of already added
678 dependencies.
680 `output_file` must be a string. The specified file is
681 immediately overwritten.
683 If output_file is '-', the output will be written to stdout.
684 If it is None, no file output is done when calling add().
686 self.list = []
687 if output_file == '-':
688 self.file = sys.stdout
689 elif output_file:
690 self.file = open(output_file, 'w')
691 else:
692 self.file = None
694 def add(self, *filenames):
696 If the dependency `filename` has not already been added,
697 append it to self.list and print it to self.file if self.file
698 is not None.
700 for filename in filenames:
701 if not filename in self.list:
702 self.list.append(filename)
703 if self.file is not None:
704 print >>self.file, filename
706 def close(self):
708 Close the output file.
710 self.file.close()
711 self.file = None
713 def __repr__(self):
714 if self.file:
715 output_file = self.file.name
716 else:
717 output_file = None
718 return '%s(%r, %s)' % (self.__class__.__name__, output_file, self.list)