normalize_language_tag() now returns `BCP 47`_ conformant tags
[docutils.git] / docutils / utils / __init__.py
blob1aead4884694b2eaa8fbb395b6e6441a2cafdf35
1 # coding: utf8
2 # $Id$
3 # Author: David Goodger <goodger@python.org>
4 # Copyright: This module has been placed in the public domain.
6 """
7 Miscellaneous utilities for the documentation utilities.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 import os
14 import os.path
15 import re
16 import warnings
17 import unicodedata
18 from docutils import ApplicationError, DataError
19 from docutils import nodes
20 from docutils.io import FileOutput
21 from docutils.utils.error_reporting import ErrorOutput, SafeString
24 class SystemMessage(ApplicationError):
26 def __init__(self, system_message, level):
27 Exception.__init__(self, system_message.astext())
28 self.level = level
31 class SystemMessagePropagation(ApplicationError): pass
34 class Reporter:
36 """
37 Info/warning/error reporter and ``system_message`` element generator.
39 Five levels of system messages are defined, along with corresponding
40 methods: `debug()`, `info()`, `warning()`, `error()`, and `severe()`.
42 There is typically one Reporter object per process. A Reporter object is
43 instantiated with thresholds for reporting (generating warnings) and
44 halting processing (raising exceptions), a switch to turn debug output on
45 or off, and an I/O stream for warnings. These are stored as instance
46 attributes.
48 When a system message is generated, its level is compared to the stored
49 thresholds, and a warning or error is generated as appropriate. Debug
50 messages are produced if the stored debug switch is on, independently of
51 other thresholds. Message output is sent to the stored warning stream if
52 not set to ''.
54 The Reporter class also employs a modified form of the "Observer" pattern
55 [GoF95]_ to track system messages generated. The `attach_observer` method
56 should be called before parsing, with a bound method or function which
57 accepts system messages. The observer can be removed with
58 `detach_observer`, and another added in its place.
60 .. [GoF95] Gamma, Helm, Johnson, Vlissides. *Design Patterns: Elements of
61 Reusable Object-Oriented Software*. Addison-Wesley, Reading, MA, USA,
62 1995.
63 """
65 levels = 'DEBUG INFO WARNING ERROR SEVERE'.split()
66 """List of names for system message levels, indexed by level."""
68 # system message level constants:
69 (DEBUG_LEVEL,
70 INFO_LEVEL,
71 WARNING_LEVEL,
72 ERROR_LEVEL,
73 SEVERE_LEVEL) = range(5)
75 def __init__(self, source, report_level, halt_level, stream=None,
76 debug=False, encoding=None, error_handler='backslashreplace'):
77 """
78 :Parameters:
79 - `source`: The path to or description of the source data.
80 - `report_level`: The level at or above which warning output will
81 be sent to `stream`.
82 - `halt_level`: The level at or above which `SystemMessage`
83 exceptions will be raised, halting execution.
84 - `debug`: Show debug (level=0) system messages?
85 - `stream`: Where warning output is sent. Can be file-like (has a
86 ``.write`` method), a string (file name, opened for writing),
87 '' (empty string) or `False` (for discarding all stream messages)
88 or `None` (implies `sys.stderr`; default).
89 - `encoding`: The output encoding.
90 - `error_handler`: The error handler for stderr output encoding.
91 """
93 self.source = source
94 """The path to or description of the source data."""
96 self.error_handler = error_handler
97 """The character encoding error handler."""
99 self.debug_flag = debug
100 """Show debug (level=0) system messages?"""
102 self.report_level = report_level
103 """The level at or above which warning output will be sent
104 to `self.stream`."""
106 self.halt_level = halt_level
107 """The level at or above which `SystemMessage` exceptions
108 will be raised, halting execution."""
110 if not isinstance(stream, ErrorOutput):
111 stream = ErrorOutput(stream, encoding, error_handler)
113 self.stream = stream
114 """Where warning output is sent."""
116 self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
117 """The output character encoding."""
119 self.observers = []
120 """List of bound methods or functions to call with each system_message
121 created."""
123 self.max_level = -1
124 """The highest level system message generated so far."""
126 def set_conditions(self, category, report_level, halt_level,
127 stream=None, debug=False):
128 warnings.warn('docutils.utils.Reporter.set_conditions deprecated; '
129 'set attributes via configuration settings or directly',
130 DeprecationWarning, stacklevel=2)
131 self.report_level = report_level
132 self.halt_level = halt_level
133 if not isinstance(stream, ErrorOutput):
134 stream = ErrorOutput(stream, self.encoding, self.error_handler)
135 self.stream = stream
136 self.debug_flag = debug
138 def attach_observer(self, observer):
140 The `observer` parameter is a function or bound method which takes one
141 argument, a `nodes.system_message` instance.
143 self.observers.append(observer)
145 def detach_observer(self, observer):
146 self.observers.remove(observer)
148 def notify_observers(self, message):
149 for observer in self.observers:
150 observer(message)
152 def system_message(self, level, message, *children, **kwargs):
154 Return a system_message object.
156 Raise an exception or generate a warning if appropriate.
158 # `message` can be a `string`, `unicode`, or `Exception` instance.
159 if isinstance(message, Exception):
160 message = SafeString(message)
162 attributes = kwargs.copy()
163 if 'base_node' in kwargs:
164 source, line = get_source_line(kwargs['base_node'])
165 del attributes['base_node']
166 if source is not None:
167 attributes.setdefault('source', source)
168 if line is not None:
169 attributes.setdefault('line', line)
170 # assert source is not None, "node has line- but no source-argument"
171 if not 'source' in attributes: # 'line' is absolute line number
172 try: # look up (source, line-in-source)
173 source, line = self.get_source_and_line(attributes.get('line'))
174 # print "locator lookup", kwargs.get('line'), "->", source, line
175 except AttributeError:
176 source, line = None, None
177 if source is not None:
178 attributes['source'] = source
179 if line is not None:
180 attributes['line'] = line
181 # assert attributes['line'] is not None, (message, kwargs)
182 # assert attributes['source'] is not None, (message, kwargs)
183 attributes.setdefault('source', self.source)
185 msg = nodes.system_message(message, level=level,
186 type=self.levels[level],
187 *children, **attributes)
188 if self.stream and (level >= self.report_level
189 or self.debug_flag and level == self.DEBUG_LEVEL
190 or level >= self.halt_level):
191 self.stream.write(msg.astext() + '\n')
192 if level >= self.halt_level:
193 raise SystemMessage(msg, level)
194 if level > self.DEBUG_LEVEL or self.debug_flag:
195 self.notify_observers(msg)
196 self.max_level = max(level, self.max_level)
197 return msg
199 def debug(self, *args, **kwargs):
201 Level-0, "DEBUG": an internal reporting issue. Typically, there is no
202 effect on the processing. Level-0 system messages are handled
203 separately from the others.
205 if self.debug_flag:
206 return self.system_message(self.DEBUG_LEVEL, *args, **kwargs)
208 def info(self, *args, **kwargs):
210 Level-1, "INFO": a minor issue that can be ignored. Typically there is
211 no effect on processing, and level-1 system messages are not reported.
213 return self.system_message(self.INFO_LEVEL, *args, **kwargs)
215 def warning(self, *args, **kwargs):
217 Level-2, "WARNING": an issue that should be addressed. If ignored,
218 there may be unpredictable problems with the output.
220 return self.system_message(self.WARNING_LEVEL, *args, **kwargs)
222 def error(self, *args, **kwargs):
224 Level-3, "ERROR": an error that should be addressed. If ignored, the
225 output will contain errors.
227 return self.system_message(self.ERROR_LEVEL, *args, **kwargs)
229 def severe(self, *args, **kwargs):
231 Level-4, "SEVERE": a severe error that must be addressed. If ignored,
232 the output will contain severe errors. Typically level-4 system
233 messages are turned into exceptions which halt processing.
235 return self.system_message(self.SEVERE_LEVEL, *args, **kwargs)
238 class ExtensionOptionError(DataError): pass
239 class BadOptionError(ExtensionOptionError): pass
240 class BadOptionDataError(ExtensionOptionError): pass
241 class DuplicateOptionError(ExtensionOptionError): pass
244 def extract_extension_options(field_list, options_spec):
246 Return a dictionary mapping extension option names to converted values.
248 :Parameters:
249 - `field_list`: A flat field list without field arguments, where each
250 field body consists of a single paragraph only.
251 - `options_spec`: Dictionary mapping known option names to a
252 conversion function such as `int` or `float`.
254 :Exceptions:
255 - `KeyError` for unknown option names.
256 - `ValueError` for invalid option values (raised by the conversion
257 function).
258 - `TypeError` for invalid option value types (raised by conversion
259 function).
260 - `DuplicateOptionError` for duplicate options.
261 - `BadOptionError` for invalid fields.
262 - `BadOptionDataError` for invalid option data (missing name,
263 missing data, bad quotes, etc.).
265 option_list = extract_options(field_list)
266 option_dict = assemble_option_dict(option_list, options_spec)
267 return option_dict
269 def extract_options(field_list):
271 Return a list of option (name, value) pairs from field names & bodies.
273 :Parameter:
274 `field_list`: A flat field list, where each field name is a single
275 word and each field body consists of a single paragraph only.
277 :Exceptions:
278 - `BadOptionError` for invalid fields.
279 - `BadOptionDataError` for invalid option data (missing name,
280 missing data, bad quotes, etc.).
282 option_list = []
283 for field in field_list:
284 if len(field[0].astext().split()) != 1:
285 raise BadOptionError(
286 'extension option field name may not contain multiple words')
287 name = str(field[0].astext().lower())
288 body = field[1]
289 if len(body) == 0:
290 data = None
291 elif len(body) > 1 or not isinstance(body[0], nodes.paragraph) \
292 or len(body[0]) != 1 or not isinstance(body[0][0], nodes.Text):
293 raise BadOptionDataError(
294 'extension option field body may contain\n'
295 'a single paragraph only (option "%s")' % name)
296 else:
297 data = body[0][0].astext()
298 option_list.append((name, data))
299 return option_list
301 def assemble_option_dict(option_list, options_spec):
303 Return a mapping of option names to values.
305 :Parameters:
306 - `option_list`: A list of (name, value) pairs (the output of
307 `extract_options()`).
308 - `options_spec`: Dictionary mapping known option names to a
309 conversion function such as `int` or `float`.
311 :Exceptions:
312 - `KeyError` for unknown option names.
313 - `DuplicateOptionError` for duplicate options.
314 - `ValueError` for invalid option values (raised by conversion
315 function).
316 - `TypeError` for invalid option value types (raised by conversion
317 function).
319 options = {}
320 for name, value in option_list:
321 convertor = options_spec[name] # raises KeyError if unknown
322 if convertor is None:
323 raise KeyError(name) # or if explicitly disabled
324 if name in options:
325 raise DuplicateOptionError('duplicate option "%s"' % name)
326 try:
327 options[name] = convertor(value)
328 except (ValueError, TypeError), detail:
329 raise detail.__class__('(option: "%s"; value: %r)\n%s'
330 % (name, value, ' '.join(detail.args)))
331 return options
334 class NameValueError(DataError): pass
337 def decode_path(path):
339 Ensure `path` is Unicode. Return `nodes.reprunicode` object.
341 Decode file/path string in a failsave manner if not already done.
343 # see also http://article.gmane.org/gmane.text.docutils.user/2905
344 if isinstance(path, unicode):
345 return path
346 try:
347 path = path.decode(sys.getfilesystemencoding(), 'strict')
348 except AttributeError: # default value None has no decode method
349 return nodes.reprunicode(path)
350 except UnicodeDecodeError:
351 try:
352 path = path.decode('utf-8', 'strict')
353 except UnicodeDecodeError:
354 path = path.decode('ascii', 'replace')
355 return nodes.reprunicode(path)
358 def extract_name_value(line):
360 Return a list of (name, value) from a line of the form "name=value ...".
362 :Exception:
363 `NameValueError` for invalid input (missing name, missing data, bad
364 quotes, etc.).
366 attlist = []
367 while line:
368 equals = line.find('=')
369 if equals == -1:
370 raise NameValueError('missing "="')
371 attname = line[:equals].strip()
372 if equals == 0 or not attname:
373 raise NameValueError(
374 'missing attribute name before "="')
375 line = line[equals+1:].lstrip()
376 if not line:
377 raise NameValueError(
378 'missing value after "%s="' % attname)
379 if line[0] in '\'"':
380 endquote = line.find(line[0], 1)
381 if endquote == -1:
382 raise NameValueError(
383 'attribute "%s" missing end quote (%s)'
384 % (attname, line[0]))
385 if len(line) > endquote + 1 and line[endquote + 1].strip():
386 raise NameValueError(
387 'attribute "%s" end quote (%s) not followed by '
388 'whitespace' % (attname, line[0]))
389 data = line[1:endquote]
390 line = line[endquote+1:].lstrip()
391 else:
392 space = line.find(' ')
393 if space == -1:
394 data = line
395 line = ''
396 else:
397 data = line[:space]
398 line = line[space+1:].lstrip()
399 attlist.append((attname.lower(), data))
400 return attlist
402 def new_reporter(source_path, settings):
404 Return a new Reporter object.
406 :Parameters:
407 `source` : string
408 The path to or description of the source text of the document.
409 `settings` : optparse.Values object
410 Runtime settings.
412 reporter = Reporter(
413 source_path, settings.report_level, settings.halt_level,
414 stream=settings.warning_stream, debug=settings.debug,
415 encoding=settings.error_encoding,
416 error_handler=settings.error_encoding_error_handler)
417 return reporter
419 def new_document(source_path, settings=None):
421 Return a new empty document object.
423 :Parameters:
424 `source_path` : string
425 The path to or description of the source text of the document.
426 `settings` : optparse.Values object
427 Runtime settings. If none are provided, a default core set will
428 be used. If you will use the document object with any Docutils
429 components, you must provide their default settings as well. For
430 example, if parsing, at least provide the parser settings,
431 obtainable as follows::
433 settings = docutils.frontend.OptionParser(
434 components=(docutils.parsers.rst.Parser,)
435 ).get_default_values()
437 from docutils import frontend
438 if settings is None:
439 settings = frontend.OptionParser().get_default_values()
440 source_path = decode_path(source_path)
441 reporter = new_reporter(source_path, settings)
442 document = nodes.document(settings, reporter, source=source_path)
443 document.note_source(source_path, -1)
444 return document
446 def clean_rcs_keywords(paragraph, keyword_substitutions):
447 if len(paragraph) == 1 and isinstance(paragraph[0], nodes.Text):
448 textnode = paragraph[0]
449 for pattern, substitution in keyword_substitutions:
450 match = pattern.search(textnode)
451 if match:
452 paragraph[0] = nodes.Text(pattern.sub(substitution, textnode))
453 return
455 def relative_path(source, target):
457 Build and return a path to `target`, relative to `source` (both files).
459 If there is no common prefix, return the absolute path to `target`.
461 source_parts = os.path.abspath(source or type(target)('dummy_file')
462 ).split(os.sep)
463 target_parts = os.path.abspath(target).split(os.sep)
464 # Check first 2 parts because '/dir'.split('/') == ['', 'dir']:
465 if source_parts[:2] != target_parts[:2]:
466 # Nothing in common between paths.
467 # Return absolute path, using '/' for URLs:
468 return '/'.join(target_parts)
469 source_parts.reverse()
470 target_parts.reverse()
471 while (source_parts and target_parts
472 and source_parts[-1] == target_parts[-1]):
473 # Remove path components in common:
474 source_parts.pop()
475 target_parts.pop()
476 target_parts.reverse()
477 parts = ['..'] * (len(source_parts) - 1) + target_parts
478 return '/'.join(parts)
480 def get_stylesheet_reference(settings, relative_to=None):
482 Retrieve a stylesheet reference from the settings object.
484 Deprecated. Use get_stylesheet_list() instead to
485 enable specification of multiple stylesheets as a comma-separated
486 list.
488 if settings.stylesheet_path:
489 assert not settings.stylesheet, (
490 'stylesheet and stylesheet_path are mutually exclusive.')
491 if relative_to == None:
492 relative_to = settings._destination
493 return relative_path(relative_to, settings.stylesheet_path)
494 else:
495 return settings.stylesheet
497 # Return 'stylesheet' or 'stylesheet_path' arguments as list.
499 # The original settings arguments are kept unchanged: you can test
500 # with e.g. ``if settings.stylesheet_path:``
502 # Differences to ``get_stylesheet_reference``:
503 # * return value is a list
504 # * no re-writing of the path (and therefore no optional argument)
505 # (if required, use ``utils.relative_path(source, target)``
506 # in the calling script)
507 def get_stylesheet_list(settings):
509 Retrieve list of stylesheet references from the settings object.
511 assert not (settings.stylesheet and settings.stylesheet_path), (
512 'stylesheet and stylesheet_path are mutually exclusive.')
513 stylesheets = settings.stylesheet_path or settings.stylesheet or []
514 # programmatically set default can be string or unicode:
515 if not isinstance(stylesheets, list):
516 stylesheets = [cls.strip() for cls in stylesheets.split(',')]
517 return stylesheets
519 def get_trim_footnote_ref_space(settings):
521 Return whether or not to trim footnote space.
523 If trim_footnote_reference_space is not None, return it.
525 If trim_footnote_reference_space is None, return False unless the
526 footnote reference style is 'superscript'.
528 if settings.trim_footnote_reference_space is None:
529 return hasattr(settings, 'footnote_references') and \
530 settings.footnote_references == 'superscript'
531 else:
532 return settings.trim_footnote_reference_space
534 def get_source_line(node):
536 Return the "source" and "line" attributes from the `node` given or from
537 its closest ancestor.
539 while node:
540 if node.source or node.line:
541 return node.source, node.line
542 node = node.parent
543 return None, None
545 def escape2null(text):
546 """Return a string with escape-backslashes converted to nulls."""
547 parts = []
548 start = 0
549 while True:
550 found = text.find('\\', start)
551 if found == -1:
552 parts.append(text[start:])
553 return ''.join(parts)
554 parts.append(text[start:found])
555 parts.append('\x00' + text[found+1:found+2])
556 start = found + 2 # skip character after escape
558 def unescape(text, restore_backslashes=False):
560 Return a string with nulls removed or restored to backslashes.
561 Backslash-escaped spaces are also removed.
563 if restore_backslashes:
564 return text.replace('\x00', '\\')
565 else:
566 for sep in ['\x00 ', '\x00\n', '\x00']:
567 text = ''.join(text.split(sep))
568 return text
570 def strip_combining_chars(text):
571 if isinstance(text, str) and sys.version_info < (3,0):
572 return text
573 return u''.join([c for c in text if not unicodedata.combining(c)])
575 def find_combining_chars(text):
576 """Return indices of all combining chars in Unicode string `text`.
578 >>> find_combining_chars(u'A t̆ab̆lĕ')
579 [3, 6, 9]
581 if isinstance(text, str) and sys.version_info < (3,0):
582 return []
583 return [i for i,c in enumerate(text) if unicodedata.combining(c)]
585 def column_indices(text):
586 """Indices of Unicode string `text` when skipping combining characters.
588 >>> column_indices(u'A t̆ab̆lĕ')
589 [0, 1, 2, 4, 5, 7, 8]
591 # TODO: account for asian wide chars here instead of using dummy
592 # replacements in the tableparser?
593 string_indices = range(len(text))
594 for index in find_combining_chars(text):
595 string_indices[index] = None
596 return [i for i in string_indices if i is not None]
598 east_asian_widths = {'W': 2, # Wide
599 'F': 2, # Full-width (wide)
600 'Na': 1, # Narrow
601 'H': 1, # Half-width (narrow)
602 'N': 1, # Neutral (not East Asian, treated as narrow)
603 'A': 1} # Ambiguous (s/b wide in East Asian context,
604 # narrow otherwise, but that doesn't work)
605 """Mapping of result codes from `unicodedata.east_asian_widt()` to character
606 column widths."""
608 def column_width(text):
609 """Return the column width of text.
611 Correct ``len(text)`` for wide East Asian and combining Unicode chars.
613 if isinstance(text, str) and sys.version_info < (3,0):
614 return len(text)
615 try:
616 width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
617 for c in text])
618 except AttributeError: # east_asian_width() New in version 2.4.
619 width = len(text)
620 # correction for combining chars:
621 width -= len(find_combining_chars(text))
622 return width
624 def uniq(L):
625 r = []
626 for item in L:
627 if not item in r:
628 r.append(item)
629 return r
631 # by Li Daobing http://code.activestate.com/recipes/190465/
632 # since Python 2.6 there is also itertools.combinations()
633 def unique_combinations(items, n):
634 """Return n-length tuples, in sorted order, no repeated elements"""
635 if n==0: yield []
636 else:
637 for i in xrange(len(items)-n+1):
638 for cc in unique_combinations(items[i+1:],n-1):
639 yield [items[i]]+cc
641 def normalize_language_tag(tag):
642 """Return a list of normalized combinations for a `BCP 47` language tag.
644 Example:
646 >>> normalize_language_tag('de_AT-1901')
647 ['de-at-1901', 'de-at', 'de-1901', 'de']
649 # normalize:
650 tag = tag.lower().replace('_','-')
651 # split (except singletons, which mark the following tag as non-standard):
652 tag = re.sub(r'-([a-zA-Z0-9])-', r'-\1_', tag)
653 taglist = []
654 subtags = [subtag.replace('_', '-') for subtag in tag.split('-')]
655 base_tag = [subtags.pop(0)]
656 # find all combinations of subtags
657 for n in range(len(subtags), 0, -1):
658 for tags in unique_combinations(subtags, n):
659 taglist.append('-'.join(base_tag+tags))
660 taglist += base_tag
661 return taglist
664 class DependencyList(object):
667 List of dependencies, with file recording support.
669 Note that the output file is not automatically closed. You have
670 to explicitly call the close() method.
673 def __init__(self, output_file=None, dependencies=[]):
675 Initialize the dependency list, automatically setting the
676 output file to `output_file` (see `set_output()`) and adding
677 all supplied dependencies.
679 self.set_output(output_file)
680 for i in dependencies:
681 self.add(i)
683 def set_output(self, output_file):
685 Set the output file and clear the list of already added
686 dependencies.
688 `output_file` must be a string. The specified file is
689 immediately overwritten.
691 If output_file is '-', the output will be written to stdout.
692 If it is None, no file output is done when calling add().
694 self.list = []
695 if output_file:
696 if output_file == '-':
697 of = None
698 else:
699 of = output_file
700 self.file = FileOutput(destination_path=of,
701 encoding='utf8', autoclose=False)
702 else:
703 self.file = None
705 def add(self, *filenames):
707 If the dependency `filename` has not already been added,
708 append it to self.list and print it to self.file if self.file
709 is not None.
711 for filename in filenames:
712 if not filename in self.list:
713 self.list.append(filename)
714 if self.file is not None:
715 self.file.write(filename+'\n')
717 def close(self):
719 Close the output file.
721 self.file.close()
722 self.file = None
724 def __repr__(self):
725 try:
726 output_file = self.file.name
727 except AttributeError:
728 output_file = None
729 return '%s(%r, %s)' % (self.__class__.__name__, output_file, self.list)