docutils/utils/punctuation_chars.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # :Id: $Id$
   4 # :Copyright: © 2011, 2017 Günter Milde.
   5 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
   6 #
   7 #    Copying and distribution of this file, with or without modification,
   8 #    are permitted in any medium without royalty provided the copyright
   9 #    notice and this notice are preserved.
  10 #    This file is offered as-is, without any warranty.
  11 #
  12 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
  13 #
  14 # This file is generated by
  15 # ``docutils/tools/dev/generate_punctuation_chars.py``.
  16 # ::
  17
  18 import sys, re
  19 import unicodedata
  20
  21 """Docutils character category patterns.
  22
  23    Patterns for the implementation of the `inline markup recognition rules`_
  24    in the reStructuredText parser `docutils.parsers.rst.states.py` based
  25    on Unicode character categories.
  26    The patterns are used inside ``[ ]`` in regular expressions.
  27
  28    Rule (5) requires determination of matching open/close pairs. However, the
  29    pairing of open/close quotes is ambiguous due to  different typographic
  30    conventions in different languages. The ``quote_pairs`` function tests
  31    whether two characters form an open/close pair.
  32
  33    The patterns are generated by
  34    ``docutils/tools/dev/generate_punctuation_chars.py`` to  prevent dependence
  35    on the Python version and avoid the time-consuming generation with every
  36    Docutils run. See there for motives and implementation details.
  37
  38    The category of some characters changed with the development of the
  39    Unicode standard. The current lists are generated with the help of the
  40    "unicodedata" module of Python 2.7.13 (based on Unicode version 5.2.0).
  41
  42    .. _inline markup recognition rules:
  43       http://docutils.sf.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
  44 """
  45
  46 openers = (u'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768'
  47            u'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea'
  48            u'\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991'
  49            u'\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28'
  50            u'\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d'
  51            u'\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41'
  52            u'\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62'
  53            u'\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20'
  54            u'\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d'
  55            u'\u2e1d\u2e21\u201b\u201f')
  56 closers = (u'"\')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769'
  57            u'\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb'
  58            u'\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992'
  59            u'\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29'
  60            u'\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e'
  61            u'\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42'
  62            u'\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63'
  63            u'\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21'
  64            u'\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c'
  65            u'\u2e1c\u2e20\u201a\u201e')
  66 delimiters = (u'\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589'
  67               u'\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c'
  68               u'\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d'
  69               u'\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f'
  70               u'\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f'
  71               u'\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735'
  72               u'\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945'
  73               u'\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-'
  74               u'\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-'
  75               u'\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-'
  76               u'\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00'
  77               u'\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-'
  78               u'\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0'
  79               u'\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7'
  80               u'\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f'
  81               u'\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb'
  82               u'\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c'
  83               u'\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a'
  84               u'\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a'
  85               u'\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65')
  86 if sys.maxunicode >= 0x10FFFF: # "wide" build
  87     delimiters += (u'\U00010100\U00010101\U0001039f\U000103d0\U00010857'
  88                    u'\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f'
  89                    u'\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-'
  90                    u'\U000110c1\U00012470-\U00012473')
  91 closing_delimiters = u'\\\\.,;!?'
  92
  93
  94 # Matching open/close quotes
  95 # --------------------------
  96
  97 quote_pairs = {# open char: matching closing characters # usage example
  98                u'\xbb':   u'\xbb',         # » » Swedish
  99                u'\u2018': u'\u201a',       # ‘ ‚ Albanian/Greek/Turkish
 100                u'\u2019': u'\u2019',       # ’ ’ Swedish
 101                u'\u201a': u'\u2018\u2019', # ‚ ‘ German ‚ ’ Polish
 102                u'\u201c': u'\u201e',       # “ „ Albanian/Greek/Turkish
 103                u'\u201e': u'\u201c\u201d', # „ “ German „ ” Polish
 104                u'\u201d': u'\u201d',       # ” ” Swedish
 105                u'\u203a': u'\u203a',       # › › Swedish
 106               }
 107 """Additional open/close quote pairs."""
 108
 109 def match_chars(c1, c2):
 110     """Test whether `c1` and `c2` are a matching open/close character pair.
 111
 112     Matching open/close pairs are at the same position in
 113     `punctuation_chars.openers` and `punctuation_chars.closers`.
 114     The pairing of open/close quotes is ambiguous due to  different
 115     typographic conventions in different languages,
 116     so we test for additional matches stored in `quote_pairs`.
 117     """
 118     try:
 119         i = openers.index(c1)
 120     except ValueError:  # c1 not in openers
 121         return False
 122     return c2 == closers[i] or c2 in quote_pairs.get(c1, u'')