docutils/utils/punctuation_chars.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # :Copyright: © 2011 Günter Milde.
   4 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
   5 #
   6 #    Copying and distribution of this file, with or without modification,
   7 #    are permitted in any medium without royalty provided the copyright
   8 #    notice and this notice are preserved.
   9 #    This file is offered as-is, without any warranty.
  10 #
  11 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
  12
  13 # :Id: $Id$
  14
  15 import sys, re
  16 import unicodedata
  17
  18 # punctuation characters around inline markup
  19 # ===========================================
  20 #
  21 # This module provides the lists of characters for the implementation of
  22 # the `inline markup recognition rules`_ in the reStructuredText parser
  23 # (states.py)
  24 #
  25 # .. _inline markup recognition rules:
  26 #     ../../docs/ref/rst/restructuredtext.html#inline-markup
  27
  28 # Docutils punctuation category sample strings
  29 # --------------------------------------------
  30 #
  31 # The sample strings are generated by punctuation_samples() and put here
  32 # literal to avoid the time-consuming generation with every Docutils run.
  33 # As the samples are used inside ``[ ]`` in regular expressions, hyphen and
  34 # square brackets are escaped. ::
  35
  36 openers = (u'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768'
  37            u'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea'
  38            u'\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991'
  39            u'\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28'
  40            u'\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d'
  41            u'\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41'
  42            u'\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62'
  43            u'\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20'
  44            u'\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d'
  45            u'\u2e1d\u2e21\u201b\u201f')
  46 closers = (u'"\')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769'
  47            u'\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb'
  48            u'\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992'
  49            u'\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29'
  50            u'\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e'
  51            u'\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42'
  52            u'\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63'
  53            u'\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21'
  54            u'\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c'
  55            u'\u2e1c\u2e20\u201a\u201e')
  56 delimiters = (u'\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589'
  57               u'\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c'
  58               u'\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d'
  59               u'\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f'
  60               u'\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f'
  61               u'\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735'
  62               u'\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945'
  63               u'\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-'
  64               u'\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-'
  65               u'\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-'
  66               u'\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00'
  67               u'\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-'
  68               u'\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0'
  69               u'\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7'
  70               u'\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f'
  71               u'\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb'
  72               u'\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c'
  73               u'\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a'
  74               u'\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a'
  75               u'\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65')
  76 if sys.maxunicode >= 0x10FFFF: # "wide" build
  77     delimiters += (u'\U00010100\U00010101\U0001039f\U000103d0\U00010857'
  78                    u'\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f'
  79                    u'\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-'
  80                    u'\U000110c1\U00012470-\U00012473')
  81 closing_delimiters = u'\\\\.,;!?'
  82
  83
  84 # Matching open/close quotes
  85 # --------------------------
  86
  87 # Rule (5) requires determination of matching open/close pairs. However,
  88 # the pairing of open/close quotes is ambigue due to  different typographic
  89 # conventions in different languages.
  90
  91 quote_pairs = {u'\xbb': u'\xbb', # Swedish
  92             u'\u2018': u'\u201a', # Greek
  93             u'\u2019': u'\u2019', # Swedish
  94             u'\u201a': u'\u2018\u2019', # German, Polish
  95             u'\u201c': u'\u201e', # German
  96             u'\u201e': u'\u201c\u201d',
  97             u'\u201d': u'\u201d', # Swedish
  98             u'\u203a': u'\u203a', # Swedish
  99             }
 100
 101 def match_chars(c1, c2):
 102     try:
 103         i = openers.index(c1)
 104     except ValueError:  # c1 not in openers
 105         return False
 106     return c2 == closers[i] or c2 in quote_pairs.get(c1, '')
 107
 108
 109 # Running this file as a standalone module checks the definitions against a
 110 # re-calculation::
 111
 112 if __name__ == '__main__':
 113
 114
 115 # Unicode punctuation character categories
 116 # ----------------------------------------
 117
 118     unicode_punctuation_categories = {
 119         # 'Pc': 'Connector', # not used in Docutils inline markup recognition
 120         'Pd': 'Dash',
 121         'Ps': 'Open',
 122         'Pe': 'Close',
 123         'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
 124         'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
 125         'Po': 'Other'
 126         }
 127     """Unicode character categories for punctuation"""
 128
 129
 130 # generate character pattern strings
 131 # ==================================
 132
 133     def unicode_charlists(categories, cp_min=0, cp_max=None):
 134         """Return dictionary of Unicode character lists.
 135
 136         For each of the `catagories`, an item contains a list with all Unicode
 137         characters with `cp_min` <= code-point <= `cp_max` that belong to
 138         the category.
 139
 140         The default values check every code-point supported by Python
 141         (`sys.maxint` is 0x10FFFF in a "wide" build and 0xFFFF in a "narrow"
 142         build, i.e. ucs4 and ucs2 respectively).
 143         """
 144         # Determine highest code point with one of the given categories
 145         # (may shorten the search time considerably if there are many
 146         # categories with not too high characters):
 147         if cp_max is None:
 148             cp_max = max(x for x in xrange(sys.maxunicode+1)
 149                         if unicodedata.category(unichr(x)) in categories)
 150             # print cp_max # => 74867 for unicode_punctuation_categories
 151         charlists = {}
 152         for cat in categories:
 153             charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
 154                               if unicodedata.category(unichr(x)) == cat]
 155         return charlists
 156
 157
 158 # Character categories in Docutils
 159 # --------------------------------
 160
 161     def punctuation_samples():
 162
 163         """Docutils punctuation category sample strings.
 164
 165         Return list of sample strings for the categories "Open", "Close",
 166         "Delimiters" and "Closing-Delimiters" used in the `inline markup
 167         recognition rules`_.
 168         """
 169
 170         # Lists with characters in Unicode punctuation character categories
 171         cp_min = 160 # ASCII chars have special rules for backwards compatibility
 172         ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
 173
 174         # match opening/closing characters
 175         # --------------------------------
 176         # Rearange the lists to ensure matching characters at the same
 177         # index position.
 178
 179         # low quotation marks are also used as closers (e.g. in Greek)
 180         # move them to category Pi:
 181         ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
 182         ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
 183         ucharlists['Pi'] += [u'‚', u'„']
 184
 185         ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
 186         ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
 187         ucharlists['Pf'] += [u'‛', u'‟']
 188
 189         # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
 190         ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
 191
 192         # print u''.join(ucharlists['Ps']).encode('utf8')
 193         # print u''.join(ucharlists['Pe']).encode('utf8')
 194         # print u''.join(ucharlists['Pi']).encode('utf8')
 195         # print u''.join(ucharlists['Pf']).encode('utf8')
 196
 197         # The Docutils character categories
 198         # ---------------------------------
 199         #
 200         # The categorization of ASCII chars is non-standard to reduce
 201         # both false positives and need for escaping. (see `inline markup
 202         # recognition rules`_)
 203
 204         # allowed before markup if there is a matching closer
 205         openers = [u'"\'(<\\[{']
 206         for cat in ('Ps', 'Pi', 'Pf'):
 207             openers.extend(ucharlists[cat])
 208
 209         # allowed after markup if there is a matching opener
 210         closers = [u'"\')>\\]}']
 211         for cat in ('Pe', 'Pf', 'Pi'):
 212             closers.extend(ucharlists[cat])
 213
 214         # non-matching, allowed on both sides
 215         delimiters = [u'\\-/:']
 216         for cat in ('Pd', 'Po'):
 217             delimiters.extend(ucharlists[cat])
 218
 219         # non-matching, after markup
 220         closing_delimiters = [r'\\.,;!?']
 221
 222         # # Test open/close matching:
 223         # for i in range(min(len(openers),len(closers))):
 224         #     print '%4d    %s    %s' % (i, openers[i].encode('utf8'),
 225         #                                closers[i].encode('utf8'))
 226
 227         return [u''.join(chars) for chars in (openers, closers, delimiters,
 228                                               closing_delimiters)]
 229
 230     def separate_wide_chars(s):
 231         """Return (s1,s2) with characters above 0xFFFF in s2"""
 232         maxunicode_narrow = 0xFFFF
 233         l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow]
 234         l2 = [ch for ch in s if ord(ch) > maxunicode_narrow]
 235         return ''.join(l1), ''.join(l2)
 236
 237     def mark_intervals(s):
 238         """Return s with shortcut notation for runs of consecutive characters
 239
 240         Sort string and replace 'cdef' by 'c-f' and similar.
 241         """
 242         l =[]
 243         s = [ord(ch) for ch in s]
 244         s.sort()
 245         for n in s:
 246             try:
 247                 if l[-1][-1]+1 == n:
 248                     l[-1].append(n)
 249                 else:
 250                     l.append([n])
 251             except IndexError:
 252                 l.append([n])
 253
 254         l2 = []
 255         for i in l:
 256             i = [unichr(n) for n in i]
 257             if len(i) > 2:
 258                 i = i[0], u'-', i[-1]
 259             l2.extend(i)
 260
 261         return ''.join(l2)
 262
 263     def wrap_string(s, startstring= "(",
 264                        endstring = ")", wrap=65):
 265         """Line-wrap a unicode string literal definition."""
 266         c = len(startstring)
 267         contstring = "'\n" + ' ' * len(startstring) + "u'"
 268         l = [startstring]
 269         for ch in s:
 270             c += 1
 271             if ch == '\\' and c > wrap:
 272                 c = len(startstring)
 273                 ch = contstring + ch
 274             l.append(ch)
 275         l.append(endstring)
 276         return ''.join(l)
 277
 278
 279 # print results
 280 # =============
 281
 282 # (re) create and compare the samples:
 283
 284     (o, c, d, cd) = punctuation_samples()
 285     o, o_wide = separate_wide_chars(o)
 286     c, c_wide = separate_wide_chars(c)
 287     d, d_wide = separate_wide_chars(d)
 288     d = d[:5] + mark_intervals(d[5:])
 289     d_wide = mark_intervals(d_wide)
 290     if sys.maxunicode >= 0x10FFFF: # "wide" build
 291         d += d_wide
 292     if o != openers:
 293         print '- openers = ur"""%s"""' % openers.encode('utf8')
 294         print '+ openers = ur"""%s"""' % o.encode('utf8')
 295     if o_wide:
 296         print '+ openers-wide = ur"""%s"""' % o_wide.encode('utf8')
 297     if c != closers:
 298         print '- closers = ur"""%s"""' % closers.encode('utf8')
 299         print '+ closers = ur"""%s"""' % c.encode('utf8')
 300     if c_wide:
 301         print '+ closers-wide = ur"""%s"""' % c_wide.encode('utf8')
 302     if d != delimiters:
 303         print '- delimiters = ur"%s"' % delimiters.encode('utf8')
 304         print '+ delimiters = ur"%s"' % d.encode('utf8')
 305     if cd != closing_delimiters:
 306         print '- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8')
 307         print '+ closing_delimiters = ur"%s"' % cd.encode('utf8')
 308     # closing_delimiters are all ASCII characters
 309
 310 # Print literal code to define the character sets:
 311
 312     # `openers` and `closers` must be verbose and keep order because they are
 313     # also used in `match_chars()`.
 314     print wrap_string(repr(o), startstring='openers = (')
 315     print wrap_string(repr(c), startstring='closers = (')
 316     # delimiters: sort and use shortcut for intervals (saves ~150 characters):
 317     print wrap_string(repr(d), startstring='delimiters = (')
 318     # add characters in the upper plane only in a "wide" build:
 319     print 'if sys.maxunicode >= 0x10FFFF: # "wide" build'
 320     print wrap_string(repr(d_wide), startstring='    delimiters += (')
 321     print 'closing_delimiters =', repr(cd)
 322
 323 # test prints
 324
 325     # print "wide" Unicode characters:
 326     # ucharlists = unicode_charlists(unicode_punctuation_categories)
 327     # for key in ucharlists:
 328     #     if key.endswith('wide'):
 329     #         print key, ucharlists[key]
 330
 331     # print 'openers = ', repr(openers)
 332     # print 'closers = ', repr(closers)
 333     # print 'delimiters = ', repr(delimiters)
 334     # print 'closing_delimiters = ', repr(closing_delimiters)
 335
 336     # ucharlists = unicode_charlists(unicode_punctuation_categories)
 337     # for cat, chars in ucharlists.items():
 338     #     # print cat, chars
 339     #     # compact output (visible with a comprehensive font):
 340     #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')
 341
 342 # verbose print
 343
 344     # print 'openers:'
 345     # for ch in openers:
 346     #     print ch.encode('utf8'), unicodedata.name(ch)
 347     # print 'closers:'
 348     # for ch in closers:
 349     #     print ch.encode('utf8'), unicodedata.name(ch)
 350     # print 'delimiters:'
 351     # for ch in delimiters:
 352     #     print ch.encode('utf8'), unicodedata.name(ch)
 353     # print 'closing_delimiters:'
 354     # for ch in closing_delimiters:
 355     #     print ch.encode('utf8'), unicodedata.name(ch)