docutils/utils/punctuation_chars.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2011 Günter Milde.
   4 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
   5 #
   6 #    Copying and distribution of this file, with or without modification,
   7 #    are permitted in any medium without royalty provided the copyright
   8 #    notice and this notice are preserved.
   9 #    This file is offered as-is, without any warranty.
  10 #
  11 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
  12
  13 # :Id: $Id$
  14
  15 import sys, re
  16 import unicodedata
  17
  18 # punctuation characters around inline markup
  19 # ===========================================
  20 #
  21 # This module provides the lists of characters for the implementation of
  22 # the `inline markup recognition rules`_ in the reStructuredText parser
  23 # (states.py)
  24 #
  25 # .. _inline markup recognition rules:
  26 #     ../../../docs/ref/rst/restructuredtext.html#inline-markup
  27
  28 # Docutils punctuation category sample strings
  29 # --------------------------------------------
  30 #
  31 # The sample strings are generated by punctuation_samples() and put here
  32 # literal to avoid the time-consuming generation with every Docutils
  33 # run. Running this file as a standalone module checks the definitions below
  34 # against a re-calculation.
  35
  36 openers = ur"""\"\'\(\<\[\{༺༼᚛⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢«‘“‹⸂⸄⸉⸌⸜⸠‚„»’”›⸃⸅⸊⸍⸝⸡‛‟"""
  37 closers = ur"""\"\'\)\>\]\}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣»’”›⸃⸅⸊⸍⸝⸡‛‟«‘“‹⸂⸄⸉⸌⸜⸠‚„"""
  38 delimiters = ur"\-\/\:֊־᐀᠆‐‑‒–—―⸗⸚〜〰゠︱︲﹘﹣－¡·¿;·՚՛՜՝՞՟։׀׃׆׳״؉؊،؍؛؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾।॥॰෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒྅࿐࿑࿒࿓࿔၊။၌၍၎၏჻፡።፣፤፥፦፧፨᙭᙮᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠇᠈᠉᠊᥄᥅᧞᧟᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᰻᰼᰽᰾᰿᱾᱿᳓‖‗†‡•‣․‥…‧‰‱′″‴‵‶‷‸※‼‽‾⁁⁂⁃⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⳹⳺⳻⳼⳾⳿⸀⸁⸆⸇⸈⸋⸎⸏⸐⸑⸒⸓⸔⸕⸖⸘⸙⸛⸞⸟⸪⸫⸬⸭⸮⸰⸱、。〃〽・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꯫︐︑︒︓︔︕︖︙︰﹅﹆﹉﹊﹋﹌﹐﹑﹒﹔﹕﹖﹗﹟﹠﹡﹨﹪﹫！＂＃％＆＇＊，．／：；？＠＼｡､･𐄀𐄁𐎟𐏐𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐬹𐬺𐬻𐬼𐬽𐬾𐬿𑂻𑂼𑂾𑂿𑃀𑃁𒑰𒑱𒑲𒑳"
  39 closing_delimiters = ur"\.\,\;\!\?"
  40
  41
  42 # Unicode punctuation character categories
  43 # ----------------------------------------
  44
  45 unicode_punctuation_categories = {
  46     # 'Pc': 'Connector', # not used in Docutils inline markup recognition
  47     'Pd': 'Dash',
  48     'Ps': 'Open',
  49     'Pe': 'Close',
  50     'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
  51     'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
  52     'Po': 'Other'
  53     }
  54 """Unicode character categories for punctuation"""
  55
  56
  57 # generate character pattern strings
  58 # ==================================
  59
  60 def unicode_charlists(categories, cp_min=0, cp_max=None):
  61     """Return dictionary of Unicode character lists.
  62
  63     For each of the `catagories`, an item contains a list with all Unicode
  64     characters with `cp_min` <= code-point <= `cp_max` that belong to the
  65     category. (The default values check every code-point supported by Python.)
  66     """
  67     # Determine highest code point with one of the given categories
  68     # (may shorten the search time considerably if there are many
  69     # categories with not too high characters):
  70     if cp_max is None:
  71         cp_max = max(x for x in xrange(sys.maxunicode + 1)
  72                      if unicodedata.category(unichr(x)) in categories)
  73         # print cp_max # => 74867 for unicode_punctuation_categories
  74     charlists = {}
  75     for cat in categories:
  76         charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
  77                           if unicodedata.category(unichr(x)) == cat]
  78     return charlists
  79
  80
  81 # Character categories in Docutils
  82 # --------------------------------
  83
  84 def punctuation_samples():
  85
  86     """Docutils punctuation category sample strings.
  87
  88     Return list of sample strings for the categories "Open", "Close",
  89     "Delimiters" and "Closing-Delimiters" used in the `inline markup
  90     recognition rules`_.
  91     """
  92
  93     # Lists with characters in Unicode punctuation character categories
  94     cp_min = 160 # ASCII chars have special rules for backwards compatibility
  95     ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
  96
  97     # match opening/closing characters
  98     # --------------------------------
  99     # Rearange the lists to ensure matching characters at the same
 100     # index position.
 101
 102     # low quotation marks are also used as closers (e.g. in Greek)
 103     # move them to category Pi:
 104     ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
 105     ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
 106     ucharlists['Pi'] += [u'‚', u'„']
 107
 108     ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
 109     ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
 110     ucharlists['Pf'] += [u'‛', u'‟']
 111
 112     # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
 113     ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
 114
 115     # print u''.join(ucharlists['Ps']).encode('utf8')
 116     # print u''.join(ucharlists['Pe']).encode('utf8')
 117     # print u''.join(ucharlists['Pi']).encode('utf8')
 118     # print u''.join(ucharlists['Pf']).encode('utf8')
 119
 120     # The Docutils character categories
 121     # ---------------------------------
 122     #
 123     # The categorization of ASCII chars is non-standard to reduce both
 124     # false positives and need for escaping. (see `inline markup recognition
 125     # rules`_)
 126
 127     # matching, allowed before markup
 128     openers = [re.escape('"\'(<[{')]
 129     for cat in ('Ps', 'Pi', 'Pf'):
 130         openers.extend(ucharlists[cat])
 131
 132     # matching, allowed after markup
 133     closers = [re.escape('"\')>]}')]
 134     for cat in ('Pe', 'Pf', 'Pi'):
 135         closers.extend(ucharlists[cat])
 136
 137     # non-matching, allowed on both sides
 138     delimiters = [re.escape('-/:')]
 139     for cat in ('Pd', 'Po'):
 140         delimiters.extend(ucharlists[cat])
 141
 142     # non-matching, after markup
 143     closing_delimiters = [re.escape('.,;!?')]
 144
 145     # # Test open/close matching:
 146     # for i in range(min(len(openers),len(closers))):
 147     #     print '%4d    %s    %s' % (i, openers[i].encode('utf8'),
 148     #                                closers[i].encode('utf8'))
 149
 150     return [u''.join(chars)
 151             for chars in (openers, closers, delimiters, closing_delimiters)]
 152
 153
 154 # Matching open/close quotes
 155 # --------------------------
 156
 157 # Rule (5) requires determination of matching open/close pairs. However,
 158 # the pairing of open/close quotes is ambigue due to  different typographic
 159 # conventions in different languages.
 160
 161 quote_pairs = {u'\xbb': u'\xbb', # Swedish
 162                u'\u2018': u'\u201a', # Greek
 163                u'\u2019': u'\u2019', # Swedish
 164                u'\u201a': u'\u2018\u2019', # German, Polish
 165                u'\u201c': u'\u201e', # German
 166                u'\u201e': u'\u201c\u201d',
 167                u'\u201d': u'\u201d', # Swedish
 168                u'\u203a': u'\u203a', # Swedish
 169               }
 170
 171 def match_chars(c1, c2):
 172     try:
 173         i = openers.index(c1)
 174     except ValueError:  # c1 not in openers
 175         return False
 176     return c2 == closers[i] or c2 in quote_pairs.get(c1, '')
 177
 178
 179
 180
 181 # print results
 182 # =============
 183
 184 if __name__ == '__main__':
 185
 186     # (re) create and compare the samples:
 187     (o, c, d, cd) = punctuation_samples()
 188     if o != openers:
 189         print '- openers = ur"""%s"""' % openers.encode('utf8')
 190         print '+ openers = ur"""%s"""' % o.encode('utf8')
 191     if c != closers:
 192         print '- closers = ur"""%s"""' % closers.encode('utf8')
 193         print '+ closers = ur"""%s"""' % c.encode('utf8')
 194     if d != delimiters:
 195         print '- delimiters = ur"%s"' % delimiters.encode('utf8')
 196         print '+ delimiters = ur"%s"' % d.encode('utf8')
 197     if cd != closing_delimiters:
 198         print '- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8')
 199         print '+ closing_delimiters = ur"%s"' % cd.encode('utf8')
 200
 201     # # test prints
 202     # print 'openers = ', repr(openers)
 203     # print 'closers = ', repr(closers)
 204     # print 'delimiters = ', repr(delimiters)
 205     # print 'closing_delimiters = ', repr(closing_delimiters)
 206
 207     # ucharlists = unicode_charlists(unicode_punctuation_categories)
 208     # for cat, chars in ucharlists.items():
 209     #     # print cat, chars
 210     #     # compact output (visible with a comprehensive font):
 211     #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')