docutils.utils is now a package (providing a place for sub-modules)
[docutils.git] / docutils / utils / punctuation_chars.py
blobb8dbe2b43c315a0a7794acd101b6b5e77efc749f
1 #!/usr/bin/env python
2 # -*- coding: utf8 -*-
3 # :Copyright: © 2011 Günter Milde.
4 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
6 # Copying and distribution of this file, with or without modification,
7 # are permitted in any medium without royalty provided the copyright
8 # notice and this notice are preserved.
9 # This file is offered as-is, without any warranty.
11 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
13 # :Id: $Id$
15 import sys, re
16 import unicodedata
18 # punctuation characters around inline markup
19 # ===========================================
21 # This module provides the lists of characters for the implementation of
22 # the `inline markup recognition rules`_ in the reStructuredText parser
23 # (states.py)
25 # .. _inline markup recognition rules:
26 # ../../../docs/ref/rst/restructuredtext.html#inline-markup
28 # Docutils punctuation category sample strings
29 # --------------------------------------------
31 # The sample strings are generated by punctuation_samples() and put here
32 # literal to avoid the time-consuming generation with every Docutils
33 # run. Running this file as a standalone module checks the definitions below
34 # against a re-calculation.
36 openers = ur"""\"\'\(\<\[\{༺༼᚛⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝([{⦅「«‘“‹⸂⸄⸉⸌⸜⸠‚„»’”›⸃⸅⸊⸍⸝⸡‛‟"""
37 closers = ur"""\"\'\)\>\]\}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞)]}⦆」»’”›⸃⸅⸊⸍⸝⸡‛‟«‘“‹⸂⸄⸉⸌⸜⸠‚„"""
38 delimiters = ur"\-\/\:֊־᐀᠆‐‑‒–—―⸗⸚〜〰゠︱︲﹘﹣-¡·¿;·՚՛՜՝՞՟։׀׃׆׳״؉؊،؍؛؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾।॥॰෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒྅࿐࿑࿒࿓࿔၊။၌၍၎၏჻፡።፣፤፥፦፧፨᙭᙮᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠇᠈᠉᠊᥄᥅᧞᧟᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᰻᰼᰽᰾᰿᱾᱿᳓‖‗†‡•‣․‥…‧‰‱′″‴‵‶‷‸※‼‽‾⁁⁂⁃⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⳹⳺⳻⳼⳾⳿⸀⸁⸆⸇⸈⸋⸎⸏⸐⸑⸒⸓⸔⸕⸖⸘⸙⸛⸞⸟⸪⸫⸬⸭⸮⸰⸱、。〃〽・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꯫︐︑︒︓︔︕︖︙︰﹅﹆﹉﹊﹋﹌﹐﹑﹒﹔﹕﹖﹗﹟﹠﹡﹨﹪﹫!"#%&'*,./:;?@\。、・𐄀𐄁𐎟𐏐𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐬹𐬺𐬻𐬼𐬽𐬾𐬿𑂻𑂼𑂾𑂿𑃀𑃁𒑰𒑱𒑲𒑳"
39 closing_delimiters = ur"\.\,\;\!\?"
42 # Unicode punctuation character categories
43 # ----------------------------------------
45 unicode_punctuation_categories = {
46 # 'Pc': 'Connector', # not used in Docutils inline markup recognition
47 'Pd': 'Dash',
48 'Ps': 'Open',
49 'Pe': 'Close',
50 'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
51 'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
52 'Po': 'Other'
54 """Unicode character categories for punctuation"""
57 # generate character pattern strings
58 # ==================================
60 def unicode_charlists(categories, cp_min=0, cp_max=None):
61 """Return dictionary of Unicode character lists.
63 For each of the `catagories`, an item contains a list with all Unicode
64 characters with `cp_min` <= code-point <= `cp_max` that belong to the
65 category. (The default values check every code-point supported by Python.)
66 """
67 # Determine highest code point with one of the given categories
68 # (may shorten the search time considerably if there are many
69 # categories with not too high characters):
70 if cp_max is None:
71 cp_max = max(x for x in xrange(sys.maxunicode + 1)
72 if unicodedata.category(unichr(x)) in categories)
73 # print cp_max # => 74867 for unicode_punctuation_categories
74 charlists = {}
75 for cat in categories:
76 charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
77 if unicodedata.category(unichr(x)) == cat]
78 return charlists
81 # Character categories in Docutils
82 # --------------------------------
84 def punctuation_samples():
86 """Docutils punctuation category sample strings.
88 Return list of sample strings for the categories "Open", "Close",
89 "Delimiters" and "Closing-Delimiters" used in the `inline markup
90 recognition rules`_.
91 """
93 # Lists with characters in Unicode punctuation character categories
94 cp_min = 160 # ASCII chars have special rules for backwards compatibility
95 ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
97 # match opening/closing characters
98 # --------------------------------
99 # Rearange the lists to ensure matching characters at the same
100 # index position.
102 # low quotation marks are also used as closers (e.g. in Greek)
103 # move them to category Pi:
104 ucharlists['Ps'].remove(u'‚') # 201A SINGLE LOW-9 QUOTATION MARK
105 ucharlists['Ps'].remove(u'„') # 201E DOUBLE LOW-9 QUOTATION MARK
106 ucharlists['Pi'] += [u'‚', u'„']
108 ucharlists['Pi'].remove(u'‛') # 201B SINGLE HIGH-REVERSED-9 QUOTATION MARK
109 ucharlists['Pi'].remove(u'‟') # 201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK
110 ucharlists['Pf'] += [u'‛', u'‟']
112 # 301F LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
113 ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
115 # print u''.join(ucharlists['Ps']).encode('utf8')
116 # print u''.join(ucharlists['Pe']).encode('utf8')
117 # print u''.join(ucharlists['Pi']).encode('utf8')
118 # print u''.join(ucharlists['Pf']).encode('utf8')
120 # The Docutils character categories
121 # ---------------------------------
123 # The categorization of ASCII chars is non-standard to reduce both
124 # false positives and need for escaping. (see `inline markup recognition
125 # rules`_)
127 # matching, allowed before markup
128 openers = [re.escape('"\'(<[{')]
129 for cat in ('Ps', 'Pi', 'Pf'):
130 openers.extend(ucharlists[cat])
132 # matching, allowed after markup
133 closers = [re.escape('"\')>]}')]
134 for cat in ('Pe', 'Pf', 'Pi'):
135 closers.extend(ucharlists[cat])
137 # non-matching, allowed on both sides
138 delimiters = [re.escape('-/:')]
139 for cat in ('Pd', 'Po'):
140 delimiters.extend(ucharlists[cat])
142 # non-matching, after markup
143 closing_delimiters = [re.escape('.,;!?')]
145 # # Test open/close matching:
146 # for i in range(min(len(openers),len(closers))):
147 # print '%4d %s %s' % (i, openers[i].encode('utf8'),
148 # closers[i].encode('utf8'))
150 return [u''.join(chars)
151 for chars in (openers, closers, delimiters, closing_delimiters)]
154 # Matching open/close quotes
155 # --------------------------
157 # Rule (5) requires determination of matching open/close pairs. However,
158 # the pairing of open/close quotes is ambigue due to different typographic
159 # conventions in different languages.
161 quote_pairs = {u'\xbb': u'\xbb', # Swedish
162 u'\u2018': u'\u201a', # Greek
163 u'\u2019': u'\u2019', # Swedish
164 u'\u201a': u'\u2018\u2019', # German, Polish
165 u'\u201c': u'\u201e', # German
166 u'\u201e': u'\u201c\u201d',
167 u'\u201d': u'\u201d', # Swedish
168 u'\u203a': u'\u203a', # Swedish
171 def match_chars(c1, c2):
172 try:
173 i = openers.index(c1)
174 except ValueError: # c1 not in openers
175 return False
176 return c2 == closers[i] or c2 in quote_pairs.get(c1, '')
181 # print results
182 # =============
184 if __name__ == '__main__':
186 # (re) create and compare the samples:
187 (o, c, d, cd) = punctuation_samples()
188 if o != openers:
189 print '- openers = ur"""%s"""' % openers.encode('utf8')
190 print '+ openers = ur"""%s"""' % o.encode('utf8')
191 if c != closers:
192 print '- closers = ur"""%s"""' % closers.encode('utf8')
193 print '+ closers = ur"""%s"""' % c.encode('utf8')
194 if d != delimiters:
195 print '- delimiters = ur"%s"' % delimiters.encode('utf8')
196 print '+ delimiters = ur"%s"' % d.encode('utf8')
197 if cd != closing_delimiters:
198 print '- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8')
199 print '+ closing_delimiters = ur"%s"' % cd.encode('utf8')
201 # # test prints
202 # print 'openers = ', repr(openers)
203 # print 'closers = ', repr(closers)
204 # print 'delimiters = ', repr(delimiters)
205 # print 'closing_delimiters = ', repr(closing_delimiters)
207 # ucharlists = unicode_charlists(unicode_punctuation_categories)
208 # for cat, chars in ucharlists.items():
209 # # print cat, chars
210 # # compact output (visible with a comprehensive font):
211 # print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')