2 # -*- coding: utf-8 -*-
3 # :Copyright: © 2011 Günter Milde.
4 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
6 # Copying and distribution of this file, with or without modification,
7 # are permitted in any medium without royalty provided the copyright
8 # notice and this notice are preserved.
9 # This file is offered as-is, without any warranty.
11 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
18 # punctuation characters around inline markup
19 # ===========================================
21 # This module provides the lists of characters for the implementation of
22 # the `inline markup recognition rules`_ in the reStructuredText parser
25 # .. _inline markup recognition rules:
26 # ../../docs/ref/rst/restructuredtext.html#inline-markup
28 # Docutils punctuation category sample strings
29 # --------------------------------------------
31 # The sample strings are generated by punctuation_samples() and put here
32 # literal to avoid the time-consuming generation with every Docutils run.
33 # As the samples are used inside ``[ ]`` in regular expressions, hyphen and
34 # square brackets are escaped. ::
36 openers
= (u
'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768'
37 u
'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea'
38 u
'\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991'
39 u
'\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28'
40 u
'\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d'
41 u
'\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41'
42 u
'\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62'
43 u
'\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20'
44 u
'\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d'
45 u
'\u2e1d\u2e21\u201b\u201f')
46 closers
= (u
'"\')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769'
47 u
'\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb'
48 u
'\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992'
49 u
'\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29'
50 u
'\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e'
51 u
'\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42'
52 u
'\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63'
53 u
'\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21'
54 u
'\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c'
55 u
'\u2e1c\u2e20\u201a\u201e')
56 delimiters
= (u
'\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589'
57 u
'\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c'
58 u
'\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d'
59 u
'\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f'
60 u
'\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f'
61 u
'\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735'
62 u
'\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945'
63 u
'\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-'
64 u
'\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-'
65 u
'\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-'
66 u
'\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00'
67 u
'\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-'
68 u
'\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0'
69 u
'\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7'
70 u
'\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f'
71 u
'\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb'
72 u
'\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c'
73 u
'\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a'
74 u
'\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a'
75 u
'\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65')
76 if sys
.maxunicode
>= 0x10FFFF: # "wide" build
77 delimiters
+= (u
'\U00010100\U00010101\U0001039f\U000103d0\U00010857'
78 u
'\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f'
79 u
'\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-'
80 u
'\U000110c1\U00012470-\U00012473')
81 closing_delimiters
= u
'\\\\.,;!?'
84 # Matching open/close quotes
85 # --------------------------
87 # Rule (5) requires determination of matching open/close pairs. However,
88 # the pairing of open/close quotes is ambigue due to different typographic
89 # conventions in different languages.
91 quote_pairs
= {u
'\xbb': u
'\xbb', # Swedish
92 u
'\u2018': u
'\u201a', # Greek
93 u
'\u2019': u
'\u2019', # Swedish
94 u
'\u201a': u
'\u2018\u2019', # German, Polish
95 u
'\u201c': u
'\u201e', # German
96 u
'\u201e': u
'\u201c\u201d',
97 u
'\u201d': u
'\u201d', # Swedish
98 u
'\u203a': u
'\u203a', # Swedish
101 def match_chars(c1
, c2
):
103 i
= openers
.index(c1
)
104 except ValueError: # c1 not in openers
106 return c2
== closers
[i
] or c2
in quote_pairs
.get(c1
, '')
109 # Running this file as a standalone module checks the definitions against a
112 if __name__
== '__main__':
115 # Unicode punctuation character categories
116 # ----------------------------------------
118 unicode_punctuation_categories
= {
119 # 'Pc': 'Connector', # not used in Docutils inline markup recognition
123 'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
124 'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
127 """Unicode character categories for punctuation"""
130 # generate character pattern strings
131 # ==================================
133 def unicode_charlists(categories
, cp_min
=0, cp_max
=None):
134 """Return dictionary of Unicode character lists.
136 For each of the `catagories`, an item contains a list with all Unicode
137 characters with `cp_min` <= code-point <= `cp_max` that belong to
140 The default values check every code-point supported by Python
141 (`sys.maxint` is 0x10FFFF in a "wide" build and 0xFFFF in a "narrow"
142 build, i.e. ucs4 and ucs2 respectively).
144 # Determine highest code point with one of the given categories
145 # (may shorten the search time considerably if there are many
146 # categories with not too high characters):
148 cp_max
= max(x
for x
in xrange(sys
.maxunicode
+1)
149 if unicodedata
.category(unichr(x
)) in categories
)
150 # print cp_max # => 74867 for unicode_punctuation_categories
152 for cat
in categories
:
153 charlists
[cat
] = [unichr(x
) for x
in xrange(cp_min
, cp_max
+1)
154 if unicodedata
.category(unichr(x
)) == cat
]
158 # Character categories in Docutils
159 # --------------------------------
161 def punctuation_samples():
163 """Docutils punctuation category sample strings.
165 Return list of sample strings for the categories "Open", "Close",
166 "Delimiters" and "Closing-Delimiters" used in the `inline markup
170 # Lists with characters in Unicode punctuation character categories
171 cp_min
= 160 # ASCII chars have special rules for backwards compatibility
172 ucharlists
= unicode_charlists(unicode_punctuation_categories
, cp_min
)
174 # match opening/closing characters
175 # --------------------------------
176 # Rearange the lists to ensure matching characters at the same
179 # low quotation marks are also used as closers (e.g. in Greek)
180 # move them to category Pi:
181 ucharlists
['Ps'].remove(u
'‚') # 201A SINGLE LOW-9 QUOTATION MARK
182 ucharlists
['Ps'].remove(u
'„') # 201E DOUBLE LOW-9 QUOTATION MARK
183 ucharlists
['Pi'] += [u
'‚', u
'„']
185 ucharlists
['Pi'].remove(u
'‛') # 201B SINGLE HIGH-REVERSED-9 QUOTATION MARK
186 ucharlists
['Pi'].remove(u
'‟') # 201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK
187 ucharlists
['Pf'] += [u
'‛', u
'‟']
189 # 301F LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
190 ucharlists
['Ps'].insert(ucharlists
['Pe'].index(u
'\u301f'), u
'\u301d')
192 # print u''.join(ucharlists['Ps']).encode('utf8')
193 # print u''.join(ucharlists['Pe']).encode('utf8')
194 # print u''.join(ucharlists['Pi']).encode('utf8')
195 # print u''.join(ucharlists['Pf']).encode('utf8')
197 # The Docutils character categories
198 # ---------------------------------
200 # The categorization of ASCII chars is non-standard to reduce
201 # both false positives and need for escaping. (see `inline markup
202 # recognition rules`_)
204 # allowed before markup if there is a matching closer
205 openers
= [u
'"\'(<\\[{']
206 for cat
in ('Ps', 'Pi', 'Pf'):
207 openers
.extend(ucharlists
[cat
])
209 # allowed after markup if there is a matching opener
210 closers
= [u
'"\')>\\]}']
211 for cat
in ('Pe', 'Pf', 'Pi'):
212 closers
.extend(ucharlists
[cat
])
214 # non-matching, allowed on both sides
215 delimiters
= [u
'\\-/:']
216 for cat
in ('Pd', 'Po'):
217 delimiters
.extend(ucharlists
[cat
])
219 # non-matching, after markup
220 closing_delimiters
= [r
'\\.,;!?']
222 # # Test open/close matching:
223 # for i in range(min(len(openers),len(closers))):
224 # print '%4d %s %s' % (i, openers[i].encode('utf8'),
225 # closers[i].encode('utf8'))
227 return [u
''.join(chars
) for chars
in (openers
, closers
, delimiters
,
230 def separate_wide_chars(s
):
231 """Return (s1,s2) with characters above 0xFFFF in s2"""
232 maxunicode_narrow
= 0xFFFF
233 l1
= [ch
for ch
in s
if ord(ch
) <= maxunicode_narrow
]
234 l2
= [ch
for ch
in s
if ord(ch
) > maxunicode_narrow
]
235 return ''.join(l1
), ''.join(l2
)
237 def mark_intervals(s
):
238 """Return s with shortcut notation for runs of consecutive characters
240 Sort string and replace 'cdef' by 'c-f' and similar.
243 s
= [ord(ch
) for ch
in s
]
256 i
= [unichr(n
) for n
in i
]
258 i
= i
[0], u
'-', i
[-1]
263 def wrap_string(s
, startstring
= "(",
264 endstring
= ")", wrap
=65):
265 """Line-wrap a unicode string literal definition."""
267 contstring
= "'\n" + ' ' * len(startstring
) + "u'"
271 if ch
== '\\' and c
> wrap
:
282 # (re) create and compare the samples:
284 (o
, c
, d
, cd
) = punctuation_samples()
285 o
, o_wide
= separate_wide_chars(o
)
286 c
, c_wide
= separate_wide_chars(c
)
287 d
, d_wide
= separate_wide_chars(d
)
288 d
= d
[:5] + mark_intervals(d
[5:])
289 d_wide
= mark_intervals(d_wide
)
290 if sys
.maxunicode
>= 0x10FFFF: # "wide" build
293 print '- openers = ur"""%s"""' % openers
.encode('utf8')
294 print '+ openers = ur"""%s"""' % o
.encode('utf8')
296 print '+ openers-wide = ur"""%s"""' % o_wide
.encode('utf8')
298 print '- closers = ur"""%s"""' % closers
.encode('utf8')
299 print '+ closers = ur"""%s"""' % c
.encode('utf8')
301 print '+ closers-wide = ur"""%s"""' % c_wide
.encode('utf8')
303 print '- delimiters = ur"%s"' % delimiters
.encode('utf8')
304 print '+ delimiters = ur"%s"' % d
.encode('utf8')
305 if cd
!= closing_delimiters
:
306 print '- closing_delimiters = ur"%s"' % closing_delimiters
.encode('utf8')
307 print '+ closing_delimiters = ur"%s"' % cd
.encode('utf8')
308 # closing_delimiters are all ASCII characters
310 # Print literal code to define the character sets:
312 # `openers` and `closers` must be verbose and keep order because they are
313 # also used in `match_chars()`.
314 print wrap_string(repr(o
), startstring
='openers = (')
315 print wrap_string(repr(c
), startstring
='closers = (')
316 # delimiters: sort and use shortcut for intervals (saves ~150 characters):
317 print wrap_string(repr(d
), startstring
='delimiters = (')
318 # add characters in the upper plane only in a "wide" build:
319 print 'if sys.maxunicode >= 0x10FFFF: # "wide" build'
320 print wrap_string(repr(d_wide
), startstring
=' delimiters += (')
321 print 'closing_delimiters =', repr(cd
)
325 # print "wide" Unicode characters:
326 # ucharlists = unicode_charlists(unicode_punctuation_categories)
327 # for key in ucharlists:
328 # if key.endswith('wide'):
329 # print key, ucharlists[key]
331 # print 'openers = ', repr(openers)
332 # print 'closers = ', repr(closers)
333 # print 'delimiters = ', repr(delimiters)
334 # print 'closing_delimiters = ', repr(closing_delimiters)
336 # ucharlists = unicode_charlists(unicode_punctuation_categories)
337 # for cat, chars in ucharlists.items():
339 # # compact output (visible with a comprehensive font):
340 # print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')
346 # print ch.encode('utf8'), unicodedata.name(ch)
349 # print ch.encode('utf8'), unicodedata.name(ch)
350 # print 'delimiters:'
351 # for ch in delimiters:
352 # print ch.encode('utf8'), unicodedata.name(ch)
353 # print 'closing_delimiters:'
354 # for ch in closing_delimiters:
355 # print ch.encode('utf8'), unicodedata.name(ch)