2 # -*- coding: utf-8 -*-
5 # :Copyright: © 2010 Günter Milde,
6 # original `SmartyPants`_: © 2003 John Gruber
7 # smartypants.py: © 2004, 2007 Chad Miller
8 # :Maintainer: docutils-develop@lists.sourceforge.net
9 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
11 # Copying and distribution of this file, with or without modification,
12 # are permitted in any medium without royalty provided the copyright
13 # notices and this notice are preserved.
14 # This file is offered as-is, without any warranty.
16 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
20 =========================
21 Smart Quotes for Docutils
22 =========================
27 "SmartyPants" is a free web publishing plug-in for Movable Type, Blosxom, and
28 BBEdit that easily translates plain ASCII punctuation characters into "smart"
29 typographic punctuation characters.
31 ``smartquotes.py`` is an adaption of "SmartyPants" to Docutils_.
33 * Using Unicode instead of HTML entities for typographic punctuation
34 characters, it works for any output format that supports Unicode.
35 * Supports `language specific quote characters`__.
37 __ http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
43 `John Gruber`_ did all of the hard work of writing this software in Perl for
44 `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_
45 ported it to Python to use with Pyblosxom_.
46 Adapted to Docutils_ by Günter Milde.
51 Portions of the SmartyPants original work are based on Brad Choate's nifty
52 MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to
53 this plug-in. Brad Choate is a fine hacker indeed.
55 `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta
56 testing of the original SmartyPants.
58 `Rael Dornfest`_ ported SmartyPants to Blosxom.
60 .. _Brad Choate: http://bradchoate.com/
61 .. _Jeremy Hedley: http://antipixel.com/
62 .. _Charles Wiltgen: http://playbacktime.com/
63 .. _Rael Dornfest: http://raelity.org/
69 SmartyPants_ license (3-Clause BSD license):
71 Copyright (c) 2003 John Gruber (http://daringfireball.net/)
74 Redistribution and use in source and binary forms, with or without
75 modification, are permitted provided that the following conditions are
78 * Redistributions of source code must retain the above copyright
79 notice, this list of conditions and the following disclaimer.
81 * Redistributions in binary form must reproduce the above copyright
82 notice, this list of conditions and the following disclaimer in
83 the documentation and/or other materials provided with the
86 * Neither the name "SmartyPants" nor the names of its contributors
87 may be used to endorse or promote products derived from this
88 software without specific prior written permission.
90 This software is provided by the copyright holders and contributors
91 "as is" and any express or implied warranties, including, but not
92 limited to, the implied warranties of merchantability and fitness for
93 a particular purpose are disclaimed. In no event shall the copyright
94 owner or contributors be liable for any direct, indirect, incidental,
95 special, exemplary, or consequential damages (including, but not
96 limited to, procurement of substitute goods or services; loss of use,
97 data, or profits; or business interruption) however caused and on any
98 theory of liability, whether in contract, strict liability, or tort
99 (including negligence or otherwise) arising in any way out of the use
100 of this software, even if advised of the possibility of such damage.
102 smartypants.py license (2-Clause BSD license):
104 smartypants.py is a derivative work of SmartyPants.
106 Redistribution and use in source and binary forms, with or without
107 modification, are permitted provided that the following conditions are
110 * Redistributions of source code must retain the above copyright
111 notice, this list of conditions and the following disclaimer.
113 * Redistributions in binary form must reproduce the above copyright
114 notice, this list of conditions and the following disclaimer in
115 the documentation and/or other materials provided with the
118 This software is provided by the copyright holders and contributors
119 "as is" and any express or implied warranties, including, but not
120 limited to, the implied warranties of merchantability and fitness for
121 a particular purpose are disclaimed. In no event shall the copyright
122 owner or contributors be liable for any direct, indirect, incidental,
123 special, exemplary, or consequential damages (including, but not
124 limited to, procurement of substitute goods or services; loss of use,
125 data, or profits; or business interruption) however caused and on any
126 theory of liability, whether in contract, strict liability, or tort
127 (including negligence or otherwise) arising in any way out of the use
128 of this software, even if advised of the possibility of such damage.
130 .. _John Gruber: http://daringfireball.net/
131 .. _Chad Miller: http://web.chad.org/
133 .. _Pyblosxom: http://pyblosxom.bluesock.org/
134 .. _SmartyPants: http://daringfireball.net/projects/smartypants/
135 .. _Movable Type: http://www.movabletype.org/
136 .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
137 .. _Docutils: http://docutils.sf.net/
142 SmartyPants can perform the following transformations:
144 - Straight quotes ( " and ' ) into "curly" quote characters
145 - Backticks-style quotes (\`\`like this'') into "curly" quote characters
146 - Dashes (``--`` and ``---``) into en- and em-dash entities
147 - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
149 This means you can write, edit, and save your posts using plain old
150 ASCII straight quotes, plain dashes, and plain dots, but your published
151 posts (and final HTML output) will appear with smart quotes, em-dashes,
154 SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``,
155 ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to
156 display text where smart quotes and other "smart punctuation" would not be
157 appropriate, such as source code or example markup.
163 If you need to use literal straight quotes (or plain hyphens and periods),
164 `smartquotes` accepts the following backslash escape sequences to force
165 ASCII-punctuation. Mind, that you need two backslashes as Docutils expands it,
179 This is useful, for example, when you want to use straight quotes as
180 foot and inch marks: 6\\'2\\" tall; a 17\\" iMac.
186 Why You Might Not Want to Use Smart Quotes in Your Weblog
187 ---------------------------------------------------------
189 For one thing, you might not care.
191 Most normal, mentally stable individuals do not take notice of proper
192 typographic punctuation. Many design and typography nerds, however, break
193 out in a nasty rash when they encounter, say, a restaurant sign that uses
194 a straight apostrophe to spell "Joe's".
196 If you're the sort of person who just doesn't care, you might well want to
197 continue not caring. Using straight quotes -- and sticking to the 7-bit
198 ASCII character set in general -- is certainly a simpler way to live.
200 Even if you *do* care about accurate typography, you still might want to
201 think twice before educating the quote characters in your weblog. One side
202 effect of publishing curly quote characters is that it makes your
203 weblog a bit harder for others to quote from using copy-and-paste. What
204 happens is that when someone copies text from your blog, the copied text
205 contains the 8-bit curly quote characters (as well as the 8-bit characters
206 for em-dashes and ellipses, if you use these options). These characters
207 are not standard across different text encoding methods, which is why they
208 need to be encoded as characters.
210 People copying text from your weblog, however, may not notice that you're
211 using curly quotes, and they'll go ahead and paste the unencoded 8-bit
212 characters copied from their browser into an email message or their own
213 weblog. When pasted as raw "smart quotes", these characters are likely to
214 get mangled beyond recognition.
216 That said, my own opinion is that any decent text editor or email client
217 makes it easy to stupefy smart quote characters into their 7-bit
218 equivalents, and I don't consider it my problem if you're using an
219 indecent text editor or email client.
222 Algorithmic Shortcomings
223 ------------------------
225 One situation in which quotes will get curled the wrong way is when
226 apostrophes are used at the start of leading contractions. For example::
228 'Twas the night before Christmas.
230 In the case above, SmartyPants will turn the apostrophe into an opening
231 single-quote, when in fact it should be the `right single quotation mark`
232 character which is also "the preferred character to use for apostrophe"
233 (Unicode). I don't think this problem can be solved in the general case --
234 every word processor I've tried gets this wrong as well. In such cases, it's
235 best to use the proper character for closing single-quotes (’) by hand.
237 In English, the same character is used for apostrophe and closing single
238 quote (both plain and "smart" ones). For other locales (French, Italean,
239 Swiss, ...) "smart" single closing quotes differ from the curly apostrophe.
241 .. class:: language-fr
243 Il dit : "C'est 'super' !"
245 If the apostrophe is used at the end of a word, it cannot be distinguished
246 from a single quote by the algorithm. Therefore, a text like::
248 .. class:: language-de-CH
250 "Er sagt: 'Ich fass' es nicht.'"
252 will get a single closing guillemet instead of an apostrophe.
254 This can be prevented by use use of the curly apostrophe character (’) in
257 - "Er sagt: 'Ich fass' es nicht.'"
258 + "Er sagt: 'Ich fass’ es nicht.'"
265 - Command line front-end.
268 - Update and extend language-dependent quotes.
269 - Differentiate apostrophe from single quote.
272 - Internationalization: language-dependent quotes.
275 - Refactor code, code cleanup,
276 - `educate_tokens()` generator as interface for Docutils.
279 - Adaption to Docutils:
280 - Use Unicode instead of HTML entities,
281 - Remove code special to pyblosxom.
283 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400
284 - Fixed bug where blocks of precious unalterable text was instead
285 interpreted. Thanks to Le Roux and Dirk van Oosterbosch.
287 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400
288 - Fix bogus magical quotation when there is no hint that the
289 user wants it, e.g., in "21st century". Thanks to Nathan Hamblen.
290 - Be smarter about quotes before terminating numbers in an en-dash'ed
293 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500
294 - Fix a date-processing bug, as reported by jacob childress.
295 - Begin a test-suite for ensuring correct output.
296 - Removed import of "string", since I didn't really need it.
297 (This was my first every Python program. Sue me!)
299 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400
300 - Abort processing if the flavour is in forbidden-list. Default of
301 [ "rss" ] (Idea of Wolfgang SCHNERRING.)
302 - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING.
304 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400
305 - Some single quotes weren't replaced properly. Diff-tesuji played
308 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500
309 - Support upcoming pyblosxom 0.9 plugin verification feature.
311 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500
319 Numeric values are the easiest way to configure SmartyPants' behavior:
321 :0: Suppress all transformations. (Do nothing.)
323 :1: Performs default SmartyPants transformations: quotes (including
324 \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash)
325 is used to signify an em-dash; there is no support for en-dashes
327 :2: Same as smarty_pants="1", except that it uses the old-school typewriter
328 shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``"
332 :3: Same as smarty_pants="2", but inverts the shorthand for dashes:
333 "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for
336 :-1: Stupefy mode. Reverses the SmartyPants transformation process, turning
337 the characters produced by SmartyPants into their ASCII equivalents.
338 E.g. the LEFT DOUBLE QUOTATION MARK (“) is turned into a simple
339 double-quote (\"), "—" is turned into two dashes
, etc
.
342 The following single
-character attribute values can be combined to toggle
343 individual transformations
from within the smarty_pants attribute
. For
344 example
, ``
"1"``
is equivalent to ``
"qBde"``
.
346 :q
: Educates normal quote characters
: (") and (').
348 :b: Educates \`\`backticks'' -style double quotes.
350 :B: Educates \`\`backticks'' -style double quotes and \`single' quotes.
352 :d: Educates em-dashes.
354 :D: Educates em-dashes and en-dashes, using old-school typewriter shorthand:
355 (dash dash) for en-dashes, (dash dash dash) for em-dashes.
357 :i: Educates em-dashes and en-dashes, using inverted old-school typewriter
358 shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes.
360 :e: Educates ellipses.
362 :w: Translates any instance of ``"`` into a normal double-quote character.
363 This should be of no interest to most people, but of particular interest
364 to anyone who writes their posts using Dreamweaver, as Dreamweaver
365 inexplicably uses this entity to represent a literal double-quote
366 character. SmartyPants only educates normal quotes, not entities (because
367 ordinarily, entities are used for the explicit purpose of representing the
368 specific character they represent). The "w
" option must be used in
369 conjunction with one (or both) of the other quote options ("q
" or "b
").
370 Thus, if you wish to apply all SmartyPants transformations (quotes, en-
371 and em-dashes, and ellipses) and also translate ``"`` entities into
372 regular quotes so SmartyPants can educate them, you should pass the
373 following to the smarty_pants attribute:
377 default_smartypants_attr = "1"
382 class smartchars(object):
383 """Smart quotes and dashes
386 endash = u'–' # "–" EN DASH
387 emdash
= u
'—' # "—" EM DASH
388 ellipsis
= u
'…' # "…" HORIZONTAL ELLIPSIS
389 apostrophe
= u
'’' # "’" RIGHT SINGLE QUOTATION MARK
391 # quote characters (language-specific, set in __init__())
392 # [1] http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
393 # [2] http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen
394 # [3] https://fr.wikipedia.org/wiki/Guillemet
395 # [4] http://typographisme.net/post/Les-espaces-typographiques-et-le-web
396 # [5] http://www.btb.termiumplus.gc.ca/tpv2guides/guides/redac/index-fra.html
397 # [6] https://en.wikipedia.org/wiki/Hebrew_punctuation#Quotation_marks
398 # [7] http://www.tustep.uni-tuebingen.de/bi/bi00/bi001t1-anfuehrung.pdf
400 # TODO: configuration option, e.g.::
402 # smartquote-locales: nl: „“’’, # apostrophe for ``'s Gravenhage``
404 # fr: « : »:‹ : ›, # :-separated list with NBSPs
405 quotes
= {'af': u
'“”‘’',
406 'af-x-altquot': u
'„”‚’',
408 'ca-x-altquot': u
'“”‘’',
410 'cs-x-altquot': u
'»«›‹',
412 'da-x-altquot': u
'„“‚‘',
413 # 'da-x-altquot2': u'””’’',
415 'de-x-altquot': u
'»«›‹',
419 'en-uk-x-altquot': u
'‘’“”', # Attention: " → ‘ and ' → “ !
422 'es-x-altquot': u
'“”‘’',
423 'et': u
'„“‚‘', # no secondary quote listed in
424 'et-x-altquot': u
'«»‹›', # the sources above (wikipedia.org)
427 'fi-x-altquot': u
'»»››',
428 'fr': (u
'« ', u
' »', u
'“', u
'”'), # full no-break space
429 'fr-x-altquot': (u
'« ', u
' »', u
'“', u
'”'), # narrow no-break space
431 'fr-ch-x-altquot': (u
'« ', u
' »', u
'‹ ', u
' ›'), # narrow no-break space, http://typoguide.ch/
433 'he': u
'”“»«', # Hebrew is RTL, test position:
434 'he-x-altquot': u
'„”‚’', # low quotation marks are opening.
435 # 'he-x-altquot': u'“„‘‚', # RTL: low quotation marks opening
436 'hr': u
'„”‘’', # http://hrvatska-tipografija.com/polunavodnici/
437 'hr-x-altquot': u
'»«›‹',
439 'hsb-x-altquot':u
'»«›‹',
444 'it-x-altquot': u
'“”‘’',
445 # 'it-x-altquot2': u'“„‘‚', # [7] antiquated?
450 'nl-x-altquot': u
'„”‚’',
451 # 'nl-x-altquot2': u'””’’',
453 'pl-x-altquot': u
'«»‚’',
454 # 'pl-x-altquot2': u'„”‚’', # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w
459 'sh': u
'„”‚’', # Serbo-Croatian
460 'sh-x-altquot': u
'»«›‹',
461 'sk': u
'„“‚‘', # Slovak
462 'sk-x-altquot': u
'»«›‹',
463 'sl': u
'„“‚‘', # Slovenian
464 'sl-x-altquot': u
'»«›‹',
465 'sq': u
'«»‹›', # Albanian
466 'sq-x-altquot': u
'“„‘‚',
468 'sr-x-altquot': u
'»«›‹',
470 'sv-x-altquot': u
'»»››',
472 'tr-x-altquot': u
'«»‹›',
473 # 'tr-x-altquot2': u'“„‘‚', # [7] antiquated?
475 'uk-x-altquot': u
'„“‚‘',
480 def __init__(self
, language
='en'):
481 self
.language
= language
483 (self
.opquote
, self
.cpquote
,
484 self
.osquote
, self
.csquote
) = self
.quotes
[language
.lower()]
486 self
.opquote
, self
.cpquote
, self
.osquote
, self
.csquote
= u
'""\'\''
489 def smartyPants(text
, attr
=default_smartypants_attr
, language
='en'):
490 """Main function for "traditional" use."""
492 return "".join([t
for t
in educate_tokens(tokenize(text
),
496 def educate_tokens(text_tokens
, attr
=default_smartypants_attr
, language
='en'):
497 """Return iterator that "educates" the items of `text_tokens`.
503 # 2 : set all, using old school en- and em- dash shortcuts
504 # 3 : set all, using inverted old school en and em- dash shortcuts
507 # b : backtick quotes (``double'' only)
508 # B : backtick quotes (``double'' and `single')
510 # D : old school dashes
511 # i : inverted old school dashes
513 # w : convert " entities to " for Dreamweaver users
515 convert_quot
= False # translate " entities into normal quotes?
522 # if attr == "0": # pass tokens unchanged (see below).
523 if attr
== "1": # Do everything, turn all options on.
529 # Do everything, turn all options on, use old school dash shorthand.
535 # Do everything, use inverted old school dash shorthand.
540 elif attr
== "-1": # Special "stupefy" mode.
543 if "q" in attr
: do_quotes
= True
544 if "b" in attr
: do_backticks
= True
545 if "B" in attr
: do_backticks
= 2
546 if "d" in attr
: do_dashes
= 1
547 if "D" in attr
: do_dashes
= 2
548 if "i" in attr
: do_dashes
= 3
549 if "e" in attr
: do_ellipses
= True
550 if "w" in attr
: convert_quot
= True
552 prev_token_last_char
= " "
553 # Last character of the previous text token. Used as
554 # context to curl leading quote characters correctly.
556 for (ttype
, text
) in text_tokens
:
558 # skip HTML and/or XML tags as well as empty text tokens
559 # without updating the last character
560 if ttype
== 'tag' or not text
:
564 # skip literal text (math, literal, raw, ...)
565 if ttype
== 'literal':
566 prev_token_last_char
= text
[-1:]
570 last_char
= text
[-1:] # Remember last char before processing.
572 text
= processEscapes(text
)
575 text
= re
.sub('"', '"', text
)
578 text
= educateDashes(text
)
580 text
= educateDashesOldSchool(text
)
582 text
= educateDashesOldSchoolInverted(text
)
585 text
= educateEllipses(text
)
587 # Note: backticks need to be processed before quotes.
589 text
= educateBackticks(text
, language
)
591 if do_backticks
== 2:
592 text
= educateSingleBackticks(text
, language
)
595 # Replace plain quotes in context to prevent conversion to
596 # 2-character sequence in French.
597 context
= prev_token_last_char
.replace('"',';').replace("'",';')
598 text
= educateQuotes(context
+text
, language
)[1:]
601 text
= stupefyEntities(text
, language
)
603 # Remember last char as context for the next token
604 prev_token_last_char
= last_char
606 text
= processEscapes(text
, restore
=True)
612 def educateQuotes(text
, language
='en'):
614 Parameter: - text string (unicode or bytes).
615 - language (`BCP 47` language tag.)
616 Returns: The `text`, with "educated" curly quote characters.
618 Example input: "Isn't this fun?"
619 Example output: “Isn’t this fun?“;
622 smart
= smartchars(language
)
625 punct_class
= r
"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
627 # Special case if the very first character is a quote
628 # followed by punctuation at a non-word-break.
629 # Close the quotes by brute force:
630 text
= re
.sub(r
"""^'(?=%s\\B)""" % (punct_class
,), smart
.csquote
, text
)
631 text
= re
.sub(r
"""^"(?=%s\\B)""" % (punct_class
,), smart
.cpquote
, text
)
633 # Special case for double sets of quotes, e.g.:
634 # <p>He said, "'Quoted' words in a larger quote."</p>
635 text
= re
.sub(r
""""'(?=\w)""", smart
.opquote
+smart
.osquote
, text
)
636 text
= re
.sub(r
"""'"(?=\w)""", smart
.osquote
+smart
.opquote
, text
)
638 # Special case for decade abbreviations (the '80s):
639 if language
.startswith('en'): # TODO similar cases in other languages?
640 text
= re
.sub(r
"""'(?=\d{2}s)""", smart
.apostrophe
, text
, re
.UNICODE
)
642 close_class
= r
"""[^\ \t\r\n\[\{\(\-]"""
643 dec_dashes
= r
"""–|—"""
645 # Get most opening single quotes:
646 opening_single_quotes_regex
= re
.compile(r
"""
648 \s | # a whitespace char, or
649 | # a non-breaking space entity, or
651 &[mn]dash; | # named dash entities
652 %s | # or decimal entities
653 &\#x201[34]; # or hex
656 (?=\w) # followed by a word character
657 """ % (dec_dashes
,), re
.VERBOSE | re
.UNICODE
)
658 text
= opening_single_quotes_regex
.sub(r
'\1'+smart
.osquote
, text
)
660 # In many locales, single closing quotes are different from apostrophe:
661 if smart
.csquote
!= smart
.apostrophe
:
662 apostrophe_regex
= re
.compile(r
"(?<=(\w|\d))'(?=\w)", re
.UNICODE
)
663 text
= apostrophe_regex
.sub(smart
.apostrophe
, text
)
664 # TODO: keep track of quoting level to recognize apostrophe in, e.g.,
665 # "Ich fass' es nicht."
667 closing_single_quotes_regex
= re
.compile(r
"""
674 """ % (close_class
,), re
.VERBOSE | re
.UNICODE
)
675 text
= closing_single_quotes_regex
.sub(r
'\1'+smart
.csquote
, text
)
677 closing_single_quotes_regex
= re
.compile(r
"""
681 """ % (close_class
,), re
.VERBOSE | re
.UNICODE
)
682 text
= closing_single_quotes_regex
.sub(r
'\1%s\2' % smart
.csquote
, text
)
684 # Any remaining single quotes should be opening ones:
685 text
= re
.sub(r
"""'""", smart
.osquote
, text
)
687 # Get most opening double quotes:
688 opening_double_quotes_regex
= re
.compile(r
"""
690 \s | # a whitespace char, or
691 | # a non-breaking space entity, or
693 &[mn]dash; | # named dash entities
694 %s | # or decimal entities
695 &\#x201[34]; # or hex
698 (?=\w) # followed by a word character
699 """ % (dec_dashes
,), re
.VERBOSE
)
700 text
= opening_double_quotes_regex
.sub(r
'\1'+smart
.opquote
, text
)
702 # Double closing quotes:
703 closing_double_quotes_regex
= re
.compile(r
"""
704 #(%s)? # character that indicates the quote should be closing
707 """ % (close_class
,), re
.VERBOSE
)
708 text
= closing_double_quotes_regex
.sub(smart
.cpquote
, text
)
710 closing_double_quotes_regex
= re
.compile(r
"""
711 (%s) # character that indicates the quote should be closing
713 """ % (close_class
,), re
.VERBOSE
)
714 text
= closing_double_quotes_regex
.sub(r
'\1'+smart
.cpquote
, text
)
716 # Any remaining quotes should be opening ones.
717 text
= re
.sub(r
'"', smart
.opquote
, text
)
722 def educateBackticks(text
, language
='en'):
724 Parameter: String (unicode or bytes).
725 Returns: The `text`, with ``backticks'' -style double quotes
726 translated into HTML curly quote entities.
727 Example input: ``Isn't this fun?''
728 Example output: “Isn't this fun?“;
730 smart
= smartchars(language
)
732 text
= re
.sub(r
"""``""", smart
.opquote
, text
)
733 text
= re
.sub(r
"""''""", smart
.cpquote
, text
)
737 def educateSingleBackticks(text
, language
='en'):
739 Parameter: String (unicode or bytes).
740 Returns: The `text`, with `backticks' -style single quotes
741 translated into HTML curly quote entities.
743 Example input: `Isn't this fun?'
744 Example output: ‘Isn’t this fun?’
746 smart
= smartchars(language
)
748 text
= re
.sub(r
"""`""", smart
.osquote
, text
)
749 text
= re
.sub(r
"""'""", smart
.csquote
, text
)
753 def educateDashes(text
):
755 Parameter: String (unicode or bytes).
756 Returns: The `text`, with each instance of "--" translated to
757 an em-dash character.
760 text
= re
.sub(r
"""---""", smartchars
.endash
, text
) # en (yes, backwards)
761 text
= re
.sub(r
"""--""", smartchars
.emdash
, text
) # em (yes, backwards)
765 def educateDashesOldSchool(text
):
767 Parameter: String (unicode or bytes).
768 Returns: The `text`, with each instance of "--" translated to
769 an en-dash character, and each "---" translated to
770 an em-dash character.
773 text
= re
.sub(r
"""---""", smartchars
.emdash
, text
)
774 text
= re
.sub(r
"""--""", smartchars
.endash
, text
)
778 def educateDashesOldSchoolInverted(text
):
780 Parameter: String (unicode or bytes).
781 Returns: The `text`, with each instance of "--" translated to
782 an em-dash character, and each "---" translated to
783 an en-dash character. Two reasons why: First, unlike the
784 en- and em-dash syntax supported by
785 EducateDashesOldSchool(), it's compatible with existing
786 entries written before SmartyPants 1.1, back when "--" was
787 only used for em-dashes. Second, em-dashes are more
788 common than en-dashes, and so it sort of makes sense that
789 the shortcut should be shorter to type. (Thanks to Aaron
790 Swartz for the idea.)
792 text
= re
.sub(r
"""---""", smartchars
.endash
, text
) # em
793 text
= re
.sub(r
"""--""", smartchars
.emdash
, text
) # en
798 def educateEllipses(text
):
800 Parameter: String (unicode or bytes).
801 Returns: The `text`, with each instance of "..." translated to
802 an ellipsis character.
804 Example input: Huh...?
805 Example output: Huh…?
808 text
= re
.sub(r
"""\.\.\.""", smartchars
.ellipsis
, text
)
809 text
= re
.sub(r
"""\. \. \.""", smartchars
.ellipsis
, text
)
813 def stupefyEntities(text
, language
='en'):
815 Parameter: String (unicode or bytes).
816 Returns: The `text`, with each SmartyPants character translated to
817 its ASCII counterpart.
819 Example input: “Hello — world.”
820 Example output: "Hello -- world."
822 smart
= smartchars(language
)
824 text
= re
.sub(smart
.endash
, "-", text
) # en-dash
825 text
= re
.sub(smart
.emdash
, "--", text
) # em-dash
827 text
= re
.sub(smart
.osquote
, "'", text
) # open single quote
828 text
= re
.sub(smart
.csquote
, "'", text
) # close single quote
830 text
= re
.sub(smart
.opquote
, '"', text
) # open double quote
831 text
= re
.sub(smart
.cpquote
, '"', text
) # close double quote
833 text
= re
.sub(smart
.ellipsis
, '...', text
)# ellipsis
838 def processEscapes(text
, restore
=False):
840 Parameter: String (unicode or bytes).
841 Returns: The `text`, with after processing the following backslash
842 escape sequences. This is useful if you want to force a "dumb"
843 quote or other character to appear.
854 replacements = ((r'\\', r'\'),
861 for (ch, rep) in replacements:
862 text = text.replace(rep, ch[1])
864 for (ch, rep) in replacements:
865 text = text.replace(ch, rep)
872 Parameter
: String containing HTML markup
.
873 Returns
: An iterator that yields the tokens comprising the
input
874 string
. Each token
is either a
tag (possibly with nested
,
875 tags contained therein
, such
as <a href
="<MTFoo>">, or a
876 run of text between tags
. Each yielded element
is a
877 two
-element
tuple; the first
is either
'tag' or 'text';
878 the second
is the actual value
.
880 Based on the
_tokenize() subroutine
from Brad Choate
's MTRegex plugin.
881 <http://www.bradchoate.com/past/mtregex.php>
889 nested_tags = "|".join(['(?
:<(?
:[^
<>]',] * depth) + (')*>)' * depth)
890 #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments
891 # (?: <\? .*? \?> ) | # directives
892 # %s # nested tags """ % (nested_tags,)
893 tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
895 token_match = tag_soup.search(text)
898 while token_match is not None:
899 if token_match.group(1):
900 yield ('text
', token_match.group(1))
902 yield ('tag
', token_match.group(2))
904 previous_end = token_match.end()
905 token_match = tag_soup.search(text, token_match.end())
907 if previous_end < len(text):
908 yield ('text
', text[previous_end:])
912 if __name__ == "__main__":
916 import locale # module missing in Jython
917 locale.setlocale(locale.LC_ALL, '') # set to user defaults
918 defaultlanguage = locale.getdefaultlocale()[0]
920 defaultlanguage = 'en
'
922 # Normalize and drop unsupported subtags:
923 defaultlanguage = defaultlanguage.lower().replace('-','_
')
924 # split (except singletons, which mark the following tag as non-standard):
925 defaultlanguage = re.sub(r'_([a
-zA
-Z0
-9])_
', r'_\
1-', defaultlanguage)
926 _subtags = [subtag for subtag in defaultlanguage.split('_
')]
927 _basetag = _subtags.pop(0)
928 # find all combinations of subtags
929 for n in range(len(_subtags), 0, -1):
930 for tags in itertools.combinations(_subtags, n):
931 _tag = '-'.join((_basetag,)+tags)
932 if _tag in smartchars.quotes:
933 defaultlanguage = _tag
936 if _basetag in smartchars.quotes:
937 defaultlanguage = _basetag
939 defaultlanguage = 'en
'
943 parser = argparse.ArgumentParser(
944 description='Filter stdin making ASCII punctuation
"smart".')
945 # parser.add_argument("text", help="text to be acted on")
946 parser.add_argument("-a", "--action", default="1",
947 help="what to do with the input (see --actionhelp)")
948 parser.add_argument("-e", "--encoding", default="utf8",
949 help="text encoding")
950 parser.add_argument("-l", "--language", default=defaultlanguage,
951 help="text language (BCP47 tag), Default: %s"%defaultlanguage)
952 parser.add_argument("-q", "--alternative-quotes", action="store_true",
953 help="use alternative quote style")
954 parser.add_argument("--doc", action="store_true",
955 help="print documentation")
956 parser.add_argument("--actionhelp", action="store_true",
957 help="list available actions")
958 parser.add_argument("--stylehelp", action="store_true",
959 help="list available quote styles")
960 parser.add_argument("--test", action="store_true",
961 help="perform short self-test")
962 args = parser.parse_args()
966 elif args.actionhelp:
970 print "Available styles (primary open/close, secondary open/close)"
971 print "language tag quotes"
972 print "============ ======"
973 for key in sorted(smartchars.quotes.keys()):
974 print "%-14s %s" % (key, smartchars.quotes[key])
976 # Unit test output goes to stderr.
979 class TestSmartypantsAllAttributes(unittest.TestCase):
980 # the default attribute is "1", which means "all".
981 def test_dates(self):
982 self.assertEqual(smartyPants("1440-80's
"), u"1440-80’s
")
983 self.assertEqual(smartyPants("1440-'80s"), u"1440-’80s")
984 self.assertEqual(smartyPants("1440---'80s
"), u"1440–’
80s
")
985 self.assertEqual(smartyPants("1960's"), u"1960’s")
986 self.assertEqual(smartyPants("one two '60s
"), u"one two ’
60s
")
987 self.assertEqual(smartyPants("'60s"), u"’60s")
989 def test_educated_quotes(self):
990 self.assertEqual(smartyPants('"Isn\'t this fun?"'), u'“Isn’t this fun?”
')
992 def test_html_tags(self):
993 text = '<a src
="foo">more
</a
>'
994 self.assertEqual(smartyPants(text), text)
996 suite = unittest.TestLoader().loadTestsFromTestCase(
997 TestSmartypantsAllAttributes)
998 unittest.TextTestRunner().run(suite)
1001 if args.alternative_quotes:
1002 if '-x
-altquot
' in args.language:
1003 args.language = args.language.replace('-x
-altquot
', '')
1005 args.language += '-x
-altquot
'
1006 text = sys.stdin.read().decode(args.encoding)
1007 print smartyPants(text, attr=args.action,
1008 language=args.language).encode(args.encoding)