docutils/utils/smartquotes.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 # :Id: $Id$
   5 # :Copyright: © 2010 Günter Milde,
   6 #             original `SmartyPants`_: © 2003 John Gruber
   7 #             smartypants.py:          © 2004, 2007 Chad Miller
   8 # :Maintainer: docutils-develop@lists.sourceforge.net
   9 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
  10 #
  11 #    Copying and distribution of this file, with or without modification,
  12 #    are permitted in any medium without royalty provided the copyright
  13 #    notices and this notice are preserved.
  14 #    This file is offered as-is, without any warranty.
  15 #
  16 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
  17
  18
  19 r"""
  20 =========================
  21 Smart Quotes for Docutils
  22 =========================
  23
  24 Synopsis
  25 ========
  26
  27 "SmartyPants" is a free web publishing plug-in for Movable Type, Blosxom, and
  28 BBEdit that easily translates plain ASCII punctuation characters into "smart"
  29 typographic punctuation characters.
  30
  31 ``smartquotes.py`` is an adaption of "SmartyPants" to Docutils_.
  32
  33 * Using Unicode instead of HTML entities for typographic punctuation
  34   characters, it works for any output format that supports Unicode.
  35 * Supports `language specific quote characters`__.
  36
  37 __ http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
  38
  39
  40 Authors
  41 =======
  42
  43 `John Gruber`_ did all of the hard work of writing this software in Perl for
  44 `Movable Type`_ and almost all of this useful documentation.  `Chad Miller`_
  45 ported it to Python to use with Pyblosxom_.
  46 Adapted to Docutils_ by Günter Milde.
  47
  48 Additional Credits
  49 ==================
  50
  51 Portions of the SmartyPants original work are based on Brad Choate's nifty
  52 MTRegex plug-in.  `Brad Choate`_ also contributed a few bits of source code to
  53 this plug-in.  Brad Choate is a fine hacker indeed.
  54
  55 `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta
  56 testing of the original SmartyPants.
  57
  58 `Rael Dornfest`_ ported SmartyPants to Blosxom.
  59
  60 .. _Brad Choate: http://bradchoate.com/
  61 .. _Jeremy Hedley: http://antipixel.com/
  62 .. _Charles Wiltgen: http://playbacktime.com/
  63 .. _Rael Dornfest: http://raelity.org/
  64
  65
  66 Copyright and License
  67 =====================
  68
  69 SmartyPants_ license (3-Clause BSD license):
  70
  71   Copyright (c) 2003 John Gruber (http://daringfireball.net/)
  72   All rights reserved.
  73
  74   Redistribution and use in source and binary forms, with or without
  75   modification, are permitted provided that the following conditions are
  76   met:
  77
  78   * Redistributions of source code must retain the above copyright
  79     notice, this list of conditions and the following disclaimer.
  80
  81   * Redistributions in binary form must reproduce the above copyright
  82     notice, this list of conditions and the following disclaimer in
  83     the documentation and/or other materials provided with the
  84     distribution.
  85
  86   * Neither the name "SmartyPants" nor the names of its contributors
  87     may be used to endorse or promote products derived from this
  88     software without specific prior written permission.
  89
  90   This software is provided by the copyright holders and contributors
  91   "as is" and any express or implied warranties, including, but not
  92   limited to, the implied warranties of merchantability and fitness for
  93   a particular purpose are disclaimed. In no event shall the copyright
  94   owner or contributors be liable for any direct, indirect, incidental,
  95   special, exemplary, or consequential damages (including, but not
  96   limited to, procurement of substitute goods or services; loss of use,
  97   data, or profits; or business interruption) however caused and on any
  98   theory of liability, whether in contract, strict liability, or tort
  99   (including negligence or otherwise) arising in any way out of the use
 100   of this software, even if advised of the possibility of such damage.
 101
 102 smartypants.py license (2-Clause BSD license):
 103
 104   smartypants.py is a derivative work of SmartyPants.
 105
 106   Redistribution and use in source and binary forms, with or without
 107   modification, are permitted provided that the following conditions are
 108   met:
 109
 110   * Redistributions of source code must retain the above copyright
 111     notice, this list of conditions and the following disclaimer.
 112
 113   * Redistributions in binary form must reproduce the above copyright
 114     notice, this list of conditions and the following disclaimer in
 115     the documentation and/or other materials provided with the
 116     distribution.
 117
 118   This software is provided by the copyright holders and contributors
 119   "as is" and any express or implied warranties, including, but not
 120   limited to, the implied warranties of merchantability and fitness for
 121   a particular purpose are disclaimed. In no event shall the copyright
 122   owner or contributors be liable for any direct, indirect, incidental,
 123   special, exemplary, or consequential damages (including, but not
 124   limited to, procurement of substitute goods or services; loss of use,
 125   data, or profits; or business interruption) however caused and on any
 126   theory of liability, whether in contract, strict liability, or tort
 127   (including negligence or otherwise) arising in any way out of the use
 128   of this software, even if advised of the possibility of such damage.
 129
 130 .. _John Gruber: http://daringfireball.net/
 131 .. _Chad Miller: http://web.chad.org/
 132
 133 .. _Pyblosxom: http://pyblosxom.bluesock.org/
 134 .. _SmartyPants: http://daringfireball.net/projects/smartypants/
 135 .. _Movable Type: http://www.movabletype.org/
 136 .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
 137 .. _Docutils: http://docutils.sf.net/
 138
 139 Description
 140 ===========
 141
 142 SmartyPants can perform the following transformations:
 143
 144 - Straight quotes ( " and ' ) into "curly" quote characters
 145 - Backticks-style quotes (\`\`like this'') into "curly" quote characters
 146 - Dashes (``--`` and ``---``) into en- and em-dash entities
 147 - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
 148
 149 This means you can write, edit, and save your posts using plain old
 150 ASCII straight quotes, plain dashes, and plain dots, but your published
 151 posts (and final HTML output) will appear with smart quotes, em-dashes,
 152 and proper ellipses.
 153
 154 SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``,
 155 ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to
 156 display text where smart quotes and other "smart punctuation" would not be
 157 appropriate, such as source code or example markup.
 158
 159
 160 Backslash Escapes
 161 =================
 162
 163 If you need to use literal straight quotes (or plain hyphens and periods),
 164 `smartquotes` accepts the following backslash escape sequences to force
 165 ASCII-punctuation. Mind, that you need two backslashes as Docutils expands it,
 166 too.
 167
 168 ========  =========
 169 Escape    Character
 170 ========  =========
 171 ``\\``    \\
 172 ``\\"``   \\"
 173 ``\\'``   \\'
 174 ``\\.``   \\.
 175 ``\\-``   \\-
 176 ``\\```   \\`
 177 ========  =========
 178
 179 This is useful, for example, when you want to use straight quotes as
 180 foot and inch marks: 6\\'2\\" tall; a 17\\" iMac.
 181
 182
 183 Caveats
 184 =======
 185
 186 Why You Might Not Want to Use Smart Quotes in Your Weblog
 187 ---------------------------------------------------------
 188
 189 For one thing, you might not care.
 190
 191 Most normal, mentally stable individuals do not take notice of proper
 192 typographic punctuation. Many design and typography nerds, however, break
 193 out in a nasty rash when they encounter, say, a restaurant sign that uses
 194 a straight apostrophe to spell "Joe's".
 195
 196 If you're the sort of person who just doesn't care, you might well want to
 197 continue not caring. Using straight quotes -- and sticking to the 7-bit
 198 ASCII character set in general -- is certainly a simpler way to live.
 199
 200 Even if you *do* care about accurate typography, you still might want to
 201 think twice before educating the quote characters in your weblog. One side
 202 effect of publishing curly quote characters is that it makes your
 203 weblog a bit harder for others to quote from using copy-and-paste. What
 204 happens is that when someone copies text from your blog, the copied text
 205 contains the 8-bit curly quote characters (as well as the 8-bit characters
 206 for em-dashes and ellipses, if you use these options). These characters
 207 are not standard across different text encoding methods, which is why they
 208 need to be encoded as characters.
 209
 210 People copying text from your weblog, however, may not notice that you're
 211 using curly quotes, and they'll go ahead and paste the unencoded 8-bit
 212 characters copied from their browser into an email message or their own
 213 weblog. When pasted as raw "smart quotes", these characters are likely to
 214 get mangled beyond recognition.
 215
 216 That said, my own opinion is that any decent text editor or email client
 217 makes it easy to stupefy smart quote characters into their 7-bit
 218 equivalents, and I don't consider it my problem if you're using an
 219 indecent text editor or email client.
 220
 221
 222 Algorithmic Shortcomings
 223 ------------------------
 224
 225 One situation in which quotes will get curled the wrong way is when
 226 apostrophes are used at the start of leading contractions. For example::
 227
 228   'Twas the night before Christmas.
 229
 230 In the case above, SmartyPants will turn the apostrophe into an opening
 231 single-quote, when in fact it should be the `right single quotation mark`
 232 character which is also "the preferred character to use for apostrophe"
 233 (Unicode). I don't think this problem can be solved in the general case --
 234 every word processor I've tried gets this wrong as well. In such cases, it's
 235 best to use the proper character for closing single-quotes (’) by hand.
 236
 237 In English, the same character is used for apostrophe and  closing single
 238 quote (both plain and "smart" ones). For other locales (French, Italean,
 239 Swiss, ...) "smart" single closing quotes differ from the curly apostrophe.
 240
 241    .. class:: language-fr
 242
 243    Il dit : "C'est 'super' !"
 244
 245 If the apostrophe is used at the end of a word, it cannot be distinguished
 246 from a single quote by the algorithm. Therefore, a text like::
 247
 248    .. class:: language-de-CH
 249
 250    "Er sagt: 'Ich fass' es nicht.'"
 251
 252 will get a single closing guillemet instead of an apostrophe.
 253
 254 This can be prevented by use use of the curly apostrophe character (’) in
 255 the source::
 256
 257    -  "Er sagt: 'Ich fass' es nicht.'"
 258    +  "Er sagt: 'Ich fass’ es nicht.'"
 259
 260
 261 Version History
 262 ===============
 263
 264 1.8:    2017-04-24
 265         - Command line front-end.
 266
 267 1.7.1:  2017-03-19
 268         - Update and extend language-dependent quotes.
 269         - Differentiate apostrophe from single quote.
 270
 271 1.7:    2012-11-19
 272         - Internationalization: language-dependent quotes.
 273
 274 1.6.1:  2012-11-06
 275         - Refactor code, code cleanup,
 276         - `educate_tokens()` generator as interface for Docutils.
 277
 278 1.6:    2010-08-26
 279         - Adaption to Docutils:
 280           - Use Unicode instead of HTML entities,
 281           - Remove code special to pyblosxom.
 282
 283 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400
 284         - Fixed bug where blocks of precious unalterable text was instead
 285           interpreted.  Thanks to Le Roux and Dirk van Oosterbosch.
 286
 287 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400
 288         - Fix bogus magical quotation when there is no hint that the
 289           user wants it, e.g., in "21st century".  Thanks to Nathan Hamblen.
 290         - Be smarter about quotes before terminating numbers in an en-dash'ed
 291           range.
 292
 293 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500
 294         - Fix a date-processing bug, as reported by jacob childress.
 295         - Begin a test-suite for ensuring correct output.
 296         - Removed import of "string", since I didn't really need it.
 297           (This was my first every Python program.  Sue me!)
 298
 299 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400
 300         - Abort processing if the flavour is in forbidden-list.  Default of
 301           [ "rss" ]   (Idea of Wolfgang SCHNERRING.)
 302         - Remove stray virgules from en-dashes.  Patch by Wolfgang SCHNERRING.
 303
 304 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400
 305         - Some single quotes weren't replaced properly.  Diff-tesuji played
 306           by Benjamin GEIGER.
 307
 308 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500
 309         - Support upcoming pyblosxom 0.9 plugin verification feature.
 310
 311 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500
 312         - Initial release
 313 """
 314
 315 options = r"""
 316 Options
 317 =======
 318
 319 Numeric values are the easiest way to configure SmartyPants' behavior:
 320
 321 :0:     Suppress all transformations. (Do nothing.)
 322
 323 :1:     Performs default SmartyPants transformations: quotes (including
 324         \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash)
 325         is used to signify an em-dash; there is no support for en-dashes
 326
 327 :2:     Same as smarty_pants="1", except that it uses the old-school typewriter
 328         shorthand for dashes:  "``--``" (dash dash) for en-dashes, "``---``"
 329         (dash dash dash)
 330         for em-dashes.
 331
 332 :3:     Same as smarty_pants="2", but inverts the shorthand for dashes:
 333         "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for
 334         en-dashes.
 335
 336 :-1:    Stupefy mode. Reverses the SmartyPants transformation process, turning
 337         the characters produced by SmartyPants into their ASCII equivalents.
 338         E.g. the LEFT DOUBLE QUOTATION MARK (“) is turned into a simple
 339         double-quote (\"), "—" is turned into two dashes, etc.
 340
 341
 342 The following single-character attribute values can be combined to toggle
 343 individual transformations from within the smarty_pants attribute. For
 344 example, ``"1"`` is equivalent to ``"qBde"``.
 345
 346 :q:     Educates normal quote characters: (") and (').
 347
 348 :b:     Educates \`\`backticks'' -style double quotes.
 349
 350 :B:     Educates \`\`backticks'' -style double quotes and \`single' quotes.
 351
 352 :d:     Educates em-dashes.
 353
 354 :D:     Educates em-dashes and en-dashes, using old-school typewriter shorthand:
 355         (dash dash) for en-dashes, (dash dash dash) for em-dashes.
 356
 357 :i:     Educates em-dashes and en-dashes, using inverted old-school typewriter
 358         shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes.
 359
 360 :e:     Educates ellipses.
 361
 362 :w:     Translates any instance of ``&quot;`` into a normal double-quote character.
 363         This should be of no interest to most people, but of particular interest
 364         to anyone who writes their posts using Dreamweaver, as Dreamweaver
 365         inexplicably uses this entity to represent a literal double-quote
 366         character. SmartyPants only educates normal quotes, not entities (because
 367         ordinarily, entities are used for the explicit purpose of representing the
 368         specific character they represent). The "w" option must be used in
 369         conjunction with one (or both) of the other quote options ("q" or "b").
 370         Thus, if you wish to apply all SmartyPants transformations (quotes, en-
 371         and em-dashes, and ellipses) and also translate ``&quot;`` entities into
 372         regular quotes so SmartyPants can educate them, you should pass the
 373         following to the smarty_pants attribute:
 374 """
 375
 376
 377 default_smartypants_attr = "1"
 378
 379
 380 import re, sys
 381
 382 class smartchars(object):
 383     """Smart quotes and dashes
 384     """
 385
 386     endash   = u'–' # "&#8211;" EN DASH
 387     emdash   = u'—' # "&#8212;" EM DASH
 388     ellipsis = u'…' # "&#8230;" HORIZONTAL ELLIPSIS
 389     apostrophe = u'’' # "&#8217;" RIGHT SINGLE QUOTATION MARK
 390
 391     # quote characters (language-specific, set in __init__())
 392     # [1] http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
 393     # [2] http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen
 394     # [3] https://fr.wikipedia.org/wiki/Guillemet
 395     # [4] http://typographisme.net/post/Les-espaces-typographiques-et-le-web
 396     # [5] http://www.btb.termiumplus.gc.ca/tpv2guides/guides/redac/index-fra.html
 397     # [6] https://en.wikipedia.org/wiki/Hebrew_punctuation#Quotation_marks
 398     # [7] http://www.tustep.uni-tuebingen.de/bi/bi00/bi001t1-anfuehrung.pdf
 399     # [8] http://www.korrekturavdelingen.no/anforselstegn.htm
 400     # [9] Typografisk håndbok. Oslo: Spartacus. 2000. s. 67. ISBN 8243001530.
 401     # [10] http://www.typografi.org/sitat/sitatart.html
 402     #
 403     # TODO: configuration option, e.g.::
 404     #
 405     #   smartquote-locales: nl: „“’’,  # apostrophe for ``'s Gravenhage``
 406     #                       nr: se,    # alias
 407     #                       fr: « : »:‹ : ›, # :-separated list with NBSPs
 408     quotes = {'af':           u'“”‘’',
 409               'af-x-altquot': u'„”‚’',
 410               'bg':           u'„“‚‘', # Bulgarian, https://bg.wikipedia.org/wiki/Кавички
 411               'ca':           u'«»“”',
 412               'ca-x-altquot': u'“”‘’',
 413               'cs':           u'„“‚‘',
 414               'cs-x-altquot': u'»«›‹',
 415               'da':           u'»«›‹',
 416               'da-x-altquot': u'„“‚‘',
 417               # 'da-x-altquot2': u'””’’',
 418               'de':           u'„“‚‘',
 419               'de-x-altquot': u'»«›‹',
 420               'de-ch':        u'«»‹›',
 421               'el':           u'«»“”',
 422               'en':           u'“”‘’',
 423               'en-uk-x-altquot': u'‘’“”', # Attention: " → ‘ and ' → “ !
 424               'eo':           u'“”‘’',
 425               'es':           u'«»“”',
 426               'es-x-altquot': u'“”‘’',
 427               'et':           u'„“‚‘', # no secondary quote listed in
 428               'et-x-altquot': u'«»‹›', # the sources above (wikipedia.org)
 429               'eu':           u'«»‹›',
 430               'fi':           u'””’’',
 431               'fi-x-altquot': u'»»››',
 432               'fr':           (u'« ', u' »', u'“', u'”'), # full no-break space
 433               'fr-x-altquot': (u'« ', u' »', u'“', u'”'), # narrow no-break space
 434               'fr-ch':        u'«»‹›',
 435               'fr-ch-x-altquot': (u'« ',  u' »', u'‹ ', u' ›'), # narrow no-break space, http://typoguide.ch/
 436               'gl':           u'«»“”',
 437               'he':           u'”“»«', # Hebrew is RTL, test position:
 438               'he-x-altquot': u'„”‚’', # low quotation marks are opening.
 439               # 'he-x-altquot': u'“„‘‚', # RTL: low quotation marks opening
 440               'hr':           u'„”‘’', # http://hrvatska-tipografija.com/polunavodnici/
 441               'hr-x-altquot': u'»«›‹',
 442               'hsb':          u'„“‚‘',
 443               'hsb-x-altquot':u'»«›‹',
 444               'hu':           u'„”«»',
 445               'is':           u'„“‚‘',
 446               'it':           u'«»“”',
 447               'it-ch':        u'«»‹›',
 448               'it-x-altquot': u'“”‘’',
 449               # 'it-x-altquot2': u'“„‘‚', # [7] in headlines
 450               'ja':           u'「」『』',
 451               'lt':           u'„“‚‘',
 452               'lv':           u'„“‚‘',
 453               'mk':           u'„“‚‘', # Macedonian, https://mk.wikipedia.org/wiki/Правопис_и_правоговор_на_македонскиот_јазик
 454               'nl':           u'“”‘’',
 455               'nl-x-altquot': u'„”‚’',
 456               # 'nl-x-altquot2': u'””’’',
 457               'nb':           u'«»’’', # Norsk bokmål (canonical form 'no')
 458               'nn':           u'«»’’', # Nynorsk [10]
 459               'nn-x-altquot': u'«»‘’', # [8], [10]
 460               # 'nn-x-altquot2': u'«»«»', # [9], [10
 461               # 'nn-x-altquot3': u'„“‚‘', # [10]
 462               'no':           u'«»’’', # Norsk bokmål [10]
 463               'no-x-altquot': u'«»‘’', # [8], [10]
 464               # 'no-x-altquot2': u'«»«»', # [9], [10
 465               # 'no-x-altquot3': u'„“‚‘', # [10]
 466               'pl':           u'„”«»',
 467               'pl-x-altquot': u'«»‚’',
 468               # 'pl-x-altquot2': u'„”‚’', # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w
 469               'pt':           u'«»“”',
 470               'pt-br':        u'“”‘’',
 471               'ro':           u'„”«»',
 472               'ru':           u'«»„“',
 473               'sh':           u'„”‚’', # Serbo-Croatian
 474               'sh-x-altquot': u'»«›‹',
 475               'sk':           u'„“‚‘', # Slovak
 476               'sk-x-altquot': u'»«›‹',
 477               'sl':           u'„“‚‘', # Slovenian
 478               'sl-x-altquot': u'»«›‹',
 479               'sq':           u'«»‹›', # Albanian
 480               'sq-x-altquot': u'“„‘‚',
 481               'sr':           u'„”’’',
 482               'sr-x-altquot': u'»«›‹',
 483               'sv':           u'””’’',
 484               'sv-x-altquot': u'»»››',
 485               'tr':           u'“”‘’',
 486               'tr-x-altquot': u'«»‹›',
 487               # 'tr-x-altquot2': u'“„‘‚', # [7] antiquated?
 488               'uk':           u'«»„“',
 489               'uk-x-altquot': u'„“‚‘',
 490               'zh-cn':        u'“”‘’',
 491               'zh-tw':        u'「」『』',
 492              }
 493
 494     def __init__(self, language='en'):
 495         self.language = language
 496         try:
 497             (self.opquote, self.cpquote,
 498              self.osquote, self.csquote) = self.quotes[language.lower()]
 499         except KeyError:
 500             self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\''
 501
 502
 503 def smartyPants(text, attr=default_smartypants_attr, language='en'):
 504     """Main function for "traditional" use."""
 505
 506     return "".join([t for t in educate_tokens(tokenize(text),
 507                                               attr, language)])
 508
 509
 510 def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'):
 511     """Return iterator that "educates" the items of `text_tokens`.
 512     """
 513
 514     # Parse attributes:
 515     # 0 : do nothing
 516     # 1 : set all
 517     # 2 : set all, using old school en- and em- dash shortcuts
 518     # 3 : set all, using inverted old school en and em- dash shortcuts
 519     #
 520     # q : quotes
 521     # b : backtick quotes (``double'' only)
 522     # B : backtick quotes (``double'' and `single')
 523     # d : dashes
 524     # D : old school dashes
 525     # i : inverted old school dashes
 526     # e : ellipses
 527     # w : convert &quot; entities to " for Dreamweaver users
 528
 529     convert_quot = False  # translate &quot; entities into normal quotes?
 530     do_dashes = False
 531     do_backticks = False
 532     do_quotes = False
 533     do_ellipses = False
 534     do_stupefy = False
 535
 536     # if attr == "0": # pass tokens unchanged (see below).
 537     if attr == "1": # Do everything, turn all options on.
 538         do_quotes    = True
 539         do_backticks = True
 540         do_dashes    = 1
 541         do_ellipses  = True
 542     elif attr == "2":
 543         # Do everything, turn all options on, use old school dash shorthand.
 544         do_quotes    = True
 545         do_backticks = True
 546         do_dashes    = 2
 547         do_ellipses  = True
 548     elif attr == "3":
 549         # Do everything, use inverted old school dash shorthand.
 550         do_quotes    = True
 551         do_backticks = True
 552         do_dashes    = 3
 553         do_ellipses  = True
 554     elif attr == "-1": # Special "stupefy" mode.
 555         do_stupefy   = True
 556     else:
 557         if "q" in attr: do_quotes = True
 558         if "b" in attr: do_backticks = True
 559         if "B" in attr: do_backticks = 2
 560         if "d" in attr: do_dashes = 1
 561         if "D" in attr: do_dashes = 2
 562         if "i" in attr: do_dashes = 3
 563         if "e" in attr: do_ellipses = True
 564         if "w" in attr: convert_quot = True
 565
 566     prev_token_last_char = " "
 567     # Last character of the previous text token. Used as
 568     # context to curl leading quote characters correctly.
 569
 570     for (ttype, text) in text_tokens:
 571
 572         # skip HTML and/or XML tags as well as emtpy text tokens
 573         # without updating the last character
 574         if ttype == 'tag' or not text:
 575             yield text
 576             continue
 577
 578         # skip literal text (math, literal, raw, ...)
 579         if ttype == 'literal':
 580             prev_token_last_char = text[-1:]
 581             yield text
 582             continue
 583
 584         last_char = text[-1:] # Remember last char before processing.
 585
 586         text = processEscapes(text)
 587
 588         if convert_quot:
 589             text = re.sub('&quot;', '"', text)
 590
 591         if do_dashes == 1:
 592             text = educateDashes(text)
 593         elif do_dashes == 2:
 594             text = educateDashesOldSchool(text)
 595         elif do_dashes == 3:
 596             text = educateDashesOldSchoolInverted(text)
 597
 598         if do_ellipses:
 599             text = educateEllipses(text)
 600
 601         # Note: backticks need to be processed before quotes.
 602         if do_backticks:
 603             text = educateBackticks(text, language)
 604
 605         if do_backticks == 2:
 606             text = educateSingleBackticks(text, language)
 607
 608         if do_quotes:
 609             # Replace plain quotes in context to prevent converstion to
 610             # 2-character sequence in French.
 611             context = prev_token_last_char.replace('"',';').replace("'",';')
 612             text = educateQuotes(context+text, language)[1:]
 613
 614         if do_stupefy:
 615             text = stupefyEntities(text, language)
 616
 617         # Remember last char as context for the next token
 618         prev_token_last_char = last_char
 619
 620         text = processEscapes(text, restore=True)
 621
 622         yield text
 623
 624
 625
 626 def educateQuotes(text, language='en'):
 627     """
 628     Parameter:  - text string (unicode or bytes).
 629                 - language (`BCP 47` language tag.)
 630     Returns:    The `text`, with "educated" curly quote characters.
 631
 632     Example input:  "Isn't this fun?"
 633     Example output: “Isn’t this fun?“;
 634     """
 635
 636     smart = smartchars(language)
 637
 638     # oldtext = text
 639     punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
 640
 641     # Special case if the very first character is a quote
 642     # followed by punctuation at a non-word-break.
 643     # Close the quotes by brute force:
 644     text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.csquote, text)
 645     text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.cpquote, text)
 646
 647     # Special case for double sets of quotes, e.g.:
 648     #   <p>He said, "'Quoted' words in a larger quote."</p>
 649     text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text)
 650     text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text)
 651
 652     # Special case for decade abbreviations (the '80s):
 653     if language.startswith('en'): # TODO similar cases in other languages?
 654         text = re.sub(r"""'(?=\d{2}s)""", smart.apostrophe, text, re.UNICODE)
 655
 656     close_class = r"""[^\ \t\r\n\[\{\(\-]"""
 657     dec_dashes = r"""&#8211;|&#8212;"""
 658
 659     # Get most opening single quotes:
 660     opening_single_quotes_regex = re.compile(r"""
 661                     (
 662                             \s          |   # a whitespace char, or
 663                             &nbsp;      |   # a non-breaking space entity, or
 664                             --          |   # dashes, or
 665                             &[mn]dash;  |   # named dash entities
 666                             %s          |   # or decimal entities
 667                             &\#x201[34];    # or hex
 668                     )
 669                     '                 # the quote
 670                     (?=\w)            # followed by a word character
 671                     """ % (dec_dashes,), re.VERBOSE | re.UNICODE)
 672     text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text)
 673
 674     # In many locales, single closing quotes are different from apostrophe:
 675     if smart.csquote != smart.apostrophe:
 676         apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE)
 677         text = apostrophe_regex.sub(smart.apostrophe, text)
 678     # TODO: keep track of quoting level to recognize apostrophe in, e.g.,
 679     # "Ich fass' es nicht."
 680
 681     closing_single_quotes_regex = re.compile(r"""
 682                     (%s)
 683                     '
 684                     (?!\s  |       # whitespace
 685                        s\b |
 686                         \d         # digits   ('80s)
 687                     )
 688                     """ % (close_class,), re.VERBOSE | re.UNICODE)
 689     text = closing_single_quotes_regex.sub(r'\1'+smart.csquote, text)
 690
 691     closing_single_quotes_regex = re.compile(r"""
 692                     (%s)
 693                     '
 694                     (\s | s\b)
 695                     """ % (close_class,), re.VERBOSE | re.UNICODE)
 696     text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text)
 697
 698     # Any remaining single quotes should be opening ones:
 699     text = re.sub(r"""'""", smart.osquote, text)
 700
 701     # Get most opening double quotes:
 702     opening_double_quotes_regex = re.compile(r"""
 703                     (
 704                             \s          |   # a whitespace char, or
 705                             &nbsp;      |   # a non-breaking space entity, or
 706                             --          |   # dashes, or
 707                             &[mn]dash;  |   # named dash entities
 708                             %s          |   # or decimal entities
 709                             &\#x201[34];    # or hex
 710                     )
 711                     "                 # the quote
 712                     (?=\w)            # followed by a word character
 713                     """ % (dec_dashes,), re.VERBOSE)
 714     text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text)
 715
 716     # Double closing quotes:
 717     closing_double_quotes_regex = re.compile(r"""
 718                     #(%s)?   # character that indicates the quote should be closing
 719                     "
 720                     (?=\s)
 721                     """ % (close_class,), re.VERBOSE)
 722     text = closing_double_quotes_regex.sub(smart.cpquote, text)
 723
 724     closing_double_quotes_regex = re.compile(r"""
 725                     (%s)   # character that indicates the quote should be closing
 726                     "
 727                     """ % (close_class,), re.VERBOSE)
 728     text = closing_double_quotes_regex.sub(r'\1'+smart.cpquote, text)
 729
 730     # Any remaining quotes should be opening ones.
 731     text = re.sub(r'"', smart.opquote, text)
 732
 733     return text
 734
 735
 736 def educateBackticks(text, language='en'):
 737     """
 738     Parameter:  String (unicode or bytes).
 739     Returns:    The `text`, with ``backticks'' -style double quotes
 740                 translated into HTML curly quote entities.
 741     Example input:  ``Isn't this fun?''
 742     Example output: “Isn't this fun?“;
 743     """
 744     smart = smartchars(language)
 745
 746     text = re.sub(r"""``""", smart.opquote, text)
 747     text = re.sub(r"""''""", smart.cpquote, text)
 748     return text
 749
 750
 751 def educateSingleBackticks(text, language='en'):
 752     """
 753     Parameter:  String (unicode or bytes).
 754     Returns:    The `text`, with `backticks' -style single quotes
 755                 translated into HTML curly quote entities.
 756
 757     Example input:  `Isn't this fun?'
 758     Example output: ‘Isn’t this fun?’
 759     """
 760     smart = smartchars(language)
 761
 762     text = re.sub(r"""`""", smart.osquote, text)
 763     text = re.sub(r"""'""", smart.csquote, text)
 764     return text
 765
 766
 767 def educateDashes(text):
 768     """
 769     Parameter:  String (unicode or bytes).
 770     Returns:    The `text`, with each instance of "--" translated to
 771                 an em-dash character.
 772     """
 773
 774     text = re.sub(r"""---""", smartchars.endash, text) # en  (yes, backwards)
 775     text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards)
 776     return text
 777
 778
 779 def educateDashesOldSchool(text):
 780     """
 781     Parameter:  String (unicode or bytes).
 782     Returns:    The `text`, with each instance of "--" translated to
 783                 an en-dash character, and each "---" translated to
 784                 an em-dash character.
 785     """
 786
 787     text = re.sub(r"""---""", smartchars.emdash, text)
 788     text = re.sub(r"""--""", smartchars.endash, text)
 789     return text
 790
 791
 792 def educateDashesOldSchoolInverted(text):
 793     """
 794     Parameter:  String (unicode or bytes).
 795     Returns:    The `text`, with each instance of "--" translated to
 796                 an em-dash character, and each "---" translated to
 797                 an en-dash character. Two reasons why: First, unlike the
 798                 en- and em-dash syntax supported by
 799                 EducateDashesOldSchool(), it's compatible with existing
 800                 entries written before SmartyPants 1.1, back when "--" was
 801                 only used for em-dashes.  Second, em-dashes are more
 802                 common than en-dashes, and so it sort of makes sense that
 803                 the shortcut should be shorter to type. (Thanks to Aaron
 804                 Swartz for the idea.)
 805     """
 806     text = re.sub(r"""---""", smartchars.endash, text)    # em
 807     text = re.sub(r"""--""", smartchars.emdash, text)    # en
 808     return text
 809
 810
 811
 812 def educateEllipses(text):
 813     """
 814     Parameter:  String (unicode or bytes).
 815     Returns:    The `text`, with each instance of "..." translated to
 816                 an ellipsis character.
 817
 818     Example input:  Huh...?
 819     Example output: Huh&#8230;?
 820     """
 821
 822     text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text)
 823     text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text)
 824     return text
 825
 826
 827 def stupefyEntities(text, language='en'):
 828     """
 829     Parameter:  String (unicode or bytes).
 830     Returns:    The `text`, with each SmartyPants character translated to
 831                 its ASCII counterpart.
 832
 833     Example input:  “Hello — world.”
 834     Example output: "Hello -- world."
 835     """
 836     smart = smartchars(language)
 837
 838     text = re.sub(smart.endash, "-", text)  # en-dash
 839     text = re.sub(smart.emdash, "--", text) # em-dash
 840
 841     text = re.sub(smart.osquote, "'", text)  # open single quote
 842     text = re.sub(smart.csquote, "'", text)  # close single quote
 843
 844     text = re.sub(smart.opquote, '"', text)  # open double quote
 845     text = re.sub(smart.cpquote, '"', text)  # close double quote
 846
 847     text = re.sub(smart.ellipsis, '...', text)# ellipsis
 848
 849     return text
 850
 851
 852 def processEscapes(text, restore=False):
 853     r"""
 854     Parameter:  String (unicode or bytes).
 855     Returns:    The `text`, with after processing the following backslash
 856                 escape sequences. This is useful if you want to force a "dumb"
 857                 quote or other character to appear.
 858
 859                 Escape  Value
 860                 ------  -----
 861                 \\      &#92;
 862                 \"      &#34;
 863                 \'      &#39;
 864                 \.      &#46;
 865                 \-      &#45;
 866                 \`      &#96;
 867     """
 868     replacements = ((r'\\', r'&#92;'),
 869                     (r'\"', r'&#34;'),
 870                     (r"\'", r'&#39;'),
 871                     (r'\.', r'&#46;'),
 872                     (r'\-', r'&#45;'),
 873                     (r'\`', r'&#96;'))
 874     if restore:
 875         for (ch, rep) in replacements:
 876             text = text.replace(rep, ch[1])
 877     else:
 878         for (ch, rep) in replacements:
 879             text = text.replace(ch, rep)
 880
 881     return text
 882
 883
 884 def tokenize(text):
 885     """
 886     Parameter:  String containing HTML markup.
 887     Returns:    An iterator that yields the tokens comprising the input
 888                 string. Each token is either a tag (possibly with nested,
 889                 tags contained therein, such as <a href="<MTFoo>">, or a
 890                 run of text between tags. Each yielded element is a
 891                 two-element tuple; the first is either 'tag' or 'text';
 892                 the second is the actual value.
 893
 894     Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin.
 895         <http://www.bradchoate.com/past/mtregex.php>
 896     """
 897
 898     pos = 0
 899     length = len(text)
 900     # tokens = []
 901
 902     depth = 6
 903     nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth)
 904     #match = r"""(?: <! ( -- .*? -- \s* )+ > ) |  # comments
 905     #               (?: <\? .*? \?> ) |  # directives
 906     #               %s  # nested tags       """ % (nested_tags,)
 907     tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
 908
 909     token_match = tag_soup.search(text)
 910
 911     previous_end = 0
 912     while token_match is not None:
 913         if token_match.group(1):
 914             yield ('text', token_match.group(1))
 915
 916         yield ('tag', token_match.group(2))
 917
 918         previous_end = token_match.end()
 919         token_match = tag_soup.search(text, token_match.end())
 920
 921     if previous_end < len(text):
 922         yield ('text', text[previous_end:])
 923
 924
 925
 926 if __name__ == "__main__":
 927
 928     import itertools
 929     try:
 930         import locale # module missing in Jython
 931         locale.setlocale(locale.LC_ALL, '') # set to user defaults
 932         defaultlanguage = locale.getdefaultlocale()[0]
 933     except:
 934         defaultlanguage = 'en'
 935
 936     # Normalize and drop unsupported subtags:
 937     defaultlanguage = defaultlanguage.lower().replace('-','_')
 938     # split (except singletons, which mark the following tag as non-standard):
 939     defaultlanguage = re.sub(r'_([a-zA-Z0-9])_', r'_\1-', defaultlanguage)
 940     _subtags = [subtag for subtag in defaultlanguage.split('_')]
 941     _basetag = _subtags.pop(0)
 942     # find all combinations of subtags
 943     for n in range(len(_subtags), 0, -1):
 944         for tags in itertools.combinations(_subtags, n):
 945             _tag = '-'.join((_basetag,)+tags)
 946             if _tag in smartchars.quotes:
 947                 defaultlanguage = _tag
 948                 break
 949         else:
 950             if _basetag in smartchars.quotes:
 951                 defaultlanguage = _basetag
 952             else:
 953                 defaultlanguage = 'en'
 954
 955
 956     import argparse
 957     parser = argparse.ArgumentParser(
 958                 description='Filter stdin making ASCII punctuation "smart".')
 959     # parser.add_argument("text", help="text to be acted on")
 960     parser.add_argument("-a", "--action", default="1",
 961                         help="what to do with the input (see --actionhelp)")
 962     parser.add_argument("-e", "--encoding", default="utf8",
 963                         help="text encoding")
 964     parser.add_argument("-l", "--language", default=defaultlanguage,
 965                         help="text language (BCP47 tag), Default: %s"%defaultlanguage)
 966     parser.add_argument("-q", "--alternative-quotes", action="store_true",
 967                         help="use alternative quote style")
 968     parser.add_argument("--doc", action="store_true",
 969                         help="print documentation")
 970     parser.add_argument("--actionhelp", action="store_true",
 971                         help="list available actions")
 972     parser.add_argument("--stylehelp", action="store_true",
 973                         help="list available quote styles")
 974     parser.add_argument("--test", action="store_true",
 975                         help="perform short self-test")
 976     args = parser.parse_args()
 977
 978     if args.doc:
 979         print (__doc__)
 980     elif args.actionhelp:
 981         print options
 982     elif args.stylehelp:
 983         print
 984         print "Available styles (primary open/close, secondary open/close)"
 985         print "language tag   quotes"
 986         print "============   ======"
 987         for key in sorted(smartchars.quotes.keys()):
 988             print "%-14s %s" % (key, smartchars.quotes[key])
 989     elif args.test:
 990         # Unit test output goes to stderr.
 991         import unittest
 992
 993         class TestSmartypantsAllAttributes(unittest.TestCase):
 994             # the default attribute is "1", which means "all".
 995             def test_dates(self):
 996                 self.assertEqual(smartyPants("1440-80's"), u"1440-80’s")
 997                 self.assertEqual(smartyPants("1440-'80s"), u"1440-’80s")
 998                 self.assertEqual(smartyPants("1440---'80s"), u"1440–’80s")
 999                 self.assertEqual(smartyPants("1960's"), u"1960’s")
1000                 self.assertEqual(smartyPants("one two '60s"), u"one two ’60s")
1001                 self.assertEqual(smartyPants("'60s"), u"’60s")
1002
1003             def test_educated_quotes(self):
1004                 self.assertEqual(smartyPants('"Isn\'t this fun?"'), u'“Isn’t this fun?”')
1005
1006             def test_html_tags(self):
1007                 text = '<a src="foo">more</a>'
1008                 self.assertEqual(smartyPants(text), text)
1009
1010         suite = unittest.TestLoader().loadTestsFromTestCase(
1011                                             TestSmartypantsAllAttributes)
1012         unittest.TextTestRunner().run(suite)
1013
1014     else:
1015         if args.alternative_quotes:
1016             if '-x-altquot' in args.language:
1017                 args.language = args.language.replace('-x-altquot', '')
1018             else:
1019                 args.language += '-x-altquot'
1020         text = sys.stdin.read().decode(args.encoding)
1021         print smartyPants(text, attr=args.action,
1022                           language=args.language).encode(args.encoding)