webapps/qooxdoo-0.6.3-sdk/frontend/framework/tool/modules/textile.py

   1 #!/usr/bin/env python
   2 # _*_ coding: latin1 _*_
   3
   4 """This is Textile
   5 A Humane Web Text Generator
   6
   7 TODO:
   8 * Make it work with Python 2.1.
   9 * Make it work with Python 1.5.2? Or that's too optimistic?
  10
  11 ---
  12 To get an overview of all PyTextile's features, simply
  13 type 'tell me about textile.' in a single line.
  14 """
  15
  16 __authors__ = ["Roberto A. F. De Almeida (roberto@dealmeida.net)",
  17                "Mark Pilgrim (f8dy@diveintomark.org)"]
  18 __version__ = "2.0.10"
  19 __date__ = "2004/10/06"
  20 __copyright__ = """
  21 Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
  22 Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/
  23 All rights reserved.
  24
  25 Original PHP version:
  26 Version 1.0
  27 21 Feb, 2003
  28
  29 Copyright (c) 2003, Dean Allen, www.textism.com
  30 All rights reserved.
  31
  32 Parts of the documentation and some of the regular expressions are (c) Brad
  33 Choate, http://bradchoate.com/. Thanks, Brad!
  34 """
  35 __license__ = """
  36 Redistribution and use in source and binary forms, with or without
  37 modification, are permitted provided that the following conditions are met:
  38
  39 * Redistributions of source code must retain the above copyright notice,
  40   this list of conditions and the following disclaimer.
  41
  42 * Redistributions in binary form must reproduce the above copyright notice,
  43   this list of conditions and the following disclaimer in the documentation
  44   and/or other materials provided with the distribution.
  45
  46 * Neither the name Textile nor the names of its contributors may be used to
  47   endorse or promote products derived from this software without specific
  48   prior written permission.
  49
  50 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  51 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  52 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  53 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  54 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  55 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  56 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  57 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  58 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  59 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  60 POSSIBILITY OF SUCH DAMAGE.
  61 """
  62 __history__ = """
  63 1.0 - 2003/03/19 - MAP - initial release
  64 1.01 - 2003/03/19 - MAP - don't strip whitespace within <pre> tags;
  65   map high-bit ASCII to HTML numeric entities
  66 1.02 - 2003/03/19 - MAP - changed hyperlink qtag expression to only
  67   match valid URL characters (per RFC 2396); fixed preg_replace to
  68   not match across line breaks (solves lots of problems with
  69   mistakenly matching overlapping inline markup); fixed whitespace
  70   stripping to only strip whitespace from beginning and end of lines,
  71   not immediately before and after HTML tags.
  72 1.03 - 2003/03/20 - MAP - changed hyperlink qtag again to more
  73   closely match original Textile (fixes problems with links
  74   immediately followed by punctuation -- somewhere Dean is
  75   grinning right now); handle curly apostrophe with "ve"
  76   contraction; clean up empty titles at end.
  77 1.04 - 2003/03/23 - MAP - lstrip input to deal with extra spaces at
  78   beginning of first line; tweaked list loop to handle consecutive lists
  79 1.1 - 2003/06/06 - MAP - created initial test suite for links and images,
  80   and fixed a bunch of related bugs to pass them
  81 1.11 - 2003/07/20 - CL - don't demoronise unicode strings; handle
  82   "they're" properly
  83 1.12 - 2003/07/23 - GW - print debug messages to stderr; handle bq(cite).
  84 1.13 - 2003/07/23 - MAP - wrap bq. text in <p>...</p>
  85 2 - 2004/03/26 - RAFA - rewritten from (almost) scratch to include
  86   all features from Textile 2 and a little bit more.
  87 2.0.1 - 2004/04/02 - RAFA - Fixed validating function that uses uTidyLib.
  88 2.0.2 - 2004/04/02 - RAFA - Fixed problem with caps letters in URLs.
  89 2.0.3 - 2004/04/19 - RAFA - Multiple classes are allowed, thanks to Dave
  90   Anderson. The "lang" attribute is now removed from <code>, to be valid
  91   XHTML. Fixed <span class="caps">UCAS</span> problem.
  92 2.0.4 - 2004/05/20 - RAFA, CLB - Added inline formatting to table cells.
  93   Curt Bergmann fixed a bug with the colspan formatting. Added Amazon
  94   Associated id.
  95 2.0.5 - 2004/06/01 - CL - Applied patch from Chris Lawrence to (1) fix
  96   that Amazon associates ID was being added to all search URIs, (2)
  97   customize the Amazon site used with the AMAZON variable, and (3) added
  98   an "isbn" URI type that links directly to an Amazon product by ISBN or
  99   Amazon ASIN.
 100 2.0.6 - 2004/06/02 - RAFA - Fixed CAPS problem, again. I hope this is
 101   the last time.
 102 2.0.7 - 2004/06/04 - RAFA, MW - Fixed bullet macro, thanks to Adam
 103   Messinger. Added patch from Michal Wallace changing {}.pop() for
 104   compatibility with Python 2.2.x.
 105 2.0.8 - 2004/06/25 - RAFA - Strip tags when adding the content from a
 106   footnote to the reference link. Escaped '<' and '>' in the self-
 107   generated documentation.
 108 2.0.9 - 2004/10/04 - RAFA - In images, if ALT is not defined, add an
 109   empty attribute. Added "LaTeX" style open/close quotes. Fixed a bug
 110   where the acronym definition was being formatted with inline rules.
 111   Handle "broken" lines correctly, removing the <br /> from inside
 112   split HTML tags.
 113 2.0.10 - 2004/10/06 - RAFA, LO - Escape all non-escaped ampersands.
 114   Applied "trivial patch" from Ludvig Omholt to remove newline right
 115   after the <pre> tag.
 116 """
 117
 118 # Set your encoding here.
 119 ENCODING = 'utf-8'
 120
 121 # Output? Non-ASCII characters will be automatically
 122 # converted to XML entities if you choose ASCII.
 123 OUTPUT = 'utf-8'
 124
 125 # PyTextile can optionally validate the generated
 126 # XHTML code. We can use either mxTidy or uTidyLib.
 127 # You can change the default behaviour here.
 128 VALIDATE = 0
 129
 130 # If you want h1. to be translated to something other
 131 # than <h1>, change this offset. You can also pass it
 132 # as an argument to textile().
 133 HEAD_OFFSET = 0
 134
 135 # If you want to use itex2mml, specify the full path
 136 # to the binary here. You can download it from here:
 137 # http://golem.ph.utexas.edu/~distler/blog/files/itexToMML.tar.gz
 138 itex2mml = None
 139 #itex2mml = '/usr/local/bin/itex2MML'
 140 #itex2mml = '/usr/people/almeida/bin/itex2MML'
 141
 142 # PyTextile can optionally sanitize the generated XHTML,
 143 # which is good for weblog comments or if you don't trust
 144 # yourself.
 145 SANITIZE = 1
 146
 147 # Turn debug on?
 148 DEBUGLEVEL = 0
 149
 150 # Amazon associate for links: "keywords":amazon
 151 # If you don't have one, please consider leaving mine here as
 152 # a small compensation for writing PyTextile. It's commented
 153 # off as default.
 154 #amazon_associate_id = 'bomtempo-21'
 155 amazon_associate_id = None
 156
 157 #AMAZON = 'www.amazon.co.uk'
 158 AMAZON = 'www.amazon.com'
 159
 160 import re
 161 import sys
 162 import os
 163 import sgmllib
 164 import unicodedata
 165
 166
 167 def _in_tag(text, tag):
 168     """Extracts text from inside a tag.
 169
 170     This function extracts the text from inside a given tag.
 171     It's useful to get the text between <body></body> or
 172     <pre></pre> when using the validators or the colorizer.
 173     """
 174     if text.count('<%s' % tag):
 175         text = text.split('<%s' % tag, 1)[1]
 176         if text.count('>'):
 177             text = text.split('>', 1)[1]
 178     if text.count('</%s' % tag):
 179         text = text.split('</%s' % tag, 1)[0]
 180
 181     text = text.strip().replace('\r\n', '\n')
 182
 183     return text
 184
 185
 186 # If you want PyTextile to automatically colorize
 187 # your Python code, you need the htmlizer module
 188 # from Twisted. (You can just grab this file from
 189 # the distribution, it has no other dependencies.)
 190 try:
 191     #from twisted.python import htmlizer
 192     import htmlizer
 193     from StringIO import StringIO
 194
 195     def _color(code):
 196         """Colorizer Python code.
 197
 198         This function wraps a text string in a StringIO,
 199         and passes it to the htmlizer function from
 200         Twisted.
 201         """
 202         # Fix line continuations.
 203         code = preg_replace(r' \\\n', ' \\\\\n', code)
 204
 205         code_in  = StringIO(code)
 206         code_out = StringIO()
 207
 208         htmlizer.filter(code_in, code_out)
 209
 210         # Remove <pre></pre> from input.
 211         code = _in_tag(code_out.getvalue(), 'pre')
 212
 213         # Fix newlines.
 214         code = code.replace('<span class="py-src-newline">\n</span>', '<span class="py-src-newline"></span>\n')
 215
 216         return code
 217
 218 except ImportError:
 219     htmlizer = None
 220
 221
 222 # PyTextile can optionally validate the generated
 223 # XHTML code using either mxTidy or uTidyLib.
 224 try:
 225     # This is mxTidy.
 226     from mx.Tidy import Tidy
 227
 228     def _tidy1(text):
 229         """mxTidy's XHTML validator.
 230
 231         This function is a wrapper to mxTidy's validator.
 232         """
 233         nerrors, nwarnings, text, errortext = Tidy.tidy(text, output_xhtml=1, numeric_entities=1, wrap=0)
 234         return _in_tag(text, 'body')
 235
 236     _tidy = _tidy1
 237
 238 except ImportError:
 239     try:
 240         # This is uTidyLib.
 241         import tidy
 242
 243         def _tidy2(text):
 244             """uTidyLib's XHTML validator.
 245
 246             This function is a wrapper to uTidyLib's validator.
 247             """
 248             text = tidy.parseString(text,  output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0)
 249             return _in_tag(str(text), 'body')
 250
 251         _tidy = _tidy2
 252
 253     except ImportError:
 254         _tidy = None
 255
 256
 257 # This is good for debugging.
 258 def _debug(s, level=1):
 259     """Outputs debug information to sys.stderr.
 260
 261     This function outputs debug information if DEBUGLEVEL is
 262     higher than a given treshold.
 263     """
 264     if DEBUGLEVEL >= level: print >> sys.stderr, s
 265
 266
 267 #############################
 268 # Useful regular expressions.
 269 parameters = {
 270     # Horizontal alignment.
 271     'align':    r'''(?:(?:<>|[<>=])                 # Either '<>', '<', '>' or '='
 272                     (?![^\s]*(?:<>|[<>=])))         # Look-ahead to ensure it happens once
 273                  ''',
 274
 275     # Horizontal padding.
 276     'padding':  r'''(?:[\(\)]+)                     # Any number of '(' and/or ')'
 277                  ''',
 278
 279     # Class and/or id.
 280     'classid':  r'''(                               #
 281                         (?:\(\#[\w]+\))             # (#id)
 282                         |                           #
 283                         (?:\((?:[\w]+(?:\s[\w]+)*)  #
 284                             (?:\#[\w]+)?\))         # (class1 class2 ... classn#id) or (class1 class2 ... classn)
 285                     )                               #
 286                     (?![^\s]*(?:\([\w#]+\)))        # must happen once
 287                  ''',
 288
 289     # Language.
 290     'lang':     r'''(?:\[[\w-]+\])                  # [lang]
 291                     (?![^\s]*(?:\[.*?\]))           # must happen once
 292                  ''',
 293
 294     # Style.
 295     'style':    r'''(?:{[^\}]+})                    # {style}
 296                     (?![^\s]*(?:{.*?}))             # must happen once
 297                  ''',
 298 }
 299
 300 res = {
 301     # Punctuation.
 302     'punct': r'''[\!"#\$%&'()\*\+,\-\./:;<=>\?@\[\\\]\^_`{\|}\~]''',
 303
 304     # URL regular expression.
 305     'url':   r'''(?=[a-zA-Z0-9./#])                         # Must start correctly
 306                  (?:                                        # Match the leading part (proto://hostname, or just hostname)
 307                      (?:ftp|https?|telnet|nntp)             #     protocol
 308                      ://                                    #     ://
 309                      (?:                                    #     Optional 'username:password@'
 310                          \w+                                #         username
 311                          (?::\w+)?                          #         optional :password
 312                          @                                  #         @
 313                      )?                                     #
 314                      [-\w]+(?:\.\w[-\w]*)+                  #     hostname (sub.example.com)
 315                  |                                          #
 316                      (?:mailto:)?                           #     Optional mailto:
 317                      [-\+\w]+                               #     username
 318                      \@                                     #     at
 319                      [-\w]+(?:\.\w[-\w]*)+                  #     hostname
 320                  |                                          #
 321                      (?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+ #     domain without protocol
 322                      (?:com\b                               #     TLD
 323                      |  edu\b                               #
 324                      |  biz\b                               #
 325                      |  gov\b                               #
 326                      |  in(?:t|fo)\b                        #     .int or .info
 327                      |  mil\b                               #
 328                      |  net\b                               #
 329                      |  org\b                               #
 330                      |  museum\b                            #
 331                      |  aero\b                              #
 332                      |  coop\b                              #
 333                      |  name\b                              #
 334                      |  pro\b                               #
 335                      |  [a-z][a-z]\b                        #     two-letter country codes
 336                      )                                      #
 337                  )?                                         #
 338                  (?::\d+)?                                  # Optional port number
 339                  (?:                                        # Rest of the URL, optional
 340                      /?                                     #     Start with '/'
 341                      [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]*      #     Can't start with these
 342                      (?:                                    #
 343                          [.!,?;:]+                          #     One or more of these
 344                          [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]+  #     Can't finish with these
 345                          #'"                                #     # or ' or "
 346                      )*                                     #
 347                  )?                                         #
 348               ''',
 349
 350
 351     # Block attributes.
 352     'battr': r'''(?P<parameters>                            #
 353                      (?: %(align)s                          # alignment
 354                      |   %(classid)s                        # class and/or id
 355                      |   %(padding)s                        # padding tags
 356                      |   %(lang)s                           # [lang]
 357                      |   %(style)s                          # {style}
 358                      )+                                     #
 359                  )?                                         #
 360               ''' % parameters,
 361
 362     # (Un)ordered list attributes.
 363     'olattr': r'''(?P<olparameters>                         #
 364                       (?: %(align)s                         # alignment
 365                       | ((?:\(\#[\w]+\))                    # (#id)
 366                           |                                 #
 367                           (?:\((?:[\w]+(?:\s[\w]+)*)        #
 368                             (?:\#[\w]+)?\))                 # (class1 class2 ... classn#id) or (class1 class2 ... classn)
 369                       )                                     #
 370                       |   %(padding)s                       # padding tags
 371                       |   %(lang)s                          # [lang]
 372                       |   %(style)s                         # {style}
 373                       )+                                    #
 374                   )?                                        #
 375               ''' % parameters,
 376
 377     # List item attributes.
 378     'liattr': r'''(?P<liparameters>                         #
 379                       (?: %(align)s                         # alignment
 380                       |   %(classid)s                       # class and/or id
 381                       |   %(padding)s                       # padding tags
 382                       |   %(lang)s                          # [lang]
 383                       |   %(style)s                         # {style}
 384                       )+                                    #
 385                   )?                                        #
 386               ''' % parameters,
 387
 388     # Qtag attributes.
 389     'qattr': r'''(?P<parameters>                            #
 390                      (?: %(classid)s                        # class and/or id
 391                      |   %(lang)s                           # [lang]
 392                      |   %(style)s                          # {style}
 393                      )+                                     #
 394                  )?                                         #
 395               ''' % parameters,
 396
 397     # Link attributes.
 398     'lattr': r'''(?P<parameters>                            # Links attributes
 399                      (?: %(align)s                          # alignment
 400                      |   %(classid)s                        # class and/or id
 401                      |   %(lang)s                           # [lang]
 402                      |   %(style)s                          # {style}
 403                      )+                                     #
 404                  )?                                         #
 405               ''' % parameters,
 406
 407     # Image attributes.
 408     'iattr': r'''(?P<parameters>                            #
 409                      (?:                                    #
 410                      (?: [<>]+                              # horizontal alignment tags
 411                          (?![^\s]*(?:[<>])))                #     (must happen once)
 412                      |                                      #
 413                      (?: [\-\^~]+                           # vertical alignment tags
 414                          (?![^\s]*(?:[\-\^~])))             #     (must happen once)
 415                      | %(classid)s                          # class and/or id
 416                      | %(padding)s                          # padding tags
 417                      | %(style)s                            # {style}
 418                      )+                                     #
 419                  )?                                         #
 420               ''' % parameters,
 421
 422     # Resize attributes.
 423     'resize': r'''(?:                                       #
 424                       (?:([\d]+%?)x([\d]+%?))               # 20x10
 425                       |                                     #
 426                       (?:                                   # or
 427                           (?:([\d]+)%?w\s([\d]+)%?h)        #     10h 20w
 428                           |                                 #     or
 429                           (?:([\d]+)%?h\s([\d]+)%?w)        #     20w 10h
 430                       )                                     #
 431                   )?                                        #
 432                ''',
 433
 434     # Table attributes.
 435     'tattr': r'''(?P<parameters>                            #
 436                      (?:                                    #
 437                      (?: [\^~]                              # vertical alignment
 438                          (?![^\s]*(?:[\^~])))               #     (must happen once)
 439                      |   %(align)s                          # alignment
 440                      |   %(lang)s                           # [lang]
 441                      |   %(style)s                          # {style}
 442                      |   %(classid)s                        # class and/or id
 443                      |   %(padding)s                        # padding
 444                      |   _                                  # is this a header row/cell?
 445                      |   \\\d+                              # colspan
 446                      |   /\d+                               # rowspan
 447                      )+                                     #
 448                  )?                                         #
 449               ''' % parameters,
 450 }
 451
 452
 453 def preg_replace(pattern, replacement, text):
 454     """Alternative re.sub that handles empty groups.
 455
 456     This acts like re.sub, except it replaces empty groups with ''
 457     instead of raising an exception.
 458     """
 459
 460     def replacement_func(matchobj):
 461         counter = 1
 462         rc = replacement
 463         _debug(matchobj.groups())
 464         for matchitem in matchobj.groups():
 465             if not matchitem:
 466                 matchitem = ''
 467
 468             rc = rc.replace(r'\%s' % counter, matchitem)
 469             counter += 1
 470
 471         return rc
 472
 473     p = re.compile(pattern)
 474     _debug(pattern)
 475
 476     return p.sub(replacement_func, text)
 477
 478
 479 def html_replace(pattern, replacement, text):
 480     """Replacement outside HTML tags.
 481
 482     Does a preg_replace only outside HTML tags.
 483     """
 484     # If there is no html, do a simple search and replace.
 485     if not re.search(r'''<.*>''', text):
 486         return preg_replace(pattern, replacement, text)
 487
 488     else:
 489         lines = []
 490         # Else split the text into an array at <>.
 491         for line in re.split('(<.*?>)', text):
 492             if not re.match('<.*?>', line):
 493                 line = preg_replace(pattern, replacement, line)
 494
 495             lines.append(line)
 496
 497         return ''.join(lines)
 498
 499
 500 # PyTextile can optionally sanitize the generated XHTML,
 501 # which is good for weblog comments. This code is from
 502 # Mark Pilgrim's feedparser.
 503 class _BaseHTMLProcessor(sgmllib.SGMLParser):
 504     elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
 505       'img', 'input', 'isindex', 'link', 'meta', 'param']
 506
 507     def __init__(self):
 508         sgmllib.SGMLParser.__init__(self)
 509
 510     def reset(self):
 511         self.pieces = []
 512         sgmllib.SGMLParser.reset(self)
 513
 514     def normalize_attrs(self, attrs):
 515         # utility method to be called by descendants
 516         attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
 517         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
 518         return attrs
 519
 520     def unknown_starttag(self, tag, attrs):
 521         # called for each start tag
 522         # attrs is a list of (attr, value) tuples
 523         # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
 524         strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
 525         if tag in self.elements_no_end_tag:
 526             self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
 527         else:
 528             self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
 529
 530     def unknown_endtag(self, tag):
 531         # called for each end tag, e.g. for </pre>, tag will be "pre"
 532         # Reconstruct the original end tag.
 533         if tag not in self.elements_no_end_tag:
 534             self.pieces.append("</%(tag)s>" % locals())
 535
 536     def handle_charref(self, ref):
 537         # called for each character reference, e.g. for "&#160;", ref will be "160"
 538         # Reconstruct the original character reference.
 539         self.pieces.append("&#%(ref)s;" % locals())
 540
 541     def handle_entityref(self, ref):
 542         # called for each entity reference, e.g. for "&copy;", ref will be "copy"
 543         # Reconstruct the original entity reference.
 544         self.pieces.append("&%(ref)s;" % locals())
 545
 546     def handle_data(self, text):
 547         # called for each block of plain text, i.e. outside of any tag and
 548         # not containing any character or entity references
 549         # Store the original text verbatim.
 550         self.pieces.append(text)
 551
 552     def handle_comment(self, text):
 553         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
 554         # Reconstruct the original comment.
 555         self.pieces.append("<!--%(text)s-->" % locals())
 556
 557     def handle_pi(self, text):
 558         # called for each processing instruction, e.g. <?instruction>
 559         # Reconstruct original processing instruction.
 560         self.pieces.append("<?%(text)s>" % locals())
 561
 562     def handle_decl(self, text):
 563         # called for the DOCTYPE, if present, e.g.
 564         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 565         #     "http://www.w3.org/TR/html4/loose.dtd">
 566         # Reconstruct original DOCTYPE
 567         self.pieces.append("<!%(text)s>" % locals())
 568
 569     def output(self):
 570         """Return processed HTML as a single string"""
 571         return "".join(self.pieces)
 572
 573
 574 class _HTMLSanitizer(_BaseHTMLProcessor):
 575     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
 576       'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
 577       'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
 578       'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
 579       'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
 580       'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
 581       'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
 582       'thead', 'tr', 'tt', 'u', 'ul', 'var']
 583
 584     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
 585       'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
 586       'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
 587       'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
 588       'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
 589       'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
 590       'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
 591       'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
 592       'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
 593       'usemap', 'valign', 'value', 'vspace', 'width']
 594
 595     unacceptable_elements_with_end_tag = ['script', 'applet']
 596
 597     # This if for MathML.
 598     mathml_elements = ['math', 'mi', 'mn', 'mo', 'mrow', 'msup']
 599     mathml_attributes = ['mode', 'xmlns']
 600
 601     acceptable_elements = acceptable_elements + mathml_elements
 602     acceptable_attributes = acceptable_attributes + mathml_attributes
 603
 604     def reset(self):
 605         _BaseHTMLProcessor.reset(self)
 606         self.unacceptablestack = 0
 607
 608     def unknown_starttag(self, tag, attrs):
 609         if not tag in self.acceptable_elements:
 610             if tag in self.unacceptable_elements_with_end_tag:
 611                 self.unacceptablestack += 1
 612             return
 613         attrs = self.normalize_attrs(attrs)
 614         attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
 615         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
 616
 617     def unknown_endtag(self, tag):
 618         if not tag in self.acceptable_elements:
 619             if tag in self.unacceptable_elements_with_end_tag:
 620                 self.unacceptablestack -= 1
 621             return
 622         _BaseHTMLProcessor.unknown_endtag(self, tag)
 623
 624     def handle_pi(self, text):
 625         pass
 626
 627     def handle_decl(self, text):
 628         pass
 629
 630     def handle_data(self, text):
 631         if not self.unacceptablestack:
 632             _BaseHTMLProcessor.handle_data(self, text)
 633
 634
 635 class Textiler:
 636     """Textile formatter.
 637
 638     This is the base class for the PyTextile text processor.
 639     """
 640     def __init__(self, text=''):
 641         """Instantiate the class, passing the text to be formatted.
 642
 643         Here we pre-process the text and collect all the link
 644         lookups for later.
 645         """
 646         self.text = text
 647
 648         # Basic regular expressions.
 649         self.res = res
 650
 651         # Smart searches.
 652         self.searches = {}
 653         self.searches['imdb']   = 'http://www.imdb.com/Find?for=%s'
 654         self.searches['google'] = 'http://www.google.com/search?q=%s'
 655         self.searches['python'] = 'http://www.python.org/doc/current/lib/module-%s.html'
 656         if amazon_associate_id:
 657             self.searches['isbn']   = ''.join(['http://', AMAZON, '/exec/obidos/ASIN/%s/', amazon_associate_id])
 658             self.searches['amazon'] = ''.join(['http://', AMAZON, '/exec/obidos/external-search?mode=blended&keyword=%s&tag=', amazon_associate_id])
 659         else:
 660             self.searches['isbn']   = ''.join(['http://', AMAZON, '/exec/obidos/ASIN/%s'])
 661             self.searches['amazon'] = ''.join(['http://', AMAZON, '/exec/obidos/external-search?mode=blended&keyword=%s'])
 662
 663         # These are the blocks we know.
 664         self.signatures = [
 665                            # Paragraph.
 666                            (r'''^p                       # Paragraph signature
 667                                 %(battr)s                # Paragraph attributes
 668                                 (?P<dot>\.)              # .
 669                                 (?P<extend>\.)?          # Extended paragraph denoted by a second dot
 670                                 \s                       # whitespace
 671                                 (?P<text>.*)             # text
 672                              ''' % self.res, self.paragraph),
 673
 674                            # Pre-formatted text.
 675                            (r'''^pre                     # Pre signature
 676                                 %(battr)s                # Pre attributes
 677                                 (?P<dot>\.)              # .
 678                                 (?P<extend>\.)?          # Extended pre denoted by a second dot
 679                                 \s                       # whitespace
 680                                 (?P<text>.*)             # text
 681                              ''' % self.res, self.pre),
 682
 683                            # Block code.
 684                            (r'''^bc                      # Blockcode signature
 685                                 %(battr)s                # Blockcode attributes
 686                                 (?P<dot>\.)              # .
 687                                 (?P<extend>\.)?          # Extended blockcode denoted by a second dot
 688                                 \s                       # whitespace
 689                                 (?P<text>.*)             # text
 690                              ''' % self.res, self.bc),
 691
 692                            # Blockquote.
 693                            (r'''^bq                      # Blockquote signature
 694                                 %(battr)s                # Blockquote attributes
 695                                 (?P<dot>\.)              # .
 696                                 (?P<extend>\.)?          # Extended blockquote denoted by a second dot
 697                                 (:(?P<cite>              # Optional cite attribute
 698                                 (                        #
 699                                     %(url)s              #     URL
 700                                 |   "[\w]+(?:\s[\w]+)*"  #     "Name inside quotes"
 701                                 ))                       #
 702                                 )?                       #
 703                                 \s                       # whitespace
 704                                 (?P<text>.*)             # text
 705                              ''' % self.res, self.blockquote),
 706
 707                            # Header.
 708                            (r'''^h                       # Header signature
 709                                 (?P<header>\d)           # Header number
 710                                 %(battr)s                # Header attributes
 711                                 (?P<dot>\.)              # .
 712                                 (?P<extend>\.)?          # Extended header denoted by a second dot
 713                                 \s                       # whitespace
 714                                 (?P<text>.*)             # text
 715                              ''' % self.res, self.header),
 716
 717                            # Footnote.
 718                            (r'''^fn                      # Footnote signature
 719                                 (?P<footnote>[\d]+)      # Footnote number
 720                                 (?P<dot>\.)              # .
 721                                 (?P<extend>\.)?          # Extended footnote denoted by a second dot
 722                                 \s                       # whitespace
 723                                 (?P<text>.*)             # text
 724                              ''', self.footnote),
 725
 726                            # Definition list.
 727                            (r'''^dl                      # Definition list signature
 728                                 %(battr)s                # Definition list attributes
 729                                 (?P<dot>\.)              # .
 730                                 (?P<extend>\.)?          # Extended definition list denoted by a second dot
 731                                 \s                       # whitespace
 732                                 (?P<text>.*)             # text
 733                              ''' % self.res, self.dl),
 734
 735                            # Ordered list (attributes to first <li>).
 736                            (r'''^%(olattr)s              # Ordered list attributes
 737                                 \#                       # Ordered list signature
 738                                 %(liattr)s               # List item attributes
 739                                 (?P<dot>\.)?             # .
 740                                 \s                       # whitespace
 741                                 (?P<text>.*)             # text
 742                              ''' % self.res, self.ol),
 743
 744                            # Unordered list (attributes to first <li>).
 745                            (r'''^%(olattr)s              # Unrdered list attributes
 746                                 \*                       # Unordered list signature
 747                                 %(liattr)s               # Unordered list attributes
 748                                 (?P<dot>\.)?             # .
 749                                 \s                       # whitespace
 750                                 (?P<text>.*)             # text
 751                              ''' % self.res, self.ul),
 752
 753                            # Escaped text.
 754                            (r'''^==?(?P<text>.*?)(==)?$  # Escaped text
 755                              ''', self.escape),
 756
 757                            (r'''^(?P<text><.*)$          # XHTML tag
 758                              ''', self.escape),
 759
 760                            # itex code.
 761                            (r'''^(?P<text>               # itex code
 762                                 \\\[                     # starts with \[
 763                                 .*?                      # complicated mathematical equations go here
 764                                 \\\])                    # ends with \]
 765                              ''', self.itex),
 766
 767                            # Tables.
 768                            (r'''^table                   # Table signature
 769                                 %(tattr)s                # Table attributes
 770                                 (?P<dot>\.)              # .
 771                                 (?P<extend>\.)?          # Extended blockcode denoted by a second dot
 772                                 \s                       # whitespace
 773                                 (?P<text>.*)             # text
 774                              ''' % self.res, self.table),
 775
 776                            # Simple tables.
 777                            (r'''^(?P<text>
 778                                 \|
 779                                 .*)
 780                              ''', self.table),
 781
 782                            # About.
 783                            (r'''^(?P<text>tell\sme\sabout\stextile\.)$''', self.about),
 784                           ]
 785
 786
 787     def preprocess(self):
 788         """Pre-processing of the text.
 789
 790         Remove whitespace, fix carriage returns.
 791         """
 792         # Remove whitespace.
 793         self.text = self.text.strip()
 794
 795         # Zap carriage returns.
 796         self.text = self.text.replace("\r\n", "\n")
 797         self.text = self.text.replace("\r", "\n")
 798
 799         # Minor sanitizing.
 800         self.text = self.sanitize(self.text)
 801
 802
 803     def grab_links(self):
 804         """Grab link lookups.
 805
 806         Check the text for link lookups, store them in a
 807         dictionary, and clean them up.
 808         """
 809         # Grab links like this: '[id]example.com'
 810         links = {}
 811         p = re.compile(r'''(?:^|\n)\[([\w]+?)\](%(url)s)(?:$|\n)''' % self.res, re.VERBOSE)
 812         for key, link in p.findall(self.text):
 813             links[key] = link
 814
 815         # And clear them from the text.
 816         self.text = p.sub('', self.text)
 817
 818         return links
 819
 820
 821     def process(self, head_offset=HEAD_OFFSET, validate=VALIDATE, sanitize=SANITIZE, output=OUTPUT, encoding=ENCODING):
 822         """Process the text.
 823
 824         Here we actually process the text, splitting the text in
 825         blocks and applying the corresponding function to each
 826         one of them.
 827         """
 828         # Basic global changes.
 829         self.preprocess()
 830
 831         # Grab lookup links and clean them from the text.
 832         self._links = self.grab_links()
 833
 834         # Offset for the headers.
 835         self.head_offset = head_offset
 836
 837         # Process each block.
 838         self.blocks = self.split_text()
 839
 840         text = []
 841         for [function, captures] in self.blocks:
 842             text.append(function(**captures))
 843
 844         text = '\n\n'.join(text)
 845
 846         # Add titles to footnotes.
 847         text = self.footnotes(text)
 848
 849         # Convert to desired output.
 850         text = unicode(text, encoding)
 851         text = text.encode(output, 'xmlcharrefreplace')
 852
 853         # Sanitize?
 854         if sanitize:
 855             p = _HTMLSanitizer()
 856             p.feed(text)
 857             text = p.output()
 858
 859         # Validate output.
 860         if _tidy and validate:
 861             text = _tidy(text)
 862
 863         return text
 864
 865
 866     def sanitize(self, text):
 867         """Fix single tags.
 868
 869         Fix tags like <img />, <br /> and <hr />.
 870
 871         ---
 872         h1. Sanitizing
 873
 874         Textile can help you generate valid XHTML(eXtensible HyperText Markup Language).
 875         It will fix any single tags that are not properly closed, like
 876         @<img />@, @<br />@ and @<hr />@.
 877
 878         If you have "mx.Tidy":http://www.egenix.com/files/python/mxTidy.html
 879         and/or "&micro;TidyLib":http://utidylib.sourceforge.net/ installed,
 880         it also can optionally validade the generated code with these wrappers
 881         to ensure 100% valid XHTML(eXtensible HyperText Markup Language).
 882         """
 883         # Fix single tags like <img /> and <br />.
 884         text = preg_replace(r'''<(img|br|hr)(.*?)(?:\s*/?\s*)?>''', r'''<\1\2 />''', text)
 885
 886         # Remove ampersands.
 887         text = preg_replace(r'''&(?!#?[xX]?(?:[0-9a-fA-F]+|\w{1,8});)''', r'''&amp;''', text)
 888
 889         return text
 890
 891
 892     def split_text(self):
 893         """Process the blocks from the text.
 894
 895         Split the blocks according to the signatures, join extended
 896         blocks and associate each one of them with a function to
 897         process them.
 898
 899         ---
 900         h1. Blocks
 901
 902         Textile process your text by dividing it in blocks. Each block
 903         is identified by a signature and separated from other blocks by
 904         an empty line.
 905
 906         All signatures should end with a period followed by a space. A
 907         header @<h1></h1>@ can be done this way:
 908
 909         pre. h1. This is a header 1.
 910
 911         Blocks may continue for multiple paragraphs of text. If you want
 912         a block signature to stay "active", use two periods after the
 913         signature instead of one. For example:
 914
 915         pre.. bq.. This is paragraph one of a block quote.
 916
 917         This is paragraph two of a block quote.
 918
 919         =p. Now we're back to a regular paragraph.
 920
 921         p. Becomes:
 922
 923         pre.. <blockquote>
 924         <p>This is paragraph one of a block quote.</p>
 925
 926         <p>This is paragraph two of a block quote.</p>
 927         </blockquote>
 928
 929         <p>Now we&#8217;re back to a regular paragraph.</p>
 930
 931         p. The blocks can be customised by adding parameters between the
 932         signature and the period. These include:
 933
 934         dl. {style rule}:A CSS(Cascading Style Sheets) style rule.
 935         [ll]:A language identifier (for a "lang" attribute).
 936         (class) or (#id) or (class#id):For CSS(Cascading Style Sheets) class and id attributes.
 937         &gt;, &lt;, =, &lt;&gt;:Modifier characters for alignment. Right-justification, left-justification, centered, and full-justification. The paragraph will also receive the class names "right", "left", "center" and "justify", respectively.
 938         ( (one or more):Adds padding on the left. 1em per "(" character is applied. When combined with the align-left or align-right modifier, it makes the block float.
 939         ) (one or more):Adds padding on the right. 1em per ")" character is applied. When combined with the align-left or align-right modifier, it makes the block float.
 940
 941         Here's an overloaded example:
 942
 943         pre. p(())>(class#id)[en]{color:red}. A simple paragraph.
 944
 945         Becomes:
 946
 947         pre. <p lang="en" style="color:red;padding-left:2em;padding-right:2em;float:right;" class="class right" id="id">A simple paragraph.</p>
 948         """
 949         # Clear signature.
 950         clear_sig = r'''^clear(?P<alignment>[<>])?\.$'''
 951         clear = None
 952
 953         extending  = 0
 954
 955         # We capture the \n's because they are important inside "pre..".
 956         blocks = re.split(r'''((\n\s*){2,})''', self.text)
 957         output = []
 958         for block in blocks:
 959             # Check for the clear signature.
 960             m = re.match(clear_sig, block)
 961             if m:
 962                 clear = m.group('alignment')
 963                 if clear:
 964                     clear = {'<': 'clear:left;', '>': 'clear:right;'}[clear]
 965                 else:
 966                     clear = 'clear:both;'
 967
 968             else:
 969                 # Check each of the code signatures.
 970                 for regexp, function in self.signatures:
 971                     p = re.compile(regexp, (re.VERBOSE | re.DOTALL))
 972                     m = p.match(block)
 973                     if m:
 974                         # Put everything in a dictionary.
 975                         captures = m.groupdict()
 976
 977                         # If we are extending a block, we require a dot to
 978                         # break it, so we can start lines with '#' inside
 979                         # an extended <pre> without matching an ordered list.
 980                         if extending and not captures.get('dot', None):
 981                             output[-1][1]['text'] += block
 982                             break
 983                         elif captures.has_key('dot'):
 984                             del captures['dot']
 985
 986                         # If a signature matches, we are not extending a block.
 987                         extending = 0
 988
 989                         # Check if we should extend this block.
 990                         if captures.has_key('extend'):
 991                             extending = captures['extend']
 992                             del captures['extend']
 993
 994                         # Apply head_offset.
 995                         if captures.has_key('header'):
 996                             captures['header'] = int(captures['header']) + self.head_offset
 997
 998                         # Apply clear.
 999                         if clear:
1000                             captures['clear'] = clear
1001                             clear = None
1002
1003                         # Save the block to be processed later.
1004                         output.append([function, captures])
1005
1006                         break
1007
1008                 else:
1009                     if extending:
1010                         # Append the text to the last block.
1011                         output[-1][1]['text'] += block
1012                     elif block.strip():
1013                         output.append([self.paragraph, {'text': block}])
1014
1015         return output
1016
1017
1018     def parse_params(self, parameters, clear=None, align_type='block'):
1019         """Parse the parameters from a block signature.
1020
1021         This function parses the parameters from a block signature,
1022         splitting the information about class, id, language and
1023         style. The positioning (indentation and alignment) is parsed
1024         and stored in the style.
1025
1026         A paragraph like:
1027
1028             p>(class#id){color:red}[en]. Paragraph.
1029
1030         or:
1031
1032             p{color:red}[en](class#id)>. Paragraph.
1033
1034         will have its parameters parsed to:
1035
1036             output = {'lang' : 'en',
1037                       'class': 'class',
1038                       'id'   : 'id',
1039                       'style': 'color:red;text-align:right;'}
1040
1041         Note that order is not important.
1042         """
1043         if not parameters:
1044             if clear:
1045                 return {'style': clear}
1046             else:
1047                 return {}
1048
1049         output = {}
1050
1051         # Match class from (class) or (class#id).
1052         m = re.search(r'''\((?P<class>[\w]+(\s[\w]+)*)(\#[\w]+)?\)''', parameters)
1053         if m: output['class'] = m.group('class')
1054
1055         # Match id from (#id) or (class#id).
1056         m = re.search(r'''\([\w]*(\s[\w]+)*\#(?P<id>[\w]+)\)''', parameters)
1057         if m: output['id'] = m.group('id')
1058
1059         # Match [language].
1060         m = re.search(r'''\[(?P<lang>[\w-]+)\]''', parameters)
1061         if m: output['lang'] = m.group('lang')
1062
1063         # Match {style}.
1064         m = re.search(r'''{(?P<style>[^\}]+)}''', parameters)
1065         if m:
1066             output['style'] = m.group('style').replace('\n', '')
1067
1068             # If necessary, apppend a semi-comma to the style.
1069             if not output['style'].endswith(';'):
1070                 output['style'] += ';'
1071
1072         # Clear the block?
1073         if clear:
1074             output['style'] = output.get('style', '') + clear
1075
1076         # Remove classes, ids, langs and styles. This makes the
1077         # regular expression for the positioning much easier.
1078         parameters = preg_replace(r'''\([\#\w\s]+\)''', '', parameters)
1079         parameters = preg_replace(r'''\[[\w-]+\]''', '', parameters)
1080         parameters = preg_replace(r'''{[\w:;#%-]+}''', '', parameters)
1081
1082         style = []
1083
1084         # Count the left indentation.
1085         l_indent = parameters.count('(')
1086         if l_indent: style.append('padding-left:%dem;' % l_indent)
1087
1088         # Count the right indentation.
1089         r_indent = parameters.count(')')
1090         if r_indent: style.append('padding-right:%dem;' % r_indent)
1091
1092         # Add alignment.
1093         if align_type == 'image':
1094             align = [('<', 'float:left;', ' left'),
1095                      ('>', 'float:right;', ' right')]
1096
1097             valign = [('^', 'vertical-align:text-top;', ' top'),
1098                       ('-', 'vertical-align:middle;', ' middle'),
1099                       ('~', 'vertical-align:text-bottom;', ' bottom')]
1100
1101             # Images can have both a vertical and a horizontal alignment.
1102             for alignments in [align, valign]:
1103                 for _align, _style, _class in alignments:
1104                     if parameters.count(_align):
1105                         style.append(_style)
1106
1107                         # Append a class name related to the alignment.
1108                         output['class'] = output.get('class', '') + _class
1109                         break
1110
1111         elif align_type == 'table':
1112             align = [('<', 'left'),
1113                      ('>', 'right'),
1114                      ('=', 'center'),
1115                      ('<>', 'justify')]
1116
1117             valign = [('^', 'top'),
1118                       ('~', 'bottom')]
1119
1120             # Horizontal alignment.
1121             for _align, _style, in align:
1122                 if parameters.count(_align):
1123                     output['align'] = _style
1124
1125             # Vertical alignment.
1126             for _align, _style, in valign:
1127                 if parameters.count(_align):
1128                     output['valign'] = _style
1129
1130             # Colspan and rowspan.
1131             m = re.search(r'''\\(\d+)''', parameters)
1132             if m:
1133                 #output['colspan'] = m.groups()
1134                 output['colspan'] = int(m.groups()[0])
1135
1136             m = re.search(r'''/(\d+)''', parameters)
1137             if m:
1138                 output['rowspan'] = int(m.groups()[0])
1139
1140         else:
1141             if l_indent or r_indent:
1142                 alignments = [('<>', 'text-align:justify;', ' justify'),
1143                               ('=', 'text-align:center;', ' center'),
1144                               ('<', 'float:left;', ' left'),
1145                               ('>', 'float:right;', ' right')]
1146             else:
1147                 alignments = [('<>', 'text-align:justify;', ' justify'),
1148                               ('=', 'text-align:center;', ' center'),
1149                               ('<', 'text-align:left;', ' left'),
1150                               ('>', 'text-align:right;', ' right')]
1151
1152             for _align, _style, _class in alignments:
1153                 if parameters.count(_align):
1154                     style.append(_style)
1155
1156                     # Append a class name related to the alignment.
1157                     output['class'] = output.get('class', '') + _class
1158                     break
1159
1160         # Join all the styles.
1161         output['style'] = output.get('style', '') + ''.join(style)
1162
1163         # Remove excess whitespace.
1164         if output.has_key('class'):
1165             output['class'] = output['class'].strip()
1166
1167         return output
1168
1169
1170     def build_open_tag(self, tag, attributes={}, single=0):
1171         """Build the open tag with specified attributes.
1172
1173         This function is used by all block builders to
1174         generate the opening tags with the attributes of
1175         the block.
1176         """
1177         # Open tag.
1178         open_tag = ['<%s' % tag]
1179         for k,v in attributes.items():
1180             # The ALT attribute can be empty.
1181             if k == 'alt' or v: open_tag.append(' %s="%s"' % (k, v))
1182
1183         if single:
1184             open_tag.append(' /')
1185
1186         # Close tag.
1187         open_tag.append('>')
1188
1189         return ''.join(open_tag)
1190
1191
1192     def paragraph(self, text, parameters=None, attributes=None, clear=None):
1193         """Process a paragraph.
1194
1195         This function processes the paragraphs, enclosing the text in a
1196         <p> tag and breaking lines with <br />. Paragraphs are formatted
1197         with all the inline rules.
1198
1199         ---
1200         h1. Paragraph
1201
1202         This is how you write a paragraph:
1203
1204         pre. p. This is a paragraph, although a short one.
1205
1206         Since the paragraph is the default block, you can safely omit its
1207         signature ([@p@]). Simply write:
1208
1209         pre. This is a paragraph, although a short one.
1210
1211         Text in a paragraph block is wrapped in @<p></p>@ tags, and
1212         newlines receive a <br /> tag. In both cases Textile will process
1213         the text to:
1214
1215         pre. <p>This is a paragraph, although a short one.</p>
1216
1217         Text in a paragraph block is processed with all the inline rules.
1218         """
1219         # Split the lines.
1220         lines = re.split('\n{2,}', text)
1221
1222         # Get the attributes.
1223         attributes = attributes or self.parse_params(parameters, clear)
1224
1225         output = []
1226         for line in lines:
1227             if line:
1228                 # Clean the line.
1229                 line = line.strip()
1230
1231                 # Build the tag.
1232                 open_tag = self.build_open_tag('p', attributes)
1233                 close_tag = '</p>'
1234
1235                 # Pop the id because it must be unique.
1236                 if attributes.has_key('id'): del attributes['id']
1237
1238                 # Break lines.
1239                 line = preg_replace(r'(<br />|\n)+', '<br />\n', line)
1240
1241                 # Remove <br /> from inside broken HTML tags.
1242                 line = preg_replace(r'(<[^>]*)<br />\n(.*?>)', r'\1 \2', line)
1243
1244                 # Inline formatting.
1245                 line = self.inline(line)
1246
1247                 output.append(open_tag + line + close_tag)
1248
1249         return '\n\n'.join(output)
1250
1251
1252     def pre(self, text, parameters=None, clear=None):
1253         """Process pre-formatted text.
1254
1255         This function processes pre-formatted text into a <pre> tag.
1256         No HTML is added for the lines, but @<@ and @>@ are translated into
1257         HTML entities.
1258
1259         ---
1260         h1. Pre-formatted text
1261
1262         Pre-formatted text can be specified using the @pre@ signature.
1263         Inside a "pre" block, whitespace is preserved and @<@ and @>@ are
1264         translated into HTML(HyperText Markup Language) entities
1265         automatically.
1266
1267         Text in a "pre" block is _not processed_ with any inline rule.
1268
1269         Here's a simple example:
1270
1271         pre. pre. This text is pre-formatted.
1272         Nothing interesting happens inside here...
1273
1274         Will become:
1275
1276         pre. <pre>
1277         This text is pre-formatted.
1278         Nothing interesting happens inside here...
1279         </pre>
1280         """
1281
1282         # Remove trailing whitespace.
1283         text = text.rstrip()
1284
1285         # Get the attributes.
1286         attributes = self.parse_params(parameters, clear)
1287
1288         # Build the tag.
1289         #open_tag = self.build_open_tag('pre', attributes) + '\n'
1290         open_tag = self.build_open_tag('pre', attributes)
1291         close_tag = '\n</pre>'
1292
1293         # Replace < and >.
1294         text = text.replace('<', '&lt;')
1295         text = text.replace('>', '&gt;')
1296
1297         return open_tag + text + close_tag
1298
1299
1300     def bc(self, text, parameters=None, clear=None):
1301         """Process block code.
1302
1303         This function processes block code into a <code> tag inside a
1304         <pre>. No HTML is added for the lines, but @<@ and @>@ are translated
1305         into HTML entities.
1306
1307         ---
1308         h1. Block code
1309
1310         A block code, specified by the @bc@ signature, is a block of
1311         pre-formatted text which also receives a @<code></code>@ tag. As
1312         with "pre", whitespace is preserved and @<@ and @>@ are translated
1313         into HTML(HyperText Markup Language) entities automatically.
1314
1315         Text in a "bc" code is _not processed_ with the inline rules.
1316
1317         If you have "Twisted":http://www.twistedmatrix.com/ installed,
1318         Textile can automatically colorize your Python code if you
1319         specify its language as "Python":
1320
1321         pre. bc[python]. from twisted.python import htmlizer
1322
1323         This will become:
1324
1325         pre. <pre>
1326         <code lang="python">
1327         <span class="py-src-keyword">from</span> <span class="py-src-variable">twisted</span><span class="py-src-op">.</span><span class="py-src-variable">python</span> <span class="py-src-keyword">import</span> <span class="py-src-variable">htmlizer</span>
1328         </code>
1329         </pre>
1330
1331         The colors can be specified in your CSS(Cascading Style Sheets)
1332         file. If you don't want to install Twisted, you can download just
1333         the @htmlizer@ module "independently":http://dealmeida.net/code/htmlizer.py.txt.
1334         """
1335
1336         # Get the attributes.
1337         attributes = self.parse_params(parameters, clear)
1338
1339         # XHTML <code> can't have the attribute lang.
1340         if attributes.has_key('lang'):
1341             lang = attributes['lang']
1342             del attributes['lang']
1343         else:
1344             lang = None
1345
1346         # Build the tag.
1347         open_tag = '<pre>\n' + self.build_open_tag('code', attributes) + '\n'
1348         close_tag = '\n</code>\n</pre>'
1349
1350         # Colorize Python code?
1351         if htmlizer and lang == 'python':
1352             text = _color(text)
1353         else:
1354             # Replace < and >.
1355             text = text.replace('<', '&lt;')
1356             text = text.replace('>', '&gt;')
1357
1358         return open_tag + text + close_tag
1359
1360
1361     def dl(self, text, parameters=None, clear=None):
1362         """Process definition list.
1363
1364         This function process definition lists. The text inside
1365         the <dt> and <dd> tags is processed for inline formatting.
1366
1367         ---
1368         h1. Definition list
1369
1370         A definition list starts with the signature @dl@, and has
1371         its items separated by a @:@. Here's a simple example:
1372
1373         pre. dl. name:Sir Lancelot of Camelot.
1374         quest:To seek the Holy Grail.
1375         color:Blue.
1376
1377         Becomes:
1378
1379         pre. <dl>
1380         <dt>name</dt>
1381         <dd>Sir Lancelot of Camelot.</dd>
1382         <dt>quest</dt>
1383         <dd>To seek the Holy Grail.</dd>
1384         <dt>color</dt>
1385         <dd>Blue.</dd>
1386         </dl>
1387         """
1388         # Get the attributes.
1389         attributes = self.parse_params(parameters, clear)
1390
1391         # Build the tag.
1392         open_tag = self.build_open_tag('dl', attributes) + '\n'
1393         close_tag = '\n</dl>'
1394
1395         lines = text.split('\n')
1396         output = []
1397         for line in lines:
1398             if line.count(':'):
1399                 [dt, dd] = line.split(':', 1)
1400             else:
1401                 dt,dd = line, ''
1402
1403             if dt: output.append('<dt>%s</dt>\n<dd>%s</dd>' % (dt, dd))
1404
1405         text = '\n'.join(output)
1406
1407         text = self.inline(text)
1408
1409         return open_tag + text + close_tag
1410
1411
1412     def blockquote(self, text, parameters=None, cite=None, clear=None):
1413         """Process block quote.
1414
1415         The block quote is inserted into a <blockquote> tag, and
1416         processed as a paragraph. An optional cite attribute can
1417         be appended on the last line after two dashes (--), or
1418         after the period following ':' for compatibility with the
1419         Perl version.
1420
1421         ---
1422         h1. Blockquote
1423
1424         A blockquote is denoted by the signature @bq@. The text in this
1425         block will be enclosed in @<blockquote></blockquote>@ and @<p></p>@,
1426         receiving the same formatting as a paragraph. For example:
1427
1428         pre. bq. This is a blockquote.
1429
1430         Becomes:
1431
1432         pre. <blockquote>
1433         <p>This is a blockquote.</p>
1434         </blockquote>
1435
1436         You can optionally specify the @cite@ attribute of the blockquote,
1437         using the following syntax:
1438
1439         pre. bq.:http://example.com Some text.
1440
1441         pre. bq.:"John Doe" Some other text.
1442
1443         Becomes:
1444
1445         pre. <blockquote cite="http://example.com">
1446         <p>Some text.</p>
1447         </blockquote>
1448
1449         pre. <blockquote cite="John Doe">
1450         <p>Some other text.</p>
1451         </blockquote>
1452
1453         You can also specify the @cite@ using a pair of dashes on the
1454         last line of the blockquote:
1455
1456         pre. bq. Some text.
1457         -- http://example.com
1458         """
1459
1460         # Get the attributes.
1461         attributes = self.parse_params(parameters, clear)
1462
1463         if cite:
1464             # Remove the quotes?
1465             cite = cite.strip('"')
1466             attributes['cite'] = cite
1467         else:
1468             # The citation should be on the last line.
1469             text = text.split('\n')
1470             if text[-1].startswith('-- '):
1471                 attributes['cite'] = text.pop()[3:]
1472
1473             text = '\n'.join(text)
1474
1475         # Build the tag.
1476         open_tag = self.build_open_tag('blockquote', attributes) + '\n'
1477         close_tag = '\n</blockquote>'
1478
1479         # Process the paragraph, passing the attributes.
1480         # Does it make sense to pass the id, class, etc. to
1481         # the paragraph instead of applying it to the
1482         # blockquote tag?
1483         text = self.paragraph(text)
1484
1485         return open_tag + text + close_tag
1486
1487
1488     def header(self, text, parameters=None, header=1, clear=None):
1489         """Process a header.
1490
1491         The header number is captured by the regular
1492         expression and lives in header. If head_offset is
1493         set, it is adjusted accordingly.
1494
1495         ---
1496         h1. Header
1497
1498         A header is produced by the signature @hn@, where @n@ goes
1499         from 1 to 6. You can adjust the relative output of the headers
1500         passing a @head_offset@ attribute when calling @textile()@.
1501
1502         To make a header:
1503
1504         pre. h1. This is a header.
1505
1506         Becomes:
1507
1508         pre. <h1>This is a header.</h1>
1509         """
1510         # Get the attributes.
1511         attributes = self.parse_params(parameters, clear)
1512
1513         # Get the header number and limit it between 1 and 6.
1514         n = header
1515         n = min(n,6)
1516         n = max(n,1)
1517
1518         # Build the tag.
1519         open_tag = self.build_open_tag('h%d' % n, attributes)
1520         close_tag = '</h%d>' % n
1521
1522         text = self.inline(text)
1523
1524         return open_tag + text + close_tag
1525
1526
1527     def footnote(self, text, parameters=None, footnote=1, clear=None):
1528         """Process a footnote.
1529
1530         A footnote is formatted as a paragraph of class
1531         'footnote' and id 'fn%d', starting with the footnote
1532         number in a <sup> tag. Here we just build the
1533         attributes and pass them directly to self.paragraph().
1534
1535         ---
1536         h1. Footnote
1537
1538         A footnote is produced by the signature @fn@ followed by
1539         a number. Footnotes are paragraphs of a special CSS(Cascading Style Sheets)
1540         class. An example:
1541
1542         pre. fn1. This is footnote number one.
1543
1544         Will produce this:
1545
1546         pre. <p class="footnote" id="fn1"><sup>1</sup> This is footnote number one.</p>
1547
1548         This footnote can be referenced anywhere on the text by the
1549         following way:
1550
1551         pre. This is a reference[1] to footnote number one.
1552
1553         Which becomes:
1554
1555         pre. <p>This is a reference<sup class="footnote"><a href="#fn1" title="This is footnote number one.">1</a></sup> to footnote number 1.</p>
1556
1557         Note that the text from the footnote appears in the @title@ of the
1558         link pointing to it.
1559         """
1560         # Get the number.
1561         n = int(footnote)
1562
1563         # Build the attributes to the paragraph.
1564         attributes = self.parse_params(parameters, clear)
1565         attributes['class'] = 'footnote'
1566         attributes['id']    = 'fn%d' % n
1567
1568         # Build the paragraph text.
1569         text = ('<sup>%d</sup> ' % n) + text
1570
1571         # And return the paragraph.
1572         return self.paragraph(text=text, attributes=attributes)
1573
1574
1575     def build_li(self, items, liattributes):
1576         """Build the list item.
1577
1578         This function build the list item of an (un)ordered list. It
1579         works by peeking at the next list item, and searching for a
1580         multi-list. If a multi-list is found, it is processed and
1581         appended inside the list item tags, as it should be.
1582         """
1583         lines = []
1584         while len(items):
1585             item = items.pop(0)
1586
1587             # Clean the line.
1588             item = item.lstrip()
1589             item = item.replace('\n', '<br />\n')
1590
1591             # Get list item attributes.
1592             p = re.compile(r'''^%(liattr)s\s''' % self.res, re.VERBOSE)
1593             m = p.match(item)
1594             if m:
1595                 c = m.groupdict('')
1596                 liparameters = c['liparameters']
1597                 item = p.sub('', item)
1598             else:
1599                 liparameters = ''
1600
1601             liattributes = liattributes or self.parse_params(liparameters)
1602
1603             # Build the item tag.
1604             open_tag_li = self.build_open_tag('li', liattributes)
1605
1606             # Reset the attributes, which should be applied
1607             # only to the first <li>.
1608             liattributes = {}
1609
1610             # Build the closing tag.
1611             close_tag_li = '</li>'
1612
1613             # Multi-list recursive routine.
1614             # Here we check the _next_ items for a multi-list. If we
1615             # find one, we extract all items of the multi-list and
1616             # process them recursively.
1617             if len(items):
1618                 inlist = []
1619
1620                 # Grab all the items that start with # or *.
1621                 n_item = items.pop(0)
1622
1623                 # Grab the <ol> parameters.
1624                 p = re.compile(r'''^%(olattr)s''' % self.res, re.VERBOSE)
1625                 m = p.match(n_item)
1626                 if m:
1627                     c = m.groupdict('')
1628                     olparameters = c['olparameters']
1629                     tmp = p.sub('', n_item)
1630                 else:
1631                     olparameters = ''
1632
1633                 # Check for an ordered list inside this one.
1634                 if tmp.startswith('#'):
1635                     n_item = tmp
1636                     inlist.append(n_item)
1637                     while len(items):
1638                         # Peek into the next item.
1639                         n_item = items.pop(0)
1640                         if n_item.startswith('#'):
1641                             inlist.append(n_item)
1642                         else:
1643                             items.insert(0, n_item)
1644                             break
1645
1646                     inlist = self.ol('\n'.join(inlist), olparameters=olparameters)
1647                     item = item + '\n' + inlist + '\n'
1648
1649                 # Check for an unordered list inside this one.
1650                 elif tmp.startswith('*'):
1651                     n_item = tmp
1652                     inlist.append(n_item)
1653                     while len(items):
1654                         # Peek into the next item.
1655                         n_item = items.pop(0)
1656                         if n_item.startswith('*'):
1657                             inlist.append(n_item)
1658                         else:
1659                             items.insert(0, n_item)
1660                             break
1661
1662                     inlist = self.ul('\n'.join(inlist), olparameters=olparameters)
1663                     item = item + '\n' + inlist + '\n'
1664
1665                 # Otherwise we just put it back in the list.
1666                 else:
1667                     items.insert(0, n_item)
1668
1669             item = self.inline(item)
1670
1671             item = open_tag_li + item + close_tag_li
1672             lines.append(item)
1673
1674         return '\n'.join(lines)
1675
1676
1677     def ol(self, text, liparameters=None, olparameters=None, clear=None):
1678         """Build an ordered list.
1679
1680         This function basically just sets the <ol></ol> with the
1681         right attributes, and then pass everything inside to
1682         _build_li, which does the real tough recursive job.
1683
1684         ---
1685         h1. Ordered lists
1686
1687         Ordered lists can be constructed this way:
1688
1689         pre. # Item number 1.
1690         # Item number 2.
1691         # Item number 3.
1692
1693         And you get:
1694
1695         pre. <ol>
1696         <li>Item number 1.</li>
1697         <li>Item number 2.</li>
1698         <li>Item number 3.</li>
1699         </ol>
1700
1701         If you want a list to "break" an extended block, you should
1702         add a period after the hash. This is useful for writing
1703         Python code:
1704
1705         pre.. bc[python].. #!/usr/bin/env python
1706
1707         # This is a comment, not an ordered list!
1708         # So this won't break the extended "bc".
1709
1710         p. Lists can be nested:
1711
1712         pre. # Item number 1.
1713         ## Item number 1a.
1714         ## Item number 1b.
1715         # Item number 2.
1716         ## Item number 2a.
1717
1718         Textile will transform this to:
1719
1720         pre. <ol>
1721         <li>Item number 1.
1722         <ol>
1723         <li>Item number 1a.</li>
1724         <li>Item number 1b.</li>
1725         </ol>
1726         </li>
1727         <li>Item number 2.
1728         <ol>
1729         <li>Item number 2a.</li>
1730         </ol>
1731         </li>
1732         </ol>
1733
1734         You can also mix ordered and unordered lists:
1735
1736         pre. * To write well you need:
1737         *# to read every day
1738         *# to write every day
1739         *# and X
1740
1741         You'll get this:
1742
1743         pre. <ul>
1744         <li>To write well you need:
1745         <ol>
1746         <li>to read every day</li>
1747         <li>to write every day</li>
1748         <li>and X</li>
1749         </ol>
1750         </li>
1751         </ul>
1752
1753         To style a list, the parameters should go before the hash if you want
1754         to set the attributes on the @<ol>@ tag:
1755
1756         pre. (class#id)# one
1757         # two
1758         # three
1759
1760         If you want to customize the firsr @<li>@ tag, apply the parameters
1761         after the hash:
1762
1763         pre. #(class#id) one
1764         # two
1765         # three
1766         """
1767         # Get the attributes.
1768         olattributes = self.parse_params(olparameters, clear)
1769         liattributes = self.parse_params(liparameters)
1770
1771         # Remove list depth.
1772         if text.startswith('#'):
1773             text = text[1:]
1774
1775         items = text.split('\n#')
1776
1777         # Build the open tag.
1778         open_tag = self.build_open_tag('ol', olattributes) + '\n'
1779
1780         close_tag = '\n</ol>'
1781
1782         # Build the list items.
1783         text = self.build_li(items, liattributes)
1784
1785         return open_tag + text + close_tag
1786
1787
1788     def ul(self, text, liparameters=None, olparameters=None, clear=None):
1789         """Build an unordered list.
1790
1791         This function basically just sets the <ul></ul> with the
1792         right attributes, and then pass everything inside to
1793         _build_li, which does the real tough recursive job.
1794
1795         ---
1796         h1. Unordered lists
1797
1798         Unordered lists behave exactly like the ordered lists, and are
1799         defined using a star:
1800
1801         pre. * Python
1802         * Perl
1803         * PHP
1804
1805         Becomes:
1806
1807         pre. <ul>
1808         <li>Python</li>
1809         <li>Perl</li>
1810         <li><span class="caps">PHP</span></li>
1811         </ul>
1812         """
1813         # Get the attributes.
1814         olattributes = self.parse_params(olparameters, clear)
1815         liattributes = self.parse_params(liparameters)
1816
1817         # Remove list depth.
1818         if text.startswith('*'):
1819             text = text[1:]
1820
1821         items = text.split('\n*')
1822
1823         # Build the open tag.
1824         open_tag = self.build_open_tag('ul', olattributes) + '\n'
1825
1826         close_tag = '\n</ul>'
1827
1828         # Build the list items.
1829         text = self.build_li(items, liattributes)
1830
1831         return open_tag + text + close_tag
1832
1833
1834     def table(self, text, parameters=None, clear=None):
1835         """Build a table.
1836
1837         To build a table we split the text in lines to get the
1838         rows, and split the rows between '|' to get the individual
1839         cells.
1840
1841         ---
1842         h1. Tables
1843
1844         Making a simple table is as easy as possible:
1845
1846         pre. |a|b|c|
1847         |1|2|3|
1848
1849         Will be processed into:
1850
1851         pre. <table>
1852         <tr>
1853         <td>a</td>
1854         <td>b</td>
1855         <td>c</td>
1856         </tr>
1857         <tr>
1858         <td>1</td>
1859         <td>2</td>
1860         <td>3</td>
1861         </tr>
1862         </table>
1863
1864         If you want to customize the @<table>@ tag, you must use the
1865         @table@ signature:
1866
1867         pre. table(class#id)[en]. |a|b|c|
1868         |1|2|3|
1869
1870         To customize a row, apply the modifier _before_ the first @|@:
1871
1872         pre. table. (class)<>|a|b|c|
1873         |1|2|3|
1874
1875         Individual cells can by customized by adding the parameters _after_
1876         the @|@, proceded by a period and a space:
1877
1878         pre. |(#id). a|b|c|
1879         |1|2|3|
1880
1881         The allowed modifiers are:
1882
1883         dl. {style rule}:A CSS(Cascading Style Sheets) style rule.
1884         (class) or (#id) or (class#id):A CSS(Cascading Style Sheets) class and/or id attribute.
1885         ( (one or more):Adds 1em of padding to the left for each '(' character.
1886         ) (one or more):Adds 1em of padding to the right for each ')' character.
1887         &lt;:Aligns to the left (floats to left for tables if combined with the ')' modifier).
1888         &gt;:Aligns to the right (floats to right for tables if combined with the '(' modifier).
1889         =:Aligns to center (sets left, right margins to 'auto' for tables).
1890         &lt;&gt;:For cells only. Justifies text.
1891         ^:For rows and cells only. Aligns to the top.
1892         ~ (tilde):For rows and cells only. Aligns to the bottom.
1893         _ (underscore):Can be applied to a table row or cell to indicate a header row or cell.
1894         \\2 or \\3 or \\4, etc.:Used within cells to indicate a colspan of 2, 3, 4, etc. columns. When you see "\\", think "push forward".
1895         /2 or /3 or /4, etc.:Used within cells to indicate a rowspan of 2, 3, 4, etc. rows. When you see "/", think "push downward".
1896
1897         When a cell is identified as a header cell and an alignment is
1898         specified, that becomes the default alignment for cells below it.
1899         You can always override this behavior by specifying an alignment
1900         for one of the lower cells.
1901         """
1902         attributes = self.parse_params(parameters, clear, align_type='table')
1903         #attributes['cellspacing'] = '0'
1904
1905         # Build the <table>.
1906         open_tag = self.build_open_tag('table', attributes) + '\n'
1907         close_tag = '</table>'
1908
1909         output = []
1910         default_align = {}
1911         rows = re.split(r'''\n+''', text)
1912         for row in rows:
1913             # Get the columns.
1914             columns = row.split('|')
1915
1916             # Build the <tr>.
1917             parameters = columns.pop(0)
1918
1919             rowattr = self.parse_params(parameters, align_type='table')
1920             open_tr = self.build_open_tag('tr', rowattr) + '\n'
1921             output.append(open_tr)
1922
1923             # Does the row define headers?
1924             if parameters.count('_'):
1925                 td_tag = 'th'
1926             else:
1927                 td_tag = 'td'
1928
1929             col = 0
1930             for cell in columns[:-1]:
1931                 p = re.compile(r'''(?:%(tattr)s\.\s)?(?P<text>.*)''' % self.res, re.VERBOSE)
1932                 m = p.match(cell)
1933                 if m:
1934                     c = m.groupdict('')
1935                     cellattr = self.parse_params(c['parameters'], align_type='table')
1936
1937                     # Get the width of this cell.
1938                     width = cellattr.get('colspan', 1)
1939
1940                     # Is this a header?
1941                     if c['parameters'].count('_'):
1942                         td_tag = 'th'
1943
1944                     # If it is a header, let's set the default alignment.
1945                     if td_tag == 'th':
1946                         # Set the default aligment for all cells below this one.
1947                         # This is a little tricky because this header can have
1948                         # a colspan set.
1949                         for i in range(col, col+width):
1950                             default_align[i] = cellattr.get('align', None)
1951
1952                     else:
1953                         # Apply the default align, if any.
1954                         cellattr['align'] = cellattr.get('align', default_align.get(col, None))
1955
1956                     open_td = self.build_open_tag(td_tag, cellattr)
1957                     close_td = '</%s>\n' % td_tag
1958
1959                     #output.append(open_td + c['text'].strip() + close_td)
1960                     output.append(open_td + self.inline(c['text'].strip()) + close_td)
1961
1962                 col += width
1963
1964             output.append('</tr>\n')
1965
1966         text = open_tag + ''.join(output) + close_tag
1967
1968         return text
1969
1970
1971     def escape(self, text):
1972         """Do nothing.
1973
1974         This is used to match escaped text. Nothing to see here!
1975
1976         ---
1977         h1. Escaping
1978
1979         If you don't want Textile processing a block, you can simply
1980         enclose it inside @==@:
1981
1982         pre. p. Regular paragraph
1983
1984         pre. ==
1985         Escaped portion -- will not be formatted
1986         by Textile at all
1987         ==
1988
1989         pre. p. Back to normal.
1990
1991         This can also be used inline, disabling the formatting temporarily:
1992
1993         pre. p. This is ==*a test*== of escaping.
1994         """
1995         return text
1996
1997
1998     def itex(self, text):
1999         """Convert itex to MathML.
2000
2001         If the itex2mml binary is set, we use it to convert the
2002         itex to MathML. Otherwise, the text is unprocessed and
2003         return as is.
2004
2005         ---
2006         h1. itex
2007
2008         Textile can automatically convert itex code to MathML(Mathematical Markup Language)
2009         for you, if you have the itex2MML binary (you can download it
2010         from the "Movable Type plugin":http://golem.ph.utexas.edu/~distler/blog/files/itexToMML.tar.gz).
2011
2012         Block equations should be enclosed inbetween @\[@ and @\]@:
2013
2014         pre. \[ e^{i\pi} + 1 = 0 \]
2015
2016         Will be translated to:
2017
2018         pre. <math xmlns='http://www.w3.org/1998/Math/MathML' mode='display'>
2019         <msup><mi>e</mi> <mrow><mi>i</mi>
2020         <mi>&amp;pi;</mi></mrow></msup>
2021         <mo>+</mo><mn>1</mn><mo>=</mo><mn>0</mn>
2022         </math>
2023
2024         Equations can also be displayed inline:
2025
2026         pre. Euler's formula, $e^{i\pi}+1=0$, ...
2027
2028         (Note that if you want to display MathML(Mathematical Markup Language)
2029         your content must be served as @application/xhtml+xml@, which is not
2030         accepted by all browsers.)
2031         """
2032         if itex2mml:
2033             try:
2034                 text = os.popen("echo '%s' | %s" % (text, itex2mml)).read()
2035             except:
2036                 pass
2037
2038         return text
2039
2040
2041     def about(self, text=None):
2042         """Show PyTextile's functionalities.
2043
2044         An introduction to PyTextile. Can be called when running the
2045         main script or if you write the following line:
2046
2047             'tell me about textile.'
2048
2049         But keep it a secret!
2050         """
2051
2052         about = []
2053         about.append(textile('h1. This is Textile', head_offset=self.head_offset))
2054         about.append(textile(__doc__.split('---', 1)[1], head_offset=self.head_offset))
2055
2056         functions = [(self.split_text, 1),
2057                      (self.paragraph,  2),
2058                      (self.pre,        2),
2059                      (self.bc,         2),
2060                      (self.blockquote, 2),
2061                      (self.dl,         2),
2062                      (self.header,     2),
2063                      (self.footnote,   2),
2064                      (self.escape,     2),
2065                      (self.itex,       2),
2066                      (self.ol,         2),
2067                      (self.ul,         2),
2068                      (self.table,      2),
2069                      (self.inline,     1),
2070                      (self.qtags,      2),
2071                      (self.glyphs,     2),
2072                      (self.macros,     2),
2073                      (self.acronym,    2),
2074                      (self.images,     1),
2075                      (self.links,      1),
2076                      (self.sanitize,   1),
2077                     ]
2078
2079         for function, offset in functions:
2080             doc = function.__doc__.split('---', 1)[1]
2081             doc = doc.split('\n')
2082             lines = []
2083             for line in doc:
2084                 line = line.strip()
2085                 lines.append(line)
2086
2087             doc = '\n'.join(lines)
2088             about.append(textile(doc, head_offset=self.head_offset+offset))
2089
2090         about = '\n'.join(about)
2091         about = about.replace('<br />', '')
2092
2093         return about
2094
2095
2096     def acronym(self, text):
2097         """Process acronyms.
2098
2099         Acronyms can have letters in upper and lower caps, or even numbers,
2100         provided that the numbers and upper caps are the same in the
2101         abbreviation and in the description. For example:
2102
2103             XHTML(eXtensible HyperText Markup Language)
2104             OPeNDAP(Open source Project for a Network Data Access Protocol)
2105             L94(Levitus 94)
2106
2107         are all valid acronyms.
2108
2109         ---
2110         h1. Acronyms
2111
2112         You can define acronyms in your text the following way:
2113
2114         pre. This is XHTML(eXtensible HyperText Markup Language).
2115
2116         The resulting code is:
2117
2118         pre. <p><acronym title="eXtensible HyperText Markup Language"><span class="caps">XHTML</span></acronym></p>
2119
2120         Acronyms can have letters in upper and lower caps, or even numbers,
2121         provided that the numbers and upper caps are the same in the
2122         abbreviation and in the description. For example:
2123
2124         pre. XHTML(eXtensible HyperText Markup Language)
2125         OPeNDAP(Open source Project for a Network Data Access Protocol)
2126         L94(Levitus 94)
2127
2128         are all valid acronyms.
2129         """
2130         # Find the acronyms.
2131         acronyms = r'''(?P<acronym>[\w]+)\((?P<definition>[^\(\)]+?)\)'''
2132
2133         # Check all acronyms.
2134         for acronym, definition in re.findall(acronyms, text):
2135             caps_acronym = ''.join(re.findall('[A-Z\d]+', acronym))
2136             caps_definition = ''.join(re.findall('[A-Z\d]+', definition))
2137             if caps_acronym and caps_acronym == caps_definition:
2138                 text = text.replace('%s(%s)' % (acronym, definition), '<acronym title="%s">%s</acronym>' % (definition, acronym))
2139
2140         text = html_replace(r'''(^|\s)([A-Z]{3,})\b(?!\()''', r'''\1<span class="caps">\2</span>''', text)
2141
2142         return text
2143
2144
2145     def footnotes(self, text):
2146         """Add titles to footnotes references.
2147
2148         This function searches for footnotes references like this [1], and
2149         adds a title to the link containing the first paragraph of the
2150         footnote.
2151         """
2152         # Search for footnotes.
2153         p = re.compile(r'''<p class="footnote" id="fn(?P<n>\d+)"><sup>(?P=n)</sup>(?P<note>.*)</p>''')
2154         for m in p.finditer(text):
2155             n = m.group('n')
2156             note = m.group('note').strip()
2157
2158             # Strip HTML from note.
2159             note = re.sub('<.*?>', '', note)
2160
2161             # Add the title.
2162             text = text.replace('<a href="#fn%s">' % n, '<a href="#fn%s" title="%s">' % (n, note))
2163
2164         return text
2165
2166
2167     def macros(self, m):
2168         """Quick macros.
2169
2170         This function replaces macros inside brackets using a built-in
2171         dictionary, and also unicode names if the key doesn't exist.
2172
2173         ---
2174         h1. Macros
2175
2176         Textile has support for character macros, which should be enclosed
2177         in curly braces. A few useful ones are:
2178
2179         pre. {C=} or {=C}: euro sign
2180         {+-} or {-+}: plus-minus sign
2181         {L-} or {-L}: pound sign.
2182
2183         You can also make accented characters:
2184
2185         pre. Expos{e'}
2186
2187         Becomes:
2188
2189         pre. <p>Expos&amp;#233;</p>
2190
2191         You can also specify Unicode names like:
2192
2193         pre. {umbrella}
2194         {white smiling face}
2195         """
2196         entity = m.group(1)
2197
2198         macros = {'c|': '&#162;',       # cent sign
2199                   '|c': '&#162;',       # cent sign
2200                   'L-': '&#163;',       # pound sign
2201                   '-L': '&#163;',       # pound sign
2202                   'Y=': '&#165;',       # yen sign
2203                   '=Y': '&#165;',       # yen sign
2204                   '(c)': '&#169;',      # copyright sign
2205                   '<<': '&#171;',       # left-pointing double angle quotation
2206                   '(r)': '&#174;',      # registered sign
2207                   '+_': '&#177;',       # plus-minus sign
2208                   '_+': '&#177;',       # plus-minus sign
2209                   '>>': '&#187;',       # right-pointing double angle quotation
2210                   '1/4': '&#188;',      # vulgar fraction one quarter
2211                   '1/2': '&#189;',      # vulgar fraction one half
2212                   '3/4': '&#190;',      # vulgar fraction three quarters
2213                   'A`': '&#192;',       # latin capital letter a with grave
2214                   '`A': '&#192;',       # latin capital letter a with grave
2215                   'A\'': '&#193;',      # latin capital letter a with acute
2216                   '\'A': '&#193;',      # latin capital letter a with acute
2217                   'A^': '&#194;',       # latin capital letter a with circumflex
2218                   '^A': '&#194;',       # latin capital letter a with circumflex
2219                   'A~': '&#195;',       # latin capital letter a with tilde
2220                   '~A': '&#195;',       # latin capital letter a with tilde
2221                   'A"': '&#196;',       # latin capital letter a with diaeresis
2222                   '"A': '&#196;',       # latin capital letter a with diaeresis
2223                   'Ao': '&#197;',       # latin capital letter a with ring above
2224                   'oA': '&#197;',       # latin capital letter a with ring above
2225                   'AE': '&#198;',       # latin capital letter ae
2226                   'C,': '&#199;',       # latin capital letter c with cedilla
2227                   ',C': '&#199;',       # latin capital letter c with cedilla
2228                   'E`': '&#200;',       # latin capital letter e with grave
2229                   '`E': '&#200;',       # latin capital letter e with grave
2230                   'E\'': '&#201;',      # latin capital letter e with acute
2231                   '\'E': '&#201;',      # latin capital letter e with acute
2232                   'E^': '&#202;',       # latin capital letter e with circumflex
2233                   '^E': '&#202;',       # latin capital letter e with circumflex
2234                   'E"': '&#203;',       # latin capital letter e with diaeresis
2235                   '"E': '&#203;',       # latin capital letter e with diaeresis
2236                   'I`': '&#204;',       # latin capital letter i with grave
2237                   '`I': '&#204;',       # latin capital letter i with grave
2238                   'I\'': '&#205;',      # latin capital letter i with acute
2239                   '\'I': '&#205;',      # latin capital letter i with acute
2240                   'I^': '&#206;',       # latin capital letter i with circumflex
2241                   '^I': '&#206;',       # latin capital letter i with circumflex
2242                   'I"': '&#207;',       # latin capital letter i with diaeresis
2243                   '"I': '&#207;',       # latin capital letter i with diaeresis
2244                   'D-': '&#208;',       # latin capital letter eth
2245                   '-D': '&#208;',       # latin capital letter eth
2246                   'N~': '&#209;',       # latin capital letter n with tilde
2247                   '~N': '&#209;',       # latin capital letter n with tilde
2248                   'O`': '&#210;',       # latin capital letter o with grave
2249                   '`O': '&#210;',       # latin capital letter o with grave
2250                   'O\'': '&#211;',      # latin capital letter o with acute
2251                   '\'O': '&#211;',      # latin capital letter o with acute
2252                   'O^': '&#212;',       # latin capital letter o with circumflex
2253                   '^O': '&#212;',       # latin capital letter o with circumflex
2254                   'O~': '&#213;',       # latin capital letter o with tilde
2255                   '~O': '&#213;',       # latin capital letter o with tilde
2256                   'O"': '&#214;',       # latin capital letter o with diaeresis
2257                   '"O': '&#214;',       # latin capital letter o with diaeresis
2258                   'O/': '&#216;',       # latin capital letter o with stroke
2259                   '/O': '&#216;',       # latin capital letter o with stroke
2260                   'U`':  '&#217;',      # latin capital letter u with grave
2261                   '`U':  '&#217;',      # latin capital letter u with grave
2262                   'U\'': '&#218;',      # latin capital letter u with acute
2263                   '\'U': '&#218;',      # latin capital letter u with acute
2264                   'U^': '&#219;',       # latin capital letter u with circumflex
2265                   '^U': '&#219;',       # latin capital letter u with circumflex
2266                   'U"': '&#220;',       # latin capital letter u with diaeresis
2267                   '"U': '&#220;',       # latin capital letter u with diaeresis
2268                   'Y\'': '&#221;',      # latin capital letter y with acute
2269                   '\'Y': '&#221;',      # latin capital letter y with acute
2270                   'a`': '&#224;',       # latin small letter a with grave
2271                   '`a': '&#224;',       # latin small letter a with grave
2272                   'a\'': '&#225;',      # latin small letter a with acute
2273                   '\'a': '&#225;',      # latin small letter a with acute
2274                   'a^': '&#226;',       # latin small letter a with circumflex
2275                   '^a': '&#226;',       # latin small letter a with circumflex
2276                   'a~': '&#227;',       # latin small letter a with tilde
2277                   '~a': '&#227;',       # latin small letter a with tilde
2278                   'a"': '&#228;',       # latin small letter a with diaeresis
2279                   '"a': '&#228;',       # latin small letter a with diaeresis
2280                   'ao': '&#229;',       # latin small letter a with ring above
2281                   'oa': '&#229;',       # latin small letter a with ring above
2282                   'ae': '&#230;',       # latin small letter ae
2283                   'c,': '&#231;',       # latin small letter c with cedilla
2284                   ',c': '&#231;',       # latin small letter c with cedilla
2285                   'e`': '&#232;',       # latin small letter e with grave
2286                   '`e': '&#232;',       # latin small letter e with grave
2287                   'e\'': '&#233;',      # latin small letter e with acute
2288                   '\'e': '&#233;',      # latin small letter e with acute
2289                   'e^': '&#234;',       # latin small letter e with circumflex
2290                   '^e': '&#234;',       # latin small letter e with circumflex
2291                   'e"': '&#235;',       # latin small letter e with diaeresis
2292                   '"e': '&#235;',       # latin small letter e with diaeresis
2293                   'i`': '&#236;',       # latin small letter i with grave
2294                   '`i': '&#236;',       # latin small letter i with grave
2295                   'i\'': '&#237;',      # latin small letter i with acute
2296                   '\'i': '&#237;',      # latin small letter i with acute
2297                   'i^': '&#238;',       # latin small letter i with circumflex
2298                   '^i': '&#238;',       # latin small letter i with circumflex
2299                   'i"': '&#239;',       # latin small letter i with diaeresis
2300                   '"i': '&#239;',       # latin small letter i with diaeresis
2301                   'n~': '&#241;',       # latin small letter n with tilde
2302                   '~n': '&#241;',       # latin small letter n with tilde
2303                   'o`': '&#242;',       # latin small letter o with grave
2304                   '`o': '&#242;',       # latin small letter o with grave
2305                   'o\'': '&#243;',      # latin small letter o with acute
2306                   '\'o': '&#243;',      # latin small letter o with acute
2307                   'o^': '&#244;',       # latin small letter o with circumflex
2308                   '^o': '&#244;',       # latin small letter o with circumflex
2309                   'o~': '&#245;',       # latin small letter o with tilde
2310                   '~o': '&#245;',       # latin small letter o with tilde
2311                   'o"': '&#246;',       # latin small letter o with diaeresis
2312                   '"o': '&#246;',       # latin small letter o with diaeresis
2313                   ':-': '&#247;',       # division sign
2314                   '-:': '&#247;',       # division sign
2315                   'o/': '&#248;',       # latin small letter o with stroke
2316                   '/o': '&#248;',       # latin small letter o with stroke
2317                   'u`': '&#249;',       # latin small letter u with grave
2318                   '`u': '&#249;',       # latin small letter u with grave
2319                   'u\'': '&#250;',      # latin small letter u with acute
2320                   '\'u': '&#250;',      # latin small letter u with acute
2321                   'u^': '&#251;',       # latin small letter u with circumflex
2322                   '^u': '&#251;',       # latin small letter u with circumflex
2323                   'u"': '&#252;',       # latin small letter u with diaeresis
2324                   '"u': '&#252;',       # latin small letter u with diaeresis
2325                   'y\'': '&#253;',      # latin small letter y with acute
2326                   '\'y': '&#253;',      # latin small letter y with acute
2327                   'y"': '&#255',        # latin small letter y with diaeresis
2328                   '"y': '&#255',        # latin small letter y with diaeresis
2329                   'OE': '&#338;',       # latin capital ligature oe
2330                   'oe': '&#339;',       # latin small ligature oe
2331                   '*': '&#8226;',       # bullet
2332                   'Fr': '&#8355;',      # french franc sign
2333                   'L=': '&#8356;',      # lira sign
2334                   '=L': '&#8356;',      # lira sign
2335                   'Rs': '&#8360;',      # rupee sign
2336                   'C=': '&#8364;',      # euro sign
2337                   '=C': '&#8364;',      # euro sign
2338                   'tm': '&#8482;',      # trade mark sign
2339                   '<-': '&#8592;',      # leftwards arrow
2340                   '->': '&#8594;',      # rightwards arrow
2341                   '<=': '&#8656;',      # leftwards double arrow
2342                   '=>': '&#8658;',      # rightwards double arrow
2343                   '=/': '&#8800;',      # not equal to
2344                   '/=': '&#8800;',      # not equal to
2345                   '<_': '&#8804;',      # less-than or equal to
2346                   '_<': '&#8804;',      # less-than or equal to
2347                   '>_': '&#8805;',      # greater-than or equal to
2348                   '_>': '&#8805;',      # greater-than or equal to
2349                   ':(': '&#9785;',      # white frowning face
2350                   ':)': '&#9786;',      # white smiling face
2351                   'spade': '&#9824;',   # black spade suit
2352                   'club': '&#9827;',    # black club suit
2353                   'heart': '&#9829;',   # black heart suit
2354                   'diamond': '&#9830;', # black diamond suit
2355                  }
2356
2357         try:
2358             # Try the key.
2359             entity = macros[entity]
2360         except KeyError:
2361             try:
2362                 # Try a unicode entity.
2363                 entity = unicodedata.lookup(entity)
2364                 entity = entity.encode('ascii', 'xmlcharrefreplace')
2365             except:
2366                 # Return the unmodified entity.
2367                 entity = '{%s}' % entity
2368
2369         return entity
2370
2371
2372     def glyphs(self, text):
2373         """Glyph formatting.
2374
2375         This function replaces quotations marks, dashes and a few other
2376         symbol for numerical entities. The em/en dashes use definitions
2377         comes from http://alistapart.com/articles/emen/.
2378
2379         ---
2380         h1. Glyphs
2381
2382         Textile replaces some of the characters in your text with their
2383         equivalent numerical entities. These include:
2384
2385         * Replace single and double primes used as quotation marks with HTML(HyperText Markup Language) entities for opening and closing quotation marks in readable text, while leaving untouched the primes required within HTML(HyperText Markup Language) tags.
2386         * Replace double hyphens (==--==) with an em-dash (&#8212;) entity.
2387         * Replace triple hyphens (==---==) with two em-dash (&#8212;&#8212;) entities.
2388         * Replace single hyphens surrounded by spaces with an en-dash (&#8211;) entity.
2389         * Replace triplets of periods (==...==) with an ellipsis (&#8230;) entity.
2390         * Convert many nonstandard characters to browser-safe entities corresponding to keyboard input.
2391         * Convert ==(TM)==, ==(R)==, and  ==(C)== to &#8482;, &#174;, and &#169;.
2392         * Convert the letter x to a dimension sign: 2==x==4 to 2x4 and 8 ==x== 10 to 8x10.
2393         """
2394         glyphs = [(r'''"(?<!\w)\b''', r'''&#8220;'''),                              # double quotes
2395                   (r'''"''', r'''&#8221;'''),                                       # double quotes
2396                   (r"""\b'""", r'''&#8217;'''),                                     # single quotes
2397                   (r"""'(?<!\w)\b""", r'''&#8216;'''),                              # single quotes
2398                   (r"""'""", r'''&#8217;'''),                                       # single single quote
2399                   (r'''(\b|^)( )?\.{3}''', r'''\1&#8230;'''),                       # ellipsis
2400                   (r'''\b---\b''', r'''&#8212;&#8212;'''),                          # double em dash
2401                   (r'''\s?--\s?''', r'''&#8212;'''),                                # em dash
2402                   (r'''(\d+)-(\d+)''', r'''\1&#8211;\2'''),                         # en dash (1954-1999)
2403                   (r'''(\d+)-(\W)''', r'''\1&#8212;\2'''),                          # em dash (1954--)
2404                   (r'''\s-\s''', r''' &#8211; '''),                                 # en dash
2405                   (r'''(\d+) ?x ?(\d+)''', r'''\1&#215;\2'''),                      # dimension sign
2406                   (r'''\b ?(\((tm|TM)\))''', r'''&#8482;'''),                       # trademark
2407                   (r'''\b ?(\([rR]\))''', r'''&#174;'''),                           # registered
2408                   (r'''\b ?(\([cC]\))''', r'''&#169;'''),                           # copyright
2409                   (r'''([^\s])\[(\d+)\]''',                                         #
2410                        r'''\1<sup class="footnote"><a href="#fn\2">\2</a></sup>'''),# footnote
2411                   ]
2412
2413         # Apply macros.
2414         text = re.sub(r'''{([^}]+)}''', self.macros, text)
2415
2416         # LaTeX style quotes.
2417         text = text.replace('\x60\x60', '&#8220;')
2418         text = text.replace('\xb4\xb4', '&#8221;')
2419
2420         # Linkify URL and emails.
2421         url = r'''(?=[a-zA-Z0-9./#])                          # Must start correctly
2422                   ((?:                                        # Match the leading part (proto://hostname, or just hostname)
2423                       (?:ftp|https?|telnet|nntp)              #     protocol
2424                       ://                                     #     ://
2425                       (?:                                     #     Optional 'username:password@'
2426                           \w+                                 #         username
2427                           (?::\w+)?                           #         optional :password
2428                           @                                   #         @
2429                       )?                                      #
2430                       [-\w]+(?:\.\w[-\w]*)+                   #     hostname (sub.example.com)
2431                   )                                           #
2432                   (?::\d+)?                                   # Optional port number
2433                   (?:                                         # Rest of the URL, optional
2434                       /?                                      #     Start with '/'
2435                       [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]*       #     Can't start with these
2436                       (?:                                     #
2437                           [.!,?;:]+                           #     One or more of these
2438                           [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]+   #     Can't finish with these
2439                           #'"                                 #     # or ' or "
2440                       )*                                      #
2441                   )?)                                         #
2442                '''
2443
2444         email = r'''(?:mailto:)?            # Optional mailto:
2445                     ([-\+\w]+               # username
2446                     \@                      # at
2447                     [-\w]+(?:\.\w[-\w]*)+)  # hostname
2448                  '''
2449
2450         # If there is no html, do a simple search and replace.
2451         if not re.search(r'''<.*>''', text):
2452             for glyph_search, glyph_replace in glyphs:
2453                 text = preg_replace(glyph_search, glyph_replace, text)
2454
2455             # Linkify.
2456             text = re.sub(re.compile(url, re.VERBOSE), r'''<a href="\1">\1</a>''', text)
2457             text = re.sub(re.compile(email, re.VERBOSE), r'''<a href="mailto:\1">\1</a>''', text)
2458
2459         else:
2460             lines = []
2461             # Else split the text into an array at <>.
2462             for line in re.split('(<.*?>)', text):
2463                 if not re.match('<.*?>', line):
2464                     for glyph_search, glyph_replace in glyphs:
2465                         line = preg_replace(glyph_search, glyph_replace, line)
2466
2467                     # Linkify.
2468                     line = re.sub(re.compile(url, re.VERBOSE), r'''<a href="\1">\1</a>''', line)
2469                     line = re.sub(re.compile(email, re.VERBOSE), r'''<a href="mailto:\1">\1</a>''', line)
2470
2471                 lines.append(line)
2472
2473             text = ''.join(lines)
2474
2475         return text
2476
2477
2478     def qtags(self, text):
2479         """Quick tags formatting.
2480
2481         This function does the inline formatting of text, like
2482         bold, italic, strong and also itex code.
2483
2484         ---
2485         h1. Quick tags
2486
2487         Quick tags allow you to format your text, making it bold,
2488         emphasized or small, for example. The quick tags operators
2489         include:
2490
2491         dl. ==*strong*==:Translates into @<strong>strong</strong>@.
2492         ==_emphasis_==:Translates into @<em>emphasis</em>@.
2493         ==**bold**==:Translates into @<b>bold</b>@.
2494         ==__italics__==:Translates into @<i>italics</i>@.
2495         ==++bigger++==:Translates into @<big>bigger</big>@.
2496         ==--smaller--==:Translates into: @<small>smaller</small>@.
2497         ==-deleted text-==:Translates into @<del>deleted text</del>@.
2498         ==+inserted text+==:Translates into @<ins>inserted text</ins>@.
2499         ==^superscript^==:Translates into @<sup>superscript</sup>@.
2500         ==~subscript~==:Translates into @<sub>subscript</sub>@.
2501         ==%span%==:Translates into @<span>span</span>@.
2502         ==@code@==:Translates into @<code>code</code>@.
2503
2504         Note that within a "==@==...==@==" section, @<@ and @>@ are
2505         translated into HTML entities automatically.
2506
2507         Inline formatting operators accept the following modifiers:
2508
2509         dl. {style rule}:A CSS(Cascading Style Sheets) style rule.
2510         [ll]:A language identifier (for a "lang" attribute).
2511         (class) or (#id) or (class#id):For CSS(Cascading Style Sheets) class and id attributes.
2512         """
2513         # itex2mml.
2514         text = re.sub('\$(.*?)\$', lambda m: self.itex(m.group()), text)
2515
2516         # Add span tags to upper-case words which don't have a description.
2517         #text = preg_replace(r'''(^|\s)([A-Z]{3,})\b(?!\()''', r'''\1<span class="caps">\2</span>''', text)
2518
2519         # Quick tags.
2520         qtags = [('**', 'b',      {'qf': '(?<!\*)\*\*(?!\*)', 'cls': '\*'}),
2521                  ('__', 'i',      {'qf': '(?<!_)__(?!_)', 'cls': '_'}),
2522                  ('??', 'cite',   {'qf': '\?\?(?!\?)', 'cls': '\?'}),
2523                  ('-',  'del',    {'qf': '(?<!\-)\-(?!\-)', 'cls': '-'}),
2524                  ('+',  'ins',    {'qf': '(?<!\+)\+(?!\+)', 'cls': '\+'}),
2525                  ('*',  'strong', {'qf': '(?<!\*)\*(?!\*)', 'cls': '\*'}),
2526                  ('_',  'em',     {'qf': '(?<!_)_(?!_)', 'cls': '_'}),
2527                  ('++', 'big',    {'qf': '(?<!\+)\+\+(?!\+)', 'cls': '\+\+'}),
2528                  ('--', 'small',  {'qf': '(?<!\-)\-\-(?!\-)', 'cls': '\-\-'}),
2529                  ('~',  'sub',    {'qf': '(?<!\~)\~(?!(\\\/~))', 'cls': '\~'}),
2530                  ('@',  'code',   {'qf': '(?<!@)@(?!@)', 'cls': '@'}),
2531                  ('%',  'span',   {'qf': '(?<!%)%(?!%)', 'cls': '%'}),
2532                 ]
2533
2534         # Superscript.
2535         text = re.sub(r'''(?<!\^)\^(?!\^)(.+?)(?<!\^)\^(?!\^)''', r'''<sup>\1</sup>''', text)
2536
2537         # This is from the perl version of Textile.
2538         for qtag, htmltag, redict in qtags:
2539             self.res.update(redict)
2540             p = re.compile(r'''(?:                          #
2541                                    ^                        # Start of string
2542                                    |                        #
2543                                    (?<=[\s>'"])             # Whitespace, end of tag, quotes
2544                                    |                        #
2545                                    (?P<pre>[{[])            # Surrounded by [ or {
2546                                    |                        #
2547                                    (?<=%(punct)s)           # Punctuation
2548                                )                            #
2549                                %(qf)s                       # opening tag
2550                                %(qattr)s                    # attributes
2551                                (?P<text>[^%(cls)s\s].*?)    # text
2552                                (?<=\S)                      # non-whitespace
2553                                %(qf)s                       #
2554                                (?:                          #
2555                                    $                        # End of string
2556                                    |                        #
2557                                    (?P<post>[\]}])          # Surrounded by ] or }
2558                                    |                        #
2559                                    (?=%(punct)s{1,2}|\s)    # punctuation
2560                                 )                           #
2561                              ''' % self.res, re.VERBOSE)
2562
2563             def _replace(m):
2564                 c = m.groupdict('')
2565
2566                 attributes = self.parse_params(c['parameters'])
2567                 open_tag  = self.build_open_tag(htmltag, attributes)
2568                 close_tag = '</%s>' % htmltag
2569
2570                 # Replace < and > inside <code></code>.
2571                 if htmltag == 'code':
2572                     c['text'] = c['text'].replace('<', '&lt;')
2573                     c['text'] = c['text'].replace('>', '&gt;')
2574
2575                 return open_tag + c['text'] + close_tag
2576
2577             text = p.sub(_replace, text)
2578
2579         return text
2580
2581
2582     def images(self, text):
2583         """Process images.
2584
2585         This function process images tags, with or without links. Images
2586         can have vertical and/or horizontal alignment, and can be resized
2587         unefficiently using width and height tags.
2588
2589         ---
2590         h1. Images
2591
2592         An image is generated by enclosing the image source in @!@:
2593
2594         pre. !/path/to/image!
2595
2596         You may optionally specify an alternative text for the image, which
2597         will also be used as its title:
2598
2599         pre. !image.jpg (Nice picture)!
2600
2601         Becomes:
2602
2603         pre. <p><img src="image.jpg" alt="Nice picture" title="Nice picture" /></p>
2604
2605         If you want to make the image point to a link, simply append a
2606         comma and the URL(Universal Republic of Love) to the image:
2607
2608         pre. !image.jpg!:http://diveintopython.org
2609
2610         Images can also be resized. These are all equivalent:
2611
2612         pre. !image.jpg 10x20!
2613         !image.jpg 10w 20h!
2614         !image.jpg 20h 10w!
2615
2616         The image @image.jpg@ will be resized to width 10 and height 20.
2617
2618         Modifiers to the @<img>@ tag go after the opening @!@:
2619
2620         pre. !(class#id)^image.jpg!
2621
2622         Allowed modifiers include:
2623
2624         dl. &lt;:Align the image to the left (causes the image to float if CSS options are enabled).
2625         &gt;:Align the image to the right (causes the image to float if CSS options are enabled).
2626         - (dash):Aligns the image to the middle.
2627         ^:Aligns the image to the top.
2628         ~ (tilde):Aligns the image to the bottom.
2629         {style rule}:Applies a CSS style rule to the image.
2630         (class) or (#id) or (class#id):Applies a CSS class and/or id to the image.
2631         ( (one or more):Pads 1em on the left for each '(' character.
2632         ) (one or more):Pads 1em on the right for each ')' character.
2633
2634         Images receive the class "top" when using top alignment, "bottom"
2635         for bottom alignment and "middle" for middle alignment.
2636         """
2637         # Compile the beast.
2638         p = re.compile(r'''\!               # Opening !
2639                            %(iattr)s        # Image attributes
2640                            (?P<src>%(url)s) # Image src
2641                            \s?              # Optional whitesapce
2642                            (                #
2643                                \(           #
2644                                (?P<alt>.*?) # Optional (alt) attribute
2645                                \)           #
2646                            )?               #
2647                            \s?              # Optional whitespace
2648                            %(resize)s       # Resize parameters
2649                            \!               # Closing !
2650                            (                # Optional link
2651                                :            #    starts with ':'
2652                                (?P<link>    #
2653                                %(url)s      #    link HREF
2654                                )            #
2655                            )?               #
2656                         ''' % self.res, re.VERBOSE)
2657
2658         for m in p.finditer(text):
2659             c = m.groupdict('')
2660
2661             # Build the parameters for the <img /> tag.
2662             attributes = self.parse_params(c['parameters'], align_type='image')
2663             attributes.update(c)
2664             if attributes['alt']:
2665                 attributes['title'] = attributes['alt']
2666
2667             # Append height and width.
2668             attributes['width'] = m.groups()[5] or m.groups()[7] or m.groups()[10]
2669             attributes['height'] = m.groups()[6] or m.groups()[8] or m.groups()[9]
2670
2671             # Create the image tag.
2672             tag = self.image(attributes)
2673
2674             text = text.replace(m.group(), tag)
2675
2676         return text
2677
2678
2679     def image(self, attributes):
2680         """Process each image.
2681
2682         This method builds the <img> tag for each image in the text. It's
2683         separated from the 'images' method so it can be easily overriden when
2684         subclassing Textiler. Useful if you want to download and/or process
2685         the images, for example.
2686         """
2687         link = attributes['link']
2688         del attributes['link']
2689         del attributes['parameters']
2690
2691         # Build the tag.
2692         tag = self.build_open_tag('img', attributes, single=1)
2693
2694         if link:
2695             href = preg_replace('&(?!(#|amp))', '&amp;', link)
2696             tag = '<a href="%s">%s</a>' % (href, tag)
2697
2698         return tag
2699
2700
2701     def links(self, text):
2702         """Process links.
2703
2704         This function is responsible for processing links. It has
2705         some nice shortcuts to Google, Amazon and IMDB queries.
2706
2707         ---
2708         h1. Links
2709
2710         A links is done the following way:
2711
2712         pre. "This is the text link":http://example.com
2713
2714         The result from this markup is:
2715
2716         pre. <p><a href="http://example.com">This is the text link</a></p>
2717
2718         You can add an optional @title@ attribute:
2719
2720         pre. "This is the text link(This is the title)":http://example.com
2721
2722         The link can be customised as well:
2723
2724         pre. "(nospam)E-mail me please":mailto:someone@example.com
2725
2726         You can use either single or double quotes. They must be enclosed in
2727         whitespace, punctuation or brackets:
2728
2729         pre. You["gotta":http://example.com]seethis!
2730
2731         If you are going to reference the same link a couple of times, you
2732         can define a lookup list anywhere on your document:
2733
2734         pre. [python]http://www.python.org
2735
2736         Links to the Python website can then be defined the following way:
2737
2738         pre. "Check this":python
2739
2740         There are also shortcuts for Amazon, IMDB(Internet Movie DataBase) and
2741         Google queries:
2742
2743         pre. "Has anyone seen this guy?":imdb:Stephen+Fry
2744         "Really nice book":amazon:Goedel+Escher+Bach
2745         "PyBlosxom":google
2746         ["Using Textile and Blosxom with Python":google:python blosxom textile]
2747
2748         Becomes:
2749
2750         pre. <a href="http://www.imdb.com/Find?for=Stephen+Fry">Has anyone seen this guy?</a>
2751         <a href="http://www.amazon.com/exec/obidos/external-search?index=blended&amp;keyword=Goedel+Escher+Bach">Really nice book</a>
2752         <a href="http://www.google.com/search?q=PyBlosxom">PyBlosxom</a>
2753         <a href="http://www.google.com/search?q=python+blosxom+textile">Using Textile and Blosxom with Python</a>
2754         """
2755         linkres = [r'''\[                           # [
2756                        (?P<quote>"|')               # Opening quotes
2757                        %(lattr)s                    # Link attributes
2758                        (?P<text>[^"]+?)             # Link text
2759                        \s?                          # Optional whitespace
2760                        (?:\((?P<title>[^\)]+?)\))?  # Optional (title)
2761                        (?P=quote)                   # Closing quotes
2762                        :                            # :
2763                        (?P<href>[^\]]+)             # HREF
2764                        \]                           # ]
2765                     ''' % self.res,
2766                    r'''(?P<quote>"|')               # Opening quotes
2767                        %(lattr)s                    # Link attributes
2768                        (?P<text>[^"]+?)             # Link text
2769                        \s?                          # Optional whitespace
2770                        (?:\((?P<title>[^\)]+?)\))?  # Optional (title)
2771                        (?P=quote)                   # Closing quotes
2772                        :                            # :
2773                        (?P<href>%(url)s)            # HREF
2774                     ''' % self.res]
2775
2776         for linkre in linkres:
2777             p = re.compile(linkre, re.VERBOSE)
2778             for m in p.finditer(text):
2779                 c = m.groupdict('')
2780
2781                 attributes = self.parse_params(c['parameters'])
2782                 attributes['title'] = c['title'].replace('"', '&quot;')
2783
2784                 # Search lookup list.
2785                 link = self._links.get(c['href'], None) or c['href']
2786
2787                 # Hyperlinks for Amazon, IMDB and Google searches.
2788                 parts = link.split(':', 1)
2789                 proto = parts[0]
2790                 if len(parts) == 2:
2791                     query = parts[1]
2792                 else:
2793                     query = c['text']
2794
2795                 query = query.replace(' ', '+')
2796
2797                 # Look for smart search.
2798                 if self.searches.has_key(proto):
2799                     link = self.searches[proto] % query
2800
2801                 # Fix URL.
2802                 attributes['href'] = preg_replace('&(?!(#|amp))', '&amp;', link)
2803
2804                 open_tag = self.build_open_tag('a', attributes)
2805                 close_tag = '</a>'
2806
2807                 repl = open_tag + c['text'] + close_tag
2808
2809                 text = text.replace(m.group(), repl)
2810
2811         return text
2812
2813
2814     def format(self, text):
2815         """Text formatting.
2816
2817         This function basically defines the order on which the
2818         formatting is applied.
2819         """
2820         text = self.qtags(text)
2821         text = self.images(text)
2822         text = self.links(text)
2823         text = self.acronym(text)
2824         text = self.glyphs(text)
2825
2826         return text
2827
2828
2829     def inline(self, text):
2830         """Inline formatting.
2831
2832         This function calls the formatting on the inline text,
2833         taking care to avoid the escaped parts.
2834
2835         ---
2836         h1. Inline
2837
2838         Inline formatting is applied within a block of text.
2839         """
2840         if not re.search(r'''==(.*?)==''', text):
2841             text = self.format(text)
2842
2843         else:
2844             lines = []
2845             # Else split the text into an array at <>.
2846             for line in re.split('(==.*?==)', text):
2847                 if not re.match('==.*?==', line):
2848                     line = self.format(line)
2849                 else:
2850                     line = line[2:-2]
2851
2852                 lines.append(line)
2853
2854             text = ''.join(lines)
2855
2856         return text
2857
2858
2859 def textile(text, **args):
2860     """This is Textile.
2861
2862     Generates XHTML from a simple markup developed by Dean Allen.
2863
2864     This function should be called like this:
2865
2866         textile(text, head_offset=0, validate=0, sanitize=0,
2867                 encoding='latin-1', output='ASCII')
2868     """
2869     return Textiler(text).process(**args)
2870
2871
2872 if __name__ == '__main__':
2873     print textile('tell me about textile.', head_offset=1)