third_party/markdown/__init__.py

   1 # markdown is released under the BSD license
   2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
   3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
   4 # Copyright 2004 Manfred Stienstra (the original version)
   5 #
   6 # All rights reserved.
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions are met:
  10 #
  11 # *   Redistributions of source code must retain the above copyright
  12 #     notice, this list of conditions and the following disclaimer.
  13 # *   Redistributions in binary form must reproduce the above copyright
  14 #     notice, this list of conditions and the following disclaimer in the
  15 #     documentation and/or other materials provided with the distribution.
  16 # *   Neither the name of the <organization> nor the
  17 #     names of its contributors may be used to endorse or promote products
  18 #     derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
  21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
  24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32
  33 """
  34 Python Markdown
  35 ===============
  36
  37 Python Markdown converts Markdown to HTML and can be used as a library or
  38 called from the command line.
  39
  40 ## Basic usage as a module:
  41
  42     import markdown
  43     html = markdown.markdown(your_text_string)
  44
  45 See <http://packages.python.org/Markdown/> for more
  46 information and instructions on how to extend the functionality of
  47 Python Markdown.  Read that before you try modifying this file.
  48
  49 ## Authors and License
  50
  51 Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
  52 maintained  by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
  53 Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
  54
  55 Contact: markdown@freewisdom.org
  56
  57 Copyright 2007-2013 The Python Markdown Project (v. 1.7 and later)
  58 Copyright 200? Django Software Foundation (OrderedDict implementation)
  59 Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
  60 Copyright 2004 Manfred Stienstra (the original version)
  61
  62 License: BSD (see LICENSE for details).
  63 """
  64
  65 from __future__ import absolute_import
  66 from __future__ import unicode_literals
  67 from .__version__ import version, version_info
  68 import re
  69 import codecs
  70 import sys
  71 import logging
  72 from . import util
  73 from .preprocessors import build_preprocessors
  74 from .blockprocessors import build_block_parser
  75 from .treeprocessors import build_treeprocessors
  76 from .inlinepatterns import build_inlinepatterns
  77 from .postprocessors import build_postprocessors
  78 from .extensions import Extension
  79 from .serializers import to_html_string, to_xhtml_string
  80
  81 __all__ = ['Markdown', 'markdown', 'markdownFromFile']
  82
  83 logger = logging.getLogger('MARKDOWN')
  84
  85
  86 class Markdown(object):
  87     """Convert Markdown to HTML."""
  88
  89     doc_tag = "div"     # Element used to wrap document - later removed
  90
  91     option_defaults = {
  92         'html_replacement_text' : '[HTML_REMOVED]',
  93         'tab_length'            : 4,
  94         'enable_attributes'     : True,
  95         'smart_emphasis'        : True,
  96         'lazy_ol'               : True,
  97     }
  98
  99     output_formats = {
 100         'html'  : to_html_string,
 101         'html4' : to_html_string,
 102         'html5' : to_html_string,
 103         'xhtml' : to_xhtml_string,
 104         'xhtml1': to_xhtml_string,
 105         'xhtml5': to_xhtml_string,
 106     }
 107
 108     ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
 109                     '(', ')', '>', '#', '+', '-', '.', '!']
 110
 111     def __init__(self, *args, **kwargs):
 112         """
 113         Creates a new Markdown instance.
 114
 115         Keyword arguments:
 116
 117         * extensions: A list of extensions.
 118            If they are of type string, the module mdx_name.py will be loaded.
 119            If they are a subclass of markdown.Extension, they will be used
 120            as-is.
 121         * extension_configs: Configuration settingis for extensions.
 122         * output_format: Format of output. Supported formats are:
 123             * "xhtml1": Outputs XHTML 1.x. Default.
 124             * "xhtml5": Outputs XHTML style tags of HTML 5
 125             * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
 126             * "html4": Outputs HTML 4
 127             * "html5": Outputs HTML style tags of HTML 5
 128             * "html": Outputs latest supported version of HTML (currently HTML 4).
 129             Note that it is suggested that the more specific formats ("xhtml1"
 130             and "html4") be used as "xhtml" or "html" may change in the future
 131             if it makes sense at that time.
 132         * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
 133         * html_replacement_text: Text used when safe_mode is set to "replace".
 134         * tab_length: Length of tabs in the source. Default: 4
 135         * enable_attributes: Enable the conversion of attributes. Default: True
 136         * smart_emphasis: Treat `_connected_words_` intelegently Default: True
 137         * lazy_ol: Ignore number of first item of ordered lists. Default: True
 138
 139         """
 140
 141         # For backward compatibility, loop through old positional args
 142         pos = ['extensions', 'extension_configs', 'safe_mode', 'output_format']
 143         c = 0
 144         for arg in args:
 145             if pos[c] not in kwargs:
 146                 kwargs[pos[c]] = arg
 147             c += 1
 148             if c == len(pos):
 149                 # ignore any additional args
 150                 break
 151
 152         # Loop through kwargs and assign defaults
 153         for option, default in self.option_defaults.items():
 154             setattr(self, option, kwargs.get(option, default))
 155
 156         self.safeMode = kwargs.get('safe_mode', False)
 157         if self.safeMode and 'enable_attributes' not in kwargs:
 158             # Disable attributes in safeMode when not explicitly set
 159             self.enable_attributes = False
 160
 161         self.registeredExtensions = []
 162         self.docType = ""
 163         self.stripTopLevelTags = True
 164
 165         self.build_parser()
 166
 167         self.references = {}
 168         self.htmlStash = util.HtmlStash()
 169         self.set_output_format(kwargs.get('output_format', 'xhtml1'))
 170         self.registerExtensions(extensions=kwargs.get('extensions', []),
 171                                 configs=kwargs.get('extension_configs', {}))
 172         self.reset()
 173
 174     def build_parser(self):
 175         """ Build the parser from the various parts. """
 176         self.preprocessors = build_preprocessors(self)
 177         self.parser = build_block_parser(self)
 178         self.inlinePatterns = build_inlinepatterns(self)
 179         self.treeprocessors = build_treeprocessors(self)
 180         self.postprocessors = build_postprocessors(self)
 181         return self
 182
 183     def registerExtensions(self, extensions, configs):
 184         """
 185         Register extensions with this instance of Markdown.
 186
 187         Keyword arguments:
 188
 189         * extensions: A list of extensions, which can either
 190            be strings or objects.  See the docstring on Markdown.
 191         * configs: A dictionary mapping module names to config options.
 192
 193         """
 194         for ext in extensions:
 195             if isinstance(ext, util.string_type):
 196                 ext = self.build_extension(ext, configs.get(ext, []))
 197             if isinstance(ext, Extension):
 198                 ext.extendMarkdown(self, globals())
 199             elif ext is not None:
 200                 raise TypeError(
 201                     'Extension "%s.%s" must be of type: "markdown.Extension"'
 202                     % (ext.__class__.__module__, ext.__class__.__name__))
 203
 204         return self
 205
 206     def build_extension(self, ext_name, configs = []):
 207         """Build extension by name, then return the module.
 208
 209         The extension name may contain arguments as part of the string in the
 210         following format: "extname(key1=value1,key2=value2)"
 211
 212         """
 213
 214         # Parse extensions config params (ignore the order)
 215         configs = dict(configs)
 216         pos = ext_name.find("(") # find the first "("
 217         if pos > 0:
 218             ext_args = ext_name[pos+1:-1]
 219             ext_name = ext_name[:pos]
 220             pairs = [x.split("=") for x in ext_args.split(",")]
 221             configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
 222
 223         # Setup the module name
 224         module_name = ext_name
 225         if '.' not in ext_name:
 226             module_name = '.'.join(['third_party.markdown.extensions', ext_name])
 227
 228         # Try loading the extension first from one place, then another
 229         try: # New style (markdown.extensons.<extension>)
 230             module = __import__(module_name, {}, {}, [module_name.rpartition('.')[0]])
 231         except ImportError:
 232             module_name_old_style = '_'.join(['mdx', ext_name])
 233             try: # Old style (mdx_<extension>)
 234                 module = __import__(module_name_old_style)
 235             except ImportError as e:
 236                 message = "Failed loading extension '%s' from '%s' or '%s'" \
 237                     % (ext_name, module_name, module_name_old_style)
 238                 e.args = (message,) + e.args[1:]
 239                 raise
 240
 241         # If the module is loaded successfully, we expect it to define a
 242         # function called makeExtension()
 243         try:
 244             return module.makeExtension(configs.items())
 245         except AttributeError as e:
 246             message = e.args[0]
 247             message = "Failed to initiate extension " \
 248                       "'%s': %s" % (ext_name, message)
 249             e.args = (message,) + e.args[1:]
 250             raise
 251
 252     def registerExtension(self, extension):
 253         """ This gets called by the extension """
 254         self.registeredExtensions.append(extension)
 255         return self
 256
 257     def reset(self):
 258         """
 259         Resets all state variables so that we can start with a new text.
 260         """
 261         self.htmlStash.reset()
 262         self.references.clear()
 263
 264         for extension in self.registeredExtensions:
 265             if hasattr(extension, 'reset'):
 266                 extension.reset()
 267
 268         return self
 269
 270     def set_output_format(self, format):
 271         """ Set the output format for the class instance. """
 272         self.output_format = format.lower()
 273         try:
 274             self.serializer = self.output_formats[self.output_format]
 275         except KeyError as e:
 276             valid_formats = list(self.output_formats.keys())
 277             valid_formats.sort()
 278             message = 'Invalid Output Format: "%s". Use one of %s.' \
 279                        % (self.output_format,
 280                           '"' + '", "'.join(valid_formats) + '"')
 281             e.args = (message,) + e.args[1:]
 282             raise
 283         return self
 284
 285     def convert(self, source):
 286         """
 287         Convert markdown to serialized XHTML or HTML.
 288
 289         Keyword arguments:
 290
 291         * source: Source text as a Unicode string.
 292
 293         Markdown processing takes place in five steps:
 294
 295         1. A bunch of "preprocessors" munge the input text.
 296         2. BlockParser() parses the high-level structural elements of the
 297            pre-processed text into an ElementTree.
 298         3. A bunch of "treeprocessors" are run against the ElementTree. One
 299            such treeprocessor runs InlinePatterns against the ElementTree,
 300            detecting inline markup.
 301         4. Some post-processors are run against the text after the ElementTree
 302            has been serialized into text.
 303         5. The output is written to a string.
 304
 305         """
 306
 307         # Fixup the source text
 308         if not source.strip():
 309             return ''  # a blank unicode string
 310
 311         try:
 312             source = util.text_type(source)
 313         except UnicodeDecodeError as e:
 314             # Customise error message while maintaining original trackback
 315             e.reason += '. -- Note: Markdown only accepts unicode input!'
 316             raise
 317
 318         # Split into lines and run the line preprocessors.
 319         self.lines = source.split("\n")
 320         for prep in self.preprocessors.values():
 321             self.lines = prep.run(self.lines)
 322
 323         # Parse the high-level elements.
 324         root = self.parser.parseDocument(self.lines).getroot()
 325
 326         # Run the tree-processors
 327         for treeprocessor in self.treeprocessors.values():
 328             newRoot = treeprocessor.run(root)
 329             if newRoot:
 330                 root = newRoot
 331
 332         # Serialize _properly_.  Strip top-level tags.
 333         output = self.serializer(root)
 334         if self.stripTopLevelTags:
 335             try:
 336                 start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
 337                 end = output.rindex('</%s>'%self.doc_tag)
 338                 output = output[start:end].strip()
 339             except ValueError:
 340                 if output.strip().endswith('<%s />'%self.doc_tag):
 341                     # We have an empty document
 342                     output = ''
 343                 else:
 344                     # We have a serious problem
 345                     raise ValueError('Markdown failed to strip top-level tags. Document=%r' % output.strip())
 346
 347         # Run the text post-processors
 348         for pp in self.postprocessors.values():
 349             output = pp.run(output)
 350
 351         return output.strip()
 352
 353     def convertFile(self, input=None, output=None, encoding=None):
 354         """Converts a markdown file and returns the HTML as a unicode string.
 355
 356         Decodes the file using the provided encoding (defaults to utf-8),
 357         passes the file content to markdown, and outputs the html to either
 358         the provided stream or the file with provided name, using the same
 359         encoding as the source file. The 'xmlcharrefreplace' error handler is
 360         used when encoding the output.
 361
 362         **Note:** This is the only place that decoding and encoding of unicode
 363         takes place in Python-Markdown.  (All other code is unicode-in /
 364         unicode-out.)
 365
 366         Keyword arguments:
 367
 368         * input: File object or path. Reads from stdin if `None`.
 369         * output: File object or path. Writes to stdout if `None`.
 370         * encoding: Encoding of input and output files. Defaults to utf-8.
 371
 372         """
 373
 374         encoding = encoding or "utf-8"
 375
 376         # Read the source
 377         if input:
 378             if isinstance(input, util.string_type):
 379                 input_file = codecs.open(input, mode="r", encoding=encoding)
 380             else:
 381                 input_file = codecs.getreader(encoding)(input)
 382             text = input_file.read()
 383             input_file.close()
 384         else:
 385             text = sys.stdin.read()
 386             if not isinstance(text, util.text_type):
 387                 text = text.decode(encoding)
 388
 389         text = text.lstrip('\ufeff') # remove the byte-order mark
 390
 391         # Convert
 392         html = self.convert(text)
 393
 394         # Write to file or stdout
 395         if output:
 396             if isinstance(output, util.string_type):
 397                 output_file = codecs.open(output, "w",
 398                                           encoding=encoding,
 399                                           errors="xmlcharrefreplace")
 400                 output_file.write(html)
 401                 output_file.close()
 402             else:
 403                 writer = codecs.getwriter(encoding)
 404                 output_file = writer(output, errors="xmlcharrefreplace")
 405                 output_file.write(html)
 406                 # Don't close here. User may want to write more.
 407         else:
 408             # Encode manually and write bytes to stdout.
 409             html = html.encode(encoding, "xmlcharrefreplace")
 410             try:
 411                 # Write bytes directly to buffer (Python 3).
 412                 sys.stdout.buffer.write(html)
 413             except AttributeError:
 414                 # Probably Python 2, which works with bytes by default.
 415                 sys.stdout.write(html)
 416
 417         return self
 418
 419
 420 """
 421 EXPORTED FUNCTIONS
 422 =============================================================================
 423
 424 Those are the two functions we really mean to export: markdown() and
 425 markdownFromFile().
 426 """
 427
 428 def markdown(text, *args, **kwargs):
 429     """Convert a markdown string to HTML and return HTML as a unicode string.
 430
 431     This is a shortcut function for `Markdown` class to cover the most
 432     basic use case.  It initializes an instance of Markdown, loads the
 433     necessary extensions and runs the parser on the given text.
 434
 435     Keyword arguments:
 436
 437     * text: Markdown formatted text as Unicode or ASCII string.
 438     * Any arguments accepted by the Markdown class.
 439
 440     Returns: An HTML document as a string.
 441
 442     """
 443     md = Markdown(*args, **kwargs)
 444     return md.convert(text)
 445
 446
 447 def markdownFromFile(*args, **kwargs):
 448     """Read markdown code from a file and write it to a file or a stream.
 449
 450     This is a shortcut function which initializes an instance of Markdown,
 451     and calls the convertFile method rather than convert.
 452
 453     Keyword arguments:
 454
 455     * input: a file name or readable object.
 456     * output: a file name or writable object.
 457     * encoding: Encoding of input and output.
 458     * Any arguments accepted by the Markdown class.
 459
 460     """
 461     # For backward compatibility loop through positional args
 462     pos = ['input', 'output', 'extensions', 'encoding']
 463     c = 0
 464     for arg in args:
 465         if pos[c] not in kwargs:
 466             kwargs[pos[c]] = arg
 467         c += 1
 468         if c == len(pos):
 469             break
 470
 471     md = Markdown(**kwargs)
 472     md.convertFile(kwargs.get('input', None),
 473                    kwargs.get('output', None),
 474                    kwargs.get('encoding', None))
 475