Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / markdown / __init__.py
blob0aa15a7e8f950a7b8789b8cbf100850ce9074d84
1 # markdown is released under the BSD license
2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4 # Copyright 2004 Manfred Stienstra (the original version)
5 #
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # * Neither the name of the <organization> nor the
17 # names of its contributors may be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
33 """
34 Python Markdown
35 ===============
37 Python Markdown converts Markdown to HTML and can be used as a library or
38 called from the command line.
40 ## Basic usage as a module:
42 import markdown
43 html = markdown.markdown(your_text_string)
45 See <http://packages.python.org/Markdown/> for more
46 information and instructions on how to extend the functionality of
47 Python Markdown. Read that before you try modifying this file.
49 ## Authors and License
51 Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
52 maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
53 Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
55 Contact: markdown@freewisdom.org
57 Copyright 2007-2013 The Python Markdown Project (v. 1.7 and later)
58 Copyright 200? Django Software Foundation (OrderedDict implementation)
59 Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
60 Copyright 2004 Manfred Stienstra (the original version)
62 License: BSD (see LICENSE for details).
63 """
65 from __future__ import absolute_import
66 from __future__ import unicode_literals
67 from .__version__ import version, version_info
68 import re
69 import codecs
70 import sys
71 import logging
72 from . import util
73 from .preprocessors import build_preprocessors
74 from .blockprocessors import build_block_parser
75 from .treeprocessors import build_treeprocessors
76 from .inlinepatterns import build_inlinepatterns
77 from .postprocessors import build_postprocessors
78 from .extensions import Extension
79 from .serializers import to_html_string, to_xhtml_string
81 __all__ = ['Markdown', 'markdown', 'markdownFromFile']
83 logger = logging.getLogger('MARKDOWN')
86 class Markdown(object):
87 """Convert Markdown to HTML."""
89 doc_tag = "div" # Element used to wrap document - later removed
91 option_defaults = {
92 'html_replacement_text' : '[HTML_REMOVED]',
93 'tab_length' : 4,
94 'enable_attributes' : True,
95 'smart_emphasis' : True,
96 'lazy_ol' : True,
99 output_formats = {
100 'html' : to_html_string,
101 'html4' : to_html_string,
102 'html5' : to_html_string,
103 'xhtml' : to_xhtml_string,
104 'xhtml1': to_xhtml_string,
105 'xhtml5': to_xhtml_string,
108 ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
109 '(', ')', '>', '#', '+', '-', '.', '!']
111 def __init__(self, *args, **kwargs):
113 Creates a new Markdown instance.
115 Keyword arguments:
117 * extensions: A list of extensions.
118 If they are of type string, the module mdx_name.py will be loaded.
119 If they are a subclass of markdown.Extension, they will be used
120 as-is.
121 * extension_configs: Configuration settingis for extensions.
122 * output_format: Format of output. Supported formats are:
123 * "xhtml1": Outputs XHTML 1.x. Default.
124 * "xhtml5": Outputs XHTML style tags of HTML 5
125 * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
126 * "html4": Outputs HTML 4
127 * "html5": Outputs HTML style tags of HTML 5
128 * "html": Outputs latest supported version of HTML (currently HTML 4).
129 Note that it is suggested that the more specific formats ("xhtml1"
130 and "html4") be used as "xhtml" or "html" may change in the future
131 if it makes sense at that time.
132 * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
133 * html_replacement_text: Text used when safe_mode is set to "replace".
134 * tab_length: Length of tabs in the source. Default: 4
135 * enable_attributes: Enable the conversion of attributes. Default: True
136 * smart_emphasis: Treat `_connected_words_` intelegently Default: True
137 * lazy_ol: Ignore number of first item of ordered lists. Default: True
141 # For backward compatibility, loop through old positional args
142 pos = ['extensions', 'extension_configs', 'safe_mode', 'output_format']
143 c = 0
144 for arg in args:
145 if pos[c] not in kwargs:
146 kwargs[pos[c]] = arg
147 c += 1
148 if c == len(pos):
149 # ignore any additional args
150 break
152 # Loop through kwargs and assign defaults
153 for option, default in self.option_defaults.items():
154 setattr(self, option, kwargs.get(option, default))
156 self.safeMode = kwargs.get('safe_mode', False)
157 if self.safeMode and 'enable_attributes' not in kwargs:
158 # Disable attributes in safeMode when not explicitly set
159 self.enable_attributes = False
161 self.registeredExtensions = []
162 self.docType = ""
163 self.stripTopLevelTags = True
165 self.build_parser()
167 self.references = {}
168 self.htmlStash = util.HtmlStash()
169 self.set_output_format(kwargs.get('output_format', 'xhtml1'))
170 self.registerExtensions(extensions=kwargs.get('extensions', []),
171 configs=kwargs.get('extension_configs', {}))
172 self.reset()
174 def build_parser(self):
175 """ Build the parser from the various parts. """
176 self.preprocessors = build_preprocessors(self)
177 self.parser = build_block_parser(self)
178 self.inlinePatterns = build_inlinepatterns(self)
179 self.treeprocessors = build_treeprocessors(self)
180 self.postprocessors = build_postprocessors(self)
181 return self
183 def registerExtensions(self, extensions, configs):
185 Register extensions with this instance of Markdown.
187 Keyword arguments:
189 * extensions: A list of extensions, which can either
190 be strings or objects. See the docstring on Markdown.
191 * configs: A dictionary mapping module names to config options.
194 for ext in extensions:
195 if isinstance(ext, util.string_type):
196 ext = self.build_extension(ext, configs.get(ext, []))
197 if isinstance(ext, Extension):
198 ext.extendMarkdown(self, globals())
199 elif ext is not None:
200 raise TypeError(
201 'Extension "%s.%s" must be of type: "markdown.Extension"'
202 % (ext.__class__.__module__, ext.__class__.__name__))
204 return self
206 def build_extension(self, ext_name, configs = []):
207 """Build extension by name, then return the module.
209 The extension name may contain arguments as part of the string in the
210 following format: "extname(key1=value1,key2=value2)"
214 # Parse extensions config params (ignore the order)
215 configs = dict(configs)
216 pos = ext_name.find("(") # find the first "("
217 if pos > 0:
218 ext_args = ext_name[pos+1:-1]
219 ext_name = ext_name[:pos]
220 pairs = [x.split("=") for x in ext_args.split(",")]
221 configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
223 # Setup the module name
224 module_name = ext_name
225 if '.' not in ext_name:
226 module_name = '.'.join(['third_party.markdown.extensions', ext_name])
228 # Try loading the extension first from one place, then another
229 try: # New style (markdown.extensons.<extension>)
230 module = __import__(module_name, {}, {}, [module_name.rpartition('.')[0]])
231 except ImportError:
232 module_name_old_style = '_'.join(['mdx', ext_name])
233 try: # Old style (mdx_<extension>)
234 module = __import__(module_name_old_style)
235 except ImportError as e:
236 message = "Failed loading extension '%s' from '%s' or '%s'" \
237 % (ext_name, module_name, module_name_old_style)
238 e.args = (message,) + e.args[1:]
239 raise
241 # If the module is loaded successfully, we expect it to define a
242 # function called makeExtension()
243 try:
244 return module.makeExtension(configs.items())
245 except AttributeError as e:
246 message = e.args[0]
247 message = "Failed to initiate extension " \
248 "'%s': %s" % (ext_name, message)
249 e.args = (message,) + e.args[1:]
250 raise
252 def registerExtension(self, extension):
253 """ This gets called by the extension """
254 self.registeredExtensions.append(extension)
255 return self
257 def reset(self):
259 Resets all state variables so that we can start with a new text.
261 self.htmlStash.reset()
262 self.references.clear()
264 for extension in self.registeredExtensions:
265 if hasattr(extension, 'reset'):
266 extension.reset()
268 return self
270 def set_output_format(self, format):
271 """ Set the output format for the class instance. """
272 self.output_format = format.lower()
273 try:
274 self.serializer = self.output_formats[self.output_format]
275 except KeyError as e:
276 valid_formats = list(self.output_formats.keys())
277 valid_formats.sort()
278 message = 'Invalid Output Format: "%s". Use one of %s.' \
279 % (self.output_format,
280 '"' + '", "'.join(valid_formats) + '"')
281 e.args = (message,) + e.args[1:]
282 raise
283 return self
285 def convert(self, source):
287 Convert markdown to serialized XHTML or HTML.
289 Keyword arguments:
291 * source: Source text as a Unicode string.
293 Markdown processing takes place in five steps:
295 1. A bunch of "preprocessors" munge the input text.
296 2. BlockParser() parses the high-level structural elements of the
297 pre-processed text into an ElementTree.
298 3. A bunch of "treeprocessors" are run against the ElementTree. One
299 such treeprocessor runs InlinePatterns against the ElementTree,
300 detecting inline markup.
301 4. Some post-processors are run against the text after the ElementTree
302 has been serialized into text.
303 5. The output is written to a string.
307 # Fixup the source text
308 if not source.strip():
309 return '' # a blank unicode string
311 try:
312 source = util.text_type(source)
313 except UnicodeDecodeError as e:
314 # Customise error message while maintaining original trackback
315 e.reason += '. -- Note: Markdown only accepts unicode input!'
316 raise
318 # Split into lines and run the line preprocessors.
319 self.lines = source.split("\n")
320 for prep in self.preprocessors.values():
321 self.lines = prep.run(self.lines)
323 # Parse the high-level elements.
324 root = self.parser.parseDocument(self.lines).getroot()
326 # Run the tree-processors
327 for treeprocessor in self.treeprocessors.values():
328 newRoot = treeprocessor.run(root)
329 if newRoot:
330 root = newRoot
332 # Serialize _properly_. Strip top-level tags.
333 output = self.serializer(root)
334 if self.stripTopLevelTags:
335 try:
336 start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
337 end = output.rindex('</%s>'%self.doc_tag)
338 output = output[start:end].strip()
339 except ValueError:
340 if output.strip().endswith('<%s />'%self.doc_tag):
341 # We have an empty document
342 output = ''
343 else:
344 # We have a serious problem
345 raise ValueError('Markdown failed to strip top-level tags. Document=%r' % output.strip())
347 # Run the text post-processors
348 for pp in self.postprocessors.values():
349 output = pp.run(output)
351 return output.strip()
353 def convertFile(self, input=None, output=None, encoding=None):
354 """Converts a markdown file and returns the HTML as a unicode string.
356 Decodes the file using the provided encoding (defaults to utf-8),
357 passes the file content to markdown, and outputs the html to either
358 the provided stream or the file with provided name, using the same
359 encoding as the source file. The 'xmlcharrefreplace' error handler is
360 used when encoding the output.
362 **Note:** This is the only place that decoding and encoding of unicode
363 takes place in Python-Markdown. (All other code is unicode-in /
364 unicode-out.)
366 Keyword arguments:
368 * input: File object or path. Reads from stdin if `None`.
369 * output: File object or path. Writes to stdout if `None`.
370 * encoding: Encoding of input and output files. Defaults to utf-8.
374 encoding = encoding or "utf-8"
376 # Read the source
377 if input:
378 if isinstance(input, util.string_type):
379 input_file = codecs.open(input, mode="r", encoding=encoding)
380 else:
381 input_file = codecs.getreader(encoding)(input)
382 text = input_file.read()
383 input_file.close()
384 else:
385 text = sys.stdin.read()
386 if not isinstance(text, util.text_type):
387 text = text.decode(encoding)
389 text = text.lstrip('\ufeff') # remove the byte-order mark
391 # Convert
392 html = self.convert(text)
394 # Write to file or stdout
395 if output:
396 if isinstance(output, util.string_type):
397 output_file = codecs.open(output, "w",
398 encoding=encoding,
399 errors="xmlcharrefreplace")
400 output_file.write(html)
401 output_file.close()
402 else:
403 writer = codecs.getwriter(encoding)
404 output_file = writer(output, errors="xmlcharrefreplace")
405 output_file.write(html)
406 # Don't close here. User may want to write more.
407 else:
408 # Encode manually and write bytes to stdout.
409 html = html.encode(encoding, "xmlcharrefreplace")
410 try:
411 # Write bytes directly to buffer (Python 3).
412 sys.stdout.buffer.write(html)
413 except AttributeError:
414 # Probably Python 2, which works with bytes by default.
415 sys.stdout.write(html)
417 return self
421 EXPORTED FUNCTIONS
422 =============================================================================
424 Those are the two functions we really mean to export: markdown() and
425 markdownFromFile().
428 def markdown(text, *args, **kwargs):
429 """Convert a markdown string to HTML and return HTML as a unicode string.
431 This is a shortcut function for `Markdown` class to cover the most
432 basic use case. It initializes an instance of Markdown, loads the
433 necessary extensions and runs the parser on the given text.
435 Keyword arguments:
437 * text: Markdown formatted text as Unicode or ASCII string.
438 * Any arguments accepted by the Markdown class.
440 Returns: An HTML document as a string.
443 md = Markdown(*args, **kwargs)
444 return md.convert(text)
447 def markdownFromFile(*args, **kwargs):
448 """Read markdown code from a file and write it to a file or a stream.
450 This is a shortcut function which initializes an instance of Markdown,
451 and calls the convertFile method rather than convert.
453 Keyword arguments:
455 * input: a file name or readable object.
456 * output: a file name or writable object.
457 * encoding: Encoding of input and output.
458 * Any arguments accepted by the Markdown class.
461 # For backward compatibility loop through positional args
462 pos = ['input', 'output', 'extensions', 'encoding']
463 c = 0
464 for arg in args:
465 if pos[c] not in kwargs:
466 kwargs[pos[c]] = arg
467 c += 1
468 if c == len(pos):
469 break
471 md = Markdown(**kwargs)
472 md.convertFile(kwargs.get('input', None),
473 kwargs.get('output', None),
474 kwargs.get('encoding', None))