2 """Extension to python-markdown to support LaTeX (rather than html) output.
4 Authored by Rufus Pollock: <http://www.rufuspollock.org/>
5 Reworked by Julian Wulfheide (ju.wulfheide@gmail.com) and
6 Indico Project (indico-team@cern.ch)
11 1. Command Line. A script entitled markdown2latex.py is automatically
12 installed. For details of usage see help::
14 $ markdown2latex.py -h
16 2. As a python-markdown extension::
19 >>> md = markdown.Markdown(None, extensions=['latex'])
20 >>> # text is input string ...
21 >>> latex_out = md.convert(text)
23 3. Directly as a module (slight inversion of std markdown extension setup)::
27 >>> md = markdown.Markdown()
28 >>> latex_mdx = mdx_latex.LaTeXExtension()
29 >>> latex_mdx.extendMarkdown(md, markdown.__dict__)
30 >>> out = md.convert(text)
35 Version: 1.0 (November 15, 2006)
37 * First working version (compatible with markdown 1.5)
38 * Includes support for tables
40 Version: 1.1 (January 17, 2007)
42 * Support for verbatim and images
44 Version: 1.2 (June 2008)
46 * Refactor as an extension.
47 * Make into a proper python/setuptools package.
48 * Tested with markdown 1.7 but should work with 1.6 and (possibly) 1.5
49 (though pre/post processor stuff not as worked out there)
51 Version 1.3: (July 2008)
52 * Improvements to image output (width)
54 Version 1.3.1: (August 2009)
55 * Tiny bugfix to remove duplicate keyword argument and set zip_safe=False
56 * Add [width=\textwidth] by default for included images
58 Version 2.0: (June 2011)
60 * Major rework since this was broken by new Python-Markdown releases
62 Version 2.1: (August 2013)
63 * Add handler for non locally referenced images, hyperlinks and horizontal rules
64 * Update math delimiters
69 # do some fancy importing stuff to allow use to override things in this module
70 # in this file while still importing * for use in our own classes
74 import xml
.dom
.minidom
75 from urlparse
import urlparse
82 start_single_quote_re
= re
.compile("(^|\s|\")'")
83 start_double_quote_re
= re
.compile("(^|\s|'|`)\"")
84 end_double_quote_re
= re
.compile("\"(,|\.|\s|$)")
87 def unescape_html_entities(text
):
88 out
= text
.replace('&', '&')
89 out
= out
.replace('<', '<')
90 out
= out
.replace('>', '>')
91 out
= out
.replace('"', '"')
95 def latex_escape(text
, ignore_math
=True):
104 "\\": r
"\textbackslash{}",
114 return chars
[x
.group()]
117 math_segments
.append(m
.group(0))
118 return "[*LaTeXmath*]"
121 text
= re
.sub(r
'\$[^\$]+\$|\$\$(^\$)\$\$', math_replace
, text
)
123 pattern
= re
.compile('|'.join(re
.escape(k
) for k
in chars
.keys()))
124 res
= pattern
.sub(substitute
, text
)
127 res
= re
.sub(r
'\[\*LaTeXmath\*\]', lambda _
: "\\protect " + math_segments
.pop(0), res
)
132 def escape_latex_entities(text
):
133 """Escape latex reserved characters."""
135 out
= unescape_html_entities(out
)
136 out
= start_single_quote_re
.sub('\g<1>`', out
)
137 out
= start_double_quote_re
.sub('\g<1>``', out
)
138 out
= end_double_quote_re
.sub("''\g<1>", out
)
140 out
= latex_escape(out
)
145 def unescape_latex_entities(text
):
146 """Limit ourselves as this is only used for maths stuff."""
148 out
= out
.replace('\\&', '&')
152 def makeExtension(configs
=None):
153 return LaTeXExtension(configs
=configs
)
156 class LaTeXExtension(markdown
.Extension
):
157 def __init__(self
, configs
=None):
160 def extendMarkdown(self
, md
, md_globals
):
163 # remove escape pattern -- \\(.*) -- as this messes up any embedded
164 # math and we don't need to escape stuff any more for html
165 for key
, pat
in self
.md
.inlinePatterns
.iteritems():
166 if pat
.pattern
== markdown
.inlinepatterns
.ESCAPE_RE
:
167 self
.md
.inlinePatterns
.pop(key
)
170 #footnote_extension = FootnoteExtension()
171 #footnote_extension.extendMarkdown(md, md_globals)
173 latex_tp
= LaTeXTreeProcessor()
174 math_pp
= MathTextPostProcessor()
175 table_pp
= TableTextPostProcessor()
176 image_pp
= ImageTextPostProcessor()
177 link_pp
= LinkTextPostProcessor()
178 unescape_html_pp
= UnescapeHtmlTextPostProcessor()
180 md
.treeprocessors
['latex'] = latex_tp
181 md
.postprocessors
['unescape_html'] = unescape_html_pp
182 md
.postprocessors
['math'] = math_pp
183 md
.postprocessors
['image'] = image_pp
184 md
.postprocessors
['table'] = table_pp
185 md
.postprocessors
['link'] = link_pp
191 class LaTeXTreeProcessor(markdown
.treeprocessors
.Treeprocessor
):
193 """Walk the dom converting relevant nodes to text nodes with relevant
195 latex_text
= self
.tolatex(doc
)
197 doc
.text
= latex_text
199 def tolatex(self
, ournode
):
204 subcontent
+= escape_latex_entities(ournode
.text
)
206 if ournode
.getchildren():
207 for child
in ournode
.getchildren():
208 subcontent
+= self
.tolatex(child
)
210 if ournode
.tag
== 'h1':
211 buffer += '\n\n\\section{%s}\n' % subcontent
212 elif ournode
.tag
== 'h2':
213 buffer += '\n\n\\subsection{%s}\n' % subcontent
214 elif ournode
.tag
== 'h3':
215 buffer += '\n\\subsubsection{%s}\n' % subcontent
216 elif ournode
.tag
== 'h4':
217 buffer += '\n\\paragraph{%s}\n' % subcontent
218 elif ournode
.tag
== 'hr':
219 buffer += '\\noindent\makebox[\linewidth]{\\rule{\paperwidth}{0.4pt}}'
220 elif ournode
.tag
== 'ul':
221 # no need for leading \n as one will be provided by li
226 elif ournode
.tag
== 'ol':
227 # no need for leading \n as one will be provided by li
232 elif ournode
.tag
== 'li':
234 \\item %s""" % subcontent
.strip()
235 elif ournode
.tag
== 'blockquote':
236 # use quotation rather than quote as quotation can support multiple
242 """ % subcontent
.strip()
243 # ignore 'code' when inside pre tags
244 # (mkdn produces <pre><code></code></pre>)
245 elif (ournode
.tag
== 'pre' or
246 # TODO: Take a look here
247 (ournode
.tag
== 'pre' and ournode
.parentNode
.tag
!= 'pre')):
252 """ % subcontent
.strip()
253 elif ournode
.tag
== 'q':
254 buffer += "`%s'" % subcontent
.strip()
255 elif ournode
.tag
== 'p':
256 buffer += '\n%s\n' % subcontent
.strip()
257 # Footnote processor inserts all of the footnote in a sup tag
258 elif ournode
.tag
== 'sup':
259 buffer += '\\footnote{%s}' % subcontent
.strip()
260 elif ournode
.tag
== 'strong':
261 buffer += '\\textbf{%s}' % subcontent
.strip()
262 elif ournode
.tag
== 'em':
263 buffer += '\\emph{%s}' % subcontent
.strip()
264 # Keep table strcuture. TableTextPostProcessor will take care.
265 elif ournode
.tag
== 'table':
266 buffer += '\n\n<table>%s</table>\n\n' % subcontent
267 elif ournode
.tag
== 'thead':
268 buffer += '<thead>%s</thead>' % subcontent
269 elif ournode
.tag
== 'tbody':
270 buffer += '<tbody>%s</tbody>' % subcontent
271 elif ournode
.tag
== 'tr':
272 buffer += '<tr>%s</tr>' % subcontent
273 elif ournode
.tag
== 'th':
274 buffer += '<th>%s</th>' % subcontent
275 elif ournode
.tag
== 'td':
276 buffer += '<td>%s</td>' % subcontent
277 elif ournode
.tag
== 'img':
278 buffer += '<img src=\"%s\" alt=\"%s\" />' % (ournode
.get('src'),
280 elif ournode
.tag
== 'a':
281 buffer += '<a href=\"%s\">%s</a>' % (ournode
.get('href'),
287 buffer += escape_latex_entities(ournode
.tail
)
292 class UnescapeHtmlTextPostProcessor(markdown
.postprocessors
.Postprocessor
):
295 return unescape_html_entities(text
)
297 # ========================= MATHS =================================
300 class MathTextPostProcessor(markdown
.postprocessors
.Postprocessor
):
302 def run(self
, instr
):
303 """Convert all math sections in {text} whether latex, asciimathml or
304 latexmathml formatted to latex.
306 This assumes you are using $$ as your mathematics delimiter (*not* the
307 standard asciimathml or latexmathml delimiter).
309 def repl_1(matchobj
):
310 text
= unescape_latex_entities(matchobj
.group(1))
312 if tmp
.startswith('\\[') or tmp
.startswith('\\begin'):
315 return '\\[%s\\]\n' % text
317 def repl_2(matchobj
):
318 text
= unescape_latex_entities(matchobj
.group(1))
319 return '$%s$%s' % (text
, matchobj
.group(2))
322 pat
= re
.compile('^\$\$([^\$]*)\$\$\s*$', re
.MULTILINE
)
323 out
= pat
.sub(repl_1
, instr
)
324 # Jones, $x=3$, is ...
325 pat3
= re
.compile(r
'\$([^\$]+)\$(\s|$)')
326 out
= pat3
.sub(repl_2
, out
)
328 # pat2 = re.compile('([^\$])\$([^\$])')
329 # out = pat2.sub('\g<1>\\$\g<2>', out)
330 # some extras due to asciimathml
331 # out = out.replace('\\lt', '<')
332 # out = out.replace(' * ', ' \\cdot ')
333 # out = out.replace('\\del', '\\partial')
336 # ========================= TABLES =================================
339 class TableTextPostProcessor(markdown
.postprocessors
.Postprocessor
):
341 def run(self
, instr
):
342 """This is not very sophisticated and for it to work it is expected
344 1. tables to be in a section on their own (that is at least one
345 blank line above and below)
346 2. no nesting of tables
348 converter
= Table2Latex()
351 for block
in instr
.split('\n\n'):
352 stripped
= block
.strip()
353 # <table catches modified verions (e.g. <table class="..">
354 if stripped
.startswith('<table') and stripped
.endswith('</table>'):
355 latex_table
= converter
.convert(stripped
).strip()
356 new_blocks
.append(latex_table
)
358 new_blocks
.append(block
)
359 return '\n\n'.join(new_blocks
)
364 Convert html tables to Latex.
366 TODO: escape latex entities.
370 # centre align everything by default
371 out
= '|l' * self
.maxcols
+ '|'
374 def get_text(self
, element
):
375 if element
.nodeType
== element
.TEXT_NODE
:
376 return escape_latex_entities(element
.data
)
378 if element
.childNodes
:
379 for child
in element
.childNodes
:
380 text
= self
.get_text(child
)
381 if text
.strip() != '':
385 def process_cell(self
, element
):
386 # works on both td and th
388 subcontent
= self
.get_text(element
)
391 if element
.tagName
== 'th':
392 subcontent
= '\\textbf{%s}' % subcontent
393 if element
.hasAttribute('colspan'):
394 colspan
= int(element
.getAttribute('colspan'))
395 buffer += ' \multicolumn{%s}{|c|}{%s}' % (colspan
, subcontent
)
396 # we don't support rowspan because:
397 # 1. it needs an extra latex package \usepackage{multirow}
398 # 2. it requires us to mess around with the alignment tags in
399 # subsequent rows (i.e. suppose the first col in row A is rowspan 2
400 # then in row B in the latex we will need a leading &)
401 # if element.hasAttribute('rowspan'):
402 # rowspan = int(element.getAttribute('rowspan'))
403 # buffer += ' \multirow{%s}{|c|}{%s}' % (rowspan, subcontent)
405 buffer += ' %s' % subcontent
407 notLast
= (element
.nextSibling
.nextSibling
and
408 element
.nextSibling
.nextSibling
.nodeType
==
409 element
.ELEMENT_NODE
and
410 element
.nextSibling
.nextSibling
.tagName
in ['td', 'th'])
415 self
.numcols
+= colspan
418 def tolatex(self
, element
):
419 if element
.nodeType
== element
.TEXT_NODE
:
424 if element
.childNodes
:
425 for child
in element
.childNodes
:
426 text
= self
.tolatex(child
)
427 if text
.strip() != "":
429 subcontent
= subcontent
.strip()
431 if element
.tagName
== 'thead':
434 elif element
.tagName
== 'tr':
435 self
.maxcols
= max(self
.numcols
, self
.maxcols
)
437 buffer += '\n\\hline\n%s \\\\' % subcontent
439 elif element
.tagName
== 'td' or element
.tagName
== 'th':
440 buffer = self
.process_cell(element
)
445 def convert(self
, instr
):
448 dom
= xml
.dom
.minidom
.parseString(instr
)
449 core
= self
.tolatex(dom
.documentElement
)
451 captionElements
= dom
.documentElement
.getElementsByTagName('caption')
454 caption
= self
.get_text(captionElements
[0])
456 colformatting
= self
.colformat()
467 """ % (colformatting
, core
, caption
)
471 # ========================= IMAGES =================================
473 class ImageTextPostProcessor(markdown
.postprocessors
.Postprocessor
):
475 def run(self
, instr
):
476 """Process all img tags
478 Similar to process_tables this is not very sophisticated and for it
479 to work it is expected that img tags are put in a section of their own
480 (that is separated by at least one blank line above and below).
482 converter
= Img2Latex()
484 for block
in instr
.split("\n\n"):
485 stripped
= block
.strip()
486 # <table catches modified verions (e.g. <table class="..">
487 if stripped
.startswith('<img'):
488 latex_img
= converter
.convert(stripped
).strip()
489 new_blocks
.append(latex_img
)
491 new_blocks
.append(block
)
492 return '\n\n'.join(new_blocks
)
495 class Img2Latex(object):
496 def convert(self
, instr
):
497 dom
= xml
.dom
.minidom
.parseString(instr
)
498 img
= dom
.documentElement
499 src
= img
.getAttribute('src')
501 if urlparse(src
).scheme
!= '':
502 src_urlparse
= urlparse(src
)
503 conn
= httplib
.HTTPConnection(src_urlparse
.netloc
)
504 conn
.request('HEAD', src_urlparse
.path
)
505 response
= conn
.getresponse()
507 if response
.status
== 200:
508 filename
= os
.path
.join(tempfile
.mkdtemp(), src
.split('/')[-1])
509 urllib
.urlretrieve(src
, filename
)
512 alt
= img
.getAttribute('alt')
513 # Using graphicx and ajustbox package for *max width*
518 \\includegraphics[max width=\\linewidth]{%s}
525 # ========================== LINKS =================================
527 class LinkTextPostProcessor(markdown
.postprocessors
.Postprocessor
):
529 def run(self
, instr
):
530 # Process all hyperlinks
531 converter
= Link2Latex()
533 for block
in instr
.split("\n\n"):
534 stripped
= block
.strip()
535 match
= re
.search(r
'<a[^>]*>([^<]+)</a>', stripped
)
536 # <table catches modified verions (e.g. <table class="..">
538 latex_link
= re
.sub(r
'<a[^>]*>([^<]+)</a>',
539 converter
.convert(match
.group(0)).strip(),
541 new_blocks
.append(latex_link
)
543 new_blocks
.append(block
)
544 return '\n\n'.join(new_blocks
)
547 class Link2Latex(object):
548 def convert(self
, instr
):
549 dom
= xml
.dom
.minidom
.parseString(instr
)
550 link
= dom
.documentElement
551 href
= link
.getAttribute('href')
553 desc
= re
.search(r
'>([^<]+)', instr
)
557 """ % (href
, desc
.group(0)[1:])
562 ========================= FOOTNOTES =================================
564 LaTeX footnote support.
566 Implemented via modification of original markdown approach (place footnote
567 definition in footnote market <sup> as opposed to putting a reference link).
571 class FootnoteExtension (markdown
.Extension
):
572 DEF_RE
= re
.compile(r
"(\ ?\ ?\ ?)\[\^([^\]]*)\]:\s*(.*)")
573 SHORT_USE_RE
= re
.compile(r
"\[\^([^\]]*)\]", re
.M
) # [^a]
575 def __init__(self
, configs
=None):
578 def extendMarkdown(self
, md
, md_globals
):
581 # Stateless extensions do not need to be registered
582 md
.registerExtension(self
)
584 # Insert a preprocessor before ReferencePreprocessor
585 #index = md.preprocessors.index(md_globals['REFERENCE_PREPROCESSOR'])
586 #preprocessor = FootnotePreprocessor(self)
587 #preprocessor.md = md
588 #md.preprocessors.insert(index, preprocessor)
589 md
.preprocessors
.add('footnotes', FootnotePreprocessor(self
), '_begin')
591 ## Insert an inline pattern before ImageReferencePattern
592 FOOTNOTE_RE
= r
"\[\^([^\]]*)\]" # blah blah [^1] blah
593 #index = md.inlinePatterns.index(md_globals['IMAGE_REFERENCE_PATTERN'])
594 #md.inlinePatterns.insert(index, FootnotePattern(FOOTNOTE_RE, self))
595 md
.inlinePatterns
.add('footnotes', FootnotePattern(FOOTNOTE_RE
, self
),
599 self
.used_footnotes
= {}
602 def setFootnote(self
, id, text
):
603 self
.footnotes
[id] = text
606 class FootnotePreprocessor
:
607 def __init__(self
, footnotes
):
608 self
.footnotes
= footnotes
610 def run(self
, lines
):
611 self
.blockGuru
= BlockGuru()
612 lines
= self
._handleFootnoteDefinitions
(lines
)
614 # Make a hash of all footnote marks in the text so that we
615 # know in what order they are supposed to appear. (This
616 # function call doesn't really substitute anything - it's just
617 # a way to get a callback for each occurence.
619 text
= "\n".join(lines
)
620 self
.footnotes
.SHORT_USE_RE
.sub(self
.recordFootnoteUse
, text
)
622 return text
.split("\n")
624 def recordFootnoteUse(self
, match
):
627 nextNum
= len(self
.footnotes
.used_footnotes
.keys()) + 1
628 self
.footnotes
.used_footnotes
[id] = nextNum
630 def _handleFootnoteDefinitions(self
, lines
):
631 """Recursively finds all footnote definitions in the lines.
633 @param lines: a list of lines of text
634 @returns: a string representing the text with footnote
635 definitions removed """
637 i
, id, footnote
= self
._findFootnoteDefinition
(lines
)
643 detabbed
, theRest
= self
.blockGuru
.detectTabbed(lines
[i
+ 1:])
645 self
.footnotes
.setFootnote(id,
647 + "\n".join(detabbed
))
649 more_plain
= self
._handleFootnoteDefinitions
(theRest
)
650 return plain
+ [""] + more_plain
655 def _findFootnoteDefinition(self
, lines
):
656 """Finds the first line of a footnote definition.
658 @param lines: a list of lines of text
659 @returns: the index of the line containing a footnote definition.
664 m
= self
.footnotes
.DEF_RE
.match(line
)
666 return counter
, m
.group(2), m
.group(3)
668 return counter
, None, None
671 class FootnotePattern(markdown
.inlinepatterns
.Pattern
):
673 def __init__(self
, pattern
, footnotes
):
674 markdown
.inlinepatterns
.Pattern
.__init
__(self
, pattern
)
675 self
.footnotes
= footnotes
677 def handleMatch(self
, m
, doc
):
678 sup
= doc
.createElement('sup')
680 # stick the footnote text in the sup
681 self
.footnotes
.md
._processSection
(sup
,
682 self
.footnotes
.footnotes
[id].split("\n"))
686 def template(template_fo
, latex_to_insert
):
687 tmpl
= template_fo
.read()
688 tmpl
= tmpl
.replace('INSERT-TEXT-HERE', latex_to_insert
)
690 # title_items = [ '\\title', '\\end{abstract}', '\\thanks', '\\author' ]
691 # has_title_stuff = False
692 # for it in title_items:
693 # has_title_stuff = has_title_stuff or (it in tmpl)
699 """usage: %prog [options] <in-file-path>
701 Given a file path, process it using markdown2latex and print the result on
704 If using template option template should place text INSERT-TEXT-HERE in the
705 template where text should be inserted.
707 parser
= optparse
.OptionParser(usage
)
708 parser
.add_option('-t', '--template', dest
='template',
710 help='path to latex template file (optional)')
711 (options
, args
) = parser
.parse_args()
712 if not len(args
) > 0:
716 infile
= file(inpath
)
718 md
= markdown
.Markdown()
719 mkdn2latex
= LaTeXExtension()
720 mkdn2latex
.extendMarkdown(md
, markdown
.__dict
__)
721 out
= md
.convert(infile
.read())
724 tmpl_fo
= file(options
.template
)
725 out
= template(tmpl_fo
, out
)
729 if __name__
== '__main__':