Fix day filter
[cds-indico.git] / indico / util / mdx_latex.py
blob9be3ad91619d6322c0b523dec02b35b28398d81c
1 #!/usr/bin/env python2
2 """Extension to python-markdown to support LaTeX (rather than html) output.
4 Authored by Rufus Pollock: <http://www.rufuspollock.org/>
5 Reworked by Julian Wulfheide (ju.wulfheide@gmail.com) and
6 Indico Project (indico-team@cern.ch)
8 Usage:
9 ======
11 1. Command Line. A script entitled markdown2latex.py is automatically
12 installed. For details of usage see help::
14 $ markdown2latex.py -h
16 2. As a python-markdown extension::
18 >>> import markdown
19 >>> md = markdown.Markdown(None, extensions=['latex'])
20 >>> # text is input string ...
21 >>> latex_out = md.convert(text)
23 3. Directly as a module (slight inversion of std markdown extension setup)::
25 >>> import markdown
26 >>> import mdx_latex
27 >>> md = markdown.Markdown()
28 >>> latex_mdx = mdx_latex.LaTeXExtension()
29 >>> latex_mdx.extendMarkdown(md, markdown.__dict__)
30 >>> out = md.convert(text)
32 History
33 =======
35 Version: 1.0 (November 15, 2006)
37 * First working version (compatible with markdown 1.5)
38 * Includes support for tables
40 Version: 1.1 (January 17, 2007)
42 * Support for verbatim and images
44 Version: 1.2 (June 2008)
46 * Refactor as an extension.
47 * Make into a proper python/setuptools package.
48 * Tested with markdown 1.7 but should work with 1.6 and (possibly) 1.5
49 (though pre/post processor stuff not as worked out there)
51 Version 1.3: (July 2008)
52 * Improvements to image output (width)
54 Version 1.3.1: (August 2009)
55 * Tiny bugfix to remove duplicate keyword argument and set zip_safe=False
56 * Add [width=\textwidth] by default for included images
58 Version 2.0: (June 2011)
59 * PEP8 cleanup
60 * Major rework since this was broken by new Python-Markdown releases
62 Version 2.1: (August 2013)
63 * Add handler for non locally referenced images, hyperlinks and horizontal rules
64 * Update math delimiters
65 """
67 __version__ = '2.1'
69 # do some fancy importing stuff to allow use to override things in this module
70 # in this file while still importing * for use in our own classes
71 import re
72 import sys
73 import markdown
74 import xml.dom.minidom
75 from urlparse import urlparse
76 import httplib
77 import os
78 import tempfile
79 import urllib
82 start_single_quote_re = re.compile("(^|\s|\")'")
83 start_double_quote_re = re.compile("(^|\s|'|`)\"")
84 end_double_quote_re = re.compile("\"(,|\.|\s|$)")
87 def unescape_html_entities(text):
88 out = text.replace('&amp;', '&')
89 out = out.replace('&lt;', '<')
90 out = out.replace('&gt;', '>')
91 out = out.replace('&quot;', '"')
92 return out
95 def latex_escape(text, ignore_math=True):
96 chars = {
97 "#": r"\#",
98 "$": r"\$",
99 "%": r"\%",
100 "&": r"\&",
101 "~": r"\~{}",
102 "_": r"\_",
103 "^": r"\^{}",
104 "\\": r"\textbackslash{}",
105 "{": r"\{",
106 "}": r"\}",
107 "\x0c": "",
108 "\x0b": ""
111 math_segments = []
113 def substitute(x):
114 return chars[x.group()]
116 def math_replace(m):
117 math_segments.append(m.group(0))
118 return "[*LaTeXmath*]"
120 if ignore_math:
121 text = re.sub(r'\$[^\$]+\$|\$\$(^\$)\$\$', math_replace, text)
123 pattern = re.compile('|'.join(re.escape(k) for k in chars.keys()))
124 res = pattern.sub(substitute, text)
126 if ignore_math:
127 res = re.sub(r'\[\*LaTeXmath\*\]', lambda _: "\\protect " + math_segments.pop(0), res)
129 return res
132 def escape_latex_entities(text):
133 """Escape latex reserved characters."""
134 out = text
135 out = unescape_html_entities(out)
136 out = start_single_quote_re.sub('\g<1>`', out)
137 out = start_double_quote_re.sub('\g<1>``', out)
138 out = end_double_quote_re.sub("''\g<1>", out)
140 out = latex_escape(out)
142 return out
145 def unescape_latex_entities(text):
146 """Limit ourselves as this is only used for maths stuff."""
147 out = text
148 out = out.replace('\\&', '&')
149 return out
152 def makeExtension(configs=None):
153 return LaTeXExtension(configs=configs)
156 class LaTeXExtension(markdown.Extension):
157 def __init__(self, configs=None):
158 self.reset()
160 def extendMarkdown(self, md, md_globals):
161 self.md = md
163 # remove escape pattern -- \\(.*) -- as this messes up any embedded
164 # math and we don't need to escape stuff any more for html
165 for key, pat in self.md.inlinePatterns.iteritems():
166 if pat.pattern == markdown.inlinepatterns.ESCAPE_RE:
167 self.md.inlinePatterns.pop(key)
168 break
170 #footnote_extension = FootnoteExtension()
171 #footnote_extension.extendMarkdown(md, md_globals)
173 latex_tp = LaTeXTreeProcessor()
174 math_pp = MathTextPostProcessor()
175 table_pp = TableTextPostProcessor()
176 image_pp = ImageTextPostProcessor()
177 link_pp = LinkTextPostProcessor()
178 unescape_html_pp = UnescapeHtmlTextPostProcessor()
180 md.treeprocessors['latex'] = latex_tp
181 md.postprocessors['unescape_html'] = unescape_html_pp
182 md.postprocessors['math'] = math_pp
183 md.postprocessors['image'] = image_pp
184 md.postprocessors['table'] = table_pp
185 md.postprocessors['link'] = link_pp
187 def reset(self):
188 pass
191 class LaTeXTreeProcessor(markdown.treeprocessors.Treeprocessor):
192 def run(self, doc):
193 """Walk the dom converting relevant nodes to text nodes with relevant
194 content."""
195 latex_text = self.tolatex(doc)
196 doc.clear()
197 doc.text = latex_text
199 def tolatex(self, ournode):
200 buffer = ""
201 subcontent = ""
203 if ournode.text:
204 subcontent += escape_latex_entities(ournode.text)
206 if ournode.getchildren():
207 for child in ournode.getchildren():
208 subcontent += self.tolatex(child)
210 if ournode.tag == 'h1':
211 buffer += '\n\n\\section{%s}\n' % subcontent
212 elif ournode.tag == 'h2':
213 buffer += '\n\n\\subsection{%s}\n' % subcontent
214 elif ournode.tag == 'h3':
215 buffer += '\n\\subsubsection{%s}\n' % subcontent
216 elif ournode.tag == 'h4':
217 buffer += '\n\\paragraph{%s}\n' % subcontent
218 elif ournode.tag == 'hr':
219 buffer += '\\noindent\makebox[\linewidth]{\\rule{\paperwidth}{0.4pt}}'
220 elif ournode.tag == 'ul':
221 # no need for leading \n as one will be provided by li
222 buffer += """
223 \\begin{itemize}%s
224 \\end{itemize}
225 """ % subcontent
226 elif ournode.tag == 'ol':
227 # no need for leading \n as one will be provided by li
228 buffer += """
229 \\begin{enumerate}%s
230 \\end{enumerate}
231 """ % subcontent
232 elif ournode.tag == 'li':
233 buffer += """
234 \\item %s""" % subcontent.strip()
235 elif ournode.tag == 'blockquote':
236 # use quotation rather than quote as quotation can support multiple
237 # paragraphs
238 buffer += """
239 \\begin{quotation}
241 \\end{quotation}
242 """ % subcontent.strip()
243 # ignore 'code' when inside pre tags
244 # (mkdn produces <pre><code></code></pre>)
245 elif (ournode.tag == 'pre' or
246 # TODO: Take a look here
247 (ournode.tag == 'pre' and ournode.parentNode.tag != 'pre')):
248 buffer += """
249 \\begin{verbatim}
251 \\end{verbatim}
252 """ % subcontent.strip()
253 elif ournode.tag == 'q':
254 buffer += "`%s'" % subcontent.strip()
255 elif ournode.tag == 'p':
256 buffer += '\n%s\n' % subcontent.strip()
257 # Footnote processor inserts all of the footnote in a sup tag
258 elif ournode.tag == 'sup':
259 buffer += '\\footnote{%s}' % subcontent.strip()
260 elif ournode.tag == 'strong':
261 buffer += '\\textbf{%s}' % subcontent.strip()
262 elif ournode.tag == 'em':
263 buffer += '\\emph{%s}' % subcontent.strip()
264 # Keep table strcuture. TableTextPostProcessor will take care.
265 elif ournode.tag == 'table':
266 buffer += '\n\n<table>%s</table>\n\n' % subcontent
267 elif ournode.tag == 'thead':
268 buffer += '<thead>%s</thead>' % subcontent
269 elif ournode.tag == 'tbody':
270 buffer += '<tbody>%s</tbody>' % subcontent
271 elif ournode.tag == 'tr':
272 buffer += '<tr>%s</tr>' % subcontent
273 elif ournode.tag == 'th':
274 buffer += '<th>%s</th>' % subcontent
275 elif ournode.tag == 'td':
276 buffer += '<td>%s</td>' % subcontent
277 elif ournode.tag == 'img':
278 buffer += '<img src=\"%s\" alt=\"%s\" />' % (ournode.get('src'),
279 ournode.get('alt'))
280 elif ournode.tag == 'a':
281 buffer += '<a href=\"%s\">%s</a>' % (ournode.get('href'),
282 subcontent)
283 else:
284 buffer = subcontent
286 if ournode.tail:
287 buffer += escape_latex_entities(ournode.tail)
289 return buffer
292 class UnescapeHtmlTextPostProcessor(markdown.postprocessors.Postprocessor):
294 def run(self, text):
295 return unescape_html_entities(text)
297 # ========================= MATHS =================================
300 class MathTextPostProcessor(markdown.postprocessors.Postprocessor):
302 def run(self, instr):
303 """Convert all math sections in {text} whether latex, asciimathml or
304 latexmathml formatted to latex.
306 This assumes you are using $$ as your mathematics delimiter (*not* the
307 standard asciimathml or latexmathml delimiter).
309 def repl_1(matchobj):
310 text = unescape_latex_entities(matchobj.group(1))
311 tmp = text.strip()
312 if tmp.startswith('\\[') or tmp.startswith('\\begin'):
313 return text
314 else:
315 return '\\[%s\\]\n' % text
317 def repl_2(matchobj):
318 text = unescape_latex_entities(matchobj.group(1))
319 return '$%s$%s' % (text, matchobj.group(2))
321 # $$ ..... $$
322 pat = re.compile('^\$\$([^\$]*)\$\$\s*$', re.MULTILINE)
323 out = pat.sub(repl_1, instr)
324 # Jones, $x=3$, is ...
325 pat3 = re.compile(r'\$([^\$]+)\$(\s|$)')
326 out = pat3.sub(repl_2, out)
327 # # $100 million
328 # pat2 = re.compile('([^\$])\$([^\$])')
329 # out = pat2.sub('\g<1>\\$\g<2>', out)
330 # some extras due to asciimathml
331 # out = out.replace('\\lt', '<')
332 # out = out.replace(' * ', ' \\cdot ')
333 # out = out.replace('\\del', '\\partial')
334 return out
336 # ========================= TABLES =================================
339 class TableTextPostProcessor(markdown.postprocessors.Postprocessor):
341 def run(self, instr):
342 """This is not very sophisticated and for it to work it is expected
343 that:
344 1. tables to be in a section on their own (that is at least one
345 blank line above and below)
346 2. no nesting of tables
348 converter = Table2Latex()
349 new_blocks = []
351 for block in instr.split('\n\n'):
352 stripped = block.strip()
353 # <table catches modified verions (e.g. <table class="..">
354 if stripped.startswith('<table') and stripped.endswith('</table>'):
355 latex_table = converter.convert(stripped).strip()
356 new_blocks.append(latex_table)
357 else:
358 new_blocks.append(block)
359 return '\n\n'.join(new_blocks)
362 class Table2Latex:
364 Convert html tables to Latex.
366 TODO: escape latex entities.
369 def colformat(self):
370 # centre align everything by default
371 out = '|l' * self.maxcols + '|'
372 return out
374 def get_text(self, element):
375 if element.nodeType == element.TEXT_NODE:
376 return escape_latex_entities(element.data)
377 result = ''
378 if element.childNodes:
379 for child in element.childNodes:
380 text = self.get_text(child)
381 if text.strip() != '':
382 result += text
383 return result
385 def process_cell(self, element):
386 # works on both td and th
387 colspan = 1
388 subcontent = self.get_text(element)
389 buffer = ""
391 if element.tagName == 'th':
392 subcontent = '\\textbf{%s}' % subcontent
393 if element.hasAttribute('colspan'):
394 colspan = int(element.getAttribute('colspan'))
395 buffer += ' \multicolumn{%s}{|c|}{%s}' % (colspan, subcontent)
396 # we don't support rowspan because:
397 # 1. it needs an extra latex package \usepackage{multirow}
398 # 2. it requires us to mess around with the alignment tags in
399 # subsequent rows (i.e. suppose the first col in row A is rowspan 2
400 # then in row B in the latex we will need a leading &)
401 # if element.hasAttribute('rowspan'):
402 # rowspan = int(element.getAttribute('rowspan'))
403 # buffer += ' \multirow{%s}{|c|}{%s}' % (rowspan, subcontent)
404 else:
405 buffer += ' %s' % subcontent
407 notLast = (element.nextSibling.nextSibling and
408 element.nextSibling.nextSibling.nodeType ==
409 element.ELEMENT_NODE and
410 element.nextSibling.nextSibling.tagName in ['td', 'th'])
412 if notLast:
413 buffer += ' &'
415 self.numcols += colspan
416 return buffer
418 def tolatex(self, element):
419 if element.nodeType == element.TEXT_NODE:
420 return ""
422 buffer = ""
423 subcontent = ""
424 if element.childNodes:
425 for child in element.childNodes:
426 text = self.tolatex(child)
427 if text.strip() != "":
428 subcontent += text
429 subcontent = subcontent.strip()
431 if element.tagName == 'thead':
432 buffer += subcontent
434 elif element.tagName == 'tr':
435 self.maxcols = max(self.numcols, self.maxcols)
436 self.numcols = 0
437 buffer += '\n\\hline\n%s \\\\' % subcontent
439 elif element.tagName == 'td' or element.tagName == 'th':
440 buffer = self.process_cell(element)
441 else:
442 buffer += subcontent
443 return buffer
445 def convert(self, instr):
446 self.numcols = 0
447 self.maxcols = 0
448 dom = xml.dom.minidom.parseString(instr)
449 core = self.tolatex(dom.documentElement)
451 captionElements = dom.documentElement.getElementsByTagName('caption')
452 caption = ''
453 if captionElements:
454 caption = self.get_text(captionElements[0])
456 colformatting = self.colformat()
457 table_latex = \
459 \\begin{table}[h]
460 \\begin{tabular}{%s}
462 \\hline
463 \\end{tabular}
464 \\\\[5pt]
465 \\caption{%s}
466 \\end{table}
467 """ % (colformatting, core, caption)
468 return table_latex
471 # ========================= IMAGES =================================
473 class ImageTextPostProcessor(markdown.postprocessors.Postprocessor):
475 def run(self, instr):
476 """Process all img tags
478 Similar to process_tables this is not very sophisticated and for it
479 to work it is expected that img tags are put in a section of their own
480 (that is separated by at least one blank line above and below).
482 converter = Img2Latex()
483 new_blocks = []
484 for block in instr.split("\n\n"):
485 stripped = block.strip()
486 # <table catches modified verions (e.g. <table class="..">
487 if stripped.startswith('<img'):
488 latex_img = converter.convert(stripped).strip()
489 new_blocks.append(latex_img)
490 else:
491 new_blocks.append(block)
492 return '\n\n'.join(new_blocks)
495 class Img2Latex(object):
496 def convert(self, instr):
497 dom = xml.dom.minidom.parseString(instr)
498 img = dom.documentElement
499 src = img.getAttribute('src')
501 if urlparse(src).scheme != '':
502 src_urlparse = urlparse(src)
503 conn = httplib.HTTPConnection(src_urlparse.netloc)
504 conn.request('HEAD', src_urlparse.path)
505 response = conn.getresponse()
506 conn.close()
507 if response.status == 200:
508 filename = os.path.join(tempfile.mkdtemp(), src.split('/')[-1])
509 urllib.urlretrieve(src, filename)
510 src = filename
512 alt = img.getAttribute('alt')
513 # Using graphicx and ajustbox package for *max width*
514 out = \
516 \\begin{figure}[H]
517 \\centering
518 \\includegraphics[max width=\\linewidth]{%s}
519 \\caption{%s}
520 \\end{figure}
521 """ % (src, alt)
522 return out
525 # ========================== LINKS =================================
527 class LinkTextPostProcessor(markdown.postprocessors.Postprocessor):
529 def run(self, instr):
530 # Process all hyperlinks
531 converter = Link2Latex()
532 new_blocks = []
533 for block in instr.split("\n\n"):
534 stripped = block.strip()
535 match = re.search(r'<a[^>]*>([^<]+)</a>', stripped)
536 # <table catches modified verions (e.g. <table class="..">
537 if match:
538 latex_link = re.sub(r'<a[^>]*>([^<]+)</a>',
539 converter.convert(match.group(0)).strip(),
540 stripped)
541 new_blocks.append(latex_link)
542 else:
543 new_blocks.append(block)
544 return '\n\n'.join(new_blocks)
547 class Link2Latex(object):
548 def convert(self, instr):
549 dom = xml.dom.minidom.parseString(instr)
550 link = dom.documentElement
551 href = link.getAttribute('href')
553 desc = re.search(r'>([^<]+)', instr)
554 out = \
556 \\href{%s}{%s}
557 """ % (href, desc.group(0)[1:])
558 return out
562 ========================= FOOTNOTES =================================
564 LaTeX footnote support.
566 Implemented via modification of original markdown approach (place footnote
567 definition in footnote market <sup> as opposed to putting a reference link).
571 class FootnoteExtension (markdown.Extension):
572 DEF_RE = re.compile(r"(\ ?\ ?\ ?)\[\^([^\]]*)\]:\s*(.*)")
573 SHORT_USE_RE = re.compile(r"\[\^([^\]]*)\]", re.M) # [^a]
575 def __init__(self, configs=None):
576 self.reset()
578 def extendMarkdown(self, md, md_globals):
579 self.md = md
581 # Stateless extensions do not need to be registered
582 md.registerExtension(self)
584 # Insert a preprocessor before ReferencePreprocessor
585 #index = md.preprocessors.index(md_globals['REFERENCE_PREPROCESSOR'])
586 #preprocessor = FootnotePreprocessor(self)
587 #preprocessor.md = md
588 #md.preprocessors.insert(index, preprocessor)
589 md.preprocessors.add('footnotes', FootnotePreprocessor(self), '_begin')
591 ## Insert an inline pattern before ImageReferencePattern
592 FOOTNOTE_RE = r"\[\^([^\]]*)\]" # blah blah [^1] blah
593 #index = md.inlinePatterns.index(md_globals['IMAGE_REFERENCE_PATTERN'])
594 #md.inlinePatterns.insert(index, FootnotePattern(FOOTNOTE_RE, self))
595 md.inlinePatterns.add('footnotes', FootnotePattern(FOOTNOTE_RE, self),
596 '_begin')
598 def reset(self):
599 self.used_footnotes = {}
600 self.footnotes = {}
602 def setFootnote(self, id, text):
603 self.footnotes[id] = text
606 class FootnotePreprocessor:
607 def __init__(self, footnotes):
608 self.footnotes = footnotes
610 def run(self, lines):
611 self.blockGuru = BlockGuru()
612 lines = self._handleFootnoteDefinitions(lines)
614 # Make a hash of all footnote marks in the text so that we
615 # know in what order they are supposed to appear. (This
616 # function call doesn't really substitute anything - it's just
617 # a way to get a callback for each occurence.
619 text = "\n".join(lines)
620 self.footnotes.SHORT_USE_RE.sub(self.recordFootnoteUse, text)
622 return text.split("\n")
624 def recordFootnoteUse(self, match):
625 id = match.group(1)
626 id = id.strip()
627 nextNum = len(self.footnotes.used_footnotes.keys()) + 1
628 self.footnotes.used_footnotes[id] = nextNum
630 def _handleFootnoteDefinitions(self, lines):
631 """Recursively finds all footnote definitions in the lines.
633 @param lines: a list of lines of text
634 @returns: a string representing the text with footnote
635 definitions removed """
637 i, id, footnote = self._findFootnoteDefinition(lines)
639 if id:
641 plain = lines[:i]
643 detabbed, theRest = self.blockGuru.detectTabbed(lines[i + 1:])
645 self.footnotes.setFootnote(id,
646 footnote + "\n"
647 + "\n".join(detabbed))
649 more_plain = self._handleFootnoteDefinitions(theRest)
650 return plain + [""] + more_plain
652 else:
653 return lines
655 def _findFootnoteDefinition(self, lines):
656 """Finds the first line of a footnote definition.
658 @param lines: a list of lines of text
659 @returns: the index of the line containing a footnote definition.
662 counter = 0
663 for line in lines:
664 m = self.footnotes.DEF_RE.match(line)
665 if m:
666 return counter, m.group(2), m.group(3)
667 counter += 1
668 return counter, None, None
671 class FootnotePattern(markdown.inlinepatterns.Pattern):
673 def __init__(self, pattern, footnotes):
674 markdown.inlinepatterns.Pattern.__init__(self, pattern)
675 self.footnotes = footnotes
677 def handleMatch(self, m, doc):
678 sup = doc.createElement('sup')
679 id = m.group(2)
680 # stick the footnote text in the sup
681 self.footnotes.md._processSection(sup,
682 self.footnotes.footnotes[id].split("\n"))
683 return sup
686 def template(template_fo, latex_to_insert):
687 tmpl = template_fo.read()
688 tmpl = tmpl.replace('INSERT-TEXT-HERE', latex_to_insert)
689 return tmpl
690 # title_items = [ '\\title', '\\end{abstract}', '\\thanks', '\\author' ]
691 # has_title_stuff = False
692 # for it in title_items:
693 # has_title_stuff = has_title_stuff or (it in tmpl)
696 def main():
697 import optparse
698 usage = \
699 """usage: %prog [options] <in-file-path>
701 Given a file path, process it using markdown2latex and print the result on
702 stdout.
704 If using template option template should place text INSERT-TEXT-HERE in the
705 template where text should be inserted.
707 parser = optparse.OptionParser(usage)
708 parser.add_option('-t', '--template', dest='template',
709 default='',
710 help='path to latex template file (optional)')
711 (options, args) = parser.parse_args()
712 if not len(args) > 0:
713 parser.print_help()
714 sys.exit(1)
715 inpath = args[0]
716 infile = file(inpath)
718 md = markdown.Markdown()
719 mkdn2latex = LaTeXExtension()
720 mkdn2latex.extendMarkdown(md, markdown.__dict__)
721 out = md.convert(infile.read())
723 if options.template:
724 tmpl_fo = file(options.template)
725 out = template(tmpl_fo, out)
727 print out
729 if __name__ == '__main__':
730 main()