third_party/markdown/preprocessors.py

   1 # markdown is released under the BSD license
   2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
   3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
   4 # Copyright 2004 Manfred Stienstra (the original version)
   5 #
   6 # All rights reserved.
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions are met:
  10 #
  11 # *   Redistributions of source code must retain the above copyright
  12 #     notice, this list of conditions and the following disclaimer.
  13 # *   Redistributions in binary form must reproduce the above copyright
  14 #     notice, this list of conditions and the following disclaimer in the
  15 #     documentation and/or other materials provided with the distribution.
  16 # *   Neither the name of the <organization> nor the
  17 #     names of its contributors may be used to endorse or promote products
  18 #     derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
  21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
  24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32
  33 """
  34 PRE-PROCESSORS
  35 =============================================================================
  36
  37 Preprocessors work on source text before we start doing anything too
  38 complicated.
  39 """
  40
  41 from __future__ import absolute_import
  42 from __future__ import unicode_literals
  43 from . import util
  44 from . import odict
  45 import re
  46
  47
  48 def build_preprocessors(md_instance, **kwargs):
  49     """ Build the default set of preprocessors used by Markdown. """
  50     preprocessors = odict.OrderedDict()
  51     preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
  52     if md_instance.safeMode != 'escape':
  53         preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
  54     preprocessors["reference"] = ReferencePreprocessor(md_instance)
  55     return preprocessors
  56
  57
  58 class Preprocessor(util.Processor):
  59     """
  60     Preprocessors are run after the text is broken into lines.
  61
  62     Each preprocessor implements a "run" method that takes a pointer to a
  63     list of lines of the document, modifies it as necessary and returns
  64     either the same pointer or a pointer to a new list.
  65
  66     Preprocessors must extend markdown.Preprocessor.
  67
  68     """
  69     def run(self, lines):
  70         """
  71         Each subclass of Preprocessor should override the `run` method, which
  72         takes the document as a list of strings split by newlines and returns
  73         the (possibly modified) list of lines.
  74
  75         """
  76         pass
  77
  78
  79 class NormalizeWhitespace(Preprocessor):
  80     """ Normalize whitespace for consistant parsing. """
  81
  82     def run(self, lines):
  83         source = '\n'.join(lines)
  84         source = source.replace(util.STX, "").replace(util.ETX, "")
  85         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
  86         source = source.expandtabs(self.markdown.tab_length)
  87         source = re.sub(r'(?<=\n) +\n', '\n', source)
  88         return source.split('\n')
  89
  90
  91 class HtmlBlockPreprocessor(Preprocessor):
  92     """Remove html blocks from the text and store them for later retrieval."""
  93
  94     right_tag_patterns = ["</%s>", "%s>"]
  95     attrs_pattern = r"""
  96         \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
  97         |                                                         # OR
  98         \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
  99         |                                                         # OR
 100         \s+(?P<attr2>[^>"'/= ]+)                                  # attr
 101         """
 102     left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
 103     attrs_re = re.compile(attrs_pattern, re.VERBOSE)
 104     left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
 105     markdown_in_raw = False
 106
 107     def _get_left_tag(self, block):
 108         m = self.left_tag_re.match(block)
 109         if m:
 110             tag = m.group('tag')
 111             raw_attrs = m.group('attrs')
 112             attrs = {}
 113             if raw_attrs:
 114                 for ma in self.attrs_re.finditer(raw_attrs):
 115                     if ma.group('attr'):
 116                         if ma.group('value'):
 117                             attrs[ma.group('attr').strip()] = ma.group('value')
 118                         else:
 119                             attrs[ma.group('attr').strip()] = ""
 120                     elif ma.group('attr1'):
 121                         if ma.group('value1'):
 122                             attrs[ma.group('attr1').strip()] = ma.group('value1')
 123                         else:
 124                             attrs[ma.group('attr1').strip()] = ""
 125                     elif ma.group('attr2'):
 126                         attrs[ma.group('attr2').strip()] = ""
 127             return tag, len(m.group(0)), attrs
 128         else:
 129             tag = block[1:].split(">", 1)[0].lower()
 130             return tag, len(tag)+2, {}
 131
 132     def _recursive_tagfind(self, ltag, rtag, start_index, block):
 133         while 1:
 134             i = block.find(rtag, start_index)
 135             if i == -1:
 136                 return -1
 137             j = block.find(ltag, start_index)
 138             # if no ltag, or rtag found before another ltag, return index
 139             if (j > i or j == -1):
 140                 return i + len(rtag)
 141             # another ltag found before rtag, use end of ltag as starting
 142             # point and search again
 143             j = block.find('>', j)
 144             start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
 145             if start_index == -1:
 146                 # HTML potentially malformed- ltag has no corresponding
 147                 # rtag
 148                 return -1
 149
 150     def _get_right_tag(self, left_tag, left_index, block):
 151         for p in self.right_tag_patterns:
 152             tag = p % left_tag
 153             i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
 154             if i > 2:
 155                 return tag.lstrip("<").rstrip(">"), i
 156         return block.rstrip()[-left_index:-1].lower(), len(block)
 157
 158     def _equal_tags(self, left_tag, right_tag):
 159         if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
 160             return True
 161         if ("/" + left_tag) == right_tag:
 162             return True
 163         if (right_tag == "--" and left_tag == "--"):
 164             return True
 165         elif left_tag == right_tag[1:] \
 166             and right_tag[0] == "/":
 167             return True
 168         else:
 169             return False
 170
 171     def _is_oneliner(self, tag):
 172         return (tag in ['hr', 'hr/'])
 173
 174     def run(self, lines):
 175         text = "\n".join(lines)
 176         new_blocks = []
 177         text = text.rsplit("\n\n")
 178         items = []
 179         left_tag = ''
 180         right_tag = ''
 181         in_tag = False # flag
 182
 183         while text:
 184             block = text[0]
 185             if block.startswith("\n"):
 186                 block = block[1:]
 187             text = text[1:]
 188
 189             if block.startswith("\n"):
 190                 block = block[1:]
 191
 192             if not in_tag:
 193                 if block.startswith("<") and len(block.strip()) > 1:
 194
 195                     if block[1] == "!":
 196                         # is a comment block
 197                         left_tag, left_index, attrs  = "--", 2, {}
 198                     else:
 199                         left_tag, left_index, attrs = self._get_left_tag(block)
 200                     right_tag, data_index = self._get_right_tag(left_tag,
 201                                                                 left_index,
 202                                                                 block)
 203                     # keep checking conditions below and maybe just append
 204
 205                     if data_index < len(block) \
 206                         and (util.isBlockLevel(left_tag)
 207                         or left_tag == '--'):
 208                         text.insert(0, block[data_index:])
 209                         block = block[:data_index]
 210
 211                     if not (util.isBlockLevel(left_tag) \
 212                         or block[1] in ["!", "?", "@", "%"]):
 213                         new_blocks.append(block)
 214                         continue
 215
 216                     if self._is_oneliner(left_tag):
 217                         new_blocks.append(block.strip())
 218                         continue
 219
 220                     if block.rstrip().endswith(">") \
 221                         and self._equal_tags(left_tag, right_tag):
 222                         if self.markdown_in_raw and 'markdown' in attrs.keys():
 223                             start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
 224                                            '', block[:left_index])
 225                             end = block[-len(right_tag)-2:]
 226                             block = block[left_index:-len(right_tag)-2]
 227                             new_blocks.append(
 228                                 self.markdown.htmlStash.store(start))
 229                             new_blocks.append(block)
 230                             new_blocks.append(
 231                                 self.markdown.htmlStash.store(end))
 232                         else:
 233                             new_blocks.append(
 234                                 self.markdown.htmlStash.store(block.strip()))
 235                         continue
 236                     else:
 237                         # if is block level tag and is not complete
 238
 239                         if util.isBlockLevel(left_tag) or left_tag == "--" \
 240                             and not block.rstrip().endswith(">"):
 241                             items.append(block.strip())
 242                             in_tag = True
 243                         else:
 244                             new_blocks.append(
 245                             self.markdown.htmlStash.store(block.strip()))
 246
 247                         continue
 248
 249                 new_blocks.append(block)
 250
 251             else:
 252                 items.append(block)
 253
 254                 right_tag, data_index = self._get_right_tag(left_tag, 0, block)
 255
 256                 if self._equal_tags(left_tag, right_tag):
 257                     # if find closing tag
 258
 259                     if data_index < len(block):
 260                         # we have more text after right_tag
 261                         items[-1] = block[:data_index]
 262                         text.insert(0, block[data_index:])
 263
 264                     in_tag = False
 265                     if self.markdown_in_raw and 'markdown' in attrs.keys():
 266                         start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
 267                                        '', items[0][:left_index])
 268                         items[0] = items[0][left_index:]
 269                         end = items[-1][-len(right_tag)-2:]
 270                         items[-1] = items[-1][:-len(right_tag)-2]
 271                         new_blocks.append(
 272                             self.markdown.htmlStash.store(start))
 273                         new_blocks.extend(items)
 274                         new_blocks.append(
 275                             self.markdown.htmlStash.store(end))
 276                     else:
 277                         new_blocks.append(
 278                             self.markdown.htmlStash.store('\n\n'.join(items)))
 279                     items = []
 280
 281         if items:
 282             if self.markdown_in_raw and 'markdown' in attrs.keys():
 283                 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
 284                                '', items[0][:left_index])
 285                 items[0] = items[0][left_index:]
 286                 end = items[-1][-len(right_tag)-2:]
 287                 items[-1] = items[-1][:-len(right_tag)-2]
 288                 new_blocks.append(
 289                     self.markdown.htmlStash.store(start))
 290                 new_blocks.extend(items)
 291                 if end.strip():
 292                     new_blocks.append(
 293                         self.markdown.htmlStash.store(end))
 294             else:
 295                 new_blocks.append(
 296                     self.markdown.htmlStash.store('\n\n'.join(items)))
 297             #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
 298             new_blocks.append('\n')
 299
 300         new_text = "\n\n".join(new_blocks)
 301         return new_text.split("\n")
 302
 303
 304 class ReferencePreprocessor(Preprocessor):
 305     """ Remove reference definitions from text and store for later use. """
 306
 307     TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
 308     RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
 309     TITLE_RE = re.compile(r'^%s$' % TITLE)
 310
 311     def run (self, lines):
 312         new_text = [];
 313         while lines:
 314             line = lines.pop(0)
 315             m = self.RE.match(line)
 316             if m:
 317                 id = m.group(1).strip().lower()
 318                 link = m.group(2).lstrip('<').rstrip('>')
 319                 t = m.group(5) or m.group(6) or m.group(7)
 320                 if not t:
 321                     # Check next line for title
 322                     tm = self.TITLE_RE.match(lines[0])
 323                     if tm:
 324                         lines.pop(0)
 325                         t = tm.group(2) or tm.group(3) or tm.group(4)
 326                 self.markdown.references[id] = (link, t)
 327             else:
 328                 new_text.append(line)
 329
 330         return new_text #+ "\n"