Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / markdown / preprocessors.py
blob3f1cfe7777fbacde0ae2a68ac910e343426e9fb7
1 # markdown is released under the BSD license
2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4 # Copyright 2004 Manfred Stienstra (the original version)
5 #
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # * Neither the name of the <organization> nor the
17 # names of its contributors may be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
33 """
34 PRE-PROCESSORS
35 =============================================================================
37 Preprocessors work on source text before we start doing anything too
38 complicated.
39 """
41 from __future__ import absolute_import
42 from __future__ import unicode_literals
43 from . import util
44 from . import odict
45 import re
48 def build_preprocessors(md_instance, **kwargs):
49 """ Build the default set of preprocessors used by Markdown. """
50 preprocessors = odict.OrderedDict()
51 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
52 if md_instance.safeMode != 'escape':
53 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
54 preprocessors["reference"] = ReferencePreprocessor(md_instance)
55 return preprocessors
58 class Preprocessor(util.Processor):
59 """
60 Preprocessors are run after the text is broken into lines.
62 Each preprocessor implements a "run" method that takes a pointer to a
63 list of lines of the document, modifies it as necessary and returns
64 either the same pointer or a pointer to a new list.
66 Preprocessors must extend markdown.Preprocessor.
68 """
69 def run(self, lines):
70 """
71 Each subclass of Preprocessor should override the `run` method, which
72 takes the document as a list of strings split by newlines and returns
73 the (possibly modified) list of lines.
75 """
76 pass
79 class NormalizeWhitespace(Preprocessor):
80 """ Normalize whitespace for consistant parsing. """
82 def run(self, lines):
83 source = '\n'.join(lines)
84 source = source.replace(util.STX, "").replace(util.ETX, "")
85 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
86 source = source.expandtabs(self.markdown.tab_length)
87 source = re.sub(r'(?<=\n) +\n', '\n', source)
88 return source.split('\n')
91 class HtmlBlockPreprocessor(Preprocessor):
92 """Remove html blocks from the text and store them for later retrieval."""
94 right_tag_patterns = ["</%s>", "%s>"]
95 attrs_pattern = r"""
96 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
97 | # OR
98 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
99 | # OR
100 \s+(?P<attr2>[^>"'/= ]+) # attr
102 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
103 attrs_re = re.compile(attrs_pattern, re.VERBOSE)
104 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
105 markdown_in_raw = False
107 def _get_left_tag(self, block):
108 m = self.left_tag_re.match(block)
109 if m:
110 tag = m.group('tag')
111 raw_attrs = m.group('attrs')
112 attrs = {}
113 if raw_attrs:
114 for ma in self.attrs_re.finditer(raw_attrs):
115 if ma.group('attr'):
116 if ma.group('value'):
117 attrs[ma.group('attr').strip()] = ma.group('value')
118 else:
119 attrs[ma.group('attr').strip()] = ""
120 elif ma.group('attr1'):
121 if ma.group('value1'):
122 attrs[ma.group('attr1').strip()] = ma.group('value1')
123 else:
124 attrs[ma.group('attr1').strip()] = ""
125 elif ma.group('attr2'):
126 attrs[ma.group('attr2').strip()] = ""
127 return tag, len(m.group(0)), attrs
128 else:
129 tag = block[1:].split(">", 1)[0].lower()
130 return tag, len(tag)+2, {}
132 def _recursive_tagfind(self, ltag, rtag, start_index, block):
133 while 1:
134 i = block.find(rtag, start_index)
135 if i == -1:
136 return -1
137 j = block.find(ltag, start_index)
138 # if no ltag, or rtag found before another ltag, return index
139 if (j > i or j == -1):
140 return i + len(rtag)
141 # another ltag found before rtag, use end of ltag as starting
142 # point and search again
143 j = block.find('>', j)
144 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
145 if start_index == -1:
146 # HTML potentially malformed- ltag has no corresponding
147 # rtag
148 return -1
150 def _get_right_tag(self, left_tag, left_index, block):
151 for p in self.right_tag_patterns:
152 tag = p % left_tag
153 i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
154 if i > 2:
155 return tag.lstrip("<").rstrip(">"), i
156 return block.rstrip()[-left_index:-1].lower(), len(block)
158 def _equal_tags(self, left_tag, right_tag):
159 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
160 return True
161 if ("/" + left_tag) == right_tag:
162 return True
163 if (right_tag == "--" and left_tag == "--"):
164 return True
165 elif left_tag == right_tag[1:] \
166 and right_tag[0] == "/":
167 return True
168 else:
169 return False
171 def _is_oneliner(self, tag):
172 return (tag in ['hr', 'hr/'])
174 def run(self, lines):
175 text = "\n".join(lines)
176 new_blocks = []
177 text = text.rsplit("\n\n")
178 items = []
179 left_tag = ''
180 right_tag = ''
181 in_tag = False # flag
183 while text:
184 block = text[0]
185 if block.startswith("\n"):
186 block = block[1:]
187 text = text[1:]
189 if block.startswith("\n"):
190 block = block[1:]
192 if not in_tag:
193 if block.startswith("<") and len(block.strip()) > 1:
195 if block[1] == "!":
196 # is a comment block
197 left_tag, left_index, attrs = "--", 2, {}
198 else:
199 left_tag, left_index, attrs = self._get_left_tag(block)
200 right_tag, data_index = self._get_right_tag(left_tag,
201 left_index,
202 block)
203 # keep checking conditions below and maybe just append
205 if data_index < len(block) \
206 and (util.isBlockLevel(left_tag)
207 or left_tag == '--'):
208 text.insert(0, block[data_index:])
209 block = block[:data_index]
211 if not (util.isBlockLevel(left_tag) \
212 or block[1] in ["!", "?", "@", "%"]):
213 new_blocks.append(block)
214 continue
216 if self._is_oneliner(left_tag):
217 new_blocks.append(block.strip())
218 continue
220 if block.rstrip().endswith(">") \
221 and self._equal_tags(left_tag, right_tag):
222 if self.markdown_in_raw and 'markdown' in attrs.keys():
223 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
224 '', block[:left_index])
225 end = block[-len(right_tag)-2:]
226 block = block[left_index:-len(right_tag)-2]
227 new_blocks.append(
228 self.markdown.htmlStash.store(start))
229 new_blocks.append(block)
230 new_blocks.append(
231 self.markdown.htmlStash.store(end))
232 else:
233 new_blocks.append(
234 self.markdown.htmlStash.store(block.strip()))
235 continue
236 else:
237 # if is block level tag and is not complete
239 if util.isBlockLevel(left_tag) or left_tag == "--" \
240 and not block.rstrip().endswith(">"):
241 items.append(block.strip())
242 in_tag = True
243 else:
244 new_blocks.append(
245 self.markdown.htmlStash.store(block.strip()))
247 continue
249 new_blocks.append(block)
251 else:
252 items.append(block)
254 right_tag, data_index = self._get_right_tag(left_tag, 0, block)
256 if self._equal_tags(left_tag, right_tag):
257 # if find closing tag
259 if data_index < len(block):
260 # we have more text after right_tag
261 items[-1] = block[:data_index]
262 text.insert(0, block[data_index:])
264 in_tag = False
265 if self.markdown_in_raw and 'markdown' in attrs.keys():
266 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
267 '', items[0][:left_index])
268 items[0] = items[0][left_index:]
269 end = items[-1][-len(right_tag)-2:]
270 items[-1] = items[-1][:-len(right_tag)-2]
271 new_blocks.append(
272 self.markdown.htmlStash.store(start))
273 new_blocks.extend(items)
274 new_blocks.append(
275 self.markdown.htmlStash.store(end))
276 else:
277 new_blocks.append(
278 self.markdown.htmlStash.store('\n\n'.join(items)))
279 items = []
281 if items:
282 if self.markdown_in_raw and 'markdown' in attrs.keys():
283 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
284 '', items[0][:left_index])
285 items[0] = items[0][left_index:]
286 end = items[-1][-len(right_tag)-2:]
287 items[-1] = items[-1][:-len(right_tag)-2]
288 new_blocks.append(
289 self.markdown.htmlStash.store(start))
290 new_blocks.extend(items)
291 if end.strip():
292 new_blocks.append(
293 self.markdown.htmlStash.store(end))
294 else:
295 new_blocks.append(
296 self.markdown.htmlStash.store('\n\n'.join(items)))
297 #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
298 new_blocks.append('\n')
300 new_text = "\n\n".join(new_blocks)
301 return new_text.split("\n")
304 class ReferencePreprocessor(Preprocessor):
305 """ Remove reference definitions from text and store for later use. """
307 TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
308 RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
309 TITLE_RE = re.compile(r'^%s$' % TITLE)
311 def run (self, lines):
312 new_text = [];
313 while lines:
314 line = lines.pop(0)
315 m = self.RE.match(line)
316 if m:
317 id = m.group(1).strip().lower()
318 link = m.group(2).lstrip('<').rstrip('>')
319 t = m.group(5) or m.group(6) or m.group(7)
320 if not t:
321 # Check next line for title
322 tm = self.TITLE_RE.match(lines[0])
323 if tm:
324 lines.pop(0)
325 t = tm.group(2) or tm.group(3) or tm.group(4)
326 self.markdown.references[id] = (link, t)
327 else:
328 new_text.append(line)
330 return new_text #+ "\n"