1 # -*- python; coding: utf-8 -*-
3 # gtk-doc - GTK DocBook documentation generator.
4 # Copyright (C) 1998 Damon Chaplin
5 # 2007-2016 Stefan Sauer
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 Markdown to Docbook converter
30 ExpandAbbreviations
= MakeXRef
= MakeHashXRef
= tagify
= None
32 # Elements to consider non-block items in MarkDown parsing
33 MD_TEXT_LEVEL_ELEMENTS
= {
34 'emphasis', 'envar', 'filename', 'firstterm', 'footnote', 'function', 'literal',
35 'manvolnum', 'option', 'replaceable', 'structfield', 'structname', 'title',
38 MD_ESCAPABLE_CHARS
= r
'\`*_{}[]()>#+-.!'
39 MD_GTK_ESCAPABLE_CHARS
= r
'@%'
43 # TODO(enonic): find a better way to do this
44 global ExpandAbbreviations
, MakeXRef
, MakeHashXRef
, tagify
45 from .mkdb
import ExpandAbbreviations
, MakeXRef
, MakeHashXRef
, tagify
48 def MarkDownParseBlocks(lines
, symbol
, context
):
50 md_block
= {"type": ''}
52 logging
.debug("parsing %s lines", len(lines
))
54 logging
.info("type='%s', int='%s', parsing '%s'", md_block
["type"], md_block
.get('interrupted'), line
)
59 if md_block
["type"] == "markup":
60 if 'closed' not in md_block
:
61 if md_block
["start"] in line
:
62 md_block
["depth"] += 1
64 if md_block
["end"] in line
:
65 if md_block
["depth"] > 0:
66 md_block
["depth"] -= 1
68 logging
.info("closing tag '%s'", line
)
69 md_block
["closed"] = 1
70 # TODO(ensonic): reparse inner text with MarkDownParseLines?
72 md_block
["text"] += "\n" + line
73 logging
.info("add to markup: '%s'", line
)
76 deindented_line
= line
.lstrip()
78 if md_block
["type"] == "heading":
79 # a heading is ended by any level less than or equal
80 if md_block
["level"] == 1:
81 heading_match
= re
.search(r
'^[#][ \t]+(.+?)[ \t]*[#]*[ \t]*(?:{#([^}]+)})?[ \t]*$', line
)
82 if re
.search(r
'^={4,}[ \t]*$', line
):
83 text
= md_block
["lines"].pop()
84 md_block
.pop("interrupted", None)
85 md_blocks
.append(md_block
)
86 md_block
= {'type': "heading",
93 md_block
.pop("interrupted", None)
94 md_blocks
.append(md_block
)
95 md_block
= {'type': "heading",
96 'text': heading_match
.group(1),
100 if heading_match
.group(2):
101 md_block
['id'] = heading_match
.group(2)
104 # push lines into the block until the end is reached
105 md_block
["lines"].append(line
)
109 heading_match
= re
.search(r
'^([#]{1,2})[ \t]+(.+?)[ \t]*[#]*[ \t]*(?:{#([^}]+)})?[ \t]*$', line
)
110 if re
.search(r
'^[=]{4,}[ \t]*$', line
):
111 text
= md_block
["lines"].pop()
112 md_block
.pop("interrupted", None)
113 md_blocks
.append(md_block
)
114 md_block
= {'type': "heading",
120 elif re
.search(r
'^[-]{4,}[ \t]*$', line
):
121 text
= md_block
["lines"].pop()
122 md_block
.pop("interrupted", None)
123 md_blocks
.append(md_block
)
124 md_block
= {'type': "heading",
131 md_block
.pop("interrupted", None)
132 md_blocks
.append(md_block
)
133 md_block
= {'type': "heading",
134 'text': heading_match
.group(2),
136 'level': len(heading_match
.group(1))
138 if heading_match
.group(3):
139 md_block
['id'] = heading_match
.group(3)
142 # push lines into the block until the end is reached
143 md_block
["lines"].append(line
)
145 elif md_block
["type"] == "code":
146 end_of_code_match
= re
.search(r
'^[ \t]*\]\|(.*)', line
)
147 if end_of_code_match
:
148 md_blocks
.append(md_block
)
149 md_block
= {'type': "paragraph",
150 'text': end_of_code_match
.group(1),
154 md_block
["lines"].append(line
)
157 if deindented_line
== '':
158 logging
.info('setting "interrupted" due to empty line')
159 md_block
["interrupted"] = 1
162 if md_block
["type"] == "quote":
163 if 'interrupted' not in md_block
:
164 line
= re
.sub(r
'^[ ]*>[ ]?', '', line
)
165 md_block
["lines"].append(line
)
168 elif md_block
["type"] == "li":
169 marker
= md_block
["marker"]
170 marker_match
= re
.search(r
'^([ ]{0,3})(%s)[ ](.*)' % marker
, line
)
172 indentation
= marker_match
.group(1)
173 if md_block
["indentation"] != indentation
:
174 md_block
["lines"].append(line
)
176 ordered
= md_block
["ordered"]
177 md_block
.pop('last', None)
178 md_blocks
.append(md_block
)
179 md_block
= {'type': "li",
181 'indentation': indentation
,
184 'lines': [re
.sub(r
'^[ ]{0,4}', '', marker_match
.group(3))],
188 if 'interrupted' in md_block
:
189 if first_char
== " ":
190 md_block
["lines"].append('')
191 line
= re
.sub(r
'^[ ]{0,4}', '', line
)
192 md_block
["lines"].append(line
)
193 md_block
.pop("interrupted", None)
196 line
= re
.sub(r
'^[ ]{0,4}', '', line
)
197 md_block
["lines"].append(line
)
200 # indentation sensitive types
201 heading_match
= re
.search(r
'^([#]{1,2})[ \t]+(.+?)[ \t]*[#]*[ \t]*(?:{#([^}]+)})?[ \t]*$', line
)
202 code_match
= re
.search(r
'^[ \t]*\|\[[ ]*(?:<!-- language="([^"]+?)" -->)?', line
)
205 md_blocks
.append(md_block
)
206 md_block
= {'type': "heading",
207 'text': heading_match
.group(2),
209 'level': len(heading_match
.group(1)),
211 if heading_match
.group(3):
212 md_block
['id'] = heading_match
.group(3)
214 elif re
.search(r
'^={4,}[ \t]*$', line
):
215 # setext heading (====)
217 if md_block
["type"] == "paragraph" and "interrupted" in md_block
:
218 md_blocks
.append(md_block
.copy())
219 md_block
["type"] = "heading"
220 md_block
["lines"] = []
221 md_block
["level"] = 1
223 elif re
.search(r
'^-{4,}[ \t]*$', line
):
224 # setext heading (-----)
226 if md_block
["type"] == "paragraph" and "interrupted" in md_block
:
227 md_blocks
.append(md_block
.copy())
228 md_block
["type"] = "heading"
229 md_block
["lines"] = []
230 md_block
["level"] = 2
235 md_block
["interrupted"] = 1
236 md_blocks
.append(md_block
)
237 md_block
= {'type': "code",
240 if code_match
.group(1):
241 md_block
['language'] = code_match
.group(1)
244 # indentation insensitive types
245 markup_match
= re
.search(r
'^[ ]*<\??(\w+)[^>]*([\/\?])?[ \t]*>', line
)
246 li_match
= re
.search(r
'^([ ]*)[*+-][ ](.*)', line
)
247 quote_match
= re
.search(r
'^[ ]*>[ ]?(.*)', line
)
248 if re
.search(r
'^[ ]*<!DOCTYPE/', line
):
249 md_blocks
.append(md_block
)
250 md_block
= {'type': "markup",
251 'text': deindented_line
,
258 # markup, including <?xml version="1.0"?>
259 tag
= markup_match
.group(1)
260 is_self_closing
= markup_match
.group(2) is not None
263 # TODO(ensonic): consider adding more uri schemes (ftp, ...)
264 if re
.search(r
'https?', tag
):
265 logging
.info("skipping link '%s'", tag
)
267 # for TEXT_LEVEL_ELEMENTS, we want to keep them as-is in the paragraph
268 # instead of creation a markdown block.
269 scanning_for_end_of_text_level_tag
= (
270 md_block
["type"] == "paragraph" and
271 'start' in md_block
and
272 'closed' not in md_block
)
273 logging
.info("markup found '%s', scanning %s ?", tag
, scanning_for_end_of_text_level_tag
)
274 if tag
not in MD_TEXT_LEVEL_ELEMENTS
and not scanning_for_end_of_text_level_tag
:
275 md_blocks
.append(md_block
)
278 logging
.info("self-closing docbook '%s'", tag
)
279 md_block
= {'type': "self-closing tag",
280 'text': deindented_line
,
285 logging
.info("new markup '%s'", tag
)
286 md_block
= {'type': "markup",
287 'text': deindented_line
,
288 'start': '<' + tag
+ '>',
289 'end': '</' + tag
+ '>',
292 if re
.search(r
'<\/%s>' % tag
, deindented_line
):
293 md_block
["closed"] = 1
297 if tag
in MD_TEXT_LEVEL_ELEMENTS
:
298 logging
.info("text level docbook '%s' in '%s' state", tag
, md_block
["type"])
299 # TODO(ensonic): handle nesting
300 if not scanning_for_end_of_text_level_tag
:
301 if not re
.search(r
'<\/%s>' % tag
, deindented_line
):
302 logging
.info("new text level markup '%s'", tag
)
303 md_block
["start"] = '<' + tag
+ '>'
304 md_block
["end"] = '</' + tag
+ '>'
305 md_block
.pop("closed", None)
306 logging
.info("scanning for end of '%s'", tag
)
309 if md_block
["end"] in deindented_line
:
310 md_block
["closed"] = 1
311 logging
.info("found end of '%s'", tag
)
314 md_blocks
.append(md_block
)
315 indentation
= li_match
.group(1)
316 md_block
= {'type': "li",
318 'indentation': indentation
,
322 'lines': [re
.sub(r
'^[ ]{0,4}', '', li_match
.group(2))],
326 md_blocks
.append(md_block
)
327 md_block
= {'type': "quote",
328 'lines': [quote_match
.group(1)],
333 list_item_match
= re
.search(r
'^([ ]{0,4})\d+[.][ ]+(.*)', line
)
335 md_blocks
.append(md_block
)
336 indentation
= list_item_match
.group(1)
337 md_block
= {'type': "li",
339 'indentation': indentation
,
343 'lines': [re
.sub(r
'^[ ]{0,4}', '', list_item_match
.group(2))],
348 if md_block
["type"] == "paragraph":
349 if "interrupted" in md_block
:
350 md_blocks
.append(md_block
)
351 md_block
= {'type': "paragraph",
354 logging
.info("new paragraph due to interrupted")
356 md_block
["text"] += "\n" + line
357 logging
.info("add to paragraph: '%s'", line
)
360 md_blocks
.append(md_block
)
361 md_block
= {'type': "paragraph",
364 logging
.info("new paragraph due to different block type")
366 md_blocks
.append(md_block
)
372 def MarkDownParseSpanElementsInner(text
, markersref
):
374 markers
= {i
: 1 for i
in markersref
}
378 closest_marker_position
= -1
383 for marker
, use
in markers
.items():
387 marker_position
= text
.find(marker
)
389 if marker_position
< 0:
393 if closest_marker
== '' or marker_position
< closest_marker_position
:
394 closest_marker
= marker
395 closest_marker_position
= marker_position
397 if closest_marker_position
>= 0:
398 text_marker
= text
[closest_marker_position
:]
400 if text_marker
== '':
405 markup
+= text
[:closest_marker_position
]
406 text
= text
[closest_marker_position
:]
407 markers_rest
= {k
: v
for k
, v
in markers
.items() if v
and k
!= closest_marker
}
409 if closest_marker
== '![' or closest_marker
== '[':
410 # 'id-ref' : local id reference
411 # 'title' : link short description/alt-text/tooltip
413 # 'href' : external link
414 # 'is-media': is link to media object
417 # FIXME: '(?R)' is a recursive subpattern
418 # match a [...] block with no ][ inside or this thing again
419 # m = re.search(r'\[((?:[^][]|(?R))*)\]', text)
420 m
= re
.search(r
'\[((?:[^][])*)\]', text
)
421 if ']' in text
and m
:
422 element
= {'is-media': text
[0] == '!',
423 'a': EscapeEntities(m
.group(1)),
426 offset
= len(m
.group(0))
427 if element
['is-media']:
429 logging
.debug("Recursive md-expr match: off=%d, text='%s', match='%s'", offset
, text
, m
.group(1))
431 remaining_text
= text
[offset
:]
433 m2
= re
.search(r
'''^\([ ]*([^)'"]*?)(?:[ ]+['"](.+?)['"])?[ ]*\)''', remaining_text
)
435 m3
= re
.search(r
'^\s*\[([^\]<]*?)\]', remaining_text
)
437 element
['href'] = m2
.group(1)
439 element
['title'] = m2
.group(2)
440 offset
+= len(m2
.group(0))
442 element
['id-ref'] = m3
.group(1)
443 offset
+= len(m3
.group(0))
448 logging
.debug("output link for", element
)
450 if 'href' in element
:
451 element
['href'] = EscapeEntities(element
['href'])
453 if element
['is-media']:
455 markup
+= '<inlinemediaobject><imageobject><imagedata fileref="' + \
456 element
['href'] + '"></imagedata></imageobject>'
459 markup
+= "<textobject><phrase>" + element
['a'] + "</phrase></textobject>"
461 markup
+= "</inlinemediaobject>"
462 elif 'id-ref' in element
:
464 element
['a'] = MarkDownParseSpanElementsInner(element
['a'], markers_rest
)
465 markup
+= '<link linkend="' + element
['id-ref'] + '"'
467 if 'title' in element
:
468 # title attribute not supported
471 markup
+= '>' + element
['a'] + "</link>"
474 element
['a'] = MarkDownParseSpanElementsInner(element
['a'], markers_rest
)
475 markup
+= '<ulink url="' + element
['href'] + '"'
477 if 'title' in element
:
478 # title attribute not supported
481 markup
+= '>' + element
['a'] + "</ulink>"
484 markup
+= closest_marker
485 if closest_marker
== '![':
490 elif closest_marker
== '<':
491 m4
= re
.search(r
'^<(https?:[\/]{2}[^\s]+?)>', text
, flags
=re
.I
)
492 m5
= re
.search(r
'^<([A-Za-z0-9._-]+?@[A-Za-z0-9._-]+?)>', text
)
493 m6
= re
.search(r
'^<[^>]+?>', text
)
495 element_url
= EscapeEntities(m4
.group(1))
497 markup
+= '<ulink url="' + element_url
+ '">' + element_url
+ '</ulink>'
498 offset
= len(m4
.group(0))
500 markup
+= "<ulink url=\"mailto:" + m5
.group(1) + "\">" + m5
.group(1) + "</ulink>"
501 offset
= len(m5
.group(0))
503 markup
+= m6
.group(0)
504 offset
= len(m6
.group(0))
509 elif closest_marker
== "\\":
512 special_char
= text
[1]
513 if special_char
in MD_ESCAPABLE_CHARS
or special_char
in MD_GTK_ESCAPABLE_CHARS
:
514 markup
+= special_char
520 elif closest_marker
== "`":
521 m7
= re
.search(r
'^(`+)([^`]+?)\1(?!`)', text
)
523 element_text
= EscapeEntities(m7
.group(2))
524 markup
+= "<literal>" + element_text
+ "</literal>"
525 offset
= len(m7
.group(0))
530 elif closest_marker
== "@":
532 # FIXME: we could make those also links ($symbol.$2), but that would be less
533 # useful as the link target is a few lines up or down
534 m7
= re
.search(r
'^(\A|[^\\])\@(\w+((\.|->)\w+)*)\s*\(\)', text
)
535 m8
= re
.search(r
'^(\A|[^\\])\@(\w+((\.|->)\w+)*)', text
)
536 m9
= re
.search(r
'^\\\@', text
)
538 markup
+= m7
.group(1) + "<parameter>" + m7
.group(2) + "()</parameter>\n"
539 offset
= len(m7
.group(0))
541 # Convert '@param', but not '\@param'.
542 markup
+= m8
.group(1) + "<parameter>" + m8
.group(2) + "</parameter>\n"
543 offset
= len(m8
.group(0))
546 offset
= len(m9
.group(0))
551 elif closest_marker
== '#':
552 m10
= re
.search(r
'^(\A|[^\\])#([\w\-:\.]+[\w]+)\s*\(\)', text
)
553 m11
= re
.search(r
'^(\A|[^\\])#([\w\-:\.]+[\w]+)', text
)
554 m12
= re
.search(r
'^\\#', text
)
556 # handle #Object.func()
557 markup
+= m10
.group(1) + MakeXRef(m10
.group(2), tagify(m10
.group(2) + "()", "function"))
558 offset
= len(m10
.group(0))
560 # Convert '#symbol', but not '\#symbol'.
561 markup
+= m11
.group(1) + MakeHashXRef(m11
.group(2), "type")
562 offset
= len(m11
.group(0))
565 offset
= len(m12
.group(0))
570 elif closest_marker
== "%":
571 m12
= re
.search(r
'^(\A|[^\\])\%(-?\w+)', text
)
572 m13
= re
.search(r
'^\\%', text
)
574 # Convert '%constant', but not '\%constant'.
575 # Also allow negative numbers, e.g. %-1.
576 markup
+= m12
.group(1) + MakeXRef(m12
.group(2), tagify(m12
.group(2), "literal"))
577 offset
= len(m12
.group(0))
580 offset
= len(m13
.group(0))
591 def MarkDownParseSpanElements(text
):
592 markers
= ["\\", '<', '![', '[', "`", '%', '#', '@']
594 text
= MarkDownParseSpanElementsInner(text
, markers
)
596 # Convert 'function()' or 'macro()'.
597 # if there is abc_*_def() we don't want to make a link to _def()
598 # FIXME: also handle abc(def(....)) : but that would need to be done recursively :/
600 return m
.group(1) + MakeXRef(m
.group(2), tagify(m
.group(2) + "()", "function"))
601 text
= re
.sub(r
'([^\*.\w])(\w+)\s*\(\)', f
, text
)
605 def EscapeEntities(text
):
606 return text
.replace('&', '&').replace('<', '<').replace('>', '>')
609 def ReplaceEntities(text
):
610 entities
= [["<", '<'],
619 ["&", '&'], # Do this last, or the others get messed up.
623 text
= re
.sub(i
[0], i
[1], text
)
627 def MarkDownOutputDocBook(blocksref
, symbol
, context
):
632 # $output += "\n<!-- beg type='" . $block->{"type"} . "'-->\n"
634 if block
["type"] == "paragraph":
635 text
= MarkDownParseSpanElements(block
["text"])
636 if context
== "li" and output
== '':
637 if 'interrupted' in block
:
638 output
+= "\n<para>%s</para>\n" % text
640 output
+= "<para>%s</para>" % text
644 output
+= "<para>%s</para>\n" % text
646 elif block
["type"] == "heading":
648 title
= MarkDownParseSpanElements(block
["text"])
650 if block
["level"] == 1:
655 text
= MarkDownParseLines(block
["lines"], symbol
, "heading")
657 output
+= "<%s id=\"%s\">" % (tag
, block
["id"])
659 output
+= "<%s>" % tag
661 output
+= "<title>%s</title>%s</%s>\n" % (title
, text
, tag
)
662 elif block
["type"] == "li":
668 output
+= "<%s>\n" % tag
670 if "interrupted" in block
:
671 block
["lines"].append('')
673 text
= MarkDownParseLines(block
["lines"], symbol
, "li")
674 output
+= "<listitem>" + text
+ "</listitem>\n"
678 output
+= "</%s>\n" % tag
680 elif block
["type"] == "quote":
681 text
= MarkDownParseLines(block
["lines"], symbol
, "quote")
682 output
+= "<blockquote>\n%s</blockquote>\n" % text
683 elif block
["type"] == "code":
684 tag
= "programlisting"
686 if "language" in block
:
687 if block
["language"] == "plain":
688 output
+= "<informalexample><screen><![CDATA[\n"
691 output
+= "<informalexample><programlisting role=\"example\" language=\"%s\"><![CDATA[\n" % block
['language']
693 output
+= "<informalexample><programlisting role=\"example\"><![CDATA[\n"
695 logging
.debug('listing for %s: [%s]', symbol
, '\n'.join(block
['lines']))
696 for line
in block
["lines"]:
697 output
+= ReplaceEntities(line
) + "\n"
699 output
+= "]]></%s></informalexample>\n" % tag
700 elif block
["type"] == "markup":
701 text
= ExpandAbbreviations(symbol
, block
["text"])
702 output
+= text
+ "\n"
704 output
+= block
["text"] + "\n"
706 # $output += "\n<!-- end type='" . $block->{"type"} . "'-->\n"
710 def MarkDownParseLines(lines
, symbol
, context
):
711 logging
.info('md parse: ctx=%s, [%s]', context
, '\n'.join(lines
))
712 blocks
= MarkDownParseBlocks(lines
, symbol
, context
)
713 output
= MarkDownOutputDocBook(blocks
, symbol
, context
)
717 def MarkDownParse(text
, symbol
):
718 """Converts mark down syntax to the respective docbook.
720 http://de.wikipedia.org/wiki/Markdown
721 Inspired by the design of ParseDown
722 http://parsedown.org/
723 Copyright (c) 2013 Emanuil Rusev, erusev.com
744 Ordered (unnested) Lists
745 ------------------------
749 1. item 2 with loooong
754 Note: we require a blank line above the list items
756 # TODO(ensonic): it would be nice to add id parameters to the refsect2 elements
758 return MarkDownParseLines(text
.splitlines(), symbol
, '')