db2html: some cleanup and planning
[gtk-doc.git] / tools / db2html.py
blobc8c9121a2974c88785fd0a6d98d7370282822eec
1 #!/usr/bin/env python3
2 # -*- python; coding: utf-8 -*-
4 # gtk-doc - GTK DocBook documentation generator.
5 # Copyright (C) 2017 Stefan Sauer
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 """Prototype for builtin docbook processing
24 The tool loaded the main xml document (<module>-docs.xml) and chunks it
25 like the xsl-stylesheets would do. For that it resolves all the xml-includes.
27 TODO: convert the docbook-xml to html
28 - more templates or maybe don't use jinja2 at all
29 - refentry/index nav headers
30 - check each docbook tag if it can contain #PCDATA, if not don't check for
31 xml.text
32 - integrate syntax-highlighing from fixxref
33 - maybe handle the combination <informalexample><programlisting> directly
34 - switch to http://pygments.org/docs/quickstart/?
35 - integrate MakeXRef from fixxref
36 - first create devhelp2 output
38 OPTIONAL:
39 - minify html: https://pypi.python.org/pypi/htmlmin/
41 Requirements:
42 sudo pip3 install anytree jinja2 lxml
44 Examples:
45 python3 tools/db2html.py tests/gobject/docs/tester-docs.xml
46 ll tests/gobject/docs/db2html
48 python3 tools/db2html.py tests/bugs/docs/tester-docs.xml
49 ll tests/bugs/docs/db2html
50 cp tests/bugs/docs/html/*.{css,png} tests/bugs/docs/db2html/
51 xdg-open tests/bugs/docs/db2html/index.html
52 meld tests/bugs/docs/{html,db2html}
54 Benchmarking:
55 (cd tests/bugs/docs/; rm html-build.stamp; time make html-build.stamp)
56 """
58 import argparse
59 import errno
60 import logging
61 import os
62 import sys
64 from anytree import Node, PreOrderIter
65 from jinja2 import Environment, FileSystemLoader
66 from lxml import etree
68 # TODO(ensonic): requires gtk-doc to be installed, rewrite later
69 sys.path.append('/usr/share/gtk-doc/python')
70 from gtkdoc.fixxref import NoLinks
73 # http://www.sagehill.net/docbookxsl/Chunking.html
74 CHUNK_TAGS = [
75 'appendix',
76 'article',
77 'bibliography', # in article or book
78 'book',
79 'chapter',
80 'colophon',
81 'glossary', # in article or book
82 'index', # in article or book
83 'part',
84 'preface',
85 'refentry',
86 'reference',
87 'sect1', # except first
88 'section', # if equivalent to sect1
89 'set',
90 'setindex',
94 class ChunkParams(object):
95 def __init__(self, prefix, parent=None):
96 self.prefix = prefix
97 self.parent = None
98 self.count = 0
101 # TODO: look up the abbrevs and hierarchy for other tags
102 # http://www.sagehill.net/docbookxsl/Chunking.html#GeneratedFilenames
103 CHUNK_PARAMS = {
104 'book': ChunkParams('bk'),
105 'chapter': ChunkParams('ch', 'book'),
106 'index': ChunkParams('ix', 'book'),
107 'sect1': ChunkParams('s', 'chapter'),
108 'section': ChunkParams('s', 'chapter'),
111 TITLE_XPATH = {
112 'book': etree.XPath('./bookinfo/title/text()'),
113 'chapter': etree.XPath('./title/text()'),
114 'index': etree.XPath('./title/text()'),
115 'refentry': etree.XPath('./refmeta/refentrytitle/text()'),
118 # Jinja2 templates
119 TOOL_PATH = os.path.dirname(os.path.abspath(__file__))
120 TEMPLATE_ENV = Environment(
121 # loader=PackageLoader('gtkdoc', 'templates'),
122 # autoescape=select_autoescape(['html', 'xml'])
123 loader=FileSystemLoader(os.path.join(TOOL_PATH, 'templates')),
124 # extensions=['jinja2.ext.do'],
125 autoescape=False,
126 lstrip_blocks=True,
127 trim_blocks=True,
130 TEMPLATES = {
131 'book': TEMPLATE_ENV.get_template('book.html'),
132 'index': TEMPLATE_ENV.get_template('index.html'),
133 'refentry': TEMPLATE_ENV.get_template('refentry.html'),
137 def gen_chunk_name(node):
138 if 'id' in node.attrib:
139 return node.attrib['id']
141 tag = node.tag
142 if tag not in CHUNK_PARAMS:
143 CHUNK_PARAMS[tag] = ChunkParams(node.tag[:2])
144 logging.warning('Add CHUNK_PARAMS for "%s"', tag)
146 naming = CHUNK_PARAMS[tag]
147 naming.count += 1
148 name = ('%s%02d' % (naming.prefix, naming.count))
149 # handle parents to make names of nested tags unique
150 # TODO: we only need to prepend the parent if there are > 1 of them in the
151 # xml
152 # while naming.parent:
153 # parent = naming.parent
154 # if parent not in CHUNK_PARAMS:
155 # break;
156 # naming = CHUNK_PARAMS[parent]
157 # name = ('%s%02d' % (naming.prefix, naming.count)) + name
158 return name
161 def get_chunk_title(node):
162 tag = node.tag
163 if tag not in TITLE_XPATH:
164 logging.warning('Add TITLE_XPATH for "%s"', tag)
165 return ''
167 xpath = TITLE_XPATH[tag]
168 return xpath(node, smart_strings=False)[0]
171 def chunk(xml_node, parent=None):
172 """Chunk the tree.
174 The first time, we're called with parent=None and in that case we return
175 the new_node as the root of the tree
177 # print('<%s %s>' % (xml_node.tag, xml_node.attrib))
178 if xml_node.tag in CHUNK_TAGS:
179 # TODO: do we need to remove the xml-node from the parent?
180 # we generate toc from the files tree
181 # from copy import deepcopy
182 # sub_tree = deepcopy(xml_node)
183 # xml_node.getparent().remove(xml_node)
184 # # or:
185 # sub_tree = etree.ElementTree(xml_node).getroot()
186 parent = Node(xml_node.tag, parent=parent, xml=xml_node,
187 filename=gen_chunk_name(xml_node) + '.html',
188 title=get_chunk_title(xml_node))
189 for child in xml_node:
190 chunk(child, parent)
192 return parent
194 # conversion helpers
197 def escape_entities(text):
198 return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
201 def convert_inner(xml, result):
202 for child in xml:
203 result.extend(convert_tags.get(child.tag, convert__unknown)(child))
206 def convert_ignore(xml):
207 return ['']
210 missing_tags = {}
213 def convert__unknown(xml):
214 # warn only once
215 if xml.tag not in missing_tags:
216 logging.warning('Add tag converter for "%s"', xml.tag)
217 missing_tags[xml.tag] = True
218 result = ['<!-- ' + xml.tag + '-->\n']
219 convert_inner(xml, result)
220 result.append('<!-- /' + xml.tag + '-->\n')
221 return result
224 def convert_refsect(xml, h_tag, inner_func=convert_inner):
225 result = ['<div class="%s">\n' % xml.tag]
226 title = xml.find('title')
227 if title is not None:
228 if 'id' in xml.attrib:
229 result.append('<a name="%s"></a>' % xml.attrib['id'])
230 result.append('<%s>%s</%s>' % (h_tag, title.text, h_tag))
231 xml.remove(title)
232 if xml.text:
233 result.append(xml.text)
234 inner_func(xml, result)
235 result.append('</div>')
236 if xml.tail:
237 result.append(xml.tail)
238 return result
241 # docbook tags
244 def convert_colspec(xml):
245 result = ['<col']
246 a = xml.attrib
247 if 'colname' in a:
248 result.append(' class="%s"' % a['colname'])
249 if 'colwidth' in a:
250 result.append(' width="%s"' % a['colwidth'])
251 result.append('>\n')
252 # is in tgroup and there can be no 'text'
253 return result
256 def convert_div(xml):
257 result = ['<div class="%s">\n' % xml.tag]
258 if xml.text:
259 result.append(xml.text)
260 convert_inner(xml, result)
261 result.append('</div>')
262 if xml.tail:
263 result.append(xml.tail)
264 return result
267 def convert_em_class(xml):
268 result = ['<em class="%s"><code>' % xml.tag]
269 if xml.text:
270 result.append(xml.text)
271 convert_inner(xml, result)
272 result.append('</code></em>')
273 if xml.tail:
274 result.append(xml.tail)
275 return result
278 def convert_entry(xml):
279 result = ['<td']
280 if 'role' in xml.attrib:
281 result.append(' class="%s">' % xml.attrib['role'])
282 else:
283 result.append('>')
284 if xml.text:
285 result.append(xml.text)
286 convert_inner(xml, result)
287 result.append('</td>')
288 if xml.tail:
289 result.append(xml.tail)
290 return result
293 def convert_informaltable(xml):
294 result = ['<div class="informaltable"><table class="informaltable"']
295 a = xml.attrib
296 if 'pgwide' in a and a['pgwide'] == '1':
297 result.append(' width="100%"')
298 if 'frame' in a and a['frame'] == 'none':
299 result.append(' border="0"')
300 result.append('>\n')
301 convert_inner(xml, result)
302 result.append('</table></div>')
303 if xml.tail:
304 result.append(xml.tail)
305 return result
308 def convert_itemizedlist(xml):
309 result = ['<div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; ">']
310 convert_inner(xml, result)
311 result.append('</ul></div>')
312 if xml.tail:
313 result.append(xml.tail)
314 return result
317 def convert_link(xml):
318 # TODO: inline more fixxref functionality
319 # TODO: need to build an 'id' map and resolve against internal links too
320 linkend = xml.attrib['linkend']
321 if linkend in NoLinks:
322 linkend = None
323 result = []
324 if linkend:
325 result = ['<!-- GTKDOCLINK HREF="%s" -->' % linkend]
326 if xml.text:
327 result.append(xml.text)
328 convert_inner(xml, result)
329 if linkend:
330 result.append('<!-- /GTKDOCLINK -->')
331 if xml.tail:
332 result.append(xml.tail)
333 return result
336 def convert_listitem(xml):
337 result = ['<li class="listitem">']
338 convert_inner(xml, result)
339 result.append('</li>')
340 # is in itemizedlist and there can be no 'text'
341 return result
344 def convert_literal(xml):
345 result = ['<code class="%s">' % xml.tag]
346 if xml.text:
347 result.append(xml.text)
348 convert_inner(xml, result)
349 result.append('</code>')
350 if xml.tail:
351 result.append(xml.tail)
352 return result
355 def convert_para(xml):
356 result = ['<p>']
357 if xml.tag != 'para':
358 result = ['<p class="%s">' % xml.tag]
359 if xml.text:
360 result.append(xml.text)
361 convert_inner(xml, result)
362 result.append('</p>')
363 if xml.tail:
364 result.append(xml.tail)
365 return result
368 def convert_phrase(xml):
369 result = ['<span']
370 if 'role' in xml.attrib:
371 result.append(' class="%s">' % xml.attrib['role'])
372 else:
373 result.append('>')
374 if xml.text:
375 result.append(xml.text)
376 convert_inner(xml, result)
377 result.append('</span>')
378 if xml.tail:
379 result.append(xml.tail)
380 return result
383 def convert_programlisting(xml):
384 result = ['<pre class="programlisting">']
385 if xml.text:
386 result.append(escape_entities(xml.text))
387 convert_inner(xml, result)
388 result.append('</pre>')
389 if xml.tail:
390 result.append(xml.tail)
391 return result
394 def convert_refsect1(xml):
395 # Add a divider between two consequitive refsect2
396 def convert_inner(xml, result):
397 prev = None
398 for child in xml:
399 if child.tag == 'refsect2' and prev is not None and prev.tag == child.tag:
400 result.append('<hr>\n')
401 result.extend(convert_tags.get(child.tag, convert__unknown)(child))
402 prev = child
403 return convert_refsect(xml, 'h2', convert_inner)
406 def convert_refsect2(xml):
407 return convert_refsect(xml, 'h3')
410 def convert_refsect3(xml):
411 return convert_refsect(xml, 'h4')
414 def convert_row(xml):
415 result = ['<tr>\n']
416 convert_inner(xml, result)
417 result.append('</tr>\n')
418 return result
421 def convert_span(xml):
422 result = ['<span class="%s">' % xml.tag]
423 if xml.text:
424 result.append(xml.text)
425 convert_inner(xml, result)
426 result.append('</span>')
427 if xml.tail:
428 result.append(xml.tail)
429 return result
432 def convert_tbody(xml):
433 result = ['<tbody>']
434 convert_inner(xml, result)
435 result.append('</tbody>')
436 # is in tgroup and there can be no 'text'
437 return result
440 def convert_tgroup(xml):
441 # tgroup does not expand to anything, but the nested colspecs need to
442 # be put into a colgroup
443 cols = xml.findall('colspec')
444 result = []
445 if cols:
446 result.append('<colgroup>\n')
447 for col in cols:
448 result.extend(convert_colspec(col))
449 xml.remove(col)
450 result.append('</colgroup>\n')
451 convert_inner(xml, result)
452 # is in informaltable and there can be no 'text'
453 return result
456 def convert_ulink(xml):
457 result = ['<a class="%s" href="%s">%s</a>' % (xml.tag, xml.attrib['url'], xml.text)]
458 if xml.tail:
459 result.append(xml.tail)
460 return result
463 convert_tags = {
464 'colspec': convert_colspec,
465 'entry': convert_entry,
466 'function': convert_span,
467 'indexterm': convert_ignore,
468 'informalexample': convert_div,
469 'informaltable': convert_informaltable,
470 'itemizedlist': convert_itemizedlist,
471 'link': convert_link,
472 'listitem': convert_listitem,
473 'literal': convert_literal,
474 'para': convert_para,
475 'parameter': convert_em_class,
476 'phrase': convert_phrase,
477 'programlisting': convert_programlisting,
478 'releaseinfo': convert_para,
479 'refsect1': convert_refsect1,
480 'refsect2': convert_refsect2,
481 'refsect3': convert_refsect3,
482 'returnvalue': convert_span,
483 'row': convert_row,
484 'structfield': convert_em_class,
485 'tbody': convert_tbody,
486 'tgroup': convert_tgroup,
487 'type': convert_span,
488 'ulink': convert_ulink,
489 'warning': convert_div,
493 def convert(out_dir, files, node):
494 """Convert the docbook chunks to a html file."""
496 def jinja_convert(xml):
497 return ''.join(convert_tags.get(xml.tag, convert__unknown)(xml))
499 logging.info('Writing: %s', node.filename)
500 with open(os.path.join(out_dir, node.filename), 'wt') as html:
501 if node.name in TEMPLATES:
502 # TODO: ideally precompile common xpath exprs once:
503 # func = etree.XPath('//b')
504 # func(xml_node)[0]
505 # unused, we can call api :)
506 # def lxml_xpath_str0(xml, expr):
507 # return xml.xpath(expr, smart_strings=False)[0]
509 # def lxml_xpath(xml, expr):
510 # return xml.xpath(expr)
512 template = TEMPLATES[node.name]
513 template.globals['convert_block'] = jinja_convert
514 params = {
515 'xml': node.xml,
516 'title': node.title,
517 'nav_home': node.root,
519 if 'id' in node.xml.attrib:
520 params['id'] = node.xml.attrib['id']
521 else:
522 # TODO: generate?
523 logging.warning('No top-level "id" for "%s"', node.xml.tag)
524 # nav params: up, prev, next
525 if node.parent:
526 params['nav_up'] = node.parent
527 ix = files.index(node)
528 if ix > 0:
529 params['nav_prev'] = files[ix - 1]
530 if ix < len(files) - 1:
531 params['nav_next'] = files[ix + 1]
533 # TODO: call a top-level python converter instead
534 # generate_{book,chapter,index,refentry}(files, node)
535 # xml is node.xml
536 # We need to rewrite all other converters to take
537 # (xml, files, node) or (xml, params)
538 # where params is sort of like what we have above
540 html.write(template.render(**params))
541 else:
542 logging.warning('Add template for "%s"', node.name)
545 def main(index_file):
546 tree = etree.parse(index_file)
547 tree.xinclude()
549 dir_name = os.path.dirname(index_file)
551 # for testing: dump to output file
552 # out_file = os.path.join(dir_name, 'db2html.xml')
553 # tree.write(out_file)
555 # TODO: rename to 'html' later on
556 out_dir = os.path.join(dir_name, 'db2html')
557 try:
558 os.mkdir(out_dir)
559 except OSError as e:
560 if e.errno != errno.EEXIST:
561 raise
563 # We need multiple passes:
564 # 1) recursively walk the tree and chunk it into a python tree so that we
565 # can generate navigation and link tags.
566 # also collect all 'id' attributes on the way and build map of
567 # id:rel-link (in fixxref is is Links[])
568 files = chunk(tree.getroot())
569 # 2) iterate the tree and output files
570 # TODO: use multiprocessing
571 files = list(PreOrderIter(files))
572 for node in files:
573 convert(out_dir, files, node)
574 # 3) create a devhelp2.xsl
575 # - toc under 'chapter'
576 # - keywords under 'functions' from all refsect2 and refsect3
579 if __name__ == '__main__':
580 parser = argparse.ArgumentParser(
581 description='db2html - chunk docbook')
582 parser.add_argument('sources', nargs='*')
583 options = parser.parse_args()
584 if len(options.sources) != 1:
585 sys.exit('Expect one source file argument.')
587 log_level = os.environ.get('GTKDOC_TRACE')
588 if log_level == '':
589 log_level = 'INFO'
590 if log_level:
591 logging.basicConfig(stream=sys.stdout,
592 level=logging.getLevelName(log_level.upper()),
593 format='%(asctime)s:%(filename)s:%(funcName)s:%(lineno)d:%(levelname)s:%(message)s')
595 sys.exit(main(options.sources[0]))