tools/db2html.py

   1 #!/usr/bin/env python3
   2 # -*- python; coding: utf-8 -*-
   3 #
   4 # gtk-doc - GTK DocBook documentation generator.
   5 # Copyright (C) 2017  Stefan Sauer
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20 #
  21
  22 """Prototype for builtin docbook processing
  23
  24 The tool loaded the main xml document (<module>-docs.xml) and chunks it
  25 like the xsl-stylesheets would do. For that it resolves all the xml-includes.
  26
  27 TODO: convert the docbook-xml to html
  28 - more templates or maybe don't use jinja2 at all
  29 - refentry/index nav headers
  30 - check each docbook tag if it can contain #PCDATA, if not don't check for
  31   xml.text
  32 - integrate syntax-highlighing from fixxref
  33   - maybe handle the combination <informalexample><programlisting> directly
  34   - switch to http://pygments.org/docs/quickstart/?
  35 - integrate MakeXRef from fixxref
  36   - first create devhelp2 output
  37
  38 OPTIONAL:
  39 - minify html: https://pypi.python.org/pypi/htmlmin/
  40
  41 Requirements:
  42 sudo pip3 install anytree jinja2 lxml
  43
  44 Examples:
  45 python3 tools/db2html.py tests/gobject/docs/tester-docs.xml
  46 ll tests/gobject/docs/db2html
  47
  48 python3 tools/db2html.py tests/bugs/docs/tester-docs.xml
  49 ll tests/bugs/docs/db2html
  50 cp tests/bugs/docs/html/*.{css,png} tests/bugs/docs/db2html/
  51 xdg-open tests/bugs/docs/db2html/index.html
  52 meld tests/bugs/docs/{html,db2html}
  53
  54 Benchmarking:
  55 (cd tests/bugs/docs/; rm html-build.stamp; time make html-build.stamp)
  56 """
  57
  58 import argparse
  59 import errno
  60 import logging
  61 import os
  62 import sys
  63
  64 from anytree import Node, PreOrderIter
  65 from jinja2 import Environment, FileSystemLoader
  66 from lxml import etree
  67
  68 # TODO(ensonic): requires gtk-doc to be installed, rewrite later
  69 sys.path.append('/usr/share/gtk-doc/python')
  70 from gtkdoc.fixxref import NoLinks
  71
  72
  73 # http://www.sagehill.net/docbookxsl/Chunking.html
  74 CHUNK_TAGS = [
  75     'appendix',
  76     'article',
  77     'bibliography',  # in article or book
  78     'book',
  79     'chapter',
  80     'colophon',
  81     'glossary',      # in article or book
  82     'index',         # in article or book
  83     'part',
  84     'preface',
  85     'refentry',
  86     'reference',
  87     'sect1',         # except first
  88     'section',       # if equivalent to sect1
  89     'set',
  90     'setindex',
  91 ]
  92
  93
  94 class ChunkParams(object):
  95     def __init__(self, prefix, parent=None):
  96         self.prefix = prefix
  97         self.parent = None
  98         self.count = 0
  99
 100
 101 # TODO: look up the abbrevs and hierarchy for other tags
 102 # http://www.sagehill.net/docbookxsl/Chunking.html#GeneratedFilenames
 103 CHUNK_PARAMS = {
 104     'book': ChunkParams('bk'),
 105     'chapter': ChunkParams('ch', 'book'),
 106     'index': ChunkParams('ix', 'book'),
 107     'sect1': ChunkParams('s', 'chapter'),
 108     'section': ChunkParams('s', 'chapter'),
 109 }
 110
 111 TITLE_XPATH = {
 112     'book': etree.XPath('./bookinfo/title/text()'),
 113     'chapter': etree.XPath('./title/text()'),
 114     'index': etree.XPath('./title/text()'),
 115     'refentry': etree.XPath('./refmeta/refentrytitle/text()'),
 116 }
 117
 118 # Jinja2 templates
 119 TOOL_PATH = os.path.dirname(os.path.abspath(__file__))
 120 TEMPLATE_ENV = Environment(
 121     # loader=PackageLoader('gtkdoc', 'templates'),
 122     # autoescape=select_autoescape(['html', 'xml'])
 123     loader=FileSystemLoader(os.path.join(TOOL_PATH, 'templates')),
 124     # extensions=['jinja2.ext.do'],
 125     autoescape=False,
 126     lstrip_blocks=True,
 127     trim_blocks=True,
 128 )
 129
 130 TEMPLATES = {
 131     'book': TEMPLATE_ENV.get_template('book.html'),
 132     'index': TEMPLATE_ENV.get_template('index.html'),
 133     'refentry': TEMPLATE_ENV.get_template('refentry.html'),
 134 }
 135
 136
 137 def gen_chunk_name(node):
 138     if 'id' in node.attrib:
 139         return node.attrib['id']
 140
 141     tag = node.tag
 142     if tag not in CHUNK_PARAMS:
 143         CHUNK_PARAMS[tag] = ChunkParams(node.tag[:2])
 144         logging.warning('Add CHUNK_PARAMS for "%s"', tag)
 145
 146     naming = CHUNK_PARAMS[tag]
 147     naming.count += 1
 148     name = ('%s%02d' % (naming.prefix, naming.count))
 149     # handle parents to make names of nested tags unique
 150     # TODO: we only need to prepend the parent if there are > 1 of them in the
 151     #       xml
 152     # while naming.parent:
 153     #     parent = naming.parent
 154     #     if parent not in CHUNK_PARAMS:
 155     #         break;
 156     #     naming = CHUNK_PARAMS[parent]
 157     #     name = ('%s%02d' % (naming.prefix, naming.count)) + name
 158     return name
 159
 160
 161 def get_chunk_title(node):
 162     tag = node.tag
 163     if tag not in TITLE_XPATH:
 164         logging.warning('Add TITLE_XPATH for "%s"', tag)
 165         return ''
 166
 167     xpath = TITLE_XPATH[tag]
 168     return xpath(node, smart_strings=False)[0]
 169
 170
 171 def chunk(xml_node, parent=None):
 172     """Chunk the tree.
 173
 174     The first time, we're called with parent=None and in that case we return
 175     the new_node as the root of the tree
 176     """
 177     # print('<%s %s>' % (xml_node.tag, xml_node.attrib))
 178     if xml_node.tag in CHUNK_TAGS:
 179         # TODO: do we need to remove the xml-node from the parent?
 180         #       we generate toc from the files tree
 181         # from copy import deepcopy
 182         # sub_tree = deepcopy(xml_node)
 183         # xml_node.getparent().remove(xml_node)
 184         # # or:
 185         # sub_tree = etree.ElementTree(xml_node).getroot()
 186         parent = Node(xml_node.tag, parent=parent, xml=xml_node,
 187                       filename=gen_chunk_name(xml_node) + '.html',
 188                       title=get_chunk_title(xml_node))
 189     for child in xml_node:
 190         chunk(child, parent)
 191
 192     return parent
 193
 194 # conversion helpers
 195
 196
 197 def escape_entities(text):
 198     return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
 199
 200
 201 def convert_inner(xml, result):
 202     for child in xml:
 203         result.extend(convert_tags.get(child.tag, convert__unknown)(child))
 204
 205
 206 def convert_ignore(xml):
 207     return ['']
 208
 209
 210 missing_tags = {}
 211
 212
 213 def convert__unknown(xml):
 214     # warn only once
 215     if xml.tag not in missing_tags:
 216         logging.warning('Add tag converter for "%s"', xml.tag)
 217         missing_tags[xml.tag] = True
 218     result = ['<!-- ' + xml.tag + '-->\n']
 219     convert_inner(xml, result)
 220     result.append('<!-- /' + xml.tag + '-->\n')
 221     return result
 222
 223
 224 def convert_refsect(xml, h_tag, inner_func=convert_inner):
 225     result = ['<div class="%s">\n' % xml.tag]
 226     title = xml.find('title')
 227     if title is not None:
 228         if 'id' in xml.attrib:
 229             result.append('<a name="%s"></a>' % xml.attrib['id'])
 230         result.append('<%s>%s</%s>' % (h_tag, title.text, h_tag))
 231         xml.remove(title)
 232     if xml.text:
 233         result.append(xml.text)
 234     inner_func(xml, result)
 235     result.append('</div>')
 236     if xml.tail:
 237         result.append(xml.tail)
 238     return result
 239
 240
 241 # docbook tags
 242
 243
 244 def convert_colspec(xml):
 245     result = ['<col']
 246     a = xml.attrib
 247     if 'colname' in a:
 248         result.append(' class="%s"' % a['colname'])
 249     if 'colwidth' in a:
 250         result.append(' width="%s"' % a['colwidth'])
 251     result.append('>\n')
 252     # is in tgroup and there can be no 'text'
 253     return result
 254
 255
 256 def convert_div(xml):
 257     result = ['<div class="%s">\n' % xml.tag]
 258     if xml.text:
 259         result.append(xml.text)
 260     convert_inner(xml, result)
 261     result.append('</div>')
 262     if xml.tail:
 263         result.append(xml.tail)
 264     return result
 265
 266
 267 def convert_em_class(xml):
 268     result = ['<em class="%s"><code>' % xml.tag]
 269     if xml.text:
 270         result.append(xml.text)
 271     convert_inner(xml, result)
 272     result.append('</code></em>')
 273     if xml.tail:
 274         result.append(xml.tail)
 275     return result
 276
 277
 278 def convert_entry(xml):
 279     result = ['<td']
 280     if 'role' in xml.attrib:
 281         result.append(' class="%s">' % xml.attrib['role'])
 282     else:
 283         result.append('>')
 284     if xml.text:
 285         result.append(xml.text)
 286     convert_inner(xml, result)
 287     result.append('</td>')
 288     if xml.tail:
 289         result.append(xml.tail)
 290     return result
 291
 292
 293 def convert_informaltable(xml):
 294     result = ['<div class="informaltable"><table class="informaltable"']
 295     a = xml.attrib
 296     if 'pgwide' in a and a['pgwide'] == '1':
 297         result.append(' width="100%"')
 298     if 'frame' in a and a['frame'] == 'none':
 299         result.append(' border="0"')
 300     result.append('>\n')
 301     convert_inner(xml, result)
 302     result.append('</table></div>')
 303     if xml.tail:
 304         result.append(xml.tail)
 305     return result
 306
 307
 308 def convert_itemizedlist(xml):
 309     result = ['<div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; ">']
 310     convert_inner(xml, result)
 311     result.append('</ul></div>')
 312     if xml.tail:
 313         result.append(xml.tail)
 314     return result
 315
 316
 317 def convert_link(xml):
 318     # TODO: inline more fixxref functionality
 319     # TODO: need to build an 'id' map and resolve against internal links too
 320     linkend = xml.attrib['linkend']
 321     if linkend in NoLinks:
 322         linkend = None
 323     result = []
 324     if linkend:
 325         result = ['<!-- GTKDOCLINK HREF="%s" -->' % linkend]
 326     if xml.text:
 327         result.append(xml.text)
 328     convert_inner(xml, result)
 329     if linkend:
 330         result.append('<!-- /GTKDOCLINK -->')
 331     if xml.tail:
 332         result.append(xml.tail)
 333     return result
 334
 335
 336 def convert_listitem(xml):
 337     result = ['<li class="listitem">']
 338     convert_inner(xml, result)
 339     result.append('</li>')
 340     # is in itemizedlist and there can be no 'text'
 341     return result
 342
 343
 344 def convert_literal(xml):
 345     result = ['<code class="%s">' % xml.tag]
 346     if xml.text:
 347         result.append(xml.text)
 348     convert_inner(xml, result)
 349     result.append('</code>')
 350     if xml.tail:
 351         result.append(xml.tail)
 352     return result
 353
 354
 355 def convert_para(xml):
 356     result = ['<p>']
 357     if xml.tag != 'para':
 358         result = ['<p class="%s">' % xml.tag]
 359     if xml.text:
 360         result.append(xml.text)
 361     convert_inner(xml, result)
 362     result.append('</p>')
 363     if xml.tail:
 364         result.append(xml.tail)
 365     return result
 366
 367
 368 def convert_phrase(xml):
 369     result = ['<span']
 370     if 'role' in xml.attrib:
 371         result.append(' class="%s">' % xml.attrib['role'])
 372     else:
 373         result.append('>')
 374     if xml.text:
 375         result.append(xml.text)
 376     convert_inner(xml, result)
 377     result.append('</span>')
 378     if xml.tail:
 379         result.append(xml.tail)
 380     return result
 381
 382
 383 def convert_programlisting(xml):
 384     result = ['<pre class="programlisting">']
 385     if xml.text:
 386         result.append(escape_entities(xml.text))
 387     convert_inner(xml, result)
 388     result.append('</pre>')
 389     if xml.tail:
 390         result.append(xml.tail)
 391     return result
 392
 393
 394 def convert_refsect1(xml):
 395     # Add a divider between two consequitive refsect2
 396     def convert_inner(xml, result):
 397         prev = None
 398         for child in xml:
 399             if child.tag == 'refsect2' and prev is not None and prev.tag == child.tag:
 400                 result.append('<hr>\n')
 401             result.extend(convert_tags.get(child.tag, convert__unknown)(child))
 402             prev = child
 403     return convert_refsect(xml, 'h2', convert_inner)
 404
 405
 406 def convert_refsect2(xml):
 407     return convert_refsect(xml, 'h3')
 408
 409
 410 def convert_refsect3(xml):
 411     return convert_refsect(xml, 'h4')
 412
 413
 414 def convert_row(xml):
 415     result = ['<tr>\n']
 416     convert_inner(xml, result)
 417     result.append('</tr>\n')
 418     return result
 419
 420
 421 def convert_span(xml):
 422     result = ['<span class="%s">' % xml.tag]
 423     if xml.text:
 424         result.append(xml.text)
 425     convert_inner(xml, result)
 426     result.append('</span>')
 427     if xml.tail:
 428         result.append(xml.tail)
 429     return result
 430
 431
 432 def convert_tbody(xml):
 433     result = ['<tbody>']
 434     convert_inner(xml, result)
 435     result.append('</tbody>')
 436     # is in tgroup and there can be no 'text'
 437     return result
 438
 439
 440 def convert_tgroup(xml):
 441     # tgroup does not expand to anything, but the nested colspecs need to
 442     # be put into a colgroup
 443     cols = xml.findall('colspec')
 444     result = []
 445     if cols:
 446         result.append('<colgroup>\n')
 447         for col in cols:
 448             result.extend(convert_colspec(col))
 449             xml.remove(col)
 450         result.append('</colgroup>\n')
 451     convert_inner(xml, result)
 452     # is in informaltable and there can be no 'text'
 453     return result
 454
 455
 456 def convert_ulink(xml):
 457     result = ['<a class="%s" href="%s">%s</a>' % (xml.tag, xml.attrib['url'], xml.text)]
 458     if xml.tail:
 459         result.append(xml.tail)
 460     return result
 461
 462
 463 convert_tags = {
 464     'colspec': convert_colspec,
 465     'entry': convert_entry,
 466     'function': convert_span,
 467     'indexterm': convert_ignore,
 468     'informalexample': convert_div,
 469     'informaltable': convert_informaltable,
 470     'itemizedlist': convert_itemizedlist,
 471     'link': convert_link,
 472     'listitem': convert_listitem,
 473     'literal': convert_literal,
 474     'para': convert_para,
 475     'parameter': convert_em_class,
 476     'phrase': convert_phrase,
 477     'programlisting': convert_programlisting,
 478     'releaseinfo': convert_para,
 479     'refsect1': convert_refsect1,
 480     'refsect2': convert_refsect2,
 481     'refsect3': convert_refsect3,
 482     'returnvalue': convert_span,
 483     'row': convert_row,
 484     'structfield': convert_em_class,
 485     'tbody': convert_tbody,
 486     'tgroup': convert_tgroup,
 487     'type': convert_span,
 488     'ulink': convert_ulink,
 489     'warning': convert_div,
 490 }
 491
 492
 493 def convert(out_dir, files, node):
 494     """Convert the docbook chunks to a html file."""
 495
 496     def jinja_convert(xml):
 497         return ''.join(convert_tags.get(xml.tag, convert__unknown)(xml))
 498
 499     logging.info('Writing: %s', node.filename)
 500     with open(os.path.join(out_dir, node.filename), 'wt') as html:
 501         if node.name in TEMPLATES:
 502             # TODO: ideally precompile common xpath exprs once:
 503             #   func = etree.XPath('//b')
 504             #   func(xml_node)[0]
 505             # unused, we can call api :)
 506             # def lxml_xpath_str0(xml, expr):
 507             #     return xml.xpath(expr, smart_strings=False)[0]
 508             #
 509             # def lxml_xpath(xml, expr):
 510             #     return xml.xpath(expr)
 511
 512             template = TEMPLATES[node.name]
 513             template.globals['convert_block'] = jinja_convert
 514             params = {
 515                 'xml': node.xml,
 516                 'title': node.title,
 517                 'nav_home': node.root,
 518             }
 519             if 'id' in node.xml.attrib:
 520                 params['id'] = node.xml.attrib['id']
 521             else:
 522                 # TODO: generate?
 523                 logging.warning('No top-level "id" for "%s"', node.xml.tag)
 524             # nav params: up, prev, next
 525             if node.parent:
 526                 params['nav_up'] = node.parent
 527             ix = files.index(node)
 528             if ix > 0:
 529                 params['nav_prev'] = files[ix - 1]
 530             if ix < len(files) - 1:
 531                 params['nav_next'] = files[ix + 1]
 532
 533             # TODO: call a top-level python converter instead
 534             # generate_{book,chapter,index,refentry}(files, node)
 535             # xml is node.xml
 536             # We need to rewrite all other converters to take
 537             # (xml, files, node) or (xml, params)
 538             # where params is sort of like what we have above
 539
 540             html.write(template.render(**params))
 541         else:
 542             logging.warning('Add template for "%s"', node.name)
 543
 544
 545 def main(index_file):
 546     tree = etree.parse(index_file)
 547     tree.xinclude()
 548
 549     dir_name = os.path.dirname(index_file)
 550
 551     # for testing: dump to output file
 552     # out_file = os.path.join(dir_name, 'db2html.xml')
 553     # tree.write(out_file)
 554
 555     # TODO: rename to 'html' later on
 556     out_dir = os.path.join(dir_name, 'db2html')
 557     try:
 558         os.mkdir(out_dir)
 559     except OSError as e:
 560         if e.errno != errno.EEXIST:
 561             raise
 562
 563     # We need multiple passes:
 564     # 1) recursively walk the tree and chunk it into a python tree so that we
 565     #   can generate navigation and link tags.
 566     #   also collect all 'id' attributes on the way and build map of
 567     #   id:rel-link (in fixxref is is Links[])
 568     files = chunk(tree.getroot())
 569     # 2) iterate the tree and output files
 570     # TODO: use multiprocessing
 571     files = list(PreOrderIter(files))
 572     for node in files:
 573         convert(out_dir, files, node)
 574     # 3) create a devhelp2.xsl
 575     # - toc under 'chapter'
 576     # - keywords under 'functions' from all refsect2 and refsect3
 577
 578
 579 if __name__ == '__main__':
 580     parser = argparse.ArgumentParser(
 581         description='db2html - chunk docbook')
 582     parser.add_argument('sources', nargs='*')
 583     options = parser.parse_args()
 584     if len(options.sources) != 1:
 585         sys.exit('Expect one source file argument.')
 586
 587     log_level = os.environ.get('GTKDOC_TRACE')
 588     if log_level == '':
 589         log_level = 'INFO'
 590     if log_level:
 591         logging.basicConfig(stream=sys.stdout,
 592                             level=logging.getLevelName(log_level.upper()),
 593                             format='%(asctime)s:%(filename)s:%(funcName)s:%(lineno)d:%(levelname)s:%(message)s')
 594
 595     sys.exit(main(options.sources[0]))