tools/db2html.py

   1 #!/usr/bin/env python3
   2 # -*- python; coding: utf-8 -*-
   3 #
   4 # gtk-doc - GTK DocBook documentation generator.
   5 # Copyright (C) 2017  Stefan Sauer
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20 #
  21
  22 """Prototype for builtin docbook processing
  23
  24 The tool loaded the main xml document (<module>-docs.xml) and chunks it
  25 like the xsl-stylesheets would do. For that it resolves all the xml-includes.
  26
  27 TODO: convert the docbook-xml to html
  28 - more templates
  29 - refentry/index nav headers
  30 - for refsect, we need a 'long-title' that also contains refpurpose
  31 - figure how to deal with all the possible docbook
  32   - how can we report 'unhandled' data
  33 - we need a generic transform for everything in a para (and others like
  34   releaseinfo)
  35   - this will walk the tree and replace nodes to convert from docbook to html
  36   - we can start with 1:1, but most likely each transform will be a function
  37     that mangles the sub tree and recurses for certain children (kind of what
  38     xslt does)
  39
  40 OPTIONAL:
  41 - minify html: https://pypi.python.org/pypi/htmlmin/
  42
  43 Requirements:
  44 sudo pip3 install anytree jinja2 lxml
  45
  46 Examples:
  47 python3 tools/db2html.py tests/gobject/docs/tester-docs.xml
  48 ll tests/gobject/docs/db2html
  49
  50 python3 tools/db2html.py tests/bugs/docs/tester-docs.xml
  51 ll tests/bugs/docs/db2html
  52 cp tests/bugs/docs/html/*.{css,png} tests/bugs/docs/db2html/
  53 xdg-open tests/bugs/docs/db2html/index.html
  54 meld tests/bugs/docs/{html,db2html}
  55
  56 Benchmarking:
  57 (cd tests/bugs/docs/; rm html-build.stamp; time make html-build.stamp)
  58 """
  59
  60 import argparse
  61 import errno
  62 import logging
  63 import os
  64 import sys
  65
  66 from anytree import Node, PreOrderIter
  67 from jinja2 import Environment, FileSystemLoader
  68 from lxml import etree
  69
  70
  71 # http://www.sagehill.net/docbookxsl/Chunking.html
  72 CHUNK_TAGS = [
  73     'appendix',
  74     'article',
  75     'bibliography',  # in article or book
  76     'book',
  77     'chapter',
  78     'colophon',
  79     'glossary',      # in article or book
  80     'index',         # in article or book
  81     'part',
  82     'preface',
  83     'refentry',
  84     'reference',
  85     'sect1',         # except first
  86     'section',       # if equivalent to sect1
  87     'set',
  88     'setindex',
  89 ]
  90
  91
  92 class ChunkParams(object):
  93     def __init__(self, prefix, parent=None):
  94         self.prefix = prefix
  95         self.parent = None
  96         self.count = 0
  97
  98
  99 # TODO: look up the abbrevs and hierarchy for other tags
 100 # http://www.sagehill.net/docbookxsl/Chunking.html#GeneratedFilenames
 101 CHUNK_PARAMS = {
 102     'book': ChunkParams('bk'),
 103     'chapter': ChunkParams('ch', 'book'),
 104     'index': ChunkParams('ix', 'book'),
 105     'sect1': ChunkParams('s', 'chapter'),
 106     'section': ChunkParams('s', 'chapter'),
 107 }
 108
 109 TITLE_XPATH = {
 110     'book': etree.XPath('./bookinfo/title/text()'),
 111     'chapter': etree.XPath('./title/text()'),
 112     'index': etree.XPath('./title/text()'),
 113     'refentry': etree.XPath('./refmeta/refentrytitle/text()'),
 114 }
 115
 116 # Jinja2 templates
 117 TOOL_PATH = os.path.dirname(os.path.abspath(__file__))
 118 TEMPLATE_ENV = Environment(
 119     # loader=PackageLoader('gtkdoc', 'templates'),
 120     # autoescape=select_autoescape(['html', 'xml'])
 121     loader=FileSystemLoader(os.path.join(TOOL_PATH, 'templates')),
 122     autoescape=False,
 123     trim_blocks=True,
 124 )
 125
 126 TEMPLATES = {
 127     'book': TEMPLATE_ENV.get_template('book.html'),
 128     'index': TEMPLATE_ENV.get_template('index.html'),
 129     'refentry': TEMPLATE_ENV.get_template('refentry.html'),
 130 }
 131
 132
 133 def gen_chunk_name(node):
 134     if 'id' in node.attrib:
 135         return node.attrib['id']
 136
 137     tag = node.tag
 138     if tag not in CHUNK_PARAMS:
 139         CHUNK_PARAMS[tag] = ChunkParams(node.tag[:2])
 140         logging.warning('Add CHUNK_PARAMS for "%s"', tag)
 141
 142     naming = CHUNK_PARAMS[tag]
 143     naming.count += 1
 144     name = ('%s%02d' % (naming.prefix, naming.count))
 145     # handle parents to make names of nested tags unique
 146     # TODO: we only need to prepend the parent if there are > 1 of them in the
 147     #       xml
 148     # while naming.parent:
 149     #     parent = naming.parent
 150     #     if parent not in CHUNK_PARAMS:
 151     #         break;
 152     #     naming = CHUNK_PARAMS[parent]
 153     #     name = ('%s%02d' % (naming.prefix, naming.count)) + name
 154     return name
 155
 156
 157 def get_chunk_title(node):
 158     tag = node.tag
 159     if tag not in TITLE_XPATH:
 160         logging.warning('Add TITLE_XPATH for "%s"', tag)
 161         return ''
 162
 163     xpath = TITLE_XPATH[tag]
 164     return xpath(node, smart_strings=False)[0]
 165
 166
 167 def chunk(xml_node, parent=None):
 168     """Chunk the tree.
 169
 170     The first time, we're called with parent=None and in that case we return
 171     the new_node as the root of the tree
 172     """
 173     # print('<%s %s>' % (xml_node.tag, xml_node.attrib))
 174     if xml_node.tag in CHUNK_TAGS:
 175         # TODO: do we need to remove the xml-node from the parent?
 176         #       we generate toc from the files tree
 177         # from copy import deepcopy
 178         # sub_tree = deepcopy(xml_node)
 179         # xml_node.getparent().remove(xml_node)
 180         # # or:
 181         # sub_tree = etree.ElementTree(xml_node).getroot()
 182         parent = Node(xml_node.tag, parent=parent, xml=xml_node,
 183                       filename=gen_chunk_name(xml_node) + '.html',
 184                       title=get_chunk_title(xml_node))
 185     for child in xml_node:
 186         chunk(child, parent)
 187
 188     return parent
 189
 190
 191 def convert__inner(xml):
 192     result = ''
 193     for child in xml:
 194         result += convert_tags.get(child.tag)(child)
 195     return result
 196
 197
 198 def convert__unknown(xml):
 199     logging.warning('Add tag converter for "%s"', xml.tag)
 200     return '<!-- ' + xml.tag + '-->\n'
 201
 202
 203 def convert_para(xml):
 204     result = '<p>'
 205     if xml.tag != 'para':
 206         result = '<p class="%s">' % xml.tag
 207     if xml.text:
 208         result += xml.text
 209     result += convert__inner(xml)
 210     result += '\n</p>'
 211     if xml.tail:
 212         result += xml.tail
 213     return result
 214
 215
 216 def convert_ulink(xml):
 217     url = xml.text
 218     result = '<a class="%s" href="%s">%s</a>' % (xml.tag, url, url)
 219     return result
 220
 221
 222 convert_tags = {
 223     'para': convert_para,
 224     'ulink': convert_ulink,
 225 }
 226
 227
 228 def convert(out_dir, files, node):
 229     """Convert the docbook chunks to a html file."""
 230
 231     logging.info('Writing: %s', node.filename)
 232     with open(os.path.join(out_dir, node.filename), 'wt') as html:
 233         if node.name in TEMPLATES:
 234             # TODO: ideally precomiple common xpath exprs once:
 235             #   func = etree.XPath('//b')
 236             #   func(xml_node)[0]
 237             # unused, we can call api :)
 238             # def lxml_xpath_str0(xml, expr):
 239             #     return xml.xpath(expr, smart_strings=False)[0]
 240             #
 241             # def lxml_xpath(xml, expr):
 242             #     return xml.xpath(expr)
 243
 244             template = TEMPLATES[node.name]
 245             template.globals['convert_para'] = convert_para
 246             params = {
 247                 'xml': node.xml,
 248                 'title': node.title,
 249                 'nav_home': node.root,
 250             }
 251             if 'id' in node.xml.attrib:
 252                 params['id'] = node.xml.attrib['id']
 253             else:
 254                 # TODO: generate?
 255                 logging.warning('No top-level "id" for "%s"', node.xml.tag)
 256             # nav params: up, prev, next
 257             if node.parent:
 258                 params['nav_up'] = node.parent
 259             ix = files.index(node)
 260             if ix > 0:
 261                 params['nav_prev'] = files[ix - 1]
 262             if ix < len(files) - 1:
 263                 params['nav_next'] = files[ix + 1]
 264
 265             # page specific vars
 266             # TODO: extract into functions?
 267             if node.name == 'book':
 268                 params['toc'] = node.root
 269             elif node.name == 'refsect':
 270                 # TODO: toc params from xml
 271                 # all refsect1 + refsect1/title/text() from xml
 272                 pass
 273
 274             html.write(template.render(**params))
 275         else:
 276             logging.warning('Add template for "%s"', node.name)
 277
 278
 279 def main(index_file):
 280     tree = etree.parse(index_file)
 281     tree.xinclude()
 282
 283     dir_name = os.path.dirname(index_file)
 284
 285     # for testing: dump to output file
 286     # out_file = os.path.join(dir_name, 'db2html.xml')
 287     # tree.write(out_file)
 288
 289     # TODO: rename to 'html' later on
 290     out_dir = os.path.join(dir_name, 'db2html')
 291     try:
 292         os.mkdir(out_dir)
 293     except OSError as e:
 294         if e.errno != errno.EEXIST:
 295             raise
 296
 297     # We need two passes:
 298     # 1) recursively walk the tree and chunk it into a python tree so that we
 299     #   can generate navigation and link tags
 300     files = chunk(tree.getroot())
 301     # 2) iterate the tree and output files
 302     # TODO: use multiprocessing
 303     files = list(PreOrderIter(files))
 304     for node in files:
 305         convert(out_dir, files, node)
 306
 307
 308 if __name__ == '__main__':
 309     parser = argparse.ArgumentParser(
 310         description='db2html - chunk docbook')
 311     parser.add_argument('sources', nargs='*')
 312     options = parser.parse_args()
 313     if len(options.sources) != 1:
 314         sys.exit('Expect one source file argument.')
 315
 316     log_level = os.environ.get('GTKDOC_TRACE')
 317     if log_level == '':
 318         log_level = 'INFO'
 319     if log_level:
 320         logging.basicConfig(stream=sys.stdout,
 321                             level=logging.getLevelName(log_level.upper()),
 322                             format='%(asctime)s:%(filename)s:%(funcName)s:%(lineno)d:%(levelname)s:%(message)s')
 323
 324     sys.exit(main(options.sources[0]))