doc/xml2po-modes/docbook.py

   1 # -*- coding: utf-8 -*-
   2 # Copyright (c) 2004 Danilo Segan <danilo@kvota.net>.
   3 #
   4 # This file is part of xml2po.
   5 #
   6 # xml2po is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # xml2po is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with xml2po; if not, write to the Free Software Foundation, Inc.,
  18 # 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19 #
  20
  21 # This implements special instructions for handling DocBook XML documents
  22 # in a better way.
  23 #
  24 #  This means:
  25 #   — better handling of nested complicated tags (i.e. definitions of
  26 #     ignored-tags and final-tags)
  27 #   — support for merging translator-credits back into DocBook articles
  28 #   — support for setting a language
  29 #
  30
  31 # We use "currentXmlMode" class name for all modes
  32 #  -- it might be better to have it named docbookXmlMode, but it will make loading harder;
  33 #     it is also not necessary until we start supporting extracting strings from more
  34 #     than one document type at the same time
  35 #
  36 import re
  37 import libxml2
  38 import os
  39 import md5
  40 import sys
  41
  42 class docbookXmlMode:
  43     """Class for special handling of DocBook document types.
  44
  45     It sets lang attribute on article elements, and adds translators
  46     to articleinfo/copyright."""
  47     def __init__(self):
  48         self.lists = ['itemizedlist', 'orderedlist', 'variablelist',
  49                       'segmentedlist', 'simplelist', 'calloutlist', 'varlistentry' ]
  50         self.objects = [ 'table', 'figure', 'textobject', 'imageobject', 'mediaobject',
  51                          'screenshot' ]
  52
  53     def getIgnoredTags(self):
  54         "Returns array of tags to be ignored."
  55         return  self.objects + self.lists
  56
  57     def getFinalTags(self):
  58         "Returns array of tags to be considered 'final'."
  59         return ['para', 'formalpara', 'simpara',
  60                 'releaseinfo', 'revnumber', 'title',
  61                 'date', 'term', 'programlisting'] + self.objects + self.lists
  62
  63     def getSpacePreserveTags(self):
  64         "Returns array of tags in which spaces are to be preserved."
  65         return [
  66             'classsynopsisinfo',
  67             'computeroutput',
  68             'funcsynopsisinfo',
  69             'literallayout',
  70             'programlisting',
  71             'screen',
  72             'synopsis',
  73             'userinput'
  74             ]
  75
  76     def getStringForTranslators(self):
  77         """Returns string which will be used to credit translators."""
  78         return "translator-credits"
  79
  80     def getCommentForTranslators(self):
  81         """Returns a comment to be added next to string for crediting translators."""
  82         return """Put one translator per line, in the form of NAME <EMAIL>."""
  83
  84     def getStringForTranslation(self):
  85         """Returns translation of 'translation'."""
  86         return "translator-translation"
  87
  88     def getCommentForTranslation(self):
  89         """Returns a string that explains how 'translation' is to be translated."""
  90         return """Place the translation of 'translation' here."""
  91
  92     def _find_articleinfo(self, node):
  93         if node.name == 'articleinfo' or node.name == 'bookinfo':
  94             return node
  95         child = node.children
  96         while child:
  97             ret = self._find_articleinfo(child)
  98             if ret:
  99                 return ret
 100             child = child.next
 101         return None
 102
 103     def _find_lastcopyright(self, node):
 104         if not node.children:
 105             return None
 106         last = node.lastChild()
 107         tmp = last
 108         while tmp:
 109             if tmp.name == "copyright":
 110                 last = tmp
 111                 break
 112             tmp = tmp.prev
 113         return last
 114
 115     def _md5_for_file(self, filename):
 116         hash = md5.new()
 117         input = open(filename, "rb")
 118         read = input.read(4096)
 119         while read:
 120             hash.update(read)
 121             read = input.read(4096)
 122         input.close()
 123         return hash.hexdigest()
 124
 125     def _output_images(self, node, msg):
 126         if node and node.type=='element' and node.name=='imagedata':
 127             # Use .fileref to construct new message
 128             attr = node.prop("fileref")
 129             if attr:
 130                 dir = os.path.dirname(msg.filename)
 131                 fullpath = os.path.join(dir, attr)
 132                 if os.path.exists(fullpath):
 133                     hash = self._md5_for_file(fullpath)
 134                 else:
 135                     hash = "THIS FILE DOESN'T EXIST"
 136                     print >>sys.stderr, "Warning: image file '%s' not found." % fullpath
 137
 138                 msg.outputMessage("@@image: '%s'; md5=%s" % (attr, hash), node.lineNo(),
 139                                   "When image changes, this message will be marked fuzzy or untranslated for you.\n"+
 140                                   "It doesn't matter what you translate it to: it's not used at all.")
 141         elif node and node.children:
 142             child = node.children
 143             while child:
 144                 self._output_images(child,msg)
 145                 child = child.next
 146
 147
 148     def preProcessXml(self, doc, msg):
 149         """Add additional messages of interest here."""
 150         root = doc.getRootElement()
 151         self._output_images(root,msg)
 152
 153     def postProcessXmlTranslation(self, doc, language, translators, translation):
 154         """Sets a language and translators in "doc" tree.
 155
 156         "translators" is a string consisted of "Name <email>" pairs
 157         of each translator, separated by newlines."""
 158
 159         root = doc.getRootElement()
 160         # DocBook documents can be something other than article, handle that as well in the future
 161         while root and root.name != 'article' and root.name != 'book':
 162             root = root.next
 163         if root and (root.name == 'article' or root.name == 'book'):
 164             root.setProp('lang', language)
 165         else:
 166             return
 167
 168         if translators == self.getStringForTranslators():
 169             return
 170         else:
 171             # Now, lets find 'articleinfo' (it can be something else, but this goes along with 'article')
 172             ai = self._find_articleinfo(root)
 173             if not ai:
 174                 return
 175
 176             # Now, lets do one translator at a time
 177             transgroup = libxml2.newNode("authorgroup")
 178             lines = translators.split("\n")
 179             for line in lines:
 180                 line = line.strip()
 181                 match = re.match(r"^([^<,]+)\s*(?:<([^>,]+)>)?$", line)
 182                 if match:
 183                     last = self._find_lastcopyright(ai)
 184                     copy = libxml2.newNode("othercredit")
 185                     if last:
 186                         copy = last.addNextSibling(copy)
 187                     else:
 188                         transgroup.addChild(copy)
 189                         ai.addChild(transgroup)
 190                     copy.newChild(None, "contrib", translation.encode('utf-8'))
 191                     if match.group(1) and match.group(2):
 192                         holder = match.group(1)+"(%s)" % match.group(2)
 193                     elif match.group(1):
 194                         holder = match.group(1)
 195                     elif match.group(2):
 196                         holder = match.group(2)
 197                     else:
 198                         holder = "???"
 199                     copy.newChild(None, "othername", holder.encode('utf-8'))
 200
 201 # Perform some tests when ran standalone
 202 if __name__ == '__main__':
 203     test = docbookXmlMode()
 204     print "Ignored tags       : " + repr(test.getIgnoredTags())
 205     print "Final tags         : " + repr(test.getFinalTags())
 206     print "Space-preserve tags: " + repr(test.getSpacePreserveTags())
 207
 208     print "Credits from string: '%s'" % test.getStringForTranslators()
 209     print "Explanation for credits:\n\t'%s'" % test.getCommentForTranslators()
 210
 211     print "String for translation: '%s'" % test.getStringForTranslation()
 212     print "Explanation for translation:\n\t'%s'" % test.getCommentForTranslation()
 213