convert/po2dtd.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2002-2006 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """script that converts a .po file to a UTF-8 encoded .dtd file as used by mozilla
  23 either done using a template or just using the .po file"""
  24
  25 from translate.storage import dtd
  26 from translate.storage import po
  27 from translate.misc import quote
  28 import warnings
  29
  30 # labelsuffixes and accesskeysuffixes are combined to accelerator notation
  31 labelsuffixes = (".label", ".title")
  32 accesskeysuffixes = (".accesskey", ".accessKey", ".akey")
  33
  34 def getlabel(unquotedstr):
  35     """retrieve the label from a mixed label+accesskey entity"""
  36     if isinstance(unquotedstr, str):
  37         unquotedstr = unquotedstr.decode("UTF-8")
  38     # mixed labels just need the & taken out
  39     # except that &entity; needs to be avoided...
  40     amppos = 0
  41     while amppos >= 0:
  42         amppos = unquotedstr.find("&", amppos)
  43         if amppos != -1:
  44             amppos += 1
  45             semipos = unquotedstr.find(";", amppos)
  46             if semipos != -1:
  47                 if unquotedstr[amppos:semipos].isalnum():
  48                     continue
  49             # otherwise, cut it out... only the first one need be changed
  50             # (see below to see how the accesskey is done)
  51             unquotedstr = unquotedstr[:amppos-1] + unquotedstr[amppos:]
  52             break
  53     return unquotedstr.encode("UTF-8")
  54
  55 def getaccesskey(unquotedstr):
  56     """retrieve the access key from a mixed label+accesskey entity"""
  57     if isinstance(unquotedstr, str):
  58         unquotedstr = unquotedstr.decode("UTF-8")
  59     # mixed access keys need the key extracted from after the &
  60     # but we must avoid proper entities i.e. &gt; etc...
  61     amppos = 0
  62     while amppos >= 0:
  63         amppos = unquotedstr.find("&", amppos)
  64         if amppos != -1:
  65             amppos += 1
  66             semipos = unquotedstr.find(";", amppos)
  67             if semipos != -1:
  68                 if unquotedstr[amppos:semipos].isalnum():
  69                     # what we have found is an entity, not a shortcut key...
  70                     continue
  71             # otherwise, we found the shortcut key
  72             return unquotedstr[amppos].encode("UTF-8")
  73     # if we didn't find the shortcut key, return an empty string rather than the original string
  74     # this will come out as "don't have a translation for this" because the string is not changed...
  75     # so the string from the original dtd will be used instead
  76     return ""
  77
  78 def removeinvalidamps(entity, unquotedstr):
  79     """find ampersands that aren't part of an entity definition..."""
  80     amppos = 0
  81     invalidamps = []
  82     while amppos >= 0:
  83         amppos = unquotedstr.find("&", amppos)
  84         if amppos != -1:
  85             amppos += 1
  86             semipos = unquotedstr.find(";", amppos)
  87             if semipos != -1:
  88                 checkentity = unquotedstr[amppos:semipos]
  89                 if checkentity.replace('.', '').isalnum():
  90                     # what we have found is an entity, not a problem...
  91                     continue
  92                 elif checkentity[0] == '#' and checkentity[1:].isalnum():
  93                     # what we have found is an entity, not a problem...
  94                     continue
  95             # otherwise, we found a problem
  96             invalidamps.append(amppos-1)
  97     if len(invalidamps) > 0:
  98         warnings.warn("invalid ampersands in dtd entity %s" % (entity))
  99         comp = 0
 100         for amppos in invalidamps:
 101             unquotedstr = unquotedstr[:amppos-comp] + unquotedstr[amppos-comp+1:]
 102             comp += 1
 103     return unquotedstr
 104
 105 def getmixedentities(entities):
 106     """returns a list of mixed .label and .accesskey entities from a list of entities"""
 107     mixedentities = []    # those entities which have a .label and .accesskey combined
 108     # search for mixed entities...
 109     for entity in entities:
 110         for labelsuffix in labelsuffixes:
 111             if entity.endswith(labelsuffix):
 112                 entitybase = entity[:entity.rfind(labelsuffix)]
 113                 # see if there is a matching accesskey, making this a mixed entity
 114                 for akeytype in accesskeysuffixes:
 115                     if entitybase + akeytype in entities:
 116                         # add both versions to the list of mixed entities
 117                         mixedentities += [entity, entitybase+akeytype]
 118     return mixedentities
 119
 120 def applytranslation(entity, dtdunit, inputunit, mixedentities):
 121     """applies the translation for entity in the po unit to the dtd unit"""
 122     # this converts the po-style string to a dtd-style string
 123     unquotedstr = inputunit.target
 124     # check there aren't missing entities...
 125     if len(unquotedstr.strip()) == 0:
 126         return
 127     # handle mixed entities
 128     for labelsuffix in labelsuffixes:
 129         if entity.endswith(labelsuffix):
 130             if entity in mixedentities:
 131                 unquotedstr = getlabel(unquotedstr)
 132                 break
 133     else:
 134         for akeytype in accesskeysuffixes:
 135             if entity.endswith(akeytype):
 136                 if entity in mixedentities:
 137                     unquotedstr = getaccesskey(unquotedstr)
 138                     if not unquotedstr:
 139                         warnings.warn("Could not find accesskey for %s" % entity)
 140                     else:
 141                         original = dtd.unquotefromdtd(dtdunit.definition)
 142                         if original.isupper() and unquotedstr.islower():
 143                             unquotedstr = unquotedstr.upper()
 144                         elif original.islower() and unquotedstr.isupper():
 145                             unquotedstr = unquotedstr.lower()
 146     # handle invalid left-over ampersands (usually unneeded access key shortcuts)
 147     unquotedstr = removeinvalidamps(entity, unquotedstr)
 148     # finally set the new definition in the dtd, but not if its empty
 149     if len(unquotedstr) > 0:
 150         dtdunit.definition = dtd.quotefordtd(unquotedstr)
 151
 152 class redtd:
 153     """this is a convertor class that creates a new dtd based on a template using translations in a po"""
 154     def __init__(self, dtdfile):
 155         self.dtdfile = dtdfile
 156
 157     def convertstore(self, inputstore, includefuzzy=False):
 158         # translate the strings
 159         for inunit in inputstore.units:
 160             # there may be more than one entity due to msguniq merge
 161             if includefuzzy or not inunit.isfuzzy():
 162                 self.handleinunit(inunit)
 163         return self.dtdfile
 164
 165     def handleinunit(self, inunit):
 166         entities = inunit.getlocations()
 167         mixedentities = getmixedentities(entities)
 168         for entity in entities:
 169             if self.dtdfile.index.has_key(entity):
 170                 # now we need to replace the definition of entity with msgstr
 171                 dtdunit = self.dtdfile.index[entity] # find the dtd
 172                 applytranslation(entity, dtdunit, inunit, mixedentities)
 173
 174 class po2dtd:
 175     """this is a convertor class that creates a new dtd file based on a po file without a template"""
 176     def convertcomments(self, inputunit, dtdunit):
 177         entities = inputunit.getlocations()
 178         if len(entities) > 1:
 179             # don't yet handle multiple entities
 180             dtdunit.comments.append(("conversionnote",'<!-- CONVERSION NOTE - multiple entities -->\n'))
 181             dtdunit.entity = entities[0]
 182         elif len(entities) == 1:
 183             dtdunit.entity = entities[0]
 184         else:
 185             # this produces a blank entity, which doesn't write anything out
 186             dtdunit.entity = ""
 187
 188         if inputunit.isfuzzy():
 189             dtdunit.comments.append(("potype", "fuzzy\n"))
 190         for note in inputunit.getnotes("translator").split("\n"):
 191             if not note:
 192                 continue
 193             note = quote.unstripcomment(note)
 194             if (note.find('LOCALIZATION NOTE') == -1) or (note.find('GROUP') == -1):
 195                 dtdunit.comments.append(("comment", note))
 196         # msgidcomments are special - they're actually localization notes
 197         msgidcomment = inputunit._extract_msgidcomments()
 198         if msgidcomment:
 199             locnote = quote.unstripcomment("LOCALIZATION NOTE ("+dtdunit.entity+"): "+msgidcomment)
 200             dtdunit.comments.append(("locnote", locnote))
 201
 202
 203     def convertstrings(self, inputunit, dtdunit):
 204         if inputunit.istranslated():
 205             unquoted = inputunit.target
 206         else:
 207             unquoted = inputunit.source
 208         unquoted = removeinvalidamps(dtdunit.entity, unquoted)
 209         dtdunit.definition = dtd.quotefordtd(unquoted)
 210
 211     def convertunit(self, inputunit):
 212         dtdunit = dtd.dtdunit()
 213         self.convertcomments(inputunit, dtdunit)
 214         self.convertstrings(inputunit, dtdunit)
 215         return dtdunit
 216
 217     def convertstore(self, inputstore, includefuzzy=False):
 218         outputstore = dtd.dtdfile()
 219         self.currentgroups = []
 220         for inputunit in inputstore.units:
 221             if includefuzzy or not inputunit.isfuzzy():
 222                 dtdunit = self.convertunit(inputunit)
 223                 if dtdunit is not None:
 224                     outputstore.addunit(dtdunit)
 225         return outputstore
 226
 227 def convertdtd(inputfile, outputfile, templatefile, includefuzzy=False):
 228     inputstore = po.pofile(inputfile)
 229     if templatefile is None:
 230         convertor = po2dtd()
 231     else:
 232         templatestore = dtd.dtdfile(templatefile)
 233         convertor = redtd(templatestore)
 234     outputstore = convertor.convertstore(inputstore, includefuzzy)
 235     outputfile.write(str(outputstore))
 236     return 1
 237
 238 def main(argv=None):
 239     # handle command line options
 240     from translate.convert import convert
 241     formats = {"po": ("dtd", convertdtd), ("po", "dtd"): ("dtd", convertdtd)}
 242     parser = convert.ConvertOptionParser(formats, usetemplates=True, description=__doc__)
 243     parser.add_fuzzy_option()
 244     parser.run(argv)
 245
 246 if __name__ == '__main__':
 247     main()
 248