2 # Copyright (c) 2004 Danilo Segan <danilo@kvota.net>.
4 # This file is part of xml2po.
6 # xml2po is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # xml2po is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with xml2po; if not, write to the Free Software Foundation, Inc.,
18 # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 # slightly modified to work on Windows for TortoiseSVN.
23 # xml2po -- translate XML documents
26 # Versioning system (I use this for a long time, so lets explain it to
27 # those Linux-versioning-scheme addicts):
28 # 1.0.* are unstable, development versions
29 # 1.1 will be first stable release (release 1), and 1.1.* bugfix releases
30 # 2.0.* will be unstable-feature-development stage (milestone 1)
31 # 2.1.* unstable development betas (milestone 2)
32 # 2.2 second stable release (release 2), and 2.2.* bugfix releases
42 def __init__(self
, with_translations
= 0):
48 self
.translations
= []
49 self
.do_translations
= with_translations
50 self
.output_msgstr
= 0 # this is msgid mode for outputMessage; 1 is for msgstr mode
52 def translationsFollow(self
):
53 """Indicate that what follows are translations."""
54 self
.output_msgstr
= 1
56 def setFilename(self
, filename
):
57 self
.filename
= filename
59 def outputMessage(self
, text
, lineno
= 0, comment
= None, spacepreserve
= 0, tag
= None):
60 """Adds a string to the list of messages."""
61 if (text
.strip() != ''):
62 t
= escapePoString(normalizeString(text
, not spacepreserve
))
63 if self
.output_msgstr
:
64 self
.translations
.append(t
)
67 if self
.do_translations
or (not t
in self
.messages
):
68 self
.messages
.append(t
)
71 if t
in self
.linenos
.keys():
72 self
.linenos
[t
].append((self
.filename
, tag
, lineno
))
74 self
.linenos
[t
] = [ (self
.filename
, tag
, lineno
) ]
75 if (not self
.do_translations
) and comment
and not t
in self
.comments
:
76 self
.comments
[t
] = comment
78 if t
in self
.linenos
.keys():
79 self
.linenos
[t
].append((self
.filename
, tag
, lineno
))
81 self
.linenos
[t
] = [ (self
.filename
, tag
, lineno
) ]
82 if comment
and not t
in self
.comments
:
83 self
.comments
[t
] = comment
85 def outputHeader(self
, out
):
86 from time
import gmtime
, strftime
87 tstamp
= strftime("%Y-%m-%d %H:%M +0000", gmtime())
90 "Project-Id-Version: PACKAGE VERSION\\n"
91 "POT-Creation-Date: %s\\n"
92 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
93 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
94 "Language-Team: LANGUAGE <LL@li.org>\\n"
95 "MIME-Version: 1.0\\n"
96 "Content-Type: text/plain; charset=UTF-8\\n"
97 "Content-Transfer-Encoding: 8bit\\n"
101 out
.write(tmp
.encode('utf-8'))
103 def outputAll(self
, out
):
104 self
.outputHeader(out
)
106 for k
in self
.messages
:
107 if k
in self
.comments
:
108 out
.write("#. %s\n" % (self
.comments
[k
].replace("\n","\n#. ")))
110 for reference
in self
.linenos
[k
]:
111 references
+= "#: %s:%d\n#.(%s)\n" % (reference
[0], reference
[2], reference
[1])
112 out
.write("%s" % (references
))
113 if k
in self
.nowrap
and self
.nowrap
[k
]:
114 out
.write("#, no-wrap\n")
115 out
.write("msgid \"%s\"\n" % (k
))
117 if self
.do_translations
:
118 if len(self
.translations
)>0:
119 translation
= self
.translations
.pop(0)
120 out
.write("msgstr \"%s\"\n\n" % (translation
))
123 def normalizeNode(node
):
126 elif isSpacePreserveNode(node
):
129 if node
.isBlankNode():
132 node
.setContent(re
.sub('\s+',' ', node
.content
))
134 elif node
.children
and node
.type == 'element':
135 child
= node
.children
140 def normalizeString(text
, ignorewhitespace
= 1):
141 """Normalizes string to be used as key for gettext lookup.
143 Removes all unnecessary whitespace."""
144 if not ignorewhitespace
:
147 # Lets add document DTD so entities are resolved
148 dtd
= doc
.intSubset()
149 tmp
= dtd
.serialize('utf-8')
150 tmp
= tmp
+ '<norm>%s</norm>' % text
152 tmp
= '<norm>%s</norm>' % text
155 ctxt
= libxml2
.createDocParserCtxt(tmp
)
157 ctxt
.replaceEntities(1)
160 newnode
= tree
.getRootElement()
162 print >> sys
.stderr
, """Error while normalizing string as XML:\n"%s"\n""" % (text
)
165 normalizeNode(newnode
)
168 child
= newnode
.children
170 result
+= child
.serialize('utf-8')
173 result
= re
.sub('^ ','', result
)
174 result
= re
.sub(' $','', result
)
178 def stringForEntity(node
):
179 """Replaces entities in the node."""
180 text
= node
.serialize('utf-8')
182 # Lets add document DTD so entities are resolved
183 dtd
= node
.doc
.intSubset()
184 tmp
= dtd
.serialize('utf-8') + '<norm>%s</norm>' % text
187 tmp
= '<norm>%s</norm>' % text
190 ctxt
= libxml2
.createDocParserCtxt(tmp
)
192 ctxt
.replaceEntities(1)
196 newnode
= tree
.children
.next
198 newnode
= tree
.children
201 child
= newnode
.children
203 result
+= child
.serialize('utf-8')
209 def escapePoString(text
):
210 return text
.replace('\\','\\\\').replace('"', "\\\"").replace("\n","\\n").replace("\t","\\t")
212 def unEscapePoString(text
):
213 return text
.replace('\\"', '"').replace('\\\\','\\')
215 def getTranslation(text
, spacepreserve
= 0):
216 """Returns a translation via gettext for specified snippet.
218 text should be a string to look for, spacepreserve set to 1
219 when spaces should be preserved.
221 text
= normalizeString(text
, not spacepreserve
)
222 if (text
.strip() == ''):
224 file = open(mofile
, "rb")
226 gt
= gettext
.GNUTranslations(file)
228 return gt
.ugettext(text
.decode('utf-8'))
231 def startTagForNode(node
):
238 for p
in node
.properties
:
239 if p
.type == 'attribute':
240 # FIXME: This part sucks
241 params
+= p
.serialize('utf-8')
244 def endTagForNode(node
):
251 def isFinalNode(node
):
253 auto
= autoNodeIsFinal(node
)
254 # Check if any of the parents is also autoNodeIsFinal,
255 # and if it is, don't consider this node a final one
257 while parent
and auto
:
258 auto
= not autoNodeIsFinal(parent
)
259 parent
= parent
.parent
261 #node.type =='text' or not node.children or
262 if node
.type == 'element' and node
.name
in ultimate_tags
:
266 child
= node
.children
267 while child
and final_children
:
268 if not isFinalNode(child
):
275 def ignoreNode(node
):
277 if node
.type in ('dtd', 'comment'):
282 if isFinalNode(node
):
284 if node
.name
in ignored_tags
or node
.type in ('dtd', 'comment'):
288 def isSpacePreserveNode(node
):
289 pres
= node
.getSpacePreserve()
293 if CurrentXmlMode
and (node
.name
in CurrentXmlMode
.getSpacePreserveTags()):
298 def getCommentForNode(node
):
299 """Walk through previous siblings until a comment is found, or other element.
301 Only whitespace is allowed between comment and current node."""
303 while prev
and prev
.type == 'text' and prev
.content
.strip() == '':
305 if prev
and prev
.type == 'comment':
306 return prev
.content
.strip()
311 def replaceNodeContentsWithText(node
,text
):
312 """Replaces all subnodes of a node with contents of text treated as XML."""
314 starttag
= node
.name
#startTagForNode(node)
315 endtag
= endTagForNode(node
)
317 # Lets add document DTD so entities are resolved
318 dtd
= doc
.intSubset()
320 if expand_entities
: # FIXME: we get a "Segmentation fault" in libxml2.parseMemory() when we include DTD otherwise
321 tmp
= dtd
.serialize('utf-8')
322 tmp
= tmp
+ '<%s>%s</%s>' % (starttag
, text
, endtag
)
324 tmp
= '<%s>%s</%s>' % (starttag
, text
, endtag
)
327 ctxt
= libxml2
.createDocParserCtxt(tmp
.encode('utf-8'))
328 ctxt
.replaceEntities(0)
332 print >> sys
.stderr
, """Error while parsing translation as XML:\n"%s"\n""" % (text
.encode('utf-8'))
335 newelem
= newnode
.getRootElement()
336 if newelem
and newelem
.children
:
343 node
.addChildList(newelem
.children
)
345 # In practice, this happens with tags such as "<para> </para>" (only whitespace in between)
348 node
.setContent(text
)
350 def autoNodeIsFinal(node
):
351 """Returns 1 if node is text node, contains non-whitespace text nodes or entities."""
353 if node
.isText() and node
.content
.strip()!='':
355 child
= node
.children
357 if child
.type in ['text'] and child
.content
.strip()!='':
365 def worthOutputting(node
):
366 """Returns 1 if node is "worth outputting", otherwise 0.
368 Node is "worth outputting", if none of the parents
369 isFinalNode, and it contains non-blank text and entities.
373 final
= isFinalNode(node
) and node
.name
not in ignored_tags
374 while not final
and parent
:
375 if isFinalNode(parent
):
376 final
= 1 # reset if we've got to one final tag
377 if final
and (parent
.name
not in ignored_tags
) and worthOutputting(parent
):
380 parent
= parent
.parent
384 return autoNodeIsFinal(node
)
386 def processElementTag(node
, replacements
, restart
= 0):
387 """Process node with node.type == 'element'."""
388 if node
.type == 'element':
393 myrepl
= replacements
397 child
= node
.children
399 if (isFinalNode(child
)) or (child
.type == 'element' and worthOutputting(child
)):
400 myrepl
.append(processElementTag(child
, myrepl
, 1))
401 outtxt
+= '<placeholder-%d/>' % (len(myrepl
))
403 if child
.type == 'element':
404 (starttag
, content
, endtag
, translation
) = processElementTag(child
, myrepl
, 0)
405 outtxt
+= '<%s>%s</%s>' % (starttag
, content
, endtag
)
407 outtxt
+= doSerialize(child
)
412 translation
= getTranslation(outtxt
, isSpacePreserveNode(node
))
415 starttag
= startTagForNode(node
)
416 endtag
= endTagForNode(node
)
418 if restart
or worthOutputting(node
):
420 while i
< len(myrepl
):
421 replacement
= '<%s>%s</%s>' % (myrepl
[i
][0], myrepl
[i
][3], myrepl
[i
][2])
423 translation
= translation
.replace('<placeholder-%d/>' % (i
), replacement
)
425 if worthOutputting(node
):
427 replaceNodeContentsWithText(node
, translation
)
429 msg
.outputMessage(outtxt
, node
.lineNo(), getCommentForNode(node
), isSpacePreserveNode(node
), tag
= node
.name
)
431 return (starttag
, outtxt
, endtag
, translation
)
433 raise Exception("You must pass node with node.type=='element'.")
436 def isExternalGeneralParsedEntity(node
):
437 if (node
and node
.type=='entity_ref'):
439 # it would be nice if debugDumpNode could use StringIO, but it apparently cannot
440 tmp
= file(".xml2po-entitychecking","w+")
441 node
.debugDumpNode(tmp
,0)
445 os
.remove(".xml2po-entitychecking")
447 # We fail silently, and replace all entities if we cannot
448 # write .xml2po-entitychecking
449 # !!! This is not very nice thing to do, but I don't know if
450 # raising an exception is any better
452 if tmpstr
.find('EXTERNAL_GENERAL_PARSED_ENTITY') != -1:
459 def doSerialize(node
):
460 """Serializes a node and its children, emitting PO messages along the way.
462 node is the node to serialize, first indicates whether surrounding
463 tags should be emitted as well.
468 elif not node
.children
:
469 return node
.serialize("utf-8")
470 elif node
.type == 'entity_ref':
471 if isExternalGeneralParsedEntity(node
):
472 return node
.serialize('utf-8')
474 return stringForEntity(node
) #content #content #serialize("utf-8")
475 elif node
.type == 'entity_decl':
476 return node
.serialize('utf-8') #'<%s>%s</%s>' % (startTagForNode(node), node.content, node.name)
477 elif node
.type == 'text':
478 return node
.serialize('utf-8')
479 elif node
.type == 'element':
481 (starttag
, content
, endtag
, translation
) = processElementTag(node
, repl
, 1)
482 return '<%s>%s</%s>' % (starttag
, content
, endtag
)
484 child
= node
.children
487 outtxt
+= doSerialize(child
)
492 def read_finaltags(filelist
):
494 return CurrentXmlMode
.getFinalTags()
496 defaults
= ['para', 'title', 'releaseinfo', 'revnumber',
497 'date', 'itemizedlist', 'orderedlist',
498 'variablelist', 'varlistentry', 'term' ]
501 def read_ignoredtags(filelist
):
503 return CurrentXmlMode
.getIgnoredTags()
505 defaults
= ['itemizedlist', 'orderedlist', 'variablelist',
509 def tryToUpdate(allargs
, lang
):
510 # Remove "-u" and "--update-translation"
513 opts
, args
= getopt
.getopt(args
, 'avhmket:o:p:u:',
514 ['automatic-tags','version', 'help', 'keep-entities', 'extract-all-entities', 'merge', 'translation=',
515 'output=', 'po-file=', 'update-translation=' ])
516 for opt
, arg
in opts
:
517 if opt
in ('-a', '--automatic-tags'):
519 elif opt
in ('-k', '--keep-entities'):
521 elif opt
in ('-e', '--extract-all-entities'):
523 elif opt
in ('-m', '--mode'):
524 command
+= " -m %s" % arg
525 elif opt
in ('-o', '--output'):
526 sys
.stderr
.write("Error: Option '-o' is not yet supported when updating translations directly.\n")
528 elif opt
in ('-v', '--version'):
531 elif opt
in ('-h', '--help'):
532 sys
.stderr
.write("Error: If you want help, please use `%s --help' without '-u' option.\n" % (allargs
[0]))
534 elif opt
in ('-u', '--update-translation'):
537 sys
.stderr
.write("Error: Option `%s' is not supported with option `-u'.\n" % (opt
))
541 command
+= " " + args
.pop()
545 sys
.stderr
.write("Merging translations for %s: " % (lang
))
546 result
= os
.system("%s | msgmerge -o .tmp.%s.po %s -" % (command
, lang
, file))
550 result
= os
.system("mv .tmp.%s.po %s" % (lang
, file))
552 sys
.stderr
.write("Error: cannot rename file.\n")
555 os
.system("msgfmt -cv -o NUL %s" % (file))
558 def load_mode(modename
):
560 #found = imp.find_module(modename, submodes_path)
561 #module = imp.load_module(modename, found[0], found[1], found[2])
563 sys
.path
.append(submodes_path
)
564 module
= __import__(modename
)
565 modeModule
= '%sXmlMode' % modename
566 return getattr(module
, modeModule
)
570 def xml_error_handler(arg
, ctxt
):
573 libxml2
.registerErrorHandler(xml_error_handler
, None)
577 if __name__
!= '__main__': raise NotImplementedError
580 submodes_path
= "xml2po-modes"
581 default_mode
= 'docbook'
590 mode
= 'pot' # 'pot' or 'merge'
593 expand_all_entities
= 0
595 output
= '-' # this means to stdout
597 import getopt
, fileinput
599 def usage (with_help
= False):
600 print >> sys
.stderr
, "Usage: %s [OPTIONS] [XMLFILE]..." % (sys
.argv
[0])
602 print >> sys
.stderr
, """
603 OPTIONS may be some of:
604 -a --automatic-tags Automatically decides if tags are to be considered
606 -k --keep-entities Don't expand entities
607 -e --expand-all-entities Expand ALL entities (including SYSTEM ones)
608 -m --mode=TYPE Treat tags as type TYPE (default: docbook)
609 -o --output=FILE Print resulting text (XML or POT) to FILE
610 -p --po-file=FILE Specify PO file containing translation, and merge
611 Overwrites temporary file .xml2po.mo.
612 -r --reuse=FILE Specify translated XML file with the same structure
613 -t --translation=FILE Specify MO file containing translation, and merge
614 -u --update-translation=LANG.po Updates a PO file using msgmerge program
615 -v --version Output version of the xml2po program
617 -h --help Output this message
620 To create a POTemplate book.pot from input files chapter1.xml and
621 chapter2.xml, run the following:
622 %s -o book.pot chapter1.xml chapter2.xml
624 After translating book.pot into de.po, merge the translations back,
625 using -p option for each XML file:
626 %s -p de.po chapter1.xml > chapter1.de.xml
627 %s -p de.po chapter2.xml > chapter2.de.xml
628 """ % (sys
.argv
[0], sys
.argv
[0], sys
.argv
[0])
631 if len(sys
.argv
) < 2: usage()
634 try: opts
, args
= getopt
.getopt(args
, 'avhkem:t:o:p:u:r:',
635 ['automatic-tags','version', 'help', 'keep-entities', 'expand-all-entities', 'mode=', 'translation=',
636 'output=', 'po-file=', 'update-translation=', 'reuse=' ])
637 except getopt
.GetoptError
: usage(True)
639 for opt
, arg
in opts
:
640 if opt
in ('-m', '--mode'):
642 if opt
in ('-a', '--automatic-tags'):
644 elif opt
in ('-k', '--keep-entities'):
646 elif opt
in ('-e', '--expand-all-entities'):
647 expand_all_entities
= 1
648 elif opt
in ('-t', '--translation'):
651 translationlanguage
= os
.path
.splitext(mofile
)[0]
652 elif opt
in ('-r', '--reuse'):
654 elif opt
in ('-u', '--update-translation'):
655 tryToUpdate(sys
.argv
, arg
)
656 elif opt
in ('-p', '--po-file'):
657 mofile
= ".xml2po.mo"
659 translationlanguage
= os
.path
.splitext(pofile
)[0]
660 os
.system("msgfmt -o %s %s >NUL" % (mofile
, pofile
)) and sys
.exit(7)
662 elif opt
in ('-o', '--output'):
664 elif opt
in ('-v', '--version'):
667 elif opt
in ('-h', '--help'):
670 # Treat remaining arguments as XML files
672 filenames
.append(args
.pop())
674 if len(filenames
) > 1 and mode
=='merge':
675 print >> sys
.stderr
, "Error: You can merge translations with only one XML file at a time."
679 CurrentXmlMode
= load_mode(default_mode
)()
681 CurrentXmlMode
= None
682 print >> sys
.stderr
, "Warning: cannot load module '%s', using automatic detection (-a)." % (default_mode
)
685 if mode
=='merge' and mofile
=='':
686 print >> sys
.stderr
, "Error: You must specify MO file when merging translations."
689 ultimate_tags
= read_finaltags(ultimate
)
690 ignored_tags
= read_ignoredtags(ignored
)
692 # I'm not particularly happy about making any of these global,
693 # but I don't want to bother too much with it right now
697 msg
= MessageOutput()
699 filenames
.append(origxml
)
700 msg
= MessageOutput(1)
702 for filename
in filenames
:
704 if filename
== origxml
:
705 msg
.translationsFollow()
706 ctxt
= libxml2
.createFileParserCtxt(filename
)
708 if expand_all_entities
:
709 ctxt
.replaceEntities(1)
713 print >> sys
.stderr
, "Error: cannot open file '%s'." % (filename
)
716 msg
.setFilename(filename
)
717 if CurrentXmlMode
and origxml
=='':
718 CurrentXmlMode
.preProcessXml(doc
,msg
)
725 out
= file(output
, 'w')
727 print >> sys
.stderr
, "Error: cannot open file %s for writing." % (output
)
732 tcmsg
= CurrentXmlMode
.getStringForTranslators()
733 tccom
= CurrentXmlMode
.getCommentForTranslators()
735 msg
.outputMessage(tcmsg
, 0, tccom
)
737 tcmsg
= CurrentXmlMode
.getStringForTranslation()
738 tccom
= CurrentXmlMode
.getCommentForTranslation()
740 msg
.outputMessage(tcmsg
, 0, tccom
)
745 tcmsg
= CurrentXmlMode
.getStringForTranslators()
747 tnames
= getTranslation(tcmsg
)
750 tcmsg
= CurrentXmlMode
.getStringForTranslation()
752 tstring
= getTranslation(tcmsg
)
756 CurrentXmlMode
.postProcessXmlTranslation(doc
, translationlanguage
, tnames
, tstring
)
757 out
.write(doc
.serialize('utf-8', 1))