sandbox/xml2rst/xml2rst.py

   1 #! /usr/bin/env python
   2 # -*- coding: iso-8859-1 -*-
   3
   4 # Based on sample.py,v 4.1.2.6 2006/04/14 13:59:26 cvs Exp
   5
   6 # Copyright (C) 2009 Stefan Merten
   7
   8 # xml2rst.py is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published
  10 # by the Free Software Foundation; either version 2 of the License,
  11 # or (at your option) any later version.
  12
  13 # This program is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16 # General Public License for more details.
  17
  18 # You should have received a copy of the GNU General Public License
  19 # along with this program; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  21 # 02111-1307, USA.
  22
  23 """
  24 Convert a docutils XML file to reStructuredText syntax.
  25
  26 Do
  27
  28         perldoc xml2rst.py
  29
  30 for a man page.
  31 """
  32
  33 """
  34 =head1 NAME
  35
  36 xml2rst.py -- convert a docutils XML file to reStructuredText syntax
  37
  38 =head1 SYNOPSIS
  39
  40 B<xml2rst.py> [B<-v>] I<xml> [I<reST>]
  41
  42 B<xml2rst.py> B<--help>
  43
  44 =head1 DESCRIPTION
  45
  46 Converts a docutils XML input file to reStructuredText source.
  47
  48 This can be used to transform another format to reStructuredText given you have
  49 a transformation to docutils XML.
  50
  51 =cut
  52 """
  53
  54 ###############################################################################
  55 ###############################################################################
  56 # Import
  57
  58 import sys
  59 import os.path
  60 import re
  61
  62 from optparse import OptionParser, OptionGroup, OptionValueError, Option
  63 from copy import copy
  64
  65 try:
  66     from lxml import etree
  67 except ImportError:
  68     errorExit(2, ( "Python package 'lxml' is not available",
  69                    "You may try to use 'xml2rst.xsl' with a standalone XSLT processor like 'xalan' or 'xsltproc'", ))
  70
  71 ###############################################################################
  72 ###############################################################################
  73 # Constants
  74
  75 """
  76 @var MainXsltNm: Name of the main XSLT source file
  77 @type MainXsltNm: str
  78 """
  79 MainXsltNm = "xml2rst.xsl"
  80
  81 """
  82 @var ScriptNm: Name of the script
  83 @type ScriptNm: str
  84 """
  85 ScriptNm = sys.argv[0]
  86
  87 ###############################################################################
  88 ###############################################################################
  89 # Variables
  90
  91 """
  92 @var options: Options given on the command line
  93 @type options: optparse.Values
  94 """
  95 global options
  96
  97 ###############################################################################
  98 ###############################################################################
  99 # General functions
 100
 101 def pod2Head(pod):
 102     """
 103     @param pod: Snippet in POD format to be analyzed.
 104     @type pod: str
 105
 106     @return: String of first `=headX' entry in POD snippet or empty string if
 107              none found.
 108     @rtype: str
 109     """
 110     for line in pod.split("\n"):
 111         if line.startswith("=head"):
 112             return line[len("=headX"):].strip()
 113     return ""
 114
 115 ###############################################################################
 116
 117 def pod2Description(pod):
 118     """
 119     @param pod: Snippet in POD format to be analyzed.
 120     @type pod: str
 121
 122     @return: Stripped text from all lines not being a POD line command.
 123     @rtype: str
 124     """
 125     result = ""
 126     for line in pod.split("\n"):
 127         if not line.startswith("="):
 128             result = result.strip() + " " + line.strip()
 129     return result.strip()
 130
 131 ###############################################################################
 132
 133 def pod2OptionList(pod):
 134     """
 135     Return option names found in POD snippet. Option names are recognized in
 136     `=item B<option>' constructs.
 137
 138     @param pod: Snippet in POD format to be analyzed.
 139     @type pod: str
 140
 141     @return: All option names contained in POD snippet as a list.
 142     @rtype: [ str, ..., ]
 143     """
 144     result = [ ]
 145     for line in pod.split("\n"):
 146         found = re.search("^=item\s*B<(-[^>]+)>", line)
 147         if found:
 148             result.append(found.group(1))
 149     return result
 150
 151 ###############################################################################
 152
 153 def pod2OptionKeywords(pod):
 154     """
 155     Return a dict mapping `OptionParser.add_option' keywords to values found in
 156     POD snippet.
 157
 158     @param pod: Snippet in POD format to be analyzed.
 159     @type pod: str
 160
 161     @return: Mapping for all values found. Currently `help' and `dest' are
 162              filled.
 163     @rtype: { keyword: value, ..., }
 164     """
 165     result = { 'help': "", }
 166     for line in pod.split("\n"):
 167         if line.startswith("=cut"):
 168             break
 169         found = re.search("^=item\s*B<--?([^>]+)>(?:=|\s*)", line)
 170         if found:
 171             result['help'] = ""
 172             optionName = found.group(1)
 173             found = re.search("I<([^>]+)>", line)
 174             if found:
 175                 result['dest'] = found.group(1)
 176             elif len(optionName) > 1:
 177                 result['dest'] = optionName
 178         else:
 179             result['help'] += line + "\n"
 180     result['help'] = result['help'].strip()
 181     if result.has_key('dest'):
 182         result['dest'] = result['dest'].replace("-", "_")
 183     else:
 184         errorExit(1, ( "Internal error: Missing `dest' in documentation string:",
 185                        pod, ))
 186     return result
 187
 188 ###############################################################################
 189
 190 def pod2Argument(pod):
 191     """
 192     Return a list of two strings for `OptionGroup.__init__' describing the
 193     argument found in POD snippet.
 194
 195     @param pod: Snippet in POD format to be analyzed.
 196     @type pod: str
 197
 198     @return: Name of the argument and its description.
 199     @rtype: [ argument, description, ]
 200     """
 201     argument = ""
 202     description = ""
 203     for line in pod.split("\n"):
 204         if line.startswith("=cut"):
 205             break
 206         found = re.search("^=item\s*I<([^>]+)>", line)
 207         if found:
 208             description = ""
 209             argument = found.group(1)
 210         else:
 211             description += line + "\n"
 212     description = description.strip()
 213     return [ argument, description, ]
 214
 215 ###############################################################################
 216
 217 def parseOptions():
 218     """
 219     Sets options and returns arguments.
 220
 221     @return: Name of input file and optionally of output file.
 222     @rtype: ( str, [str,] )
 223     """
 224     global options
 225     pod = """
 226
 227 =head1 OPTIONS
 228
 229 =cut
 230     """
 231     optionParser = OptionParser("usage: %prog [option]... <xml> [<rst>]")
 232
 233     pod = """
 234
 235 =head2 General options
 236
 237 =over 4
 238
 239 =cut
 240     """
 241     generalGroup = OptionGroup(optionParser, pod2Head(pod),
 242                                pod2Description(pod))
 243
 244     pod = """
 245
 246 =item B<-a> I<adornment>
 247
 248 =item B<--adornment>=I<adornment>
 249
 250 Configures title markup to use so different styles can be requested
 251 easily.
 252
 253 The value of the parameter must be a string made up of a sequence of
 254 character pairs. The first character of a pair is C<o> (overline) or
 255 C<u> (underline) and the second character is the character to use for
 256 the markup.
 257
 258 The first and the second character pair is used for document title and
 259 subtitle, the following pairs are used for section titles where the
 260 third pair is used for the top level section title.
 261
 262 Defaults to C<o=o-u=u-u~u:u.u`>.
 263
 264 =cut
 265     """
 266     generalGroup.add_option(default=None, *pod2OptionList(pod),
 267                             **pod2OptionKeywords(pod))
 268
 269     pod = """
 270
 271 =item B<-f> I<fold>
 272
 273 =item B<--fold>=I<fold>
 274
 275 Configures whether long text lines in paragraphs should be folded and
 276 to which length. This option is for input not coming from reST which
 277 may have no internal line feeds in plain text strings.
 278
 279 If folding is enabled text strings not in a line feed preserving
 280 context are first white-space normalized and then broken according to
 281 the folding rules. Folding rules put out the first word and continue
 282 to do so with the following words unless the next word would cross
 283 the folding boundary. Words are delimited by white-space.
 284
 285 Defaults to C<0>, i.e. no folding.
 286
 287 =cut
 288     """
 289     generalGroup.add_option(type="int", default=None,
 290                             *pod2OptionList(pod), **pod2OptionKeywords(pod))
 291
 292     pod = """
 293
 294 =item B<-v>
 295
 296 =item B<--verbose>
 297
 298 Operate verbose.
 299
 300 =cut
 301     """
 302     generalGroup.add_option(action="store_true",
 303                             *pod2OptionList(pod), **pod2OptionKeywords(pod))
 304     optionParser.add_option_group(generalGroup)
 305
 306     pod = """
 307
 308 =back
 309
 310 =head2 Arguments
 311
 312 =over 4
 313
 314 =cut
 315     """
 316     argumentGroup = OptionGroup(optionParser, pod2Head(pod),
 317                                 pod2Description(pod))
 318     optionParser.add_option_group(argumentGroup)
 319
 320     pod = """
 321
 322 =item I<xml>
 323
 324 The XML input file containing docutils XML.
 325
 326 =cut
 327     """
 328
 329     argument1Group = OptionGroup(optionParser, *pod2Argument(pod))
 330     optionParser.add_option_group(argument1Group)
 331
 332     pod = """
 333
 334 =item I<rst>
 335
 336 The optional output file containing reStructuredText.
 337
 338 If not given output is put to C<STDOUT>.
 339
 340 =cut
 341     """
 342     argument2Group = OptionGroup(optionParser, *pod2Argument(pod))
 343     optionParser.add_option_group(argument2Group)
 344
 345     pod = """
 346
 347 =back
 348
 349 =cut
 350     """
 351     ( options, args, ) = optionParser.parse_args()
 352
 353     if len(args) < 1:
 354         optionParser.error("An input file is required")
 355     if len(args) > 2:
 356         optionParser.error("At most two arguments are allowed")
 357     if (options.adornment is not None
 358         and re.search('^([ou][]!"#$%&\'()*+,\-./:;<=>?@[\\^_`{|}~])+$',
 359                       options.adornment) is None):
 360         optionParser.error("Invalid adornment string given")
 361
 362     return args
 363
 364 ###############################################################################
 365
 366 def errorOut(lines):
 367     """
 368     Outputs messages as error.
 369
 370     @param lines: Messages to be output as single lines.
 371     @type lines: ( str, ..., )
 372
 373     @return: 0
 374     @rtype: int
 375     """
 376     scriptName = os.path.basename(sys.argv[0])
 377     for line in lines:
 378         print >>sys.stderr, ("%s: %s" % ( scriptName, line, ))
 379     return 0
 380
 381 ###############################################################################
 382
 383 def verboseOut(lines):
 384     """
 385     Outputs messages as a verbose message.
 386
 387     @param lines: Messages to be output as single lines.
 388     @type lines: ( str, ..., )
 389
 390     @return: 0
 391     @rtype: int
 392     """
 393     if options.verbose:
 394         errorOut([ "## " + line
 395                    for line in lines ])
 396     return 0
 397
 398 ###############################################################################
 399
 400 def errorExit(code, lines):
 401     """
 402     Exit program with an error message.
 403
 404     @param code: Exit Code to use.
 405     @type code: int
 406
 407     @param lines: Strings to output as error message.
 408     @type lines: ( str, ..., )
 409
 410     @return: Does not return.
 411     """
 412     errorOut(lines)
 413     sys.exit(code)
 414
 415 ###############################################################################
 416 ###############################################################################
 417 # Specialized functions
 418
 419 def convert(inNm, outNm):
 420     """
 421     Do the conversion.
 422
 423     @param inNm: Filename of input file.
 424     @type inNm: str
 425
 426     @param outNm: Filename of output file or None.
 427     @type outNm: str | None
 428     """
 429     try:
 430         inF = open(inNm)
 431     except IOError:
 432         errorExit(1, ( "Can't open input file %r" % ( inNm, ), ))
 433
 434     scriptP = os.path.dirname(os.path.realpath(ScriptNm))
 435     mainXsltNm = os.path.join(scriptP, MainXsltNm)
 436     try:
 437         mainXsltF = open(mainXsltNm)
 438     except IOError:
 439         errorExit(1, ( "Can't open main XSLT file %r" % ( mainXsltNm, ), ))
 440
 441     xsltParser = etree.XMLParser()
 442     mainXsltDoc = etree.parse(mainXsltF, xsltParser)
 443     mainXsltF.close()
 444     mainXslt = etree.XSLT(mainXsltDoc)
 445
 446     inParser = etree.XMLParser()
 447     try:
 448         inDoc = etree.parse(inF, inParser)
 449     except Exception, e:
 450         errorExit(1, ( "Error parsing input file %r: %s" % ( inNm, e, ), ))
 451     inF.close()
 452
 453     xsltParams = { }
 454     if options.fold is not None:
 455         xsltParams['fold'] = str(options.fold)
 456     if options.adornment is not None:
 457         xsltParams['adornment'] = "'" + options.adornment + "'"
 458     try:
 459         result = mainXslt(inDoc, **xsltParams)
 460     except Exception, e:
 461         errorExit(1, ( "Error transforming input file %r: %s" % ( inNm, e, ), ))
 462     # Chop off trailing linefeed - added somehow
 463     outS = str(result)[:-1]
 464     if outNm:
 465         try:
 466             outF = open(outNm, "w")
 467         except IOError:
 468             errorExit(1, ( "Can't open output file %r" % ( outNm, ), ))
 469         outF.write(outS)
 470         outF.close()
 471     else:
 472         print(outS)
 473
 474 ###############################################################################
 475 ###############################################################################
 476 # Classes
 477
 478 ########################################################################
 479 ##############################################################################
 480 # Now work
 481
 482 if __name__ == '__main__':
 483     arguments = parseOptions()
 484     inF = arguments[0]
 485     if len(arguments) > 1:
 486         outF = arguments[1]
 487     else:
 488         outF = None
 489     convert(inF, outF)
 490
 491 ##############################################################################
 492 ##############################################################################
 493
 494 # TODO Accept additional XSLT sheets to create a transformation pipeline
 495
 496 # TODO Move from XSLT to Python implementation step by step by replacing
 497 #      XSLT-code by Python code through extensions and other means
 498