src/pylit.py

   1 #!/usr/bin/env python
   2 # -*- coding: iso-8859-1 -*-
   3
   4 # ===============================================================
   5 # pylit.py: Literate programming with Python and reStructuredText
   6 # ===============================================================
   7 #
   8 # :Date:      2007-01-31
   9 # :Copyright: 2005, 2007 Guenter Milde.
  10 #             Released under the terms of the GNU General Public License
  11 #             (v. 2 or later)
  12 #
  13 # .. sectnum::
  14 # .. contents::
  15 #
  16 # Frontmatter
  17 # ===========
  18 #
  19 # Changelog
  20 # ---------
  21 #
  22 # :2005-06-29: Initial version
  23 # :2005-06-30: first literate version of the script
  24 # :2005-07-01: object orientated script using generators
  25 # :2005-07-10: Two state machine (later added 'header' state)
  26 # :2006-12-04: Start of work on version 0.2 (code restructuring)
  27 # :2007-01-23: 0.2   published at http://pylit.berlios.de
  28 # :2007-01-25: 0.2.1 Outsourced non-core documentation to the PyLit pages.
  29 # :2007-01-26: 0.2.2 new behaviour of `diff` function
  30 # :2007-01-29: 0.2.3 new `header` methods after suggestion by Riccardo Murri
  31 # :2007-01-31: 0.2.4 raise Error if code indent is too small
  32 # :2007-02-05: 0.2.5 new command line option --comment-string
  33 # :2007-02-09: 0.2.6 add section with open questions,
  34 #                    Code2Text: let only blank lines (no comment str)
  35 #                    separate text and code,
  36 #                    fix `Code2Text.header`
  37 # :2007-02-19: 0.2.7 simplify `Code2Text.header,`
  38 #                    new `iter_strip` method replacing a lot of ``if``-s
  39 # :2007-02-22: 0.2.8 set `mtime` of outfile to the one of infile
  40 # :2007-02-27: 0.3   new `Code2Text` converter after an idea by Riccardo Murri
  41 #
  42 # ::
  43
  44 """pylit: Literate programming with Python and reStructuredText
  45
  46    PyLit is a bidirectional converter between
  47
  48    * a (reStructured) text source with embedded code, and
  49    * a code source with embedded text blocks (comments)
  50 """
  51
  52 __docformat__ = 'restructuredtext'
  53
  54
  55 # Requirements
  56 # ------------
  57 #
  58 # * library modules
  59 #
  60 # ::
  61
  62 import re
  63 import os
  64 import sys
  65 import optparse
  66
  67 # * non-standard extensions
  68 #
  69 # ::
  70
  71 from simplestates import SimpleStates  # generic state machine
  72
  73
  74 # Classes
  75 # =======
  76 #
  77 # PushIterator
  78 # ------------
  79 #
  80 # The PushIterator is a minimal implementation of an iterator with
  81 # backtracking from the `Effective Python Programming`_ OSCON 2005 tutorial by
  82 # Anthony Baxter. As the definition is small, it is inlined now. For the full
  83 # reasoning and documentation see `iterqueue.py`_.
  84 #
  85 # .. _`Effective Python Programming`:
  86 #    http://www.interlink.com.au/anthony/tech/talks/OSCON2005/effective_r27.pdf
  87 #
  88 # .. _iterqueue.py: iterqueue.py.html
  89 #
  90 # ::
  91
  92 class PushIterator(object):
  93     def __init__(self, iterable):
  94         self.it = iter(iterable)
  95         self.cache = []
  96     def __iter__(self):
  97         """Return `self`, as this is already an iterator"""
  98         return self
  99     def next(self):
 100         return (self.cache and self.cache.pop()) or self.it.next()
 101     def push(self, value):
 102         self.cache.append(value)
 103
 104 # Converter
 105 # ---------
 106 #
 107 # The converter classes implement a simple `state machine` to separate and
 108 # transform text and code blocks. For this task, only a very limited parsing
 109 # is needed.  Using the full blown docutils_ rst parser would introduce a
 110 # large overhead and slow down the conversion.
 111 #
 112 # PyLit's simple parser assumes:
 113 #
 114 # * indented literal blocks in a text source are code blocks.
 115 #
 116 # * comment lines that start with a matching comment string in a code source
 117 #   are text blocks.
 118 #
 119 # .. _docutils: http://docutils.sourceforge.net/
 120 #
 121 # The actual converter classes are derived from `PyLitConverter`:
 122 # `Text2Code`_ converts a text source to executable code, while `Code2Text`_
 123 # does the opposite: converting commented code to a text source.
 124 #
 125 # The `PyLitConverter` class inherits the state machine framework
 126 # (initalisation, scheduler, iterator interface, ...) from `SimpleStates`,
 127 # overrides the ``__init__`` method, and adds auxiliary methods and
 128 # configuration attributes (options). ::
 129
 130 class PyLitConverter(SimpleStates):
 131     """parent class for `Text2Code` and `Code2Text`, the state machines
 132     converting between text source and code source of a literal program.
 133     """
 134
 135 # Data attributes
 136 # ~~~~~~~~~~~~~~~
 137 #
 138 # The data attributes are class default values. They will be overridden by
 139 # matching keyword arguments during class instantiation.
 140 #
 141 # `get_converter`_ and `main`_ pass on unused keyword arguments to
 142 # the instantiation of a converter class. This way, keyword arguments
 143 # to these functions can be used to customize the converter.
 144
 145 # Default language and language specific defaults::
 146
 147     language =        "python"
 148     comment_strings = {"python": '# ',
 149                        "slang": '% ',
 150                        "c++": '// '}
 151
 152 # Number of spaces to indent code blocks in the code -> text conversion.[#]_
 153 #
 154 # .. [#] For the text -> code conversion, the codeindent is determined by the
 155 #        first recognized code line (leading comment or first indented literal
 156 #        block of the text source).
 157 #
 158 # ::
 159
 160     codeindent =  2
 161
 162 # Marker string for the first code block. (Should be a valid rst directive
 163 # that accepts code on the same line, e.g. ``'.. admonition::'``.)  No
 164 # trailing whitespace needed as indented code follows. Default is a comment
 165 # marker::
 166
 167     header_string = '..'
 168
 169 # Export to the output format stripping text or code blocks::
 170
 171     strip =           False
 172
 173 # Initial state::
 174
 175     state = 'header'
 176
 177
 178 # Instantiation
 179 # ~~~~~~~~~~~~~
 180 #
 181 # Initializing sets up the `data` attribute, an iterable object yielding
 182 # lines of the source to convert.[1]_   ::
 183
 184     def __init__(self, data, **keyw):
 185         """data   --  iterable data object
 186                       (list, file, generator, string, ...)
 187            **keyw --  all remaining keyword arguments are
 188                       stored as class attributes
 189         """
 190
 191 # As the state handlers need backtracking, the data is wrapped in a
 192 # `PushIterator`_ if it doesnot already have a `push` method::
 193
 194         if hasattr(data, 'push'):
 195             self.data = data
 196         else:
 197             self.data = PushIterator(data)
 198         self._textindent = 0
 199
 200 # Additional keyword arguments are stored as data attributes, overwriting the
 201 # class defaults::
 202
 203         self.__dict__.update(keyw)
 204
 205 # The comment string is set to the languages comment string if not given in
 206 # the keyword arguments::
 207
 208         if not hasattr(self, "comment_string") or not self.comment_string:
 209             self.comment_string = self.comment_strings[self.language]
 210
 211 # .. [1] The most common choice of data is a `file` object with the text
 212 #        or code source.
 213 #
 214 #        To convert a string into a suitable object, use its splitlines method
 215 #        with the optional `keepends` argument set to True.
 216 #
 217 # Converter.__str__
 218 # ~~~~~~~~~~~~~~~~~
 219 #
 220 # Return converted data as string::
 221
 222     def __str__(self):
 223         blocks = ["".join(block) for block in self()]
 224         return "".join(blocks)
 225
 226 # Converter.get_indent
 227 # ~~~~~~~~~~~~~~~~~~~~
 228 #
 229 # Return the number of leading spaces in `string` after expanding tabs ::
 230
 231     def get_indent(self, string):
 232         """Return the indentation of `string`.
 233         """
 234         line = string.expandtabs()
 235         return len(line) - len(line.lstrip())
 236
 237 # Converter.ensure_trailing_blank_line
 238 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 239 #
 240 # Ensure there is a blank line as last element of the list `lines`::
 241
 242     def ensure_trailing_blank_line(self, lines, next_line):
 243         if not lines:
 244             return
 245         if lines[-1].lstrip():
 246             sys.stderr.write("\nWarning: inserted blank line between\n %s %s"
 247                              %(lines[-1], next_line))
 248             lines.append("\n")
 249
 250
 251 # Converter.collect_blocks
 252
 253 # ::
 254
 255     def collect_blocks(self):
 256         """collect lines in a list
 257
 258         return list for each block of lines (paragraph) seperated by a
 259         blank line (whitespace only)
 260         """
 261         block = []
 262         for line in self.data:
 263             block.append(line)
 264             if not line.rstrip():
 265                 yield block
 266                 block = []
 267         yield block
 268
 269
 270 # Text2Code
 271 # ---------
 272 #
 273 # The `Text2Code` class separates code blocks (indented literal blocks) from
 274 # reStructured text. Code blocks are unindented, text is commented (or
 275 # filtered, if the ``strip`` option is True.
 276 #
 277 # Only `indented literal blocks` are extracted. `quoted literal blocks` and
 278 # `pydoc blocks` are treated as text. This allows the easy inclusion of
 279 # examples: [#]_
 280 #
 281 #    >>> 23 + 3
 282 #    26
 283 #
 284 # .. [#] Mark that there is no double colon before the doctest block in
 285 #        the text source.
 286 #
 287 # The state handlers are implemented as generators. Iterating over a
 288 # `Text2Code` instance initializes them to generate iterators for
 289 # the respective states (see ``simplestates.py``).
 290 #
 291 # ::
 292
 293 class Text2Code(PyLitConverter):
 294     """Convert a (reStructured) text source to code source
 295     """
 296
 297 # INIT: call the parent classes init method.
 298 #
 299 # If the `strip` argument is true, replace the `__iter_` method
 300 # with a special one that drops "spurious" blocks::
 301
 302     def __init__(self, data, **keyw):
 303         PyLitConverter.__init__(self, data, **keyw)
 304         if getattr(self, "strip", False):
 305             self.__iter__ = self.iter_strip
 306
 307 # Text2Code.header
 308 # ~~~~~~~~~~~~~~~~
 309 #
 310 # Convert the header (leading rst comment block) to code::
 311
 312     def header(self):
 313         """Convert header (comment) to code"""
 314         line = self.data_iterator.next()
 315
 316 # Test first line for rst comment: (We need to do this explicitely here, as
 317 # the code handler will only recognize the start of a text block if a line
 318 # starting with "matching comment" is preceded by an empty line. However, we
 319 # have to care for the case of the first line beeing a "text line".
 320 #
 321 # Which variant is better?
 322 #
 323 # 1. starts with comment marker and has
 324 #    something behind the comment on the first line::
 325
 326         # if line.startswith("..") and len(line.rstrip()) > 2:
 327
 328 # 2. Convert any leading comment to code::
 329
 330         if line.startswith(self.header_string):
 331
 332 # Strip leading comment string (typically added by `Code2Text.header`) and
 333 # return the result of processing the data with the code handler::
 334
 335             self.data_iterator.push(line.replace(self.header_string, "", 1))
 336             return self.code()
 337
 338 # No header code found: Push back first non-header line and set state to
 339 # "text"::
 340
 341         self.data_iterator.push(line)
 342         self.state = 'text'
 343         return []
 344
 345 # Text2Code.text_handler_generator
 346 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 347 #
 348 # The 'text' handler processes everything that is not an indented literal
 349 # comment. Text is quoted with `self.comment_string` or filtered (with
 350 # strip=True).
 351 #
 352 # It is implemented as a generator function that acts on the `data` iterator
 353 # and yields text blocks.
 354 #
 355 # Declaration and initialization::
 356
 357     def text_handler_generator(self):
 358         """Convert text blocks from rst to comment
 359         """
 360         lines = []
 361
 362 # Iterate over the data_iterator (which yields the data lines)::
 363
 364         for line in self.data_iterator:
 365             # print "Text: '%s'"%line
 366
 367 # Default action: add comment string and collect in `lines` list::
 368
 369             lines.append(self.comment_string + line)
 370
 371 # Test for the end of the text block: a line that ends with `::` but is neither
 372 # a comment nor a directive::
 373
 374             if (line.rstrip().endswith("::")
 375                 and not line.lstrip().startswith("..")):
 376
 377 # End of text block is detected, now:
 378 #
 379 # set the current text indent level (needed by the code handler to find the
 380 # end of code block) and set the state to "code" (i.e. the next call of
 381 # `self.next` goes to the code handler)::
 382
 383                 self._textindent = self.get_indent(line)
 384                 self.state = 'code'
 385
 386 # Ensure a trailing blank line (which is the paragraph separator in
 387 # reStructured Text. Look at the next line, if it is blank -- OK, if it is
 388 # not blank, push it back (it should be code) and add a line by calling the
 389 # `ensure_trailing_blank_line` method (which also issues a warning)::
 390
 391                 line = self.data_iterator.next()
 392                 if line.lstrip():
 393                     self.data_iterator.push(line) # push back
 394                     self.ensure_trailing_blank_line(lines, line)
 395                 else:
 396                     lines.append(line)
 397
 398 # Now yield and reset the lines. (There was a function call to remove a
 399 # literal marker (if on a line on itself) to shorten the comment. However,
 400 # this behaviour was removed as the resulting difference in line numbers leads
 401 # to misleading error messages in doctests)::
 402
 403                 #remove_literal_marker(lines)
 404                 yield lines
 405                 lines = []
 406
 407 # End of data: if we "fall of" the iteration loop, just join and return the
 408 # lines::
 409
 410         yield lines
 411
 412
 413 # Text2Code.code_handler_generator
 414 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 415 #
 416 # The `code` handler is called when a literal block marker is encounterd. It
 417 # returns a code block (indented literal block), removing leading whitespace
 418 # up to the indentation of the first code line in the file (this deviation
 419 # from docutils behaviour allows indented blocks of Python code).
 420 #
 421 # As the code handler detects the switch to "text" state by looking at
 422 # the line indents, it needs to push back the last probed data token. I.e.
 423 # the  data_iterator must support a `push` method. (This is the
 424 # reason for the use of the PushIterator class in `__init__`.) ::
 425
 426     def code_handler_generator(self):
 427         """Convert indented literal blocks to source code
 428         """
 429         lines = []
 430         codeindent = None  # indent of first non-blank code line, set below
 431         indent_string = "" # leading whitespace chars ...
 432
 433 # Iterate over the lines in the input data::
 434
 435         for line in self.data_iterator:
 436             # print "Code: '%s'"%line
 437
 438 # Pass on blank lines (no test for end of code block needed|possible)::
 439
 440             if not line.rstrip():
 441                 lines.append(line.replace(indent_string, "", 1))
 442                 continue
 443
 444 # Test for end of code block:
 445 #
 446 # A literal block ends with the first less indented, nonblank line.
 447 # `self._textindent` is set by the text handler to the indent of the
 448 # preceding paragraph.
 449 #
 450 # To prevent problems with different tabulator settings, hard tabs in code
 451 # lines  are expanded with the `expandtabs` string method when calculating the
 452 # indentation (i.e. replaced by 8 spaces, by default).
 453 #
 454 # ::
 455
 456             if self.get_indent(line) <= self._textindent:
 457                 # push back line
 458                 self.data_iterator.push(line)
 459                 self.state = 'text'
 460                 # append blank line (if not already present)
 461                 self.ensure_trailing_blank_line(lines, line)
 462                 yield lines
 463                 # reset list of lines
 464                 lines = []
 465                 continue
 466
 467 # OK, we are sure now that the current line is neither blank nor a text line.
 468 #
 469 # If still unset, determine the code indentation from first non-blank code
 470 # line::
 471
 472             if codeindent is None and line.lstrip():
 473                 codeindent = self.get_indent(line)
 474                 indent_string = line[:codeindent]
 475
 476 # Append unindented line to lines cache (but check if we can safely unindent
 477 # first)::
 478
 479             if not line.startswith(indent_string):
 480                 raise ValueError, "cannot unindent line %r,\n"%line \
 481                 + "  doesnot start with code indent string %r"%indent_string
 482
 483             lines.append(line[codeindent:])
 484
 485 # No more lines in the input data: just return what we have::
 486
 487         yield lines
 488
 489
 490 # Txt2Code.remove_literal_marker
 491 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 492 #
 493 # Remove literal marker (::) in "expanded form" i.e. in a paragraph on its own.
 494 #
 495 # While cleaning up the code source, it leads to confusion for doctest and
 496 # searches (e.g. grep) as line-numbers between text and code source will
 497 # differ. ::
 498
 499     def remove_literal_marker(list):
 500         try:
 501             # print lines[-3:]
 502             if (lines[-3].strip() == self.comment_string.strip()
 503                 and lines[-2].strip() == self.comment_string + '::'):
 504                 del(lines[-3:-1])
 505         except IndexError:
 506             pass
 507
 508 # Text2Code.iter_strip
 509 # ~~~~~~~~~~~~~~~~~~~~
 510 #
 511 # Modification of the `simplestates.__iter__` method that will replace it when
 512 # the `strip` keyword argument is `True` during class instantiation:
 513 #
 514 # Iterate over class instances dropping text blocks::
 515
 516     def iter_strip(self):
 517         """Generate and return an iterator dropping text blocks
 518         """
 519         self.data_iterator = self.data
 520         self._initialize_state_generators()
 521         while True:
 522             yield getattr(self, self.state)()
 523             getattr(self, self.state)() # drop text block
 524
 525
 526
 527 # Code2Text
 528 # ---------
 529 #
 530 # The `Code2Text` class does the opposite of `Text2Code`_ -- it processes
 531 # valid source code, extracts comments, and puts non-commented code in literal
 532 # blocks.
 533 #
 534 # Only lines starting with a comment string matching the one in the
 535 # `comment_string` data attribute are considered text lines.
 536 #
 537 # The class is derived from the PyLitConverter state machine and adds handlers
 538 # for the three states "header", "text", and "code". ::
 539
 540 class Code2Text(PyLitConverter):
 541     """Convert code source to text source
 542     """
 543
 544 # Code2Text.__iter__
 545
 546     def __iter__(self):
 547
 548 # If the last text block doesnot end with a code marker (by default, the
 549 # literal-block marker ``::``), the `text` method will set `code marker` to
 550 # a paragraph that will start the next code block. It is yielded if non-empty
 551 # at a text-code transition. If there is no preceding text block, `code_marker`
 552 # contains the  `header_string`::
 553
 554         if self.strip:
 555             self.code_marker = []
 556         else:
 557             self.code_marker = [self.header_string]
 558
 559         for block in self.collect_blocks():
 560
 561 # Test the state of the block, return it processed with the right handler::
 562
 563             if self.block_is_text(block):
 564                 self.state = "text"
 565             else:
 566                 if self.state != "code" and self.code_marker:
 567                     yield self.code_marker
 568                 self.state = "code"
 569             yield getattr(self, self.state)(block)
 570
 571
 572 # A paragraph is a text block, if every non-blank line starts with a matching
 573 # comment string  (test includes whitespace except for commented blank lines!)
 574 # ::
 575
 576     def block_is_text(self, block):
 577         for line in block:
 578             if (line.rstrip()
 579                 and not line.startswith(self.comment_string)
 580                 and line.rstrip() != self.comment_string.rstrip()):
 581                 return False
 582         return True
 583
 584 # "header" state
 585 # ~~~~~~~~~~~~~~~~
 586 #
 587 # Sometimes code needs to remain on the first line(s) of the document to be
 588 # valid. The most common example is the "shebang" line that tells a POSIX
 589 # shell how to process an executable file::
 590
 591 #!/usr/bin/env python
 592
 593 # In Python, the ``# -*- coding: iso-8859-1 -*-`` line must occure before any
 594 # other comment or code.
 595 #
 596 # If we want to keep the line numbers in sync for text and code source, the
 597 # reStructured Text markup for these header lines must start at the same line
 598 # as the first header line. Therfore, header lines could not be marked as
 599 # literal block (this would require the "::" and an empty line above the code).
 600 #
 601 # OTOH, a comment may start at the same line as the comment marker and it
 602 # includes subsequent indented lines. Comments are visible in the reStructured
 603 # Text source but hidden in the pretty-printed output.
 604 #
 605 # With a header converted to comment in the text source, everything before the
 606 # first text block (i.e. before the first paragraph using the matching comment
 607 # string) will be hidden away (in HTML or PDF output).
 608 #
 609 # This seems a good compromise, the advantages
 610 #
 611 # * line numbers are kept
 612 # * the "normal" code conversion rules (indent/unindent by `codeindent` apply
 613 # * greater flexibility: you can hide a repeating header in a project
 614 #   consisting of many source files.
 615 #
 616 # set off the disadvantages
 617 #
 618 # - it may come as surprise if a part of the file is not "printed",
 619 # - one more syntax element to learn for rst newbees to start with pylit,
 620 #   (however, starting from the code source, this will be auto-generated)
 621 #
 622 # In the case that there is no matching comment at all, the complete code
 623 # source will become a comment -- however, in this case it is not very likely
 624 # the source is a literate document anyway.
 625 #
 626 # If needed for the documentation, it is possible to repeat the header in (or
 627 # after) the first text block, e.g. with a `line block` in a `block quote`:
 628 #
 629 #   |  ``#!/usr/bin/env python``
 630 #   |  ``# -*- coding: iso-8859-1 -*-``
 631 #
 632 # The current implementation represents the header state by the setting of
 633 # `code_marker` to ``[self.header_string]``. The first non-empty text block
 634 # will overwrite this setting.
 635
 636 # Code2Text.text
 637 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 638 #
 639 # The *text state handler* converts a comment to a text block
 640 # Strip the leading comment string::
 641
 642     def text(self, lines):
 643         """Uncomment text blocks in source code
 644         """
 645
 646         lines = [line.replace(self.comment_string, "", 1) for line in lines]
 647
 648         lines = [re.sub("^"+self.comment_string.rstrip(), "", line)
 649                  for line in lines]
 650
 651         if self.strip:
 652             self.strip_literal_marker(lines)
 653             self.code_marker = []
 654
 655 # Check for code block marker (double colon) at the end of the text block
 656 # Update the `code_marker` argument. The current `code marker` is 'prepended'
 657 # to the next code block by `Code2Text.code`_ ::
 658
 659         elif len(lines)>1:
 660             if lines[-2].rstrip().endswith("::"):
 661                 self.code_marker = []
 662             else:
 663                 self.code_marker = ["::\n", "\n"]
 664
 665 # Return the text block to the calling function::
 666
 667         return lines
 668
 669
 670 # Code2Text.code
 671 # ~~~~~~~~~~~~~~
 672 #
 673 # The `code` method is called on non-commented code. Code is returned as
 674 # indented literal block (or filtered, if ``self.strip == True``). The amount
 675 # of the code indentation is controled by `self.codeindent` (default 2).
 676 # ::
 677
 678     def code(self, lines):
 679         """Indent lines or strip if `strip` == `True`
 680         """
 681         if self.strip == True:
 682             return []
 683
 684         return [" "*self.codeindent + line for line in lines]
 685
 686 # Code2Text.strip_literal_marker
 687 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 688 #
 689 # If the code block is stripped, the literal marker would lead to an error
 690 # when the text is converted with docutils. Replace it with the equivalent of
 691 # docutils replace rules
 692 #
 693 # * strip `::`-line (and preceding blank line) if on a line on its own
 694 # * strip `::` if it is preceded by whitespace.
 695 # * convert `::` to a single colon if preceded by text
 696 #
 697 # `lines` should be list of text lines (with a trailing blank line).
 698 # It is modified in-place::
 699
 700     def strip_literal_marker(self, lines):
 701         try:
 702             line = lines[-2]
 703         except IndexError:  # len(lines < 2)
 704             return
 705
 706         # split at rightmost '::'
 707         try:
 708             (head, tail) = line.rsplit('::', 1)
 709         except ValueError:  # only one part (no '::')
 710             return
 711
 712         # '::' on an extra line
 713         if not head.strip():
 714             del(lines[-2])
 715             # delete preceding line if it is blank
 716             if len(lines) >= 2 and not lines[-2].lstrip():
 717                 del(lines[-2])
 718         # '::' follows whitespace
 719         elif head.rstrip() < head:
 720             head = head.rstrip()
 721             lines[-2] = "".join((head, tail))
 722         # '::' follows text
 723         else:
 724             lines[-2] = ":".join((head, tail))
 725
 726
 727
 728 # Command line use
 729 # ================
 730 #
 731 # Using this script from the command line will convert a file according to its
 732 # extension. This default can be overridden by a couple of options.
 733 #
 734 # Dual source handling
 735 # --------------------
 736 #
 737 # How to determine which source is up-to-date?
 738 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 739 #
 740 # - set modification date of `oufile` to the one of `infile`
 741 #
 742 #   Points out that the source files are 'synchronized'.
 743 #
 744 #   * Are there problems to expect from "backdating" a file? Which?
 745 #
 746 #     Looking at http://www.unix.com/showthread.php?t=20526, it seems
 747 #     perfectly legal to set `mtime` (while leaving `ctime`) as `mtime` is a
 748 #     description of the "actuality" of the data in the file.
 749 #
 750 #   * Should this become a default or an option?
 751 #
 752 # - alternatively move input file to a backup copy (with option: `--replace`)
 753 #
 754 # - check modification date before overwriting
 755 #   (with option: `--overwrite=update`)
 756 #
 757 # - check modification date before editing (implemented as `Jed editor`_
 758 #   function `pylit_check()` in `pylit.sl`_)
 759 #
 760 # .. _Jed editor: http://www.jedsoft.org/jed/
 761 # .. _pylit.sl: http://jedmodes.sourceforge.net/mode/pylit/
 762 #
 763 # Recognised Filename Extensions
 764 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 765 #
 766 # Finding an easy to remember, unused filename extension is not easy.
 767 #
 768 # .py.txt
 769 #   a double extension (similar to .tar.gz, say) seems most appropriate
 770 #   (at least on UNIX). However, it fails on FAT16 filesystems.
 771 #   The same scheme can be used for c.txt, p.txt and the like.
 772 #
 773 # .pytxt
 774 #   is recognised as extension by os.path.splitext but also fails on FAT16
 775 #
 776 # .pyt
 777 #   (PYthon Text) is used by the Python test interpreter
 778 #   `pytest <http:www.zetadev.com/software/pytest/>`__
 779 #
 780 # .pyl
 781 #   was even mentioned as extension for "literate Python" files in an
 782 #   email exchange (http://www.python.org/tim_one/000115.html) but
 783 #   subsequently used for Python libraries.
 784 #
 785 # .lpy
 786 #   seems to be free (as by a Google search, "lpy" is the name of a python
 787 #   code pretty printer but this should not pose a problem).
 788 #
 789 # .tpy
 790 #   seems to be free as well.
 791 #
 792 # Instead of defining a new extension for "pylit" literate programms,
 793 # by default ``.txt`` will be appended for literate code and stripped by
 794 # the conversion to executable code. i.e. for a program foo:
 795 #
 796 # * the literate source is called ``foo.py.txt``
 797 # * the html rendering is called ``foo.py.html``
 798 # * the python source is called ``foo.py``
 799 #
 800 #
 801 #
 802 # OptionValues
 803 # ------------
 804 #
 805 # For use as keyword arguments, it is handy to have the options
 806 # in a dictionary. The following class adds an `as_dict` method
 807 # to  `optparse.Values`::
 808
 809 class OptionValues(optparse.Values):
 810     def as_dict(self):
 811         """Return options as dictionary object"""
 812         return dict([(option, getattr(self, option)) for option in dir(self)
 813                      if option not in dir(OptionValues)
 814                      and option is not None
 815                     ])
 816
 817 # PylitOptions
 818 # ------------
 819 #
 820 # Options are stored in the values attribute of the `PylitOptions` class.
 821 # It is initialized with default values and parsed command line options (and
 822 # arguments)  This scheme allows easy customization by code importing the
 823 # `pylit` module. ::
 824
 825 class PylitOptions(object):
 826     """Storage and handling of program options
 827     """
 828
 829 # Recognized file extensions for text and code versions of the source::
 830
 831     code_languages  = {".py": "python",
 832                        ".sl": "slang",
 833                        ".c": "c++"}
 834     code_extensions = code_languages.keys()
 835     text_extensions = [".txt"]
 836
 837 # Instantiation
 838 # ~~~~~~~~~~~~~
 839 #
 840 # Instantiation sets up an OptionParser and initializes it with pylit's
 841 # command line options and `default_values`. It then updates the values based
 842 # on command line options and sensible defaults::
 843
 844     def __init__(self, args=sys.argv[1:], **default_values):
 845         """Set up an `OptionParser` instance and parse and complete arguments
 846         """
 847         p = optparse.OptionParser(usage=main.__doc__, version="0.2")
 848         # set defaults
 849         p.set_defaults(**default_values)
 850         # add the options
 851         p.add_option("-c", "--code2txt", dest="txt2code", action="store_false",
 852                      help="convert code to reStructured text")
 853         p.add_option("--comment-string", dest="comment_string",
 854                      help="text block marker (default '# ' (for Python))" )
 855         p.add_option("-d", "--diff", action="store_true",
 856                      help="test for differences to existing file")
 857         p.add_option("--doctest", action="store_true",
 858                      help="run doctest.testfile() on the text version")
 859         p.add_option("-e", "--execute", action="store_true",
 860                      help="execute code (Python only)")
 861         p.add_option("-f", "--infile",
 862                      help="input file name ('-' for stdout)" )
 863         p.add_option("--overwrite", action="store",
 864                      choices = ["yes", "update", "no"],
 865                      help="overwrite output file (default 'update')")
 866         p.add_option("-o", "--outfile",
 867                      help="output file name ('-' for stdout)" )
 868         p.add_option("--replace", action="store_true",
 869                      help="move infile to a backup copy (appending '~')")
 870         p.add_option("-s", "--strip", action="store_true",
 871                      help="export by stripping text or code")
 872         p.add_option("-t", "--txt2code", action="store_true",
 873                      help="convert reStructured text to code")
 874         self.parser = p
 875
 876         # parse to fill a self.Values instance
 877         self.values = self.parse_args(args)
 878         # complete with context-sensitive defaults
 879         self.values = self.complete_values(self.values)
 880
 881 # Calling
 882 # ~~~~~~~
 883 #
 884 # "Calling" an instance updates the option values based on command line
 885 # arguments and default values and does a completion of the options based on
 886 # "context-sensitive defaults"::
 887
 888     def __call__(self, args=sys.argv[1:], **default_values):
 889         """parse and complete command line args
 890         """
 891         values = self.parse_args(args, **default_values)
 892         return self.complete_values(values)
 893
 894
 895 # PylitOptions.parse_args
 896 # ~~~~~~~~~~~~~~~~~~~~~~~
 897 #
 898 # The `parse_args` method calls the `optparse.OptionParser` on command
 899 # line or provided args and returns the result as `PylitOptions.Values`
 900 # instance.  Defaults can be provided as keyword arguments::
 901
 902     def parse_args(self, args=sys.argv[1:], **default_values):
 903         """parse command line arguments using `optparse.OptionParser`
 904
 905            args           --  list of command line arguments.
 906            default_values --  dictionary of option defaults
 907         """
 908         # update defaults
 909         defaults = self.parser.defaults.copy()
 910         defaults.update(default_values)
 911         # parse arguments
 912         (values, args) = self.parser.parse_args(args, OptionValues(defaults))
 913         # Convert FILE and OUTFILE positional args to option values
 914         # (other positional arguments are ignored)
 915         try:
 916             values.infile = args[0]
 917             values.outfile = args[1]
 918         except IndexError:
 919             pass
 920         return values
 921
 922 # PylitOptions.complete_values
 923 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 924 #
 925 # The `complete` method uses context information to set missing option values
 926 # to sensible defaults (if possible).
 927 #
 928 # ::
 929
 930     def complete_values(self, values):
 931         """complete option values with context sensible defaults
 932         """
 933         values.ensure_value("infile", "")
 934         # Guess conversion direction from infile filename
 935         if values.ensure_value("txt2code", None) is None:
 936             in_extension = os.path.splitext(values.infile)[1]
 937             if in_extension in self.text_extensions:
 938                 values.txt2code = True
 939             elif in_extension in self.code_extensions:
 940                 values.txt2code = False
 941         # Auto-determine the output file name
 942         values.ensure_value("outfile", self.get_outfile_name(values.infile,
 943                                                              values.txt2code))
 944         # Guess conversion direction from outfile filename or set to default
 945         if values.txt2code is None:
 946             out_extension = os.path.splitext(values.outfile)[1]
 947             values.txt2code = not (out_extension in self.text_extensions)
 948
 949         # Set the language of the code (default "python")
 950         if values.txt2code is True:
 951             code_extension = os.path.splitext(values.outfile)[1]
 952         elif values.txt2code is False:
 953             code_extension = os.path.splitext(values.infile)[1]
 954         values.ensure_value("language",
 955                             self.code_languages.get(code_extension, "python"))
 956
 957         # Set the default overwrite mode
 958         values.ensure_value("overwrite", 'update')
 959
 960         return values
 961
 962 # PylitOptions.get_outfile_name
 963 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 964 #
 965 # Construct a matching filename for the output file. The output filename is
 966 # constructed from `infile` by the following rules:
 967 #
 968 # * '-' (stdin) results in '-' (stdout)
 969 # * strip the `txt_extension` or add the `code_extension` (txt2code)
 970 # * add a `txt_ extension` (code2txt)
 971 # * fallback: if no guess can be made, add ".out"
 972 #
 973 # ::
 974
 975     def get_outfile_name(self, infile, txt2code=None):
 976         """Return a matching output filename for `infile`
 977         """
 978         # if input is stdin, default output is stdout
 979         if infile == '-':
 980             return '-'
 981         # Modify `infile`
 982         (base, ext) = os.path.splitext(infile)
 983         # TODO: should get_outfile_name() use self.values.outfile_extension
 984         #       if it exists?
 985
 986         # strip text extension
 987         if ext in self.text_extensions:
 988             return base
 989         # add (first) text extension for code files
 990         if ext in self.code_extensions or txt2code == False:
 991             return infile + self.text_extensions[0]
 992         # give up
 993         return infile + ".out"
 994
 995
 996
 997 # Helper functions
 998 # ----------------
 999 #
1000 # open_streams
1001 # ~~~~~~~~~~~~
1002 #
1003 # Return file objects for in- and output. If the input path is missing,
1004 # write usage and abort. (An alternative would be to use stdin as default.
1005 # However,  this leaves the uninitiated user with a non-responding application
1006 # if (s)he just tries the script without any arguments) ::
1007
1008 def open_streams(infile = '-', outfile = '-', overwrite='update', **keyw):
1009     """Open and return the input and output stream
1010
1011     open_streams(infile, outfile) -> (in_stream, out_stream)
1012
1013     in_stream   --  file(infile) or sys.stdin
1014     out_stream  --  file(outfile) or sys.stdout
1015     overwrite   --  ['yes', 'update', 'no']
1016                     if 'update', only open output file if it is older than
1017                     the input stream.
1018                     Irrelevant if outfile == '-'.
1019     """
1020     if not infile:
1021         strerror = "Missing input file name ('-' for stdin; -h for help)"
1022         raise IOError, (2, strerror, infile)
1023     if infile == '-':
1024         in_stream = sys.stdin
1025     else:
1026         in_stream = file(infile, 'r')
1027     if outfile == '-':
1028         out_stream = sys.stdout
1029     elif overwrite == 'no' and os.path.exists(outfile):
1030         raise IOError, (1, "Output file exists!", outfile)
1031     elif overwrite == 'update' and is_newer(outfile, infile):
1032         raise IOError, (1, "Output file is newer than input file!", outfile)
1033     else:
1034         out_stream = file(outfile, 'w')
1035     return (in_stream, out_stream)
1036
1037 # is_newer
1038 # ~~~~~~~~
1039 #
1040 # ::
1041
1042 def is_newer(path1, path2):
1043     """Check if `path1` is newer than `path2` (using mtime)
1044
1045     Compare modification time of files at path1 and path2.
1046
1047     Non-existing files are considered oldest: Return False if path1 doesnot
1048     exist and True if path2 doesnot exist.
1049
1050     Return None for equal modification time. (This evaluates to False in a
1051     boolean context but allows a test for equality.)
1052
1053     """
1054     try:
1055         mtime1 = os.path.getmtime(path1)
1056     except OSError:
1057         mtime1 = -1
1058     try:
1059         mtime2 = os.path.getmtime(path2)
1060     except OSError:
1061         mtime2 = -1
1062     # print "mtime1", mtime1, path1, "\n", "mtime2", mtime2, path2
1063
1064     if mtime1 == mtime2:
1065         return None
1066     return mtime1 > mtime2
1067
1068
1069 # get_converter
1070 # ~~~~~~~~~~~~~
1071 #
1072 # Get an instance of the converter state machine::
1073
1074 def get_converter(data, txt2code=True, **keyw):
1075     if txt2code:
1076         return Text2Code(data, **keyw)
1077     else:
1078         return Code2Text(data, **keyw)
1079
1080
1081 # Use cases
1082 # ---------
1083 #
1084 # run_doctest
1085 # ~~~~~~~~~~~
1086 #
1087 # ::
1088
1089 def run_doctest(infile="-", txt2code=True,
1090                 globs={}, verbose=False, optionflags=0, **keyw):
1091     """run doctest on the text source
1092     """
1093     from doctest import DocTestParser, DocTestRunner
1094     (data, out_stream) = open_streams(infile, "-")
1095
1096 # If source is code, convert to text, as tests in comments are not found by
1097 # doctest::
1098
1099     if txt2code is False:
1100         converter = Code2Text(data, **keyw)
1101         docstring = str(converter)
1102     else:
1103         docstring = data.read()
1104
1105 # Use the doctest Advanced API to do all doctests in a given string::
1106
1107     test = DocTestParser().get_doctest(docstring, globs={}, name="",
1108                                            filename=infile, lineno=0)
1109     runner = DocTestRunner(verbose=verbose, optionflags=optionflags)
1110     runner.run(test)
1111     runner.summarize
1112     if not runner.failures:
1113         print "%d failures in %d tests"%(runner.failures, runner.tries)
1114     return runner.failures, runner.tries
1115
1116
1117 # diff
1118 # ~~~~
1119 #
1120 # ::
1121
1122 def diff(infile='-', outfile='-', txt2code=True, **keyw):
1123     """Report differences between converted infile and existing outfile
1124
1125     If outfile is '-', do a round-trip conversion and report differences
1126     """
1127
1128     import difflib
1129
1130     instream = file(infile)
1131     # for diffing, we need a copy of the data as list::
1132     data = instream.readlines()
1133     # convert
1134     converter = get_converter(data, txt2code, **keyw)
1135     new = str(converter).splitlines(True)
1136
1137     if outfile != '-':
1138         outstream = file(outfile)
1139         old = outstream.readlines()
1140         oldname = outfile
1141         newname = "<conversion of %s>"%infile
1142     else:
1143         old = data
1144         oldname = infile
1145         # back-convert the output data
1146         converter = get_converter(new, not txt2code)
1147         new = str(converter).splitlines(True)
1148         newname = "<round-conversion of %s>"%infile
1149
1150     # find and print the differences
1151     delta = list(difflib.unified_diff(old, new, fromfile=oldname,
1152                                       tofile=newname))
1153     if not delta:
1154         print oldname
1155         print newname
1156         print "no differences found"
1157         return False
1158     print "".join(delta)
1159     return True
1160
1161 # main
1162 # ----
1163 #
1164 # If this script is called from the command line, the `main` function will
1165 # convert the input (file or stdin) between text and code formats.
1166
1167 # Customization
1168 # ~~~~~~~~~~~~~
1169 #
1170 # Option defaults for the conversion can be as keyword arguments to `main`_.
1171 # The option defaults will be updated by command line options and extended
1172 # with "intelligent guesses" by `PylitOptions` and passed on to helper
1173 # functions and the converter instantiation.
1174
1175 # This allows easy customization for programmatic use -- just or call `main`
1176 # with the appropriate keyword options (or with a `option_defaults`
1177 # dictionary.), e.g.:
1178
1179 # >>> option_defaults = {'language': "c++",
1180 # ...                    'codeindent': 4,
1181 # ...                    'header_string': '..admonition::'
1182 # ...                   }
1183 #
1184 # >>> main(**option_defaults)
1185 #
1186 # ::
1187
1188 def main(args=sys.argv[1:], **option_defaults):
1189     """%prog [options] FILE [OUTFILE]
1190
1191     Convert between reStructured Text with embedded code, and
1192     Source code with embedded text comment blocks"""
1193
1194 # Parse and complete the options::
1195
1196     options = PylitOptions(args, **option_defaults).values
1197
1198 # Run doctests if ``--doctest`` option is set::
1199
1200     if options.ensure_value("doctest", None):
1201         return run_doctest(**options.as_dict())
1202
1203 # Do a round-trip and report differences if the ``--diff`` opton is set::
1204
1205     if options.ensure_value("diff", None):
1206         return diff(**options.as_dict())
1207
1208 # Open in- and output streams::
1209
1210     try:
1211         (data, out_stream) = open_streams(**options.as_dict())
1212     except IOError, ex:
1213         print "IOError: %s %s" % (ex.filename, ex.strerror)
1214         sys.exit(ex.errno)
1215
1216 # Get a converter instance::
1217
1218     converter = get_converter(data, **options.as_dict())
1219
1220 # Execute if the ``-execute`` option is set::
1221
1222     if options.ensure_value("execute", None):
1223         print "executing " + options.infile
1224         if options.txt2code:
1225             code = str(converter)
1226         else:
1227             code = data
1228         exec code
1229         return
1230
1231 # Default action: Convert and write to out_stream::
1232
1233     out_stream.write(str(converter))
1234
1235     if out_stream is not sys.stdout:
1236         print "extract written to", out_stream.name
1237         out_stream.close()
1238
1239 # Rename the infile to a backup copy if ``--replace`` is set::
1240
1241     if options.ensure_value("replace", None):
1242         os.rename(options.infile, options.infile + "~")
1243
1244 # If not (and input and output are from files), set the modification time
1245 # (`mtime`) of the output file to the one of the input file to indicate that
1246 # the contained information is equal.[#]_ ::
1247
1248     else:
1249         try:
1250             os.utime(options.outfile, (os.path.getatime(options.outfile),
1251                                        os.path.getmtime(options.infile))
1252                     )
1253         except OSError:
1254             pass
1255
1256     ## print "mtime", os.path.getmtime(options.infile),  options.infile
1257     ## print "mtime", os.path.getmtime(options.outfile), options.outfile
1258
1259
1260 # .. [#] Make sure the corresponding file object (here `out_stream`) is
1261 #        closed, as otherwise the change will be overwritten when `close` is
1262 #        called afterwards (either explicitely or at program exit).
1263 #
1264 # Run main, if called from the command line::
1265
1266 if __name__ == '__main__':
1267     main()
1268
1269
1270 # Open questions
1271 # ==============
1272 #
1273 # Open questions and ideas for further development
1274 #
1275 # Options
1276 # -------
1277 #
1278 # * Collect option defaults in a dictionary (on module level)
1279 #
1280 #   Facilitates the setting of options in programmatic use
1281 #
1282 #   Use templates for the "intelligent guesses" (with Python syntax for string
1283 #   replacement with dicts: ``"hello %(what)s" % {'what': 'world'}``)
1284 #
1285 # * Is it sensible to offer the `header_string` option also as command line
1286 #   option?
1287 #
1288 # * Configurable
1289 #
1290 # Parsing Problems
1291 # ----------------------
1292 #
1293 # * How can I include a literal block that should not be in the
1294 #   executable code (e.g. an example, an earlier version or variant)?
1295 #
1296 #   Workaround:
1297 #     Use a `quoted literal block` (with a quotation different from
1298 #     the comment string used for text blocks to keep it as commented over the
1299 #     code-text round-trips.
1300 #
1301 #     Python `pydoc` examples can also use the special pydoc block syntax (no
1302 #     double colon!).
1303 #
1304 #   Alternative:
1305 #     use a special "code block" directive or a special "no code
1306 #     block" directive.
1307 #
1308 # * ignore "matching comments" in literal strings?
1309 #
1310 #   (would need a specific detection algorithm for every language that
1311 #   supports multi-line literal strings (C++, PHP, Python)
1312 #
1313 # * Warn if a comment in code will become text after round-trip?
1314 #
1315 # code syntax highlight
1316 # ---------------------
1317 #
1318 # use `listing` package in LaTeX->PDF
1319 #
1320 # in html, see
1321 #
1322 # * the syntax highlight support in rest2web
1323 #   (uses the Moin-Moin Python colorizer, see a version at
1324 #   http://www.standards-schmandards.com/2005/fangs-093/)
1325 # * Pygments (pure Python, many languages, rst integration recipe):
1326 #   http://pygments.org/docs/rstdirective/
1327 # * Silvercity, enscript, ...
1328 #
1329 # Some plug-ins require a special "code block" directive instead of the
1330 # `::`-literal block. TODO: make this an option
1331 #
1332 # Ask at docutils users|developers
1333 #
1334 # * How to handle docstrings in code blocks? (it would be nice to convert them
1335 #   to rst-text if ``__docformat__ == restructuredtext``)
1336 #