examples/pylit.py

   1 #!/usr/bin/env python
   2 # -*- coding: iso-8859-1 -*-
   3
   4 # ===============================================================
   5 # pylit.py: Literate programming with Python and reStructuredText
   6 # ===============================================================
   7 #
   8 # :Date:      2007-01-31
   9 # :Copyright: 2005, 2007 Guenter Milde.
  10 #             Released under the terms of the GNU General Public License
  11 #             (v. 2 or later)
  12 #
  13 # .. sectnum::
  14 # .. contents::
  15 #
  16 # Frontmatter
  17 # ===========
  18 #
  19 # Changelog
  20 # ---------
  21 #
  22 # :2005-06-29: Initial version
  23 # :2005-06-30: first literate version of the script
  24 # :2005-07-01: object orientated script using generators
  25 # :2005-07-10: Two state machine (later added 'header' state)
  26 # :2006-12-04: Start of work on version 0.2 (code restructuring)
  27 # :2007-01-23: 0.2   published at http://pylit.berlios.de
  28 # :2007-01-25: 0.2.1 Outsourced non-core documentation to the PyLit pages.
  29 # :2007-01-26: 0.2.2 new behaviour of `diff` function
  30 # :2007-01-29: 0.2.3 new `header` methods after suggestion by Riccardo Murri
  31 # :2007-01-31: 0.2.4 raise Error if code indent is too small
  32 # :2007-02-05: 0.2.5 new command line option --comment-string
  33 # :2007-02-09: 0.2.6 add section with open questions,
  34 #                    Code2Text: let only blank lines (no comment str)
  35 #                    separate text and code,
  36 #                    fix `Code2Text.header`
  37 # :2007-02-19: 0.2.7 simplify `Code2Text.header,`
  38 #                    new `iter_strip` method replacing a lot of ``if``-s
  39 # :2007-02-22: 0.2.8 set `mtime` of outfile to the one of infile
  40 #
  41 # ::
  42
  43 """pylit: Literate programming with Python and reStructuredText
  44
  45    PyLit is a bidirectional converter between
  46
  47    * a (reStructured) text source with embedded code, and
  48    * a code source with embedded text blocks (comments)
  49 """
  50
  51 __docformat__ = 'restructuredtext'
  52
  53
  54 # Requirements
  55 # ------------
  56 #
  57 # * library modules
  58 #
  59 # ::
  60
  61 import os
  62 import sys
  63 import optparse
  64
  65 # * non-standard extensions
  66 #
  67 # ::
  68
  69 from simplestates import SimpleStates  # generic state machine
  70
  71
  72 # Option defaults
  73 # ===============
  74 #
  75 # Module-level option defaults for the conversion can be stored in a
  76 # dictionary that is passed as keyword arguments to `main`_.
  77
  78 # `option_defaults` will be updated by command line options and extended with
  79 # "intelligent guesses" by `PylitOptions` and passed on to helper functions
  80 # and the converter instantiation.
  81
  82 # This allows easy customization for programmatic use -- just overwrite the
  83 # defaults after importing `pylit` but before calling `main` (or call `main`
  84 # with a custom `option_values` dictionary.)
  85 #
  86 # ::
  87
  88 option_defaults = {}
  89
  90 # Default language and language specific defaults::
  91
  92 option_defaults['language'] = "python"
  93 option_defaults['comment_strings'] = {"python": '# ',
  94                                       "slang": '% ',
  95                                       "c++": '// '}
  96 option_defaults['code_languages']  = {".py": "python",
  97                                       ".sl": "slang",
  98                                       ".c": "c++"}
  99 option_defaults['code_extensions'] = option_defaults['code_languages'].keys()
 100 option_defaults['text_extensions'] = [".txt"]
 101
 102 # Number of spaces to indent code blocks in the code -> text conversion.[#]_
 103 #
 104 # .. [#] For the text -> code conversion, the codeindent is determined by the
 105 #        first recognized code line (leading comment or first indented literal
 106 #        block).
 107 #
 108 # ::
 109
 110 option_defaults['codeindent'] = 2
 111
 112 # Marker string for the first code block. (Should be a valid rst directive
 113 # that accepts code on the same line, e.g. ``'.. admonition::'``.)  No
 114 # trailing whitespace needed as indented code follows. ::
 115
 116 option_defaults['header_string'] = '..'
 117
 118 # Export to the output format stripping text or code blocks::
 119
 120 option_defaults['strip'] = False
 121
 122 # Execute code (Python only)")::
 123
 124 option_defaults['execute'] = False
 125
 126
 127
 128
 129 # Classes
 130 # =======
 131 #
 132 # PushIterator
 133 # ------------
 134 #
 135 # The PushIterator is a minimal implementation of an iterator with
 136 # backtracking from the `Effective Python Programming`_ OSCON 2005 tutorial by
 137 # Anthony Baxter. As the definition is small, it is inlined now. For the full
 138 # reasoning and documentation see `iterqueue.py`_.
 139 #
 140 # .. _`Effective Python Programming`:
 141 #    http://www.interlink.com.au/anthony/tech/talks/OSCON2005/effective_r27.pdf
 142 #
 143 # .. _iterqueue.py: iterqueue.py.html
 144 #
 145 # ::
 146
 147 class PushIterator:
 148     def __init__(self, iterable):
 149         self.it = iter(iterable)
 150         self.cache = []
 151     def __iter__(self):
 152         """Return `self`, as this is already an iterator"""
 153         return self
 154     def next(self):
 155         return (self.cache and self.cache.pop()) or self.it.next()
 156     def push(self, value):
 157         self.cache.append(value)
 158
 159 # Converter
 160 # ---------
 161 #
 162 # The converter classes implement a simple `state machine` to separate and
 163 # transform text and code blocks. For this task, only a very limited parsing
 164 # is needed.  Using the full blown docutils_ rst parser would introduce a
 165 # large overhead and slow down the conversion.
 166 #
 167 # PyLit's simple parser assumes:
 168 #
 169 # * indented literal blocks in a text source are code blocks.
 170 #
 171 # * comment lines that start with a matching comment string in a code source
 172 #   are text blocks.
 173 #
 174 # .. _docutils: http://docutils.sourceforge.net/
 175 #
 176 # The actual converter classes are derived from `PyLitConverter`:
 177 # `Text2Code`_ converts a text source to executable code, while `Code2Text`_
 178 # does the opposite: converting commented code to a text source.
 179 #
 180 # The `PyLitConverter` class inherits the state machine framework
 181 # (initalisation, scheduler, iterator interface, ...) from `SimpleStates`,
 182 # overrides the ``__init__`` method, and adds auxiliary methods and
 183 # configuration attributes (options). ::
 184
 185 class PyLitConverter(SimpleStates):
 186     """parent class for `Text2Code` and `Code2Text`, the state machines
 187     converting between text source and code source of a literal program.
 188     """
 189
 190 # Data attributes
 191 # ~~~~~~~~~~~~~~~
 192 #
 193 # The data attributes are class default values. They will be overridden by
 194 # matching keyword arguments during class instantiation.
 195 #
 196 # `get_converter`_ and `main`_ pass on unused keyword arguments to
 197 # the instantiation of a converter class. This way, keyword arguments
 198 # to these functions can be used to customize the converter. ::
 199
 200     language =        option_defaults['language']
 201     comment_strings = option_defaults['comment_strings']
 202     strip =           option_defaults['strip']
 203     codeindent =      option_defaults['codeindent']
 204     header_string =   option_defaults['header_string']
 205
 206     state = 'header' # initial state
 207
 208 # Instantiation
 209 # ~~~~~~~~~~~~~
 210 #
 211 # Initializing sets up the `data` attribute, an iterable object yielding
 212 # lines of the source to convert.[1]_   ::
 213
 214     def __init__(self, data, **keyw):
 215         """data   --  iterable data object
 216                       (list, file, generator, string, ...)
 217            **keyw --  all remaining keyword arguments are
 218                       stored as class attributes
 219         """
 220
 221 # As the state handlers need backtracking, the data is wrapped in a
 222 # `PushIterator`_ if it doesnot already have a `push` method::
 223
 224         if hasattr(data, 'push'):
 225             self.data = data
 226         else:
 227             self.data = PushIterator(data)
 228         self._textindent = 0
 229
 230 # Additional keyword arguments are stored as data attributes, overwriting the
 231 # class defaults::
 232
 233         self.__dict__.update(keyw)
 234
 235 # The comment string is set to the languages comment string if not given in
 236 # the keyword arguments::
 237
 238         if not hasattr(self, "comment_string") or not self.comment_string:
 239             self.comment_string = self.comment_strings[self.language]
 240
 241 # If the `strip` argument is true, replace the `__iter_` method
 242 # with a special one that drops "spurious" blocks::
 243
 244         if getattr(self, "strip", False):
 245             self.__iter__ = self.iter_strip
 246
 247 # .. [1] The most common choice of data is a `file` object with the text
 248 #        or code source.
 249 #
 250 #        To convert a string into a suitable object, use its splitlines method
 251 #        with the optional `keepends` argument set to True.
 252 #
 253 # Converter.__str__
 254 # ~~~~~~~~~~~~~~~~~
 255 #
 256 # Return converted data as string::
 257
 258     def __str__(self):
 259         blocks = ["".join(block) for block in self()]
 260         return "".join(blocks)
 261
 262 # Converter.get_indent
 263 # ~~~~~~~~~~~~~~~~~~~~
 264 #
 265 # Return the number of leading spaces in `string` after expanding tabs ::
 266
 267     def get_indent(self, string):
 268         """Return the indentation of `string`.
 269         """
 270         line = string.expandtabs()
 271         return len(line) - len(line.lstrip())
 272
 273 # Converter.ensure_trailing_blank_line
 274 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 275 #
 276 # Ensure there is a blank line as last element of the list `lines`::
 277
 278     def ensure_trailing_blank_line(self, lines, next_line):
 279         if not lines:
 280             return
 281         if lines[-1].lstrip():
 282             sys.stderr.write("\nWarning: inserted blank line between\n %s %s"
 283                              %(lines[-1], next_line))
 284             lines.append("\n")
 285
 286
 287 # Text2Code
 288 # ---------
 289 #
 290 # The `Text2Code` class separates code blocks (indented literal blocks) from
 291 # reStructured text. Code blocks are unindented, text is commented (or
 292 # filtered, if the ``strip`` option is True.
 293 #
 294 # Only `indented literal blocks` are extracted. `quoted literal blocks` and
 295 # `pydoc blocks` are treated as text. This allows the easy inclusion of
 296 # examples: [#]_
 297 #
 298 #    >>> 23 + 3
 299 #    26
 300 #
 301 # .. [#] Mark that there is no double colon before the doctest block in
 302 #        the text source.
 303 #
 304 # The state handlers are implemented as generators. Iterating over a
 305 # `Text2Code` instance initializes them to generate iterators for
 306 # the respective states (see ``simplestates.py``).
 307 #
 308 # ::
 309
 310 class Text2Code(PyLitConverter):
 311     """Convert a (reStructured) text source to code source
 312     """
 313
 314 # Text2Code.header
 315 # ~~~~~~~~~~~~~~~~
 316 #
 317 # Convert the header (leading rst comment block) to code::
 318
 319     def header(self):
 320         """Convert header (comment) to code"""
 321         line = self.data_iterator.next()
 322
 323 # Test first line for rst comment: (We need to do this explicitely here, as
 324 # the code handler will only recognize the start of a text block if a line
 325 # starting with "matching comment" is preceded by an empty line. However, we
 326 # have to care for the case of the first line beeing a "text line".
 327 #
 328 # Which variant is better?
 329 #
 330 # 1. starts with comment marker and has
 331 #    something behind the comment on the first line::
 332
 333         # if line.startswith("..") and len(line.rstrip()) > 2:
 334
 335 # 2. Convert any leading comment to code::
 336
 337         if line.startswith(self.header_string):
 338
 339 # Strip leading comment string (typically added by `Code2Text.header`) and
 340 # return the result of processing the data with the code handler::
 341
 342             self.data_iterator.push(line.replace(self.header_string, "", 1))
 343             return self.code()
 344
 345 # No header code found: Push back first non-header line and set state to
 346 # "text"::
 347
 348         self.data_iterator.push(line)
 349         self.state = 'text'
 350         return []
 351
 352 # Text2Code.text_handler_generator
 353 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 354 #
 355 # The 'text' handler processes everything that is not an indented literal
 356 # comment. Text is quoted with `self.comment_string` or filtered (with
 357 # strip=True).
 358 #
 359 # It is implemented as a generator function that acts on the `data` iterator
 360 # and yields text blocks.
 361 #
 362 # Declaration and initialization::
 363
 364     def text_handler_generator(self):
 365         """Convert text blocks from rst to comment
 366         """
 367         lines = []
 368
 369 # Iterate over the data_iterator (which yields the data lines)::
 370
 371         for line in self.data_iterator:
 372             # print "Text: '%s'"%line
 373
 374 # Default action: add comment string and collect in `lines` list::
 375
 376             lines.append(self.comment_string + line)
 377
 378 # Test for the end of the text block: a line that ends with `::` but is neither
 379 # a comment nor a directive::
 380
 381             if (line.rstrip().endswith("::")
 382                 and not line.lstrip().startswith("..")):
 383
 384 # End of text block is detected, now:
 385 #
 386 # set the current text indent level (needed by the code handler to find the
 387 # end of code block) and set the state to "code" (i.e. the next call of
 388 # `self.next` goes to the code handler)::
 389
 390                 self._textindent = self.get_indent(line)
 391                 self.state = 'code'
 392
 393 # Ensure a trailing blank line (which is the paragraph separator in
 394 # reStructured Text. Look at the next line, if it is blank -- OK, if it is
 395 # not blank, push it back (it should be code) and add a line by calling the
 396 # `ensure_trailing_blank_line` method (which also issues a warning)::
 397
 398                 line = self.data_iterator.next()
 399                 if line.lstrip():
 400                     self.data_iterator.push(line) # push back
 401                     self.ensure_trailing_blank_line(lines, line)
 402                 else:
 403                     lines.append(line)
 404
 405 # Now yield and reset the lines. (There was a function call to remove a
 406 # literal marker (if on a line on itself) to shorten the comment. However,
 407 # this behaviour was removed as the resulting difference in line numbers leads
 408 # to misleading error messages in doctests)::
 409
 410                 #remove_literal_marker(lines)
 411                 yield lines
 412                 lines = []
 413
 414 # End of data: if we "fall of" the iteration loop, just join and return the
 415 # lines::
 416
 417         yield lines
 418
 419
 420 # Text2Code.code_handler_generator
 421 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 422 #
 423 # The `code` handler is called when a literal block marker is encounterd. It
 424 # returns a code block (indented literal block), removing leading whitespace
 425 # up to the indentation of the first code line in the file (this deviation
 426 # from docutils behaviour allows indented blocks of Python code).
 427 #
 428 # As the code handler detects the switch to "text" state by looking at
 429 # the line indents, it needs to push back the last probed data token. I.e.
 430 # the  data_iterator must support a `push` method. (This is the
 431 # reason for the use of the PushIterator class in `__init__`.) ::
 432
 433     def code_handler_generator(self):
 434         """Convert indented literal blocks to source code
 435         """
 436         lines = []
 437         codeindent = None  # indent of first non-blank code line, set below
 438         indent_string = "" # leading whitespace chars ...
 439
 440 # Iterate over the lines in the input data::
 441
 442         for line in self.data_iterator:
 443             # print "Code: '%s'"%line
 444
 445 # Pass on blank lines (no test for end of code block needed|possible)::
 446
 447             if not line.rstrip():
 448                 lines.append(line.replace(indent_string, "", 1))
 449                 continue
 450
 451 # Test for end of code block:
 452 #
 453 # A literal block ends with the first less indented, nonblank line.
 454 # `self._textindent` is set by the text handler to the indent of the
 455 # preceding paragraph.
 456 #
 457 # To prevent problems with different tabulator settings, hard tabs in code
 458 # lines  are expanded with the `expandtabs` string method when calculating the
 459 # indentation (i.e. replaced by 8 spaces, by default).
 460 #
 461 # ::
 462
 463             if self.get_indent(line) <= self._textindent:
 464                 # push back line
 465                 self.data_iterator.push(line)
 466                 self.state = 'text'
 467                 # append blank line (if not already present)
 468                 self.ensure_trailing_blank_line(lines, line)
 469                 yield lines
 470                 # reset list of lines
 471                 lines = []
 472                 continue
 473
 474 # OK, we are sure now that the current line is neither blank nor a text line.
 475 #
 476 # If still unset, determine the code indentation from first non-blank code
 477 # line::
 478
 479             if codeindent is None and line.lstrip():
 480                 codeindent = self.get_indent(line)
 481                 indent_string = line[:codeindent]
 482
 483 # Append unindented line to lines cache (but check if we can safely unindent
 484 # first)::
 485
 486             if not line.startswith(indent_string):
 487                 raise ValueError, "cannot unindent line %r,\n"%line \
 488                 + "  doesnot start with code indent string %r"%indent_string
 489
 490             lines.append(line[codeindent:])
 491
 492 # No more lines in the input data: just return what we have::
 493
 494         yield lines
 495
 496
 497 # Txt2Code.remove_literal_marker
 498 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 499 #
 500 # Remove literal marker (::) in "expanded form" i.e. in a paragraph on its own.
 501 #
 502 # While cleaning up the code source, it leads to confusion for doctest and
 503 # searches (e.g. grep) as line-numbers between text and code source will
 504 # differ. ::
 505
 506     def remove_literal_marker(list):
 507         try:
 508             # print lines[-3:]
 509             if (lines[-3].strip() == self.comment_string.strip()
 510                 and lines[-2].strip() == self.comment_string + '::'):
 511                 del(lines[-3:-1])
 512         except IndexError:
 513             pass
 514
 515 # Text2Code.iter_strip
 516 # ~~~~~~~~~~~~~~~~~~~~
 517 #
 518 # Modification of the `simplestates.__iter__` method that will replace it when
 519 # the `strip` keyword argument is `True` during class instantiation:
 520 #
 521 # Iterate over class instances dropping text blocks::
 522
 523     def iter_strip(self):
 524         """Generate and return an iterator dropping text blocks
 525         """
 526         self.data_iterator = self.data
 527         self._initialize_state_generators()
 528         while True:
 529             yield getattr(self, self.state)()
 530             getattr(self, self.state)() # drop text block
 531
 532
 533
 534 # Code2Text
 535 # ---------
 536 #
 537 # The `Code2Text` class does the opposite of `Text2Code`_ -- it processes
 538 # valid source code, extracts comments, and puts non-commented code in literal
 539 # blocks.
 540 #
 541 # Only lines starting with a comment string matching the one in the
 542 # `comment_string` data attribute are considered text lines.
 543 #
 544 # The class is derived from the PyLitConverter state machine and adds handlers
 545 # for the three states "header", "text", and "code". ::
 546
 547 class Code2Text(PyLitConverter):
 548     """Convert code source to text source
 549     """
 550
 551 # Code2Text.header
 552 # ~~~~~~~~~~~~~~~~
 553 #
 554 # Sometimes code needs to remain on the first line(s) of the document to be
 555 # valid. The most common example is the "shebang" line that tells a POSIX
 556 # shell how to process an executable file::
 557
 558 #!/usr/bin/env python
 559
 560 # In Python, the ``# -*- coding: iso-8859-1 -*-`` line must occure before any
 561 # other comment or code.
 562 #
 563 # If we want to keep the line numbers in sync for text and code source, the
 564 # reStructured Text markup for these header lines must start at the same line
 565 # as the first header line. Therfore, header lines could not be marked as
 566 # literal block (this would require the "::" and an empty line above the code).
 567 #
 568 # OTOH, a comment may start at the same line as the comment marker and it
 569 # includes subsequent indented lines. Comments are visible in the reStructured
 570 # Text source but hidden in the pretty-printed output.
 571 #
 572 # With a header converted to comment in the text source, everything before the
 573 # first text block (i.e. before the first paragraph using the matching comment
 574 # string) will be hidden away (in HTML or PDF output).
 575 #
 576 # This seems a good compromise, the advantages
 577 #
 578 # * line numbers are kept
 579 # * the "normal" code conversion rules (indent/unindent by `codeindent` apply
 580 # * greater flexibility: you can hide a repeating header in a project
 581 #   consisting of many source files.
 582 #
 583 # set off the disadvantages
 584 #
 585 # - it may come as surprise if a part of the file is not "printed",
 586 # - one more syntax element to learn for rst newbees to start with pylit,
 587 #   (however, starting from the code source, this will be auto-generated)
 588 #
 589 # In the case that there is no matching comment at all, the complete code
 590 # source will become a comment -- however, in this case it is not very likely
 591 # the source is a literate document anyway.
 592 #
 593 # If needed for the documentation, it is possible to repeat the header in (or
 594 # after) the first text block, e.g. with a `line block` in a `block quote`:
 595 #
 596 #   |  ``#!/usr/bin/env python``
 597 #   |  ``# -*- coding: iso-8859-1 -*-``
 598 #
 599 # ::
 600
 601     def header(self):
 602         """Convert leading code to rst comment"""
 603
 604 # Parse with the `text` method. If there is no leading text, return the
 605 # `header_string` (by default the rst comment marker)::
 606
 607         lines = self.text()
 608         if lines:
 609             return lines
 610         return [self.header_string]
 611
 612
 613 # Code2Text.text_handler_generator
 614 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 615 #
 616 # The text handler converts a comment to a text block if it matches the
 617 # following requirements:
 618 #
 619 # * every line starts with a matching comment string (test includes whitespace!)
 620 # * comment is separated from code by a blank line (the paragraph separator in
 621 #   reStructuredText)
 622 #
 623 # It is implemented as a generator function that acts on the `data` iterator
 624 # and yields text blocks.
 625 #
 626 # Text is uncommented. A literal block marker is appended, if not already
 627 # present ::
 628
 629     def text_handler_generator(self):
 630         """Uncomment text blocks in source code
 631         """
 632
 633 # Set up an output cache and iterate over the data lines (remember, code lines
 634 # are processed by the code handler and not seen here). ::
 635
 636         lines = []
 637         for line in self.data_iterator:
 638               # print "Text: " + line
 639
 640 # Pass on blank lines. Strip comment string from otherwise blank lines
 641 # Continue with the next line, as there is no need to test blank lines
 642 # for the end of text. ::
 643
 644             if not line.lstrip():
 645                 lines.append(line)
 646                 continue
 647
 648 # Test for end of text block: the first line that doesnot start with a
 649 # matching comment string. This tests also whitespace that is part of the
 650 # comment string! ::
 651
 652             if not line.startswith(self.comment_string):
 653
 654 # Missing whitespace in the `comment_string` is not significant for otherwise
 655 # blank lines. Add the whitespace and continue::
 656
 657                 if line.rstrip() == self.comment_string.rstrip():
 658                     lines.append(line.replace(self.comment_string.rstrip(),
 659                                               self.comment_string, 1))
 660                     continue
 661
 662 # End of text block: Push back the line and let the "code" handler handle it
 663 # (and subsequent lines)::
 664
 665                 self.state = 'code'
 666                 self.data_iterator.push(line)
 667
 668 # Also restore and push back lines that precede the next code line without a
 669 # blank line (paragraph separator) inbetween::
 670
 671                 while lines and lines[-1].lstrip():
 672                     self.data_iterator.push(lines.pop())
 673
 674 # Strip the leading comment string::
 675
 676                 lines = [line.replace(self.comment_string, "", 1)
 677                          for line in lines]
 678
 679 # Ensure literal block marker (double colon) at the end of the text block::
 680
 681                 if len(lines)>1 and not lines[-2].rstrip().endswith("::"):
 682                     lines.extend(["::\n", "\n"])
 683
 684 # Yield the text block (process following lines with `code_handler`.
 685 # When the state is again set to "text", reset the cache and continue with
 686 # next text line ::
 687
 688                 yield lines
 689                 lines = []
 690                 continue
 691
 692 # Test passed: It's text line. Append to the `lines` cache::
 693
 694             lines.append(line)
 695
 696 # No more lines: Just return the remaining lines::
 697
 698         yield [line.replace(self.comment_string, "", 1) for line in lines]
 699
 700
 701 # Code2Text.code_handler_generator
 702 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 703 #
 704 # The `code` method is called on non-commented code. Code is returned as
 705 # indented literal block (or filtered, if ``strip=True``). The amount of the
 706 # code indentation is controled by `self.codeindent` (default 2).
 707 #
 708 # ::
 709
 710     def code_handler_generator(self):
 711         """Convert source code to indented literal blocks.
 712         """
 713         lines = []
 714         for line in self.data_iterator:
 715             # yield "Code: " + line
 716             # pass on empty lines (only newline)
 717             if line == "\n":
 718                 lines.append(line)
 719                 continue
 720             # # strip comment string from blank lines
 721             # if line.rstrip() == self.comment_string.rstrip():
 722             #     lines.append("\n")
 723             #     continue
 724
 725 # Test for end of code block:
 726 #
 727 # * matching comment string at begin of line,
 728 # * following a blank line.
 729 #
 730 # The test includes whitespace in `self.comment_string` normally, but ignores
 731 # trailing whitespace if the line after the comment string is blank. ::
 732
 733             if (line.startswith(self.comment_string) or
 734                 line.rstrip() == self.comment_string.rstrip()
 735                ) and lines and not lines[-1].strip():
 736
 737                 self.data_iterator.push(line)
 738                 self.state = 'text'
 739                 # self.ensure_trailing_blank_line(lines, line)
 740                 yield lines
 741                 # reset
 742                 lines = []
 743                 continue
 744
 745 # default action: indent by codeindent and append to lines cache::
 746
 747             lines.append(" "*self.codeindent + line)
 748
 749 # no more lines in data_iterator -- return collected lines::
 750
 751         yield lines
 752
 753
 754 # Code2Text.iter_strip
 755 # ~~~~~~~~~~~~~~~~~~~~
 756 #
 757 # Modification of the `simplestates.__iter__` method that will replace it when
 758 # the `strip` keyword argument is `True` during class instantiation:
 759 #
 760 # Iterate over class instances dropping the header block and code blocks::
 761
 762     def iter_strip(self):
 763         """Generate and return an iterator dropping code|text blocks
 764         """
 765         self.data_iterator = self.data
 766         self._initialize_state_generators()
 767         textblock = self.header() # drop the header
 768         if textblock != [self.header_string]:
 769             self.strip_literal_marker(textblock)
 770             yield textblock
 771         while True:
 772             getattr(self, self.state)() # drop code blocks
 773             textblock = getattr(self, self.state)()
 774             self.strip_literal_marker(textblock)
 775             yield textblock
 776
 777
 778 # Code2Text.strip_literal_marker
 779 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 780 #
 781 # If the code block is stripped, the literal marker would lead to an error
 782 # when the text is converted with docutils. Replace it with the equivalent of
 783 # docutils replace rules
 784 #
 785 # * strip `::`-line as well as preceding blank line if on a line on its own
 786 # * strip `::` if it is preceded by whitespace.
 787 # * convert `::` to a single colon if preceded by text
 788 #
 789 # `lines` should be list of text lines (with a trailing blank line).
 790 # It is modified in-place::
 791
 792     def strip_literal_marker(self, lines):
 793         if len(lines) < 2:
 794             return
 795         parts = lines[-2].rsplit('::', 1)
 796         if lines[-2].strip() == '::':
 797             del(lines[-2])
 798             if len(lines) >= 2 and not lines[-2].lstrip():
 799                 del(lines[-2])
 800         elif parts[0].rstrip() < parts[0]:
 801             parts[0] = parts[0].rstrip()
 802             lines[-2] = "".join(parts)
 803         else:
 804             lines[-2] = ":".join(parts)
 805
 806
 807
 808 # Command line use
 809 # ================
 810 #
 811 # Using this script from the command line will convert a file according to its
 812 # extension. This default can be overridden by a couple of options.
 813 #
 814 # Dual source handling
 815 # --------------------
 816 #
 817 # How to determine which source is up-to-date?
 818 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 819 #
 820 # - set modification date of `oufile` to the one of `infile`
 821 #
 822 #   Points out that the source files are 'synchronized'.
 823 #
 824 #   * Are there problems to expect from "backdating" a file? Which?
 825 #
 826 #     Looking at http://www.unix.com/showthread.php?t=20526, it seems
 827 #     perfectly legal to set `mtime` (while leaving `ctime`) as `mtime` is a
 828 #     description of the "actuality" of the data in the file.
 829 #
 830 #   * Should this become a default or an option?
 831 #
 832 # - alternatively move input file to a backup copy (with option: `--replace`)
 833 #
 834 # - check modification date before overwriting
 835 #   (with option: `--overwrite=update`)
 836 #
 837 # - check modification date before editing (implemented as `Jed editor`_
 838 #   function `pylit_check()` in `pylit.sl`_)
 839 #
 840 # .. _Jed editor: http://www.jedsoft.org/jed/
 841 # .. _pylit.sl: http://jedmodes.sourceforge.net/mode/pylit/
 842 #
 843 # Recognised Filename Extensions
 844 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 845 #
 846 # Finding an easy to remember, unused filename extension is not easy.
 847 #
 848 # .py.txt
 849 #   a double extension (similar to .tar.gz, say) seems most appropriate
 850 #   (at least on UNIX). However, it fails on FAT16 filesystems.
 851 #   The same scheme can be used for c.txt, p.txt and the like.
 852 #
 853 # .pytxt
 854 #   is recognised as extension by os.path.splitext but also fails on FAT16
 855 #
 856 # .pyt
 857 #   (PYthon Text) is used by the Python test interpreter
 858 #   `pytest <http:www.zetadev.com/software/pytest/>`__
 859 #
 860 # .pyl
 861 #   was even mentioned as extension for "literate Python" files in an
 862 #   email exchange (http://www.python.org/tim_one/000115.html) but
 863 #   subsequently used for Python libraries.
 864 #
 865 # .lpy
 866 #   seems to be free (as by a Google search, "lpy" is the name of a python
 867 #   code pretty printer but this should not pose a problem).
 868 #
 869 # .tpy
 870 #   seems to be free as well.
 871 #
 872 # Instead of defining a new extension for "pylit" literate programms,
 873 # by default ``.txt`` will be appended for literate code and stripped by
 874 # the conversion to executable code. i.e. for a program foo:
 875 #
 876 # * the literate source is called ``foo.py.txt``
 877 # * the html rendering is called ``foo.py.html``
 878 # * the python source is called ``foo.py``
 879 #
 880 #
 881 #
 882 # OptionValues
 883 # ------------
 884 #
 885 # For use as keyword arguments, it is handy to have the options
 886 # in a dictionary. The following class adds an `as_dict` method
 887 # to  `optparse.Values`::
 888
 889 class OptionValues(optparse.Values):
 890     def as_dict(self):
 891         """Return options as dictionary object"""
 892         return dict([(option, getattr(self, option)) for option in dir(self)
 893                      if option not in dir(OptionValues)
 894                      and option is not None
 895                     ])
 896
 897 # PylitOptions
 898 # ------------
 899 #
 900 # Options are stored in the values attribute of the `PylitOptions` class.
 901 # It is initialized with default values and parsed command line options (and
 902 # arguments)  This scheme allows easy customization by code importing the
 903 # `pylit` module. ::
 904
 905 class PylitOptions(object):
 906     """Storage and handling of program options
 907     """
 908
 909 # Recognized file extensions for text and code versions of the source::
 910
 911     code_languages =  option_defaults['code_languages']
 912     code_extensions = option_defaults['code_extensions']
 913     text_extensions = option_defaults['text_extensions']
 914
 915 # Instantiation
 916 # ~~~~~~~~~~~~~
 917 #
 918 # Instantiation sets up an OptionParser and initializes it with pylit's
 919 # command line options and `default_values`. It then updates the values based
 920 # on command line options and sensible defaults::
 921
 922     def __init__(self, args=sys.argv[1:], **default_values):
 923         """Set up an `OptionParser` instance and parse and complete arguments
 924         """
 925         p = optparse.OptionParser(usage=main.__doc__, version="0.2")
 926         # set defaults
 927         p.set_defaults(**default_values)
 928         # add the options
 929         p.add_option("-c", "--code2txt", dest="txt2code", action="store_false",
 930                      help="convert code to reStructured text")
 931         p.add_option("--comment-string", dest="comment_string",
 932                      help="text block marker (default '# ' (for Python))" )
 933         p.add_option("-d", "--diff", action="store_true",
 934                      help="test for differences to existing file")
 935         p.add_option("--doctest", action="store_true",
 936                      help="run doctest.testfile() on the text version")
 937         p.add_option("-e", "--execute", action="store_true",
 938                      help="execute code (Python only)")
 939         p.add_option("-f", "--infile",
 940                      help="input file name ('-' for stdout)" )
 941         p.add_option("--overwrite", action="store",
 942                      choices = ["yes", "update", "no"],
 943                      help="overwrite output file (default 'update')")
 944         p.add_option("-o", "--outfile",
 945                      help="output file name ('-' for stdout)" )
 946         p.add_option("--replace", action="store_true",
 947                      help="move infile to a backup copy (appending '~')")
 948         p.add_option("-s", "--strip", action="store_true",
 949                      help="export by stripping text or code")
 950         p.add_option("-t", "--txt2code", action="store_true",
 951                      help="convert reStructured text to code")
 952         self.parser = p
 953
 954         # parse to fill a self.Values instance
 955         self.values = self.parse_args(args)
 956         # complete with context-sensitive defaults
 957         self.values = self.complete_values(self.values)
 958
 959 # Calling
 960 # ~~~~~~~
 961 #
 962 # "Calling" an instance updates the option values based on command line
 963 # arguments and default values and does a completion of the options based on
 964 # "context-sensitive defaults"::
 965
 966     def __call__(self, args=sys.argv[1:], **default_values):
 967         """parse and complete command line args
 968         """
 969         values = self.parse_args(args, **default_values)
 970         return self.complete_values(values)
 971
 972
 973 # PylitOptions.parse_args
 974 # ~~~~~~~~~~~~~~~~~~~~~~~
 975 #
 976 # The `parse_args` method calls the `optparse.OptionParser` on command
 977 # line or provided args and returns the result as `PylitOptions.Values`
 978 # instance.  Defaults can be provided as keyword arguments::
 979
 980     def parse_args(self, args=sys.argv[1:], **default_values):
 981         """parse command line arguments using `optparse.OptionParser`
 982
 983            args           --  list of command line arguments.
 984            default_values --  dictionary of option defaults
 985         """
 986         # update defaults
 987         defaults = self.parser.defaults.copy()
 988         defaults.update(default_values)
 989         # parse arguments
 990         (values, args) = self.parser.parse_args(args, OptionValues(defaults))
 991         # Convert FILE and OUTFILE positional args to option values
 992         # (other positional arguments are ignored)
 993         try:
 994             values.infile = args[0]
 995             values.outfile = args[1]
 996         except IndexError:
 997             pass
 998         return values
 999
1000 # PylitOptions.complete_values
1001 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1002 #
1003 # The `complete` method uses context information to set missing option values
1004 # to sensible defaults (if possible).
1005 #
1006 # ::
1007
1008     def complete_values(self, values):
1009         """complete option values with context sensible defaults
1010         """
1011         values.ensure_value("infile", "")
1012         # Guess conversion direction from infile filename
1013         if values.ensure_value("txt2code", None) is None:
1014             in_extension = os.path.splitext(values.infile)[1]
1015             if in_extension in self.text_extensions:
1016                 values.txt2code = True
1017             elif in_extension in self.code_extensions:
1018                 values.txt2code = False
1019         # Auto-determine the output file name
1020         values.ensure_value("outfile", self.get_outfile_name(values.infile,
1021                                                              values.txt2code))
1022         # Guess conversion direction from outfile filename or set to default
1023         if values.txt2code is None:
1024             out_extension = os.path.splitext(values.outfile)[1]
1025             values.txt2code = not (out_extension in self.text_extensions)
1026
1027         # Set the language of the code (default "python")
1028         if values.txt2code is True:
1029             code_extension = os.path.splitext(values.outfile)[1]
1030         elif values.txt2code is False:
1031             code_extension = os.path.splitext(values.infile)[1]
1032         values.ensure_value("language",
1033                             self.code_languages.get(code_extension, "python"))
1034
1035         # Set the default overwrite mode
1036         values.ensure_value("overwrite", 'update')
1037
1038         return values
1039
1040 # PylitOptions.get_outfile_name
1041 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1042 #
1043 # Construct a matching filename for the output file. The output filename is
1044 # constructed from `infile` by the following rules:
1045 #
1046 # * '-' (stdin) results in '-' (stdout)
1047 # * strip the `txt_extension` or add the `code_extension` (txt2code)
1048 # * add a `txt_ extension` (code2txt)
1049 # * fallback: if no guess can be made, add ".out"
1050 #
1051 # ::
1052
1053     def get_outfile_name(self, infile, txt2code=None):
1054         """Return a matching output filename for `infile`
1055         """
1056         # if input is stdin, default output is stdout
1057         if infile == '-':
1058             return '-'
1059         # Modify `infile`
1060         (base, ext) = os.path.splitext(infile)
1061         # TODO: should get_outfile_name() use self.values.outfile_extension
1062         #       if it exists?
1063
1064         # strip text extension
1065         if ext in self.text_extensions:
1066             return base
1067         # add (first) text extension for code files
1068         if ext in self.code_extensions or txt2code == False:
1069             return infile + self.text_extensions[0]
1070         # give up
1071         return infile + ".out"
1072
1073
1074
1075 # Helper functions
1076 # ----------------
1077 #
1078 # open_streams
1079 # ~~~~~~~~~~~~
1080 #
1081 # Return file objects for in- and output. If the input path is missing,
1082 # write usage and abort. (An alternative would be to use stdin as default.
1083 # However,  this leaves the uninitiated user with a non-responding application
1084 # if (s)he just tries the script without any arguments) ::
1085
1086 def open_streams(infile = '-', outfile = '-', overwrite='update', **keyw):
1087     """Open and return the input and output stream
1088
1089     open_streams(infile, outfile) -> (in_stream, out_stream)
1090
1091     in_stream   --  file(infile) or sys.stdin
1092     out_stream  --  file(outfile) or sys.stdout
1093     overwrite   --  ['yes', 'update', 'no']
1094                     if 'update', only open output file if it is older than
1095                     the input stream.
1096                     Irrelevant if outfile == '-'.
1097     """
1098     if not infile:
1099         strerror = "Missing input file name ('-' for stdin; -h for help)"
1100         raise IOError, (2, strerror, infile)
1101     if infile == '-':
1102         in_stream = sys.stdin
1103     else:
1104         in_stream = file(infile, 'r')
1105     if outfile == '-':
1106         out_stream = sys.stdout
1107     elif overwrite == 'no' and os.path.exists(outfile):
1108         raise IOError, (1, "Output file exists!", outfile)
1109     elif overwrite == 'update' and is_newer(outfile, infile):
1110         raise IOError, (1, "Output file is newer than input file!", outfile)
1111     else:
1112         out_stream = file(outfile, 'w')
1113     return (in_stream, out_stream)
1114
1115 # is_newer
1116 # ~~~~~~~~
1117 #
1118 # ::
1119
1120 def is_newer(path1, path2):
1121     """Check if `path1` is newer than `path2` (using mtime)
1122
1123     Compare modification time of files at path1 and path2.
1124
1125     Non-existing files are considered oldest: Return False if path1 doesnot
1126     exist and True if path2 doesnot exist.
1127
1128     Return None for equal modification time. (This evaluates to False in a
1129     boolean context but allows a test for equality.)
1130
1131     """
1132     try:
1133         mtime1 = os.path.getmtime(path1)
1134     except OSError:
1135         mtime1 = -1
1136     try:
1137         mtime2 = os.path.getmtime(path2)
1138     except OSError:
1139         mtime2 = -1
1140     # print "mtime1", mtime1, path1, "\n", "mtime2", mtime2, path2
1141
1142     if mtime1 == mtime2:
1143         return None
1144     return mtime1 > mtime2
1145
1146
1147 # get_converter
1148 # ~~~~~~~~~~~~~
1149 #
1150 # Get an instance of the converter state machine::
1151
1152 def get_converter(data, txt2code=True, **keyw):
1153     if txt2code:
1154         return Text2Code(data, **keyw)
1155     else:
1156         return Code2Text(data, **keyw)
1157
1158
1159 # Use cases
1160 # ---------
1161 #
1162 # run_doctest
1163 # ~~~~~~~~~~~
1164 #
1165 # ::
1166
1167 def run_doctest(infile="-", txt2code=True,
1168                 globs={}, verbose=False, optionflags=0, **keyw):
1169     """run doctest on the text source
1170     """
1171     from doctest import DocTestParser, DocTestRunner
1172     (data, out_stream) = open_streams(infile, "-")
1173
1174 # If source is code, convert to text, as tests in comments are not found by
1175 # doctest::
1176
1177     if txt2code is False:
1178         converter = Code2Text(data, **keyw)
1179         docstring = str(converter)
1180     else:
1181         docstring = data.read()
1182
1183 # Use the doctest Advanced API to do all doctests in a given string::
1184
1185     test = DocTestParser().get_doctest(docstring, globs={}, name="",
1186                                            filename=infile, lineno=0)
1187     runner = DocTestRunner(verbose=verbose, optionflags=optionflags)
1188     runner.run(test)
1189     runner.summarize
1190     if not runner.failures:
1191         print "%d failures in %d tests"%(runner.failures, runner.tries)
1192     return runner.failures, runner.tries
1193
1194
1195 # diff
1196 # ~~~~
1197 #
1198 # ::
1199
1200 def diff(infile='-', outfile='-', txt2code=True, **keyw):
1201     """Report differences between converted infile and existing outfile
1202
1203     If outfile is '-', do a round-trip conversion and report differences
1204     """
1205
1206     import difflib
1207
1208     instream = file(infile)
1209     # for diffing, we need a copy of the data as list::
1210     data = instream.readlines()
1211     # convert
1212     converter = get_converter(data, txt2code, **keyw)
1213     new = str(converter).splitlines(True)
1214
1215     if outfile != '-':
1216         outstream = file(outfile)
1217         old = outstream.readlines()
1218         oldname = outfile
1219         newname = "<conversion of %s>"%infile
1220     else:
1221         old = data
1222         oldname = infile
1223         # back-convert the output data
1224         converter = get_converter(new, not txt2code)
1225         new = str(converter).splitlines(True)
1226         newname = "<round-conversion of %s>"%infile
1227
1228     # find and print the differences
1229     delta = list(difflib.unified_diff(old, new, fromfile=oldname,
1230                                       tofile=newname))
1231     if not delta:
1232         print oldname
1233         print newname
1234         print "no differences found"
1235         return False
1236     print "".join(delta)
1237     return True
1238
1239 # main
1240 # ----
1241 #
1242 # If this script is called from the command line, the `main` function will
1243 # convert the input (file or stdin) between text and code formats::
1244
1245 def main(args=sys.argv[1:], **option_defaults):
1246     """%prog [options] FILE [OUTFILE]
1247
1248     Convert between reStructured Text with embedded code, and
1249     Source code with embedded text comment blocks"""
1250
1251 # Parse and complete the options::
1252
1253     options = PylitOptions(args, **option_defaults).values
1254
1255 # Run doctests if ``--doctest`` option is set::
1256
1257     if options.ensure_value("doctest", None):
1258         return run_doctest(**options.as_dict())
1259
1260 # Do a round-trip and report differences if the ``--diff`` opton is set::
1261
1262     if options.ensure_value("diff", None):
1263         return diff(**options.as_dict())
1264
1265 # Open in- and output streams::
1266
1267     try:
1268         (data, out_stream) = open_streams(**options.as_dict())
1269     except IOError, ex:
1270         print "IOError: %s %s" % (ex.filename, ex.strerror)
1271         sys.exit(ex.errno)
1272
1273 # Get a converter instance::
1274
1275     converter = get_converter(data, **options.as_dict())
1276
1277 # Execute if the ``-execute`` option is set::
1278
1279     if options.ensure_value("execute", None):
1280         print "executing " + options.infile
1281         if options.txt2code:
1282             code = str(converter)
1283         else:
1284             code = data
1285         exec code
1286         return
1287
1288 # Default action: Convert and write to out_stream::
1289
1290     out_stream.write(str(converter))
1291
1292     if out_stream is not sys.stdout:
1293         print "extract written to", out_stream.name
1294         out_stream.close()
1295
1296 # Rename the infile to a backup copy if ``--replace`` is set::
1297
1298     if options.ensure_value("replace", None):
1299         os.rename(options.infile, options.infile + "~")
1300
1301 # If not (and input and output are from files), set the modification time
1302 # (`mtime`) of the output file to the one of the input file to indicate that
1303 # the contained information is equal.[#]_ ::
1304
1305     else:
1306         try:
1307             os.utime(options.outfile, (os.path.getatime(options.outfile),
1308                                        os.path.getmtime(options.infile))
1309                     )
1310         except OSError:
1311             pass
1312
1313     ## print "mtime", os.path.getmtime(options.infile),  options.infile
1314     ## print "mtime", os.path.getmtime(options.outfile), options.outfile
1315
1316
1317 # .. [#] Make sure the corresponding file object (here `out_stream`) is
1318 #        closed, as otherwise the change will be overwritten when `close` is
1319 #        called afterwards (either explicitely or at program exit).
1320 #
1321 # Run main, if called from the command line::
1322
1323 if __name__ == '__main__':
1324     main()
1325
1326
1327 # Open questions
1328 # ==============
1329 #
1330 # Open questions and ideas for further development
1331 #
1332 # Options
1333 # -------
1334 #
1335 # * Collect option defaults in a dictionary (on module level)
1336 #
1337 #   Facilitates the setting of options in programmatic use
1338 #
1339 #   Use templates for the "intelligent guesses" (with Python syntax for string
1340 #   replacement with dicts: ``"hello %(what)s" % {'what': 'world'}``)
1341 #
1342 # * Is it sensible to offer the `header_string` option also as command line
1343 #   option?
1344 #
1345 # * Configurable
1346 #
1347 # Parsing Problems
1348 # ----------------------
1349 #
1350 # * How can I include a literal block that should not be in the
1351 #   executable code (e.g. an example, an earlier version or variant)?
1352 #
1353 #   Workaround:
1354 #     Use a `quoted literal block` (with a quotation different from
1355 #     the comment string used for text blocks to keep it as commented over the
1356 #     code-text round-trips.
1357 #
1358 #     Python `pydoc` examples can also use the special pydoc block syntax (no
1359 #     double colon!).
1360 #
1361 #   Alternative:
1362 #     use a special "code block" directive or a special "no code
1363 #     block" directive.
1364 #
1365 # * ignore "matching comments" in literal strings?
1366 #
1367 #   (would need a specific detection algorithm for every language that
1368 #   supports multi-line literal strings (C++, PHP, Python)
1369 #
1370 # * Warn if a comment in code will become text after round-trip?
1371 #
1372 # code syntax highlight
1373 # ---------------------
1374 #
1375 # use `listing` package in LaTeX->PDF
1376 #
1377 # in html, see
1378 #
1379 # * the syntax highlight support in rest2web
1380 #   (uses the Moin-Moin Python colorizer, see a version at
1381 #   http://www.standards-schmandards.com/2005/fangs-093/)
1382 # * Pygments (pure Python, many languages, rst integration recipe):
1383 #   http://pygments.org/docs/rstdirective/
1384 # * Silvercity, enscript, ...
1385 #
1386 # Some plug-ins require a special "code block" directive instead of the
1387 # `::`-literal block. TODO: make this an option
1388 #
1389 # Ask at docutils users|developers
1390 #
1391 # * How to handle docstrings in code blocks? (it would be nice to convert them
1392 #   to rst-text if ``__docformat__ == restructuredtext``)
1393 #