new `Code2Text` converter after an idea by Riccardo Murri
[pylit.git] / src / pylit.py
blobbd5291de26a1ac6d0c9cb9b8eb69581e5efcf40f
1 #!/usr/bin/env python
2 # -*- coding: iso-8859-1 -*-
4 # ===============================================================
5 # pylit.py: Literate programming with Python and reStructuredText
6 # ===============================================================
7 #
8 # :Date: 2007-01-31
9 # :Copyright: 2005, 2007 Guenter Milde.
10 # Released under the terms of the GNU General Public License
11 # (v. 2 or later)
13 # .. sectnum::
14 # .. contents::
16 # Frontmatter
17 # ===========
19 # Changelog
20 # ---------
22 # :2005-06-29: Initial version
23 # :2005-06-30: first literate version of the script
24 # :2005-07-01: object orientated script using generators
25 # :2005-07-10: Two state machine (later added 'header' state)
26 # :2006-12-04: Start of work on version 0.2 (code restructuring)
27 # :2007-01-23: 0.2 published at http://pylit.berlios.de
28 # :2007-01-25: 0.2.1 Outsourced non-core documentation to the PyLit pages.
29 # :2007-01-26: 0.2.2 new behaviour of `diff` function
30 # :2007-01-29: 0.2.3 new `header` methods after suggestion by Riccardo Murri
31 # :2007-01-31: 0.2.4 raise Error if code indent is too small
32 # :2007-02-05: 0.2.5 new command line option --comment-string
33 # :2007-02-09: 0.2.6 add section with open questions,
34 # Code2Text: let only blank lines (no comment str)
35 # separate text and code,
36 # fix `Code2Text.header`
37 # :2007-02-19: 0.2.7 simplify `Code2Text.header,`
38 # new `iter_strip` method replacing a lot of ``if``-s
39 # :2007-02-22: 0.2.8 set `mtime` of outfile to the one of infile
40 # :2007-02-27: 0.3 new `Code2Text` converter after an idea by Riccardo Murri
42 # ::
44 """pylit: Literate programming with Python and reStructuredText
46 PyLit is a bidirectional converter between
48 * a (reStructured) text source with embedded code, and
49 * a code source with embedded text blocks (comments)
50 """
52 __docformat__ = 'restructuredtext'
55 # Requirements
56 # ------------
58 # * library modules
60 # ::
62 import re
63 import os
64 import sys
65 import optparse
67 # * non-standard extensions
69 # ::
71 from simplestates import SimpleStates # generic state machine
74 # Classes
75 # =======
77 # PushIterator
78 # ------------
80 # The PushIterator is a minimal implementation of an iterator with
81 # backtracking from the `Effective Python Programming`_ OSCON 2005 tutorial by
82 # Anthony Baxter. As the definition is small, it is inlined now. For the full
83 # reasoning and documentation see `iterqueue.py`_.
85 # .. _`Effective Python Programming`:
86 # http://www.interlink.com.au/anthony/tech/talks/OSCON2005/effective_r27.pdf
88 # .. _iterqueue.py: iterqueue.py.html
90 # ::
92 class PushIterator(object):
93 def __init__(self, iterable):
94 self.it = iter(iterable)
95 self.cache = []
96 def __iter__(self):
97 """Return `self`, as this is already an iterator"""
98 return self
99 def next(self):
100 return (self.cache and self.cache.pop()) or self.it.next()
101 def push(self, value):
102 self.cache.append(value)
104 # Converter
105 # ---------
107 # The converter classes implement a simple `state machine` to separate and
108 # transform text and code blocks. For this task, only a very limited parsing
109 # is needed. Using the full blown docutils_ rst parser would introduce a
110 # large overhead and slow down the conversion.
112 # PyLit's simple parser assumes:
114 # * indented literal blocks in a text source are code blocks.
116 # * comment lines that start with a matching comment string in a code source
117 # are text blocks.
119 # .. _docutils: http://docutils.sourceforge.net/
121 # The actual converter classes are derived from `PyLitConverter`:
122 # `Text2Code`_ converts a text source to executable code, while `Code2Text`_
123 # does the opposite: converting commented code to a text source.
125 # The `PyLitConverter` class inherits the state machine framework
126 # (initalisation, scheduler, iterator interface, ...) from `SimpleStates`,
127 # overrides the ``__init__`` method, and adds auxiliary methods and
128 # configuration attributes (options). ::
130 class PyLitConverter(SimpleStates):
131 """parent class for `Text2Code` and `Code2Text`, the state machines
132 converting between text source and code source of a literal program.
135 # Data attributes
136 # ~~~~~~~~~~~~~~~
138 # The data attributes are class default values. They will be overridden by
139 # matching keyword arguments during class instantiation.
141 # `get_converter`_ and `main`_ pass on unused keyword arguments to
142 # the instantiation of a converter class. This way, keyword arguments
143 # to these functions can be used to customize the converter.
145 # Default language and language specific defaults::
147 language = "python"
148 comment_strings = {"python": '# ',
149 "slang": '% ',
150 "c++": '// '}
152 # Number of spaces to indent code blocks in the code -> text conversion.[#]_
154 # .. [#] For the text -> code conversion, the codeindent is determined by the
155 # first recognized code line (leading comment or first indented literal
156 # block of the text source).
158 # ::
160 codeindent = 2
162 # Marker string for the first code block. (Should be a valid rst directive
163 # that accepts code on the same line, e.g. ``'.. admonition::'``.) No
164 # trailing whitespace needed as indented code follows. Default is a comment
165 # marker::
167 header_string = '..'
169 # Export to the output format stripping text or code blocks::
171 strip = False
173 # Initial state::
175 state = 'header'
178 # Instantiation
179 # ~~~~~~~~~~~~~
181 # Initializing sets up the `data` attribute, an iterable object yielding
182 # lines of the source to convert.[1]_ ::
184 def __init__(self, data, **keyw):
185 """data -- iterable data object
186 (list, file, generator, string, ...)
187 **keyw -- all remaining keyword arguments are
188 stored as class attributes
191 # As the state handlers need backtracking, the data is wrapped in a
192 # `PushIterator`_ if it doesnot already have a `push` method::
194 if hasattr(data, 'push'):
195 self.data = data
196 else:
197 self.data = PushIterator(data)
198 self._textindent = 0
200 # Additional keyword arguments are stored as data attributes, overwriting the
201 # class defaults::
203 self.__dict__.update(keyw)
205 # The comment string is set to the languages comment string if not given in
206 # the keyword arguments::
208 if not hasattr(self, "comment_string") or not self.comment_string:
209 self.comment_string = self.comment_strings[self.language]
211 # .. [1] The most common choice of data is a `file` object with the text
212 # or code source.
214 # To convert a string into a suitable object, use its splitlines method
215 # with the optional `keepends` argument set to True.
217 # Converter.__str__
218 # ~~~~~~~~~~~~~~~~~
220 # Return converted data as string::
222 def __str__(self):
223 blocks = ["".join(block) for block in self()]
224 return "".join(blocks)
226 # Converter.get_indent
227 # ~~~~~~~~~~~~~~~~~~~~
229 # Return the number of leading spaces in `string` after expanding tabs ::
231 def get_indent(self, string):
232 """Return the indentation of `string`.
234 line = string.expandtabs()
235 return len(line) - len(line.lstrip())
237 # Converter.ensure_trailing_blank_line
238 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
240 # Ensure there is a blank line as last element of the list `lines`::
242 def ensure_trailing_blank_line(self, lines, next_line):
243 if not lines:
244 return
245 if lines[-1].lstrip():
246 sys.stderr.write("\nWarning: inserted blank line between\n %s %s"
247 %(lines[-1], next_line))
248 lines.append("\n")
251 # Converter.collect_blocks
253 # ::
255 def collect_blocks(self):
256 """collect lines in a list
258 return list for each block of lines (paragraph) seperated by a
259 blank line (whitespace only)
261 block = []
262 for line in self.data:
263 block.append(line)
264 if not line.rstrip():
265 yield block
266 block = []
267 yield block
270 # Text2Code
271 # ---------
273 # The `Text2Code` class separates code blocks (indented literal blocks) from
274 # reStructured text. Code blocks are unindented, text is commented (or
275 # filtered, if the ``strip`` option is True.
277 # Only `indented literal blocks` are extracted. `quoted literal blocks` and
278 # `pydoc blocks` are treated as text. This allows the easy inclusion of
279 # examples: [#]_
281 # >>> 23 + 3
282 # 26
284 # .. [#] Mark that there is no double colon before the doctest block in
285 # the text source.
287 # The state handlers are implemented as generators. Iterating over a
288 # `Text2Code` instance initializes them to generate iterators for
289 # the respective states (see ``simplestates.py``).
291 # ::
293 class Text2Code(PyLitConverter):
294 """Convert a (reStructured) text source to code source
297 # INIT: call the parent classes init method.
299 # If the `strip` argument is true, replace the `__iter_` method
300 # with a special one that drops "spurious" blocks::
302 def __init__(self, data, **keyw):
303 PyLitConverter.__init__(self, data, **keyw)
304 if getattr(self, "strip", False):
305 self.__iter__ = self.iter_strip
307 # Text2Code.header
308 # ~~~~~~~~~~~~~~~~
310 # Convert the header (leading rst comment block) to code::
312 def header(self):
313 """Convert header (comment) to code"""
314 line = self.data_iterator.next()
316 # Test first line for rst comment: (We need to do this explicitely here, as
317 # the code handler will only recognize the start of a text block if a line
318 # starting with "matching comment" is preceded by an empty line. However, we
319 # have to care for the case of the first line beeing a "text line".
321 # Which variant is better?
323 # 1. starts with comment marker and has
324 # something behind the comment on the first line::
326 # if line.startswith("..") and len(line.rstrip()) > 2:
328 # 2. Convert any leading comment to code::
330 if line.startswith(self.header_string):
332 # Strip leading comment string (typically added by `Code2Text.header`) and
333 # return the result of processing the data with the code handler::
335 self.data_iterator.push(line.replace(self.header_string, "", 1))
336 return self.code()
338 # No header code found: Push back first non-header line and set state to
339 # "text"::
341 self.data_iterator.push(line)
342 self.state = 'text'
343 return []
345 # Text2Code.text_handler_generator
346 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
348 # The 'text' handler processes everything that is not an indented literal
349 # comment. Text is quoted with `self.comment_string` or filtered (with
350 # strip=True).
352 # It is implemented as a generator function that acts on the `data` iterator
353 # and yields text blocks.
355 # Declaration and initialization::
357 def text_handler_generator(self):
358 """Convert text blocks from rst to comment
360 lines = []
362 # Iterate over the data_iterator (which yields the data lines)::
364 for line in self.data_iterator:
365 # print "Text: '%s'"%line
367 # Default action: add comment string and collect in `lines` list::
369 lines.append(self.comment_string + line)
371 # Test for the end of the text block: a line that ends with `::` but is neither
372 # a comment nor a directive::
374 if (line.rstrip().endswith("::")
375 and not line.lstrip().startswith("..")):
377 # End of text block is detected, now:
379 # set the current text indent level (needed by the code handler to find the
380 # end of code block) and set the state to "code" (i.e. the next call of
381 # `self.next` goes to the code handler)::
383 self._textindent = self.get_indent(line)
384 self.state = 'code'
386 # Ensure a trailing blank line (which is the paragraph separator in
387 # reStructured Text. Look at the next line, if it is blank -- OK, if it is
388 # not blank, push it back (it should be code) and add a line by calling the
389 # `ensure_trailing_blank_line` method (which also issues a warning)::
391 line = self.data_iterator.next()
392 if line.lstrip():
393 self.data_iterator.push(line) # push back
394 self.ensure_trailing_blank_line(lines, line)
395 else:
396 lines.append(line)
398 # Now yield and reset the lines. (There was a function call to remove a
399 # literal marker (if on a line on itself) to shorten the comment. However,
400 # this behaviour was removed as the resulting difference in line numbers leads
401 # to misleading error messages in doctests)::
403 #remove_literal_marker(lines)
404 yield lines
405 lines = []
407 # End of data: if we "fall of" the iteration loop, just join and return the
408 # lines::
410 yield lines
413 # Text2Code.code_handler_generator
414 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
416 # The `code` handler is called when a literal block marker is encounterd. It
417 # returns a code block (indented literal block), removing leading whitespace
418 # up to the indentation of the first code line in the file (this deviation
419 # from docutils behaviour allows indented blocks of Python code).
421 # As the code handler detects the switch to "text" state by looking at
422 # the line indents, it needs to push back the last probed data token. I.e.
423 # the data_iterator must support a `push` method. (This is the
424 # reason for the use of the PushIterator class in `__init__`.) ::
426 def code_handler_generator(self):
427 """Convert indented literal blocks to source code
429 lines = []
430 codeindent = None # indent of first non-blank code line, set below
431 indent_string = "" # leading whitespace chars ...
433 # Iterate over the lines in the input data::
435 for line in self.data_iterator:
436 # print "Code: '%s'"%line
438 # Pass on blank lines (no test for end of code block needed|possible)::
440 if not line.rstrip():
441 lines.append(line.replace(indent_string, "", 1))
442 continue
444 # Test for end of code block:
446 # A literal block ends with the first less indented, nonblank line.
447 # `self._textindent` is set by the text handler to the indent of the
448 # preceding paragraph.
450 # To prevent problems with different tabulator settings, hard tabs in code
451 # lines are expanded with the `expandtabs` string method when calculating the
452 # indentation (i.e. replaced by 8 spaces, by default).
454 # ::
456 if self.get_indent(line) <= self._textindent:
457 # push back line
458 self.data_iterator.push(line)
459 self.state = 'text'
460 # append blank line (if not already present)
461 self.ensure_trailing_blank_line(lines, line)
462 yield lines
463 # reset list of lines
464 lines = []
465 continue
467 # OK, we are sure now that the current line is neither blank nor a text line.
469 # If still unset, determine the code indentation from first non-blank code
470 # line::
472 if codeindent is None and line.lstrip():
473 codeindent = self.get_indent(line)
474 indent_string = line[:codeindent]
476 # Append unindented line to lines cache (but check if we can safely unindent
477 # first)::
479 if not line.startswith(indent_string):
480 raise ValueError, "cannot unindent line %r,\n"%line \
481 + " doesnot start with code indent string %r"%indent_string
483 lines.append(line[codeindent:])
485 # No more lines in the input data: just return what we have::
487 yield lines
490 # Txt2Code.remove_literal_marker
491 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
493 # Remove literal marker (::) in "expanded form" i.e. in a paragraph on its own.
495 # While cleaning up the code source, it leads to confusion for doctest and
496 # searches (e.g. grep) as line-numbers between text and code source will
497 # differ. ::
499 def remove_literal_marker(list):
500 try:
501 # print lines[-3:]
502 if (lines[-3].strip() == self.comment_string.strip()
503 and lines[-2].strip() == self.comment_string + '::'):
504 del(lines[-3:-1])
505 except IndexError:
506 pass
508 # Text2Code.iter_strip
509 # ~~~~~~~~~~~~~~~~~~~~
511 # Modification of the `simplestates.__iter__` method that will replace it when
512 # the `strip` keyword argument is `True` during class instantiation:
514 # Iterate over class instances dropping text blocks::
516 def iter_strip(self):
517 """Generate and return an iterator dropping text blocks
519 self.data_iterator = self.data
520 self._initialize_state_generators()
521 while True:
522 yield getattr(self, self.state)()
523 getattr(self, self.state)() # drop text block
527 # Code2Text
528 # ---------
530 # The `Code2Text` class does the opposite of `Text2Code`_ -- it processes
531 # valid source code, extracts comments, and puts non-commented code in literal
532 # blocks.
534 # Only lines starting with a comment string matching the one in the
535 # `comment_string` data attribute are considered text lines.
537 # The class is derived from the PyLitConverter state machine and adds handlers
538 # for the three states "header", "text", and "code". ::
540 class Code2Text(PyLitConverter):
541 """Convert code source to text source
544 # Code2Text.__iter__
546 def __iter__(self):
548 # If the last text block doesnot end with a code marker (by default, the
549 # literal-block marker ``::``), the `text` method will set `code marker` to
550 # a paragraph that will start the next code block. It is yielded if non-empty
551 # at a text-code transition. If there is no preceding text block, `code_marker`
552 # contains the `header_string`::
554 if self.strip:
555 self.code_marker = []
556 else:
557 self.code_marker = [self.header_string]
559 for block in self.collect_blocks():
561 # Test the state of the block, return it processed with the right handler::
563 if self.block_is_text(block):
564 self.state = "text"
565 else:
566 if self.state != "code" and self.code_marker:
567 yield self.code_marker
568 self.state = "code"
569 yield getattr(self, self.state)(block)
572 # A paragraph is a text block, if every non-blank line starts with a matching
573 # comment string (test includes whitespace except for commented blank lines!)
574 # ::
576 def block_is_text(self, block):
577 for line in block:
578 if (line.rstrip()
579 and not line.startswith(self.comment_string)
580 and line.rstrip() != self.comment_string.rstrip()):
581 return False
582 return True
584 # "header" state
585 # ~~~~~~~~~~~~~~~~
587 # Sometimes code needs to remain on the first line(s) of the document to be
588 # valid. The most common example is the "shebang" line that tells a POSIX
589 # shell how to process an executable file::
591 #!/usr/bin/env python
593 # In Python, the ``# -*- coding: iso-8859-1 -*-`` line must occure before any
594 # other comment or code.
596 # If we want to keep the line numbers in sync for text and code source, the
597 # reStructured Text markup for these header lines must start at the same line
598 # as the first header line. Therfore, header lines could not be marked as
599 # literal block (this would require the "::" and an empty line above the code).
601 # OTOH, a comment may start at the same line as the comment marker and it
602 # includes subsequent indented lines. Comments are visible in the reStructured
603 # Text source but hidden in the pretty-printed output.
605 # With a header converted to comment in the text source, everything before the
606 # first text block (i.e. before the first paragraph using the matching comment
607 # string) will be hidden away (in HTML or PDF output).
609 # This seems a good compromise, the advantages
611 # * line numbers are kept
612 # * the "normal" code conversion rules (indent/unindent by `codeindent` apply
613 # * greater flexibility: you can hide a repeating header in a project
614 # consisting of many source files.
616 # set off the disadvantages
618 # - it may come as surprise if a part of the file is not "printed",
619 # - one more syntax element to learn for rst newbees to start with pylit,
620 # (however, starting from the code source, this will be auto-generated)
622 # In the case that there is no matching comment at all, the complete code
623 # source will become a comment -- however, in this case it is not very likely
624 # the source is a literate document anyway.
626 # If needed for the documentation, it is possible to repeat the header in (or
627 # after) the first text block, e.g. with a `line block` in a `block quote`:
629 # | ``#!/usr/bin/env python``
630 # | ``# -*- coding: iso-8859-1 -*-``
632 # The current implementation represents the header state by the setting of
633 # `code_marker` to ``[self.header_string]``. The first non-empty text block
634 # will overwrite this setting.
636 # Code2Text.text
637 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
639 # The *text state handler* converts a comment to a text block
640 # Strip the leading comment string::
642 def text(self, lines):
643 """Uncomment text blocks in source code
646 lines = [line.replace(self.comment_string, "", 1) for line in lines]
648 lines = [re.sub("^"+self.comment_string.rstrip(), "", line)
649 for line in lines]
651 if self.strip:
652 self.strip_literal_marker(lines)
653 self.code_marker = []
655 # Check for code block marker (double colon) at the end of the text block
656 # Update the `code_marker` argument. The current `code marker` is 'prepended'
657 # to the next code block by `Code2Text.code`_ ::
659 elif len(lines)>1:
660 if lines[-2].rstrip().endswith("::"):
661 self.code_marker = []
662 else:
663 self.code_marker = ["::\n", "\n"]
665 # Return the text block to the calling function::
667 return lines
670 # Code2Text.code
671 # ~~~~~~~~~~~~~~
673 # The `code` method is called on non-commented code. Code is returned as
674 # indented literal block (or filtered, if ``self.strip == True``). The amount
675 # of the code indentation is controled by `self.codeindent` (default 2).
676 # ::
678 def code(self, lines):
679 """Indent lines or strip if `strip` == `True`
681 if self.strip == True:
682 return []
684 return [" "*self.codeindent + line for line in lines]
686 # Code2Text.strip_literal_marker
687 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
689 # If the code block is stripped, the literal marker would lead to an error
690 # when the text is converted with docutils. Replace it with the equivalent of
691 # docutils replace rules
693 # * strip `::`-line (and preceding blank line) if on a line on its own
694 # * strip `::` if it is preceded by whitespace.
695 # * convert `::` to a single colon if preceded by text
697 # `lines` should be list of text lines (with a trailing blank line).
698 # It is modified in-place::
700 def strip_literal_marker(self, lines):
701 try:
702 line = lines[-2]
703 except IndexError: # len(lines < 2)
704 return
706 # split at rightmost '::'
707 try:
708 (head, tail) = line.rsplit('::', 1)
709 except ValueError: # only one part (no '::')
710 return
712 # '::' on an extra line
713 if not head.strip():
714 del(lines[-2])
715 # delete preceding line if it is blank
716 if len(lines) >= 2 and not lines[-2].lstrip():
717 del(lines[-2])
718 # '::' follows whitespace
719 elif head.rstrip() < head:
720 head = head.rstrip()
721 lines[-2] = "".join((head, tail))
722 # '::' follows text
723 else:
724 lines[-2] = ":".join((head, tail))
728 # Command line use
729 # ================
731 # Using this script from the command line will convert a file according to its
732 # extension. This default can be overridden by a couple of options.
734 # Dual source handling
735 # --------------------
737 # How to determine which source is up-to-date?
738 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
740 # - set modification date of `oufile` to the one of `infile`
742 # Points out that the source files are 'synchronized'.
744 # * Are there problems to expect from "backdating" a file? Which?
746 # Looking at http://www.unix.com/showthread.php?t=20526, it seems
747 # perfectly legal to set `mtime` (while leaving `ctime`) as `mtime` is a
748 # description of the "actuality" of the data in the file.
750 # * Should this become a default or an option?
752 # - alternatively move input file to a backup copy (with option: `--replace`)
754 # - check modification date before overwriting
755 # (with option: `--overwrite=update`)
757 # - check modification date before editing (implemented as `Jed editor`_
758 # function `pylit_check()` in `pylit.sl`_)
760 # .. _Jed editor: http://www.jedsoft.org/jed/
761 # .. _pylit.sl: http://jedmodes.sourceforge.net/mode/pylit/
763 # Recognised Filename Extensions
764 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
766 # Finding an easy to remember, unused filename extension is not easy.
768 # .py.txt
769 # a double extension (similar to .tar.gz, say) seems most appropriate
770 # (at least on UNIX). However, it fails on FAT16 filesystems.
771 # The same scheme can be used for c.txt, p.txt and the like.
773 # .pytxt
774 # is recognised as extension by os.path.splitext but also fails on FAT16
776 # .pyt
777 # (PYthon Text) is used by the Python test interpreter
778 # `pytest <http:www.zetadev.com/software/pytest/>`__
780 # .pyl
781 # was even mentioned as extension for "literate Python" files in an
782 # email exchange (http://www.python.org/tim_one/000115.html) but
783 # subsequently used for Python libraries.
785 # .lpy
786 # seems to be free (as by a Google search, "lpy" is the name of a python
787 # code pretty printer but this should not pose a problem).
789 # .tpy
790 # seems to be free as well.
792 # Instead of defining a new extension for "pylit" literate programms,
793 # by default ``.txt`` will be appended for literate code and stripped by
794 # the conversion to executable code. i.e. for a program foo:
796 # * the literate source is called ``foo.py.txt``
797 # * the html rendering is called ``foo.py.html``
798 # * the python source is called ``foo.py``
802 # OptionValues
803 # ------------
805 # For use as keyword arguments, it is handy to have the options
806 # in a dictionary. The following class adds an `as_dict` method
807 # to `optparse.Values`::
809 class OptionValues(optparse.Values):
810 def as_dict(self):
811 """Return options as dictionary object"""
812 return dict([(option, getattr(self, option)) for option in dir(self)
813 if option not in dir(OptionValues)
814 and option is not None
817 # PylitOptions
818 # ------------
820 # Options are stored in the values attribute of the `PylitOptions` class.
821 # It is initialized with default values and parsed command line options (and
822 # arguments) This scheme allows easy customization by code importing the
823 # `pylit` module. ::
825 class PylitOptions(object):
826 """Storage and handling of program options
829 # Recognized file extensions for text and code versions of the source::
831 code_languages = {".py": "python",
832 ".sl": "slang",
833 ".c": "c++"}
834 code_extensions = code_languages.keys()
835 text_extensions = [".txt"]
837 # Instantiation
838 # ~~~~~~~~~~~~~
840 # Instantiation sets up an OptionParser and initializes it with pylit's
841 # command line options and `default_values`. It then updates the values based
842 # on command line options and sensible defaults::
844 def __init__(self, args=sys.argv[1:], **default_values):
845 """Set up an `OptionParser` instance and parse and complete arguments
847 p = optparse.OptionParser(usage=main.__doc__, version="0.2")
848 # set defaults
849 p.set_defaults(**default_values)
850 # add the options
851 p.add_option("-c", "--code2txt", dest="txt2code", action="store_false",
852 help="convert code to reStructured text")
853 p.add_option("--comment-string", dest="comment_string",
854 help="text block marker (default '# ' (for Python))" )
855 p.add_option("-d", "--diff", action="store_true",
856 help="test for differences to existing file")
857 p.add_option("--doctest", action="store_true",
858 help="run doctest.testfile() on the text version")
859 p.add_option("-e", "--execute", action="store_true",
860 help="execute code (Python only)")
861 p.add_option("-f", "--infile",
862 help="input file name ('-' for stdout)" )
863 p.add_option("--overwrite", action="store",
864 choices = ["yes", "update", "no"],
865 help="overwrite output file (default 'update')")
866 p.add_option("-o", "--outfile",
867 help="output file name ('-' for stdout)" )
868 p.add_option("--replace", action="store_true",
869 help="move infile to a backup copy (appending '~')")
870 p.add_option("-s", "--strip", action="store_true",
871 help="export by stripping text or code")
872 p.add_option("-t", "--txt2code", action="store_true",
873 help="convert reStructured text to code")
874 self.parser = p
876 # parse to fill a self.Values instance
877 self.values = self.parse_args(args)
878 # complete with context-sensitive defaults
879 self.values = self.complete_values(self.values)
881 # Calling
882 # ~~~~~~~
884 # "Calling" an instance updates the option values based on command line
885 # arguments and default values and does a completion of the options based on
886 # "context-sensitive defaults"::
888 def __call__(self, args=sys.argv[1:], **default_values):
889 """parse and complete command line args
891 values = self.parse_args(args, **default_values)
892 return self.complete_values(values)
895 # PylitOptions.parse_args
896 # ~~~~~~~~~~~~~~~~~~~~~~~
898 # The `parse_args` method calls the `optparse.OptionParser` on command
899 # line or provided args and returns the result as `PylitOptions.Values`
900 # instance. Defaults can be provided as keyword arguments::
902 def parse_args(self, args=sys.argv[1:], **default_values):
903 """parse command line arguments using `optparse.OptionParser`
905 args -- list of command line arguments.
906 default_values -- dictionary of option defaults
908 # update defaults
909 defaults = self.parser.defaults.copy()
910 defaults.update(default_values)
911 # parse arguments
912 (values, args) = self.parser.parse_args(args, OptionValues(defaults))
913 # Convert FILE and OUTFILE positional args to option values
914 # (other positional arguments are ignored)
915 try:
916 values.infile = args[0]
917 values.outfile = args[1]
918 except IndexError:
919 pass
920 return values
922 # PylitOptions.complete_values
923 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
925 # The `complete` method uses context information to set missing option values
926 # to sensible defaults (if possible).
928 # ::
930 def complete_values(self, values):
931 """complete option values with context sensible defaults
933 values.ensure_value("infile", "")
934 # Guess conversion direction from infile filename
935 if values.ensure_value("txt2code", None) is None:
936 in_extension = os.path.splitext(values.infile)[1]
937 if in_extension in self.text_extensions:
938 values.txt2code = True
939 elif in_extension in self.code_extensions:
940 values.txt2code = False
941 # Auto-determine the output file name
942 values.ensure_value("outfile", self.get_outfile_name(values.infile,
943 values.txt2code))
944 # Guess conversion direction from outfile filename or set to default
945 if values.txt2code is None:
946 out_extension = os.path.splitext(values.outfile)[1]
947 values.txt2code = not (out_extension in self.text_extensions)
949 # Set the language of the code (default "python")
950 if values.txt2code is True:
951 code_extension = os.path.splitext(values.outfile)[1]
952 elif values.txt2code is False:
953 code_extension = os.path.splitext(values.infile)[1]
954 values.ensure_value("language",
955 self.code_languages.get(code_extension, "python"))
957 # Set the default overwrite mode
958 values.ensure_value("overwrite", 'update')
960 return values
962 # PylitOptions.get_outfile_name
963 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
965 # Construct a matching filename for the output file. The output filename is
966 # constructed from `infile` by the following rules:
968 # * '-' (stdin) results in '-' (stdout)
969 # * strip the `txt_extension` or add the `code_extension` (txt2code)
970 # * add a `txt_ extension` (code2txt)
971 # * fallback: if no guess can be made, add ".out"
973 # ::
975 def get_outfile_name(self, infile, txt2code=None):
976 """Return a matching output filename for `infile`
978 # if input is stdin, default output is stdout
979 if infile == '-':
980 return '-'
981 # Modify `infile`
982 (base, ext) = os.path.splitext(infile)
983 # TODO: should get_outfile_name() use self.values.outfile_extension
984 # if it exists?
986 # strip text extension
987 if ext in self.text_extensions:
988 return base
989 # add (first) text extension for code files
990 if ext in self.code_extensions or txt2code == False:
991 return infile + self.text_extensions[0]
992 # give up
993 return infile + ".out"
997 # Helper functions
998 # ----------------
1000 # open_streams
1001 # ~~~~~~~~~~~~
1003 # Return file objects for in- and output. If the input path is missing,
1004 # write usage and abort. (An alternative would be to use stdin as default.
1005 # However, this leaves the uninitiated user with a non-responding application
1006 # if (s)he just tries the script without any arguments) ::
1008 def open_streams(infile = '-', outfile = '-', overwrite='update', **keyw):
1009 """Open and return the input and output stream
1011 open_streams(infile, outfile) -> (in_stream, out_stream)
1013 in_stream -- file(infile) or sys.stdin
1014 out_stream -- file(outfile) or sys.stdout
1015 overwrite -- ['yes', 'update', 'no']
1016 if 'update', only open output file if it is older than
1017 the input stream.
1018 Irrelevant if outfile == '-'.
1020 if not infile:
1021 strerror = "Missing input file name ('-' for stdin; -h for help)"
1022 raise IOError, (2, strerror, infile)
1023 if infile == '-':
1024 in_stream = sys.stdin
1025 else:
1026 in_stream = file(infile, 'r')
1027 if outfile == '-':
1028 out_stream = sys.stdout
1029 elif overwrite == 'no' and os.path.exists(outfile):
1030 raise IOError, (1, "Output file exists!", outfile)
1031 elif overwrite == 'update' and is_newer(outfile, infile):
1032 raise IOError, (1, "Output file is newer than input file!", outfile)
1033 else:
1034 out_stream = file(outfile, 'w')
1035 return (in_stream, out_stream)
1037 # is_newer
1038 # ~~~~~~~~
1040 # ::
1042 def is_newer(path1, path2):
1043 """Check if `path1` is newer than `path2` (using mtime)
1045 Compare modification time of files at path1 and path2.
1047 Non-existing files are considered oldest: Return False if path1 doesnot
1048 exist and True if path2 doesnot exist.
1050 Return None for equal modification time. (This evaluates to False in a
1051 boolean context but allows a test for equality.)
1054 try:
1055 mtime1 = os.path.getmtime(path1)
1056 except OSError:
1057 mtime1 = -1
1058 try:
1059 mtime2 = os.path.getmtime(path2)
1060 except OSError:
1061 mtime2 = -1
1062 # print "mtime1", mtime1, path1, "\n", "mtime2", mtime2, path2
1064 if mtime1 == mtime2:
1065 return None
1066 return mtime1 > mtime2
1069 # get_converter
1070 # ~~~~~~~~~~~~~
1072 # Get an instance of the converter state machine::
1074 def get_converter(data, txt2code=True, **keyw):
1075 if txt2code:
1076 return Text2Code(data, **keyw)
1077 else:
1078 return Code2Text(data, **keyw)
1081 # Use cases
1082 # ---------
1084 # run_doctest
1085 # ~~~~~~~~~~~
1087 # ::
1089 def run_doctest(infile="-", txt2code=True,
1090 globs={}, verbose=False, optionflags=0, **keyw):
1091 """run doctest on the text source
1093 from doctest import DocTestParser, DocTestRunner
1094 (data, out_stream) = open_streams(infile, "-")
1096 # If source is code, convert to text, as tests in comments are not found by
1097 # doctest::
1099 if txt2code is False:
1100 converter = Code2Text(data, **keyw)
1101 docstring = str(converter)
1102 else:
1103 docstring = data.read()
1105 # Use the doctest Advanced API to do all doctests in a given string::
1107 test = DocTestParser().get_doctest(docstring, globs={}, name="",
1108 filename=infile, lineno=0)
1109 runner = DocTestRunner(verbose=verbose, optionflags=optionflags)
1110 runner.run(test)
1111 runner.summarize
1112 if not runner.failures:
1113 print "%d failures in %d tests"%(runner.failures, runner.tries)
1114 return runner.failures, runner.tries
1117 # diff
1118 # ~~~~
1120 # ::
1122 def diff(infile='-', outfile='-', txt2code=True, **keyw):
1123 """Report differences between converted infile and existing outfile
1125 If outfile is '-', do a round-trip conversion and report differences
1128 import difflib
1130 instream = file(infile)
1131 # for diffing, we need a copy of the data as list::
1132 data = instream.readlines()
1133 # convert
1134 converter = get_converter(data, txt2code, **keyw)
1135 new = str(converter).splitlines(True)
1137 if outfile != '-':
1138 outstream = file(outfile)
1139 old = outstream.readlines()
1140 oldname = outfile
1141 newname = "<conversion of %s>"%infile
1142 else:
1143 old = data
1144 oldname = infile
1145 # back-convert the output data
1146 converter = get_converter(new, not txt2code)
1147 new = str(converter).splitlines(True)
1148 newname = "<round-conversion of %s>"%infile
1150 # find and print the differences
1151 delta = list(difflib.unified_diff(old, new, fromfile=oldname,
1152 tofile=newname))
1153 if not delta:
1154 print oldname
1155 print newname
1156 print "no differences found"
1157 return False
1158 print "".join(delta)
1159 return True
1161 # main
1162 # ----
1164 # If this script is called from the command line, the `main` function will
1165 # convert the input (file or stdin) between text and code formats.
1167 # Customization
1168 # ~~~~~~~~~~~~~
1170 # Option defaults for the conversion can be as keyword arguments to `main`_.
1171 # The option defaults will be updated by command line options and extended
1172 # with "intelligent guesses" by `PylitOptions` and passed on to helper
1173 # functions and the converter instantiation.
1175 # This allows easy customization for programmatic use -- just or call `main`
1176 # with the appropriate keyword options (or with a `option_defaults`
1177 # dictionary.), e.g.:
1179 # >>> option_defaults = {'language': "c++",
1180 # ... 'codeindent': 4,
1181 # ... 'header_string': '..admonition::'
1182 # ... }
1184 # >>> main(**option_defaults)
1186 # ::
1188 def main(args=sys.argv[1:], **option_defaults):
1189 """%prog [options] FILE [OUTFILE]
1191 Convert between reStructured Text with embedded code, and
1192 Source code with embedded text comment blocks"""
1194 # Parse and complete the options::
1196 options = PylitOptions(args, **option_defaults).values
1198 # Run doctests if ``--doctest`` option is set::
1200 if options.ensure_value("doctest", None):
1201 return run_doctest(**options.as_dict())
1203 # Do a round-trip and report differences if the ``--diff`` opton is set::
1205 if options.ensure_value("diff", None):
1206 return diff(**options.as_dict())
1208 # Open in- and output streams::
1210 try:
1211 (data, out_stream) = open_streams(**options.as_dict())
1212 except IOError, ex:
1213 print "IOError: %s %s" % (ex.filename, ex.strerror)
1214 sys.exit(ex.errno)
1216 # Get a converter instance::
1218 converter = get_converter(data, **options.as_dict())
1220 # Execute if the ``-execute`` option is set::
1222 if options.ensure_value("execute", None):
1223 print "executing " + options.infile
1224 if options.txt2code:
1225 code = str(converter)
1226 else:
1227 code = data
1228 exec code
1229 return
1231 # Default action: Convert and write to out_stream::
1233 out_stream.write(str(converter))
1235 if out_stream is not sys.stdout:
1236 print "extract written to", out_stream.name
1237 out_stream.close()
1239 # Rename the infile to a backup copy if ``--replace`` is set::
1241 if options.ensure_value("replace", None):
1242 os.rename(options.infile, options.infile + "~")
1244 # If not (and input and output are from files), set the modification time
1245 # (`mtime`) of the output file to the one of the input file to indicate that
1246 # the contained information is equal.[#]_ ::
1248 else:
1249 try:
1250 os.utime(options.outfile, (os.path.getatime(options.outfile),
1251 os.path.getmtime(options.infile))
1253 except OSError:
1254 pass
1256 ## print "mtime", os.path.getmtime(options.infile), options.infile
1257 ## print "mtime", os.path.getmtime(options.outfile), options.outfile
1260 # .. [#] Make sure the corresponding file object (here `out_stream`) is
1261 # closed, as otherwise the change will be overwritten when `close` is
1262 # called afterwards (either explicitely or at program exit).
1264 # Run main, if called from the command line::
1266 if __name__ == '__main__':
1267 main()
1270 # Open questions
1271 # ==============
1273 # Open questions and ideas for further development
1275 # Options
1276 # -------
1278 # * Collect option defaults in a dictionary (on module level)
1280 # Facilitates the setting of options in programmatic use
1282 # Use templates for the "intelligent guesses" (with Python syntax for string
1283 # replacement with dicts: ``"hello %(what)s" % {'what': 'world'}``)
1285 # * Is it sensible to offer the `header_string` option also as command line
1286 # option?
1288 # * Configurable
1290 # Parsing Problems
1291 # ----------------------
1293 # * How can I include a literal block that should not be in the
1294 # executable code (e.g. an example, an earlier version or variant)?
1296 # Workaround:
1297 # Use a `quoted literal block` (with a quotation different from
1298 # the comment string used for text blocks to keep it as commented over the
1299 # code-text round-trips.
1301 # Python `pydoc` examples can also use the special pydoc block syntax (no
1302 # double colon!).
1304 # Alternative:
1305 # use a special "code block" directive or a special "no code
1306 # block" directive.
1308 # * ignore "matching comments" in literal strings?
1310 # (would need a specific detection algorithm for every language that
1311 # supports multi-line literal strings (C++, PHP, Python)
1313 # * Warn if a comment in code will become text after round-trip?
1315 # code syntax highlight
1316 # ---------------------
1318 # use `listing` package in LaTeX->PDF
1320 # in html, see
1322 # * the syntax highlight support in rest2web
1323 # (uses the Moin-Moin Python colorizer, see a version at
1324 # http://www.standards-schmandards.com/2005/fangs-093/)
1325 # * Pygments (pure Python, many languages, rst integration recipe):
1326 # http://pygments.org/docs/rstdirective/
1327 # * Silvercity, enscript, ...
1329 # Some plug-ins require a special "code block" directive instead of the
1330 # `::`-literal block. TODO: make this an option
1332 # Ask at docutils users|developers
1334 # * How to handle docstrings in code blocks? (it would be nice to convert them
1335 # to rst-text if ``__docformat__ == restructuredtext``)