Clean up system message (source, line) reporting.
[docutils.git] / docutils / statemachine.py
blob4c33102ea6b726ad9183496c69689d37b0817615
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 A finite state machine specialized for regular-expression-based text filters,
7 this module defines the following classes:
9 - `StateMachine`, a state machine
10 - `State`, a state superclass
11 - `StateMachineWS`, a whitespace-sensitive version of `StateMachine`
12 - `StateWS`, a state superclass for use with `StateMachineWS`
13 - `SearchStateMachine`, uses `re.search()` instead of `re.match()`
14 - `SearchStateMachineWS`, uses `re.search()` instead of `re.match()`
15 - `ViewList`, extends standard Python lists.
16 - `StringList`, string-specific ViewList.
18 Exception classes:
20 - `StateMachineError`
21 - `UnknownStateError`
22 - `DuplicateStateError`
23 - `UnknownTransitionError`
24 - `DuplicateTransitionError`
25 - `TransitionPatternNotFound`
26 - `TransitionMethodNotFound`
27 - `UnexpectedIndentationError`
28 - `TransitionCorrection`: Raised to switch to another transition.
29 - `StateCorrection`: Raised to switch to another state & transition.
31 Functions:
33 - `string2lines()`: split a multi-line string into a list of one-line strings
36 How To Use This Module
37 ======================
38 (See the individual classes, methods, and attributes for details.)
40 1. Import it: ``import statemachine`` or ``from statemachine import ...``.
41 You will also need to ``import re``.
43 2. Derive a subclass of `State` (or `StateWS`) for each state in your state
44 machine::
46 class MyState(statemachine.State):
48 Within the state's class definition:
50 a) Include a pattern for each transition, in `State.patterns`::
52 patterns = {'atransition': r'pattern', ...}
54 b) Include a list of initial transitions to be set up automatically, in
55 `State.initial_transitions`::
57 initial_transitions = ['atransition', ...]
59 c) Define a method for each transition, with the same name as the
60 transition pattern::
62 def atransition(self, match, context, next_state):
63 # do something
64 result = [...] # a list
65 return context, next_state, result
66 # context, next_state may be altered
68 Transition methods may raise an `EOFError` to cut processing short.
70 d) You may wish to override the `State.bof()` and/or `State.eof()` implicit
71 transition methods, which handle the beginning- and end-of-file.
73 e) In order to handle nested processing, you may wish to override the
74 attributes `State.nested_sm` and/or `State.nested_sm_kwargs`.
76 If you are using `StateWS` as a base class, in order to handle nested
77 indented blocks, you may wish to:
79 - override the attributes `StateWS.indent_sm`,
80 `StateWS.indent_sm_kwargs`, `StateWS.known_indent_sm`, and/or
81 `StateWS.known_indent_sm_kwargs`;
82 - override the `StateWS.blank()` method; and/or
83 - override or extend the `StateWS.indent()`, `StateWS.known_indent()`,
84 and/or `StateWS.firstknown_indent()` methods.
86 3. Create a state machine object::
88 sm = StateMachine(state_classes=[MyState, ...],
89 initial_state='MyState')
91 4. Obtain the input text, which needs to be converted into a tab-free list of
92 one-line strings. For example, to read text from a file called
93 'inputfile'::
95 input_string = open('inputfile').read()
96 input_lines = statemachine.string2lines(input_string)
98 5. Run the state machine on the input text and collect the results, a list::
100 results = sm.run(input_lines)
102 6. Remove any lingering circular references::
104 sm.unlink()
107 __docformat__ = 'restructuredtext'
109 import sys
110 import re
111 import types
112 import unicodedata
113 from docutils import utils
114 from docutils.error_reporting import ErrorOutput
117 class StateMachine:
120 A finite state machine for text filters using regular expressions.
122 The input is provided in the form of a list of one-line strings (no
123 newlines). States are subclasses of the `State` class. Transitions consist
124 of regular expression patterns and transition methods, and are defined in
125 each state.
127 The state machine is started with the `run()` method, which returns the
128 results of processing in a list.
131 def __init__(self, state_classes, initial_state, debug=0):
133 Initialize a `StateMachine` object; add state objects.
135 Parameters:
137 - `state_classes`: a list of `State` (sub)classes.
138 - `initial_state`: a string, the class name of the initial state.
139 - `debug`: a boolean; produce verbose output if true (nonzero).
142 self.input_lines = None
143 """`StringList` of input lines (without newlines).
144 Filled by `self.run()`."""
146 self.input_offset = 0
147 """Offset of `self.input_lines` from the beginning of the file."""
149 self.line = None
150 """Current input line."""
152 self.line_offset = -1
153 """Current input line offset from beginning of `self.input_lines`."""
155 self.debug = debug
156 """Debugging mode on/off."""
158 self.initial_state = initial_state
159 """The name of the initial state (key to `self.states`)."""
161 self.current_state = initial_state
162 """The name of the current state (key to `self.states`)."""
164 self.states = {}
165 """Mapping of {state_name: State_object}."""
167 self.add_states(state_classes)
169 self.observers = []
170 """List of bound methods or functions to call whenever the current
171 line changes. Observers are called with one argument, ``self``.
172 Cleared at the end of `run()`."""
174 self._stderr = ErrorOutput()
175 """Wrapper around sys.stderr catching en-/decoding errors"""
178 def unlink(self):
179 """Remove circular references to objects no longer required."""
180 for state in self.states.values():
181 state.unlink()
182 self.states = None
184 def run(self, input_lines, input_offset=0, context=None,
185 input_source=None, initial_state=None):
187 Run the state machine on `input_lines`. Return results (a list).
189 Reset `self.line_offset` and `self.current_state`. Run the
190 beginning-of-file transition. Input one line at a time and check for a
191 matching transition. If a match is found, call the transition method
192 and possibly change the state. Store the context returned by the
193 transition method to be passed on to the next transition matched.
194 Accumulate the results returned by the transition methods in a list.
195 Run the end-of-file transition. Finally, return the accumulated
196 results.
198 Parameters:
200 - `input_lines`: a list of strings without newlines, or `StringList`.
201 - `input_offset`: the line offset of `input_lines` from the beginning
202 of the file.
203 - `context`: application-specific storage.
204 - `input_source`: name or path of source of `input_lines`.
205 - `initial_state`: name of initial state.
207 self.runtime_init()
208 if isinstance(input_lines, StringList):
209 self.input_lines = input_lines
210 else:
211 self.input_lines = StringList(input_lines, source=input_source)
212 self.input_offset = input_offset
213 self.line_offset = -1
214 self.current_state = initial_state or self.initial_state
215 if self.debug:
216 print >>self._stderr, (
217 u'\nStateMachine.run: input_lines (line_offset=%s):\n| %s'
218 % (self.line_offset, u'\n| '.join(self.input_lines)))
219 transitions = None
220 results = []
221 state = self.get_state()
222 try:
223 if self.debug:
224 print >>self._stderr, '\nStateMachine.run: bof transition'
225 context, result = state.bof(context)
226 results.extend(result)
227 while 1:
228 try:
229 try:
230 self.next_line()
231 if self.debug:
232 source, offset = self.input_lines.info(
233 self.line_offset)
234 print >>self._stderr, (
235 u'\nStateMachine.run: line (source=%r, '
236 u'offset=%r):\n| %s'
237 % (source, offset, self.line))
238 context, next_state, result = self.check_line(
239 context, state, transitions)
240 except EOFError:
241 if self.debug:
242 print >>self._stderr, (
243 '\nStateMachine.run: %s.eof transition'
244 % state.__class__.__name__)
245 result = state.eof(context)
246 results.extend(result)
247 break
248 else:
249 results.extend(result)
250 except TransitionCorrection, exception:
251 self.previous_line() # back up for another try
252 transitions = (exception.args[0],)
253 if self.debug:
254 print >>self._stderr, (
255 '\nStateMachine.run: TransitionCorrection to '
256 'state "%s", transition %s.'
257 % (state.__class__.__name__, transitions[0]))
258 continue
259 except StateCorrection, exception:
260 self.previous_line() # back up for another try
261 next_state = exception.args[0]
262 if len(exception.args) == 1:
263 transitions = None
264 else:
265 transitions = (exception.args[1],)
266 if self.debug:
267 print >>self._stderr, (
268 '\nStateMachine.run: StateCorrection to state '
269 '"%s", transition %s.'
270 % (next_state, transitions[0]))
271 else:
272 transitions = None
273 state = self.get_state(next_state)
274 except:
275 if self.debug:
276 self.error()
277 raise
278 self.observers = []
279 return results
281 def get_state(self, next_state=None):
283 Return current state object; set it first if `next_state` given.
285 Parameter `next_state`: a string, the name of the next state.
287 Exception: `UnknownStateError` raised if `next_state` unknown.
289 if next_state:
290 if self.debug and next_state != self.current_state:
291 print >>self._stderr, (
292 '\nStateMachine.get_state: Changing state from '
293 '"%s" to "%s" (input line %s).'
294 % (self.current_state, next_state,
295 self.abs_line_number()))
296 self.current_state = next_state
297 try:
298 return self.states[self.current_state]
299 except KeyError:
300 raise UnknownStateError(self.current_state)
302 def next_line(self, n=1):
303 """Load `self.line` with the `n`'th next line and return it."""
304 try:
305 try:
306 self.line_offset += n
307 self.line = self.input_lines[self.line_offset]
308 except IndexError:
309 self.line = None
310 raise EOFError
311 return self.line
312 finally:
313 self.notify_observers()
315 def is_next_line_blank(self):
316 """Return 1 if the next line is blank or non-existant."""
317 try:
318 return not self.input_lines[self.line_offset + 1].strip()
319 except IndexError:
320 return 1
322 def at_eof(self):
323 """Return 1 if the input is at or past end-of-file."""
324 return self.line_offset >= len(self.input_lines) - 1
326 def at_bof(self):
327 """Return 1 if the input is at or before beginning-of-file."""
328 return self.line_offset <= 0
330 def previous_line(self, n=1):
331 """Load `self.line` with the `n`'th previous line and return it."""
332 self.line_offset -= n
333 if self.line_offset < 0:
334 self.line = None
335 else:
336 self.line = self.input_lines[self.line_offset]
337 self.notify_observers()
338 return self.line
340 def goto_line(self, line_offset):
341 """Jump to absolute line offset `line_offset`, load and return it."""
342 try:
343 try:
344 self.line_offset = line_offset - self.input_offset
345 self.line = self.input_lines[self.line_offset]
346 except IndexError:
347 self.line = None
348 raise EOFError
349 return self.line
350 finally:
351 self.notify_observers()
353 def get_source(self, line_offset):
354 """Return source of line at absolute line offset `line_offset`."""
355 return self.input_lines.source(line_offset - self.input_offset)
357 def abs_line_offset(self):
358 """Return line offset of current line, from beginning of file."""
359 return self.line_offset + self.input_offset
361 def abs_line_number(self):
362 """Return line number of current line (counting from 1)."""
363 return self.line_offset + self.input_offset + 1
365 def get_source_and_line(self, lineno=None):
366 """Return (source, line) tuple for current or given line number.
368 Looks up the source and line number in the `self.input_lines`
369 StringList instance to count for included source files.
371 If the optional argument `lineno` is given, convert it from an
372 absolute line number to the corresponding (source, line) pair.
374 if lineno is None:
375 offset = self.line_offset
376 else:
377 offset = lineno - self.input_offset - 1
378 try:
379 src, srcoffset = self.input_lines.info(offset)
380 srcline = srcoffset + 1
381 except (TypeError):
382 # line is None if index is "Just past the end"
383 src, srcline = self.get_source_and_line(offset + self.input_offset)
384 return src, srcline + 1
385 except (IndexError): # `offset` is off the list
386 src, srcline = None, None
387 # raise AssertionError('cannot find line %d in %s lines' %
388 # (offset, len(self.input_lines)))
389 # # list(self.input_lines.lines())))
390 # assert offset == srcoffset, str(self.input_lines)
391 # print "get_source_and_line(%s):" % lineno,
392 # print offset + 1, '->', src, srcline
393 # print self.input_lines
394 return (src, srcline)
396 def insert_input(self, input_lines, source):
397 self.input_lines.insert(self.line_offset + 1, '',
398 source='internal padding after '+source,
399 offset=len(input_lines))
400 self.input_lines.insert(self.line_offset + 1, '',
401 source='internal padding before '+source,
402 offset=-1)
403 self.input_lines.insert(self.line_offset + 2,
404 StringList(input_lines, source))
406 def get_text_block(self, flush_left=0):
408 Return a contiguous block of text.
410 If `flush_left` is true, raise `UnexpectedIndentationError` if an
411 indented line is encountered before the text block ends (with a blank
412 line).
414 try:
415 block = self.input_lines.get_text_block(self.line_offset,
416 flush_left)
417 self.next_line(len(block) - 1)
418 return block
419 except UnexpectedIndentationError, err:
420 block = err.args[0]
421 self.next_line(len(block) - 1) # advance to last line of block
422 raise
424 def check_line(self, context, state, transitions=None):
426 Examine one line of input for a transition match & execute its method.
428 Parameters:
430 - `context`: application-dependent storage.
431 - `state`: a `State` object, the current state.
432 - `transitions`: an optional ordered list of transition names to try,
433 instead of ``state.transition_order``.
435 Return the values returned by the transition method:
437 - context: possibly modified from the parameter `context`;
438 - next state name (`State` subclass name);
439 - the result output of the transition, a list.
441 When there is no match, ``state.no_match()`` is called and its return
442 value is returned.
444 if transitions is None:
445 transitions = state.transition_order
446 state_correction = None
447 if self.debug:
448 print >>self._stderr, (
449 '\nStateMachine.check_line: state="%s", transitions=%r.'
450 % (state.__class__.__name__, transitions))
451 for name in transitions:
452 pattern, method, next_state = state.transitions[name]
453 match = pattern.match(self.line)
454 if match:
455 if self.debug:
456 print >>self._stderr, (
457 '\nStateMachine.check_line: Matched transition '
458 '"%s" in state "%s".'
459 % (name, state.__class__.__name__))
460 return method(match, context, next_state)
461 else:
462 if self.debug:
463 print >>self._stderr, (
464 '\nStateMachine.check_line: No match in state "%s".'
465 % state.__class__.__name__)
466 return state.no_match(context, transitions)
468 def add_state(self, state_class):
470 Initialize & add a `state_class` (`State` subclass) object.
472 Exception: `DuplicateStateError` raised if `state_class` was already
473 added.
475 statename = state_class.__name__
476 if statename in self.states:
477 raise DuplicateStateError(statename)
478 self.states[statename] = state_class(self, self.debug)
480 def add_states(self, state_classes):
482 Add `state_classes` (a list of `State` subclasses).
484 for state_class in state_classes:
485 self.add_state(state_class)
487 def runtime_init(self):
489 Initialize `self.states`.
491 for state in self.states.values():
492 state.runtime_init()
494 def error(self):
495 """Report error details."""
496 type, value, module, line, function = _exception_data()
497 print >>self._stderr, u'%s: %s' % (type, value)
498 print >>self._stderr, 'input line %s' % (self.abs_line_number())
499 print >>self._stderr, (u'module %s, line %s, function %s' %
500 (module, line, function))
502 def attach_observer(self, observer):
504 The `observer` parameter is a function or bound method which takes two
505 arguments, the source and offset of the current line.
507 self.observers.append(observer)
509 def detach_observer(self, observer):
510 self.observers.remove(observer)
512 def notify_observers(self):
513 for observer in self.observers:
514 try:
515 info = self.input_lines.info(self.line_offset)
516 except IndexError:
517 info = (None, None)
518 observer(*info)
521 class State:
524 State superclass. Contains a list of transitions, and transition methods.
526 Transition methods all have the same signature. They take 3 parameters:
528 - An `re` match object. ``match.string`` contains the matched input line,
529 ``match.start()`` gives the start index of the match, and
530 ``match.end()`` gives the end index.
531 - A context object, whose meaning is application-defined (initial value
532 ``None``). It can be used to store any information required by the state
533 machine, and the retured context is passed on to the next transition
534 method unchanged.
535 - The name of the next state, a string, taken from the transitions list;
536 normally it is returned unchanged, but it may be altered by the
537 transition method if necessary.
539 Transition methods all return a 3-tuple:
541 - A context object, as (potentially) modified by the transition method.
542 - The next state name (a return value of ``None`` means no state change).
543 - The processing result, a list, which is accumulated by the state
544 machine.
546 Transition methods may raise an `EOFError` to cut processing short.
548 There are two implicit transitions, and corresponding transition methods
549 are defined: `bof()` handles the beginning-of-file, and `eof()` handles
550 the end-of-file. These methods have non-standard signatures and return
551 values. `bof()` returns the initial context and results, and may be used
552 to return a header string, or do any other processing needed. `eof()`
553 should handle any remaining context and wrap things up; it returns the
554 final processing result.
556 Typical applications need only subclass `State` (or a subclass), set the
557 `patterns` and `initial_transitions` class attributes, and provide
558 corresponding transition methods. The default object initialization will
559 take care of constructing the list of transitions.
562 patterns = None
564 {Name: pattern} mapping, used by `make_transition()`. Each pattern may
565 be a string or a compiled `re` pattern. Override in subclasses.
568 initial_transitions = None
570 A list of transitions to initialize when a `State` is instantiated.
571 Each entry is either a transition name string, or a (transition name, next
572 state name) pair. See `make_transitions()`. Override in subclasses.
575 nested_sm = None
577 The `StateMachine` class for handling nested processing.
579 If left as ``None``, `nested_sm` defaults to the class of the state's
580 controlling state machine. Override it in subclasses to avoid the default.
583 nested_sm_kwargs = None
585 Keyword arguments dictionary, passed to the `nested_sm` constructor.
587 Two keys must have entries in the dictionary:
589 - Key 'state_classes' must be set to a list of `State` classes.
590 - Key 'initial_state' must be set to the name of the initial state class.
592 If `nested_sm_kwargs` is left as ``None``, 'state_classes' defaults to the
593 class of the current state, and 'initial_state' defaults to the name of
594 the class of the current state. Override in subclasses to avoid the
595 defaults.
598 def __init__(self, state_machine, debug=0):
600 Initialize a `State` object; make & add initial transitions.
602 Parameters:
604 - `statemachine`: the controlling `StateMachine` object.
605 - `debug`: a boolean; produce verbose output if true (nonzero).
608 self.transition_order = []
609 """A list of transition names in search order."""
611 self.transitions = {}
613 A mapping of transition names to 3-tuples containing
614 (compiled_pattern, transition_method, next_state_name). Initialized as
615 an instance attribute dynamically (instead of as a class attribute)
616 because it may make forward references to patterns and methods in this
617 or other classes.
620 self.add_initial_transitions()
622 self.state_machine = state_machine
623 """A reference to the controlling `StateMachine` object."""
625 self.debug = debug
626 """Debugging mode on/off."""
628 if self.nested_sm is None:
629 self.nested_sm = self.state_machine.__class__
630 if self.nested_sm_kwargs is None:
631 self.nested_sm_kwargs = {'state_classes': [self.__class__],
632 'initial_state': self.__class__.__name__}
634 def runtime_init(self):
636 Initialize this `State` before running the state machine; called from
637 `self.state_machine.run()`.
639 pass
641 def unlink(self):
642 """Remove circular references to objects no longer required."""
643 self.state_machine = None
645 def add_initial_transitions(self):
646 """Make and add transitions listed in `self.initial_transitions`."""
647 if self.initial_transitions:
648 names, transitions = self.make_transitions(
649 self.initial_transitions)
650 self.add_transitions(names, transitions)
652 def add_transitions(self, names, transitions):
654 Add a list of transitions to the start of the transition list.
656 Parameters:
658 - `names`: a list of transition names.
659 - `transitions`: a mapping of names to transition tuples.
661 Exceptions: `DuplicateTransitionError`, `UnknownTransitionError`.
663 for name in names:
664 if name in self.transitions:
665 raise DuplicateTransitionError(name)
666 if name not in transitions:
667 raise UnknownTransitionError(name)
668 self.transition_order[:0] = names
669 self.transitions.update(transitions)
671 def add_transition(self, name, transition):
673 Add a transition to the start of the transition list.
675 Parameter `transition`: a ready-made transition 3-tuple.
677 Exception: `DuplicateTransitionError`.
679 if name in self.transitions:
680 raise DuplicateTransitionError(name)
681 self.transition_order[:0] = [name]
682 self.transitions[name] = transition
684 def remove_transition(self, name):
686 Remove a transition by `name`.
688 Exception: `UnknownTransitionError`.
690 try:
691 del self.transitions[name]
692 self.transition_order.remove(name)
693 except:
694 raise UnknownTransitionError(name)
696 def make_transition(self, name, next_state=None):
698 Make & return a transition tuple based on `name`.
700 This is a convenience function to simplify transition creation.
702 Parameters:
704 - `name`: a string, the name of the transition pattern & method. This
705 `State` object must have a method called '`name`', and a dictionary
706 `self.patterns` containing a key '`name`'.
707 - `next_state`: a string, the name of the next `State` object for this
708 transition. A value of ``None`` (or absent) implies no state change
709 (i.e., continue with the same state).
711 Exceptions: `TransitionPatternNotFound`, `TransitionMethodNotFound`.
713 if next_state is None:
714 next_state = self.__class__.__name__
715 try:
716 pattern = self.patterns[name]
717 if not hasattr(pattern, 'match'):
718 pattern = re.compile(pattern)
719 except KeyError:
720 raise TransitionPatternNotFound(
721 '%s.patterns[%r]' % (self.__class__.__name__, name))
722 try:
723 method = getattr(self, name)
724 except AttributeError:
725 raise TransitionMethodNotFound(
726 '%s.%s' % (self.__class__.__name__, name))
727 return (pattern, method, next_state)
729 def make_transitions(self, name_list):
731 Return a list of transition names and a transition mapping.
733 Parameter `name_list`: a list, where each entry is either a transition
734 name string, or a 1- or 2-tuple (transition name, optional next state
735 name).
737 stringtype = type('')
738 names = []
739 transitions = {}
740 for namestate in name_list:
741 if type(namestate) is stringtype:
742 transitions[namestate] = self.make_transition(namestate)
743 names.append(namestate)
744 else:
745 transitions[namestate[0]] = self.make_transition(*namestate)
746 names.append(namestate[0])
747 return names, transitions
749 def no_match(self, context, transitions):
751 Called when there is no match from `StateMachine.check_line()`.
753 Return the same values returned by transition methods:
755 - context: unchanged;
756 - next state name: ``None``;
757 - empty result list.
759 Override in subclasses to catch this event.
761 return context, None, []
763 def bof(self, context):
765 Handle beginning-of-file. Return unchanged `context`, empty result.
767 Override in subclasses.
769 Parameter `context`: application-defined storage.
771 return context, []
773 def eof(self, context):
775 Handle end-of-file. Return empty result.
777 Override in subclasses.
779 Parameter `context`: application-defined storage.
781 return []
783 def nop(self, match, context, next_state):
785 A "do nothing" transition method.
787 Return unchanged `context` & `next_state`, empty result. Useful for
788 simple state changes (actionless transitions).
790 return context, next_state, []
793 class StateMachineWS(StateMachine):
796 `StateMachine` subclass specialized for whitespace recognition.
798 There are three methods provided for extracting indented text blocks:
800 - `get_indented()`: use when the indent is unknown.
801 - `get_known_indented()`: use when the indent is known for all lines.
802 - `get_first_known_indented()`: use when only the first line's indent is
803 known.
806 def get_indented(self, until_blank=0, strip_indent=1):
808 Return a block of indented lines of text, and info.
810 Extract an indented block where the indent is unknown for all lines.
812 :Parameters:
813 - `until_blank`: Stop collecting at the first blank line if true
814 (1).
815 - `strip_indent`: Strip common leading indent if true (1,
816 default).
818 :Return:
819 - the indented block (a list of lines of text),
820 - its indent,
821 - its first line offset from BOF, and
822 - whether or not it finished with a blank line.
824 offset = self.abs_line_offset()
825 indented, indent, blank_finish = self.input_lines.get_indented(
826 self.line_offset, until_blank, strip_indent)
827 if indented:
828 self.next_line(len(indented) - 1) # advance to last indented line
829 while indented and not indented[0].strip():
830 indented.trim_start()
831 offset += 1
832 return indented, indent, offset, blank_finish
834 def get_known_indented(self, indent, until_blank=0, strip_indent=1):
836 Return an indented block and info.
838 Extract an indented block where the indent is known for all lines.
839 Starting with the current line, extract the entire text block with at
840 least `indent` indentation (which must be whitespace, except for the
841 first line).
843 :Parameters:
844 - `indent`: The number of indent columns/characters.
845 - `until_blank`: Stop collecting at the first blank line if true
846 (1).
847 - `strip_indent`: Strip `indent` characters of indentation if true
848 (1, default).
850 :Return:
851 - the indented block,
852 - its first line offset from BOF, and
853 - whether or not it finished with a blank line.
855 offset = self.abs_line_offset()
856 indented, indent, blank_finish = self.input_lines.get_indented(
857 self.line_offset, until_blank, strip_indent,
858 block_indent=indent)
859 self.next_line(len(indented) - 1) # advance to last indented line
860 while indented and not indented[0].strip():
861 indented.trim_start()
862 offset += 1
863 return indented, offset, blank_finish
865 def get_first_known_indented(self, indent, until_blank=0, strip_indent=1,
866 strip_top=1):
868 Return an indented block and info.
870 Extract an indented block where the indent is known for the first line
871 and unknown for all other lines.
873 :Parameters:
874 - `indent`: The first line's indent (# of columns/characters).
875 - `until_blank`: Stop collecting at the first blank line if true
876 (1).
877 - `strip_indent`: Strip `indent` characters of indentation if true
878 (1, default).
879 - `strip_top`: Strip blank lines from the beginning of the block.
881 :Return:
882 - the indented block,
883 - its indent,
884 - its first line offset from BOF, and
885 - whether or not it finished with a blank line.
887 offset = self.abs_line_offset()
888 indented, indent, blank_finish = self.input_lines.get_indented(
889 self.line_offset, until_blank, strip_indent,
890 first_indent=indent)
891 self.next_line(len(indented) - 1) # advance to last indented line
892 if strip_top:
893 while indented and not indented[0].strip():
894 indented.trim_start()
895 offset += 1
896 return indented, indent, offset, blank_finish
899 class StateWS(State):
902 State superclass specialized for whitespace (blank lines & indents).
904 Use this class with `StateMachineWS`. The transitions 'blank' (for blank
905 lines) and 'indent' (for indented text blocks) are added automatically,
906 before any other transitions. The transition method `blank()` handles
907 blank lines and `indent()` handles nested indented blocks. Indented
908 blocks trigger a new state machine to be created by `indent()` and run.
909 The class of the state machine to be created is in `indent_sm`, and the
910 constructor keyword arguments are in the dictionary `indent_sm_kwargs`.
912 The methods `known_indent()` and `firstknown_indent()` are provided for
913 indented blocks where the indent (all lines' and first line's only,
914 respectively) is known to the transition method, along with the attributes
915 `known_indent_sm` and `known_indent_sm_kwargs`. Neither transition method
916 is triggered automatically.
919 indent_sm = None
921 The `StateMachine` class handling indented text blocks.
923 If left as ``None``, `indent_sm` defaults to the value of
924 `State.nested_sm`. Override it in subclasses to avoid the default.
927 indent_sm_kwargs = None
929 Keyword arguments dictionary, passed to the `indent_sm` constructor.
931 If left as ``None``, `indent_sm_kwargs` defaults to the value of
932 `State.nested_sm_kwargs`. Override it in subclasses to avoid the default.
935 known_indent_sm = None
937 The `StateMachine` class handling known-indented text blocks.
939 If left as ``None``, `known_indent_sm` defaults to the value of
940 `indent_sm`. Override it in subclasses to avoid the default.
943 known_indent_sm_kwargs = None
945 Keyword arguments dictionary, passed to the `known_indent_sm` constructor.
947 If left as ``None``, `known_indent_sm_kwargs` defaults to the value of
948 `indent_sm_kwargs`. Override it in subclasses to avoid the default.
951 ws_patterns = {'blank': ' *$',
952 'indent': ' +'}
953 """Patterns for default whitespace transitions. May be overridden in
954 subclasses."""
956 ws_initial_transitions = ('blank', 'indent')
957 """Default initial whitespace transitions, added before those listed in
958 `State.initial_transitions`. May be overridden in subclasses."""
960 def __init__(self, state_machine, debug=0):
962 Initialize a `StateSM` object; extends `State.__init__()`.
964 Check for indent state machine attributes, set defaults if not set.
966 State.__init__(self, state_machine, debug)
967 if self.indent_sm is None:
968 self.indent_sm = self.nested_sm
969 if self.indent_sm_kwargs is None:
970 self.indent_sm_kwargs = self.nested_sm_kwargs
971 if self.known_indent_sm is None:
972 self.known_indent_sm = self.indent_sm
973 if self.known_indent_sm_kwargs is None:
974 self.known_indent_sm_kwargs = self.indent_sm_kwargs
976 def add_initial_transitions(self):
978 Add whitespace-specific transitions before those defined in subclass.
980 Extends `State.add_initial_transitions()`.
982 State.add_initial_transitions(self)
983 if self.patterns is None:
984 self.patterns = {}
985 self.patterns.update(self.ws_patterns)
986 names, transitions = self.make_transitions(
987 self.ws_initial_transitions)
988 self.add_transitions(names, transitions)
990 def blank(self, match, context, next_state):
991 """Handle blank lines. Does nothing. Override in subclasses."""
992 return self.nop(match, context, next_state)
994 def indent(self, match, context, next_state):
996 Handle an indented text block. Extend or override in subclasses.
998 Recursively run the registered state machine for indented blocks
999 (`self.indent_sm`).
1001 indented, indent, line_offset, blank_finish = \
1002 self.state_machine.get_indented()
1003 sm = self.indent_sm(debug=self.debug, **self.indent_sm_kwargs)
1004 results = sm.run(indented, input_offset=line_offset)
1005 return context, next_state, results
1007 def known_indent(self, match, context, next_state):
1009 Handle a known-indent text block. Extend or override in subclasses.
1011 Recursively run the registered state machine for known-indent indented
1012 blocks (`self.known_indent_sm`). The indent is the length of the
1013 match, ``match.end()``.
1015 indented, line_offset, blank_finish = \
1016 self.state_machine.get_known_indented(match.end())
1017 sm = self.known_indent_sm(debug=self.debug,
1018 **self.known_indent_sm_kwargs)
1019 results = sm.run(indented, input_offset=line_offset)
1020 return context, next_state, results
1022 def first_known_indent(self, match, context, next_state):
1024 Handle an indented text block (first line's indent known).
1026 Extend or override in subclasses.
1028 Recursively run the registered state machine for known-indent indented
1029 blocks (`self.known_indent_sm`). The indent is the length of the
1030 match, ``match.end()``.
1032 indented, line_offset, blank_finish = \
1033 self.state_machine.get_first_known_indented(match.end())
1034 sm = self.known_indent_sm(debug=self.debug,
1035 **self.known_indent_sm_kwargs)
1036 results = sm.run(indented, input_offset=line_offset)
1037 return context, next_state, results
1040 class _SearchOverride:
1043 Mix-in class to override `StateMachine` regular expression behavior.
1045 Changes regular expression matching, from the default `re.match()`
1046 (succeeds only if the pattern matches at the start of `self.line`) to
1047 `re.search()` (succeeds if the pattern matches anywhere in `self.line`).
1048 When subclassing a `StateMachine`, list this class **first** in the
1049 inheritance list of the class definition.
1052 def match(self, pattern):
1054 Return the result of a regular expression search.
1056 Overrides `StateMachine.match()`.
1058 Parameter `pattern`: `re` compiled regular expression.
1060 return pattern.search(self.line)
1063 class SearchStateMachine(_SearchOverride, StateMachine):
1064 """`StateMachine` which uses `re.search()` instead of `re.match()`."""
1065 pass
1068 class SearchStateMachineWS(_SearchOverride, StateMachineWS):
1069 """`StateMachineWS` which uses `re.search()` instead of `re.match()`."""
1070 pass
1073 class ViewList:
1076 List with extended functionality: slices of ViewList objects are child
1077 lists, linked to their parents. Changes made to a child list also affect
1078 the parent list. A child list is effectively a "view" (in the SQL sense)
1079 of the parent list. Changes to parent lists, however, do *not* affect
1080 active child lists. If a parent list is changed, any active child lists
1081 should be recreated.
1083 The start and end of the slice can be trimmed using the `trim_start()` and
1084 `trim_end()` methods, without affecting the parent list. The link between
1085 child and parent lists can be broken by calling `disconnect()` on the
1086 child list.
1088 Also, ViewList objects keep track of the source & offset of each item.
1089 This information is accessible via the `source()`, `offset()`, and
1090 `info()` methods.
1093 def __init__(self, initlist=None, source=None, items=None,
1094 parent=None, parent_offset=None):
1095 self.data = []
1096 """The actual list of data, flattened from various sources."""
1098 self.items = []
1099 """A list of (source, offset) pairs, same length as `self.data`: the
1100 source of each line and the offset of each line from the beginning of
1101 its source."""
1103 self.parent = parent
1104 """The parent list."""
1106 self.parent_offset = parent_offset
1107 """Offset of this list from the beginning of the parent list."""
1109 if isinstance(initlist, ViewList):
1110 self.data = initlist.data[:]
1111 self.items = initlist.items[:]
1112 elif initlist is not None:
1113 self.data = list(initlist)
1114 if items:
1115 self.items = items
1116 else:
1117 self.items = [(source, i) for i in range(len(initlist))]
1118 assert len(self.data) == len(self.items), 'data mismatch'
1120 def __str__(self):
1121 return str(self.data)
1123 def __repr__(self):
1124 return '%s(%s, items=%s)' % (self.__class__.__name__,
1125 self.data, self.items)
1127 def __lt__(self, other): return self.data < self.__cast(other)
1128 def __le__(self, other): return self.data <= self.__cast(other)
1129 def __eq__(self, other): return self.data == self.__cast(other)
1130 def __ne__(self, other): return self.data != self.__cast(other)
1131 def __gt__(self, other): return self.data > self.__cast(other)
1132 def __ge__(self, other): return self.data >= self.__cast(other)
1133 def __cmp__(self, other): return cmp(self.data, self.__cast(other))
1135 def __cast(self, other):
1136 if isinstance(other, ViewList):
1137 return other.data
1138 else:
1139 return other
1141 def __contains__(self, item): return item in self.data
1142 def __len__(self): return len(self.data)
1144 # The __getitem__()/__setitem__() methods check whether the index
1145 # is a slice first, since indexing a native list with a slice object
1146 # just works.
1148 def __getitem__(self, i):
1149 if isinstance(i, types.SliceType):
1150 assert i.step in (None, 1), 'cannot handle slice with stride'
1151 return self.__class__(self.data[i.start:i.stop],
1152 items=self.items[i.start:i.stop],
1153 parent=self, parent_offset=i.start or 0)
1154 else:
1155 return self.data[i]
1157 def __setitem__(self, i, item):
1158 if isinstance(i, types.SliceType):
1159 assert i.step in (None, 1), 'cannot handle slice with stride'
1160 if not isinstance(item, ViewList):
1161 raise TypeError('assigning non-ViewList to ViewList slice')
1162 self.data[i.start:i.stop] = item.data
1163 self.items[i.start:i.stop] = item.items
1164 assert len(self.data) == len(self.items), 'data mismatch'
1165 if self.parent:
1166 self.parent[(i.start or 0) + self.parent_offset
1167 : (i.stop or len(self)) + self.parent_offset] = item
1168 else:
1169 self.data[i] = item
1170 if self.parent:
1171 self.parent[i + self.parent_offset] = item
1173 def __delitem__(self, i):
1174 try:
1175 del self.data[i]
1176 del self.items[i]
1177 if self.parent:
1178 del self.parent[i + self.parent_offset]
1179 except TypeError:
1180 assert i.step is None, 'cannot handle slice with stride'
1181 del self.data[i.start:i.stop]
1182 del self.items[i.start:i.stop]
1183 if self.parent:
1184 del self.parent[(i.start or 0) + self.parent_offset
1185 : (i.stop or len(self)) + self.parent_offset]
1187 def __add__(self, other):
1188 if isinstance(other, ViewList):
1189 return self.__class__(self.data + other.data,
1190 items=(self.items + other.items))
1191 else:
1192 raise TypeError('adding non-ViewList to a ViewList')
1194 def __radd__(self, other):
1195 if isinstance(other, ViewList):
1196 return self.__class__(other.data + self.data,
1197 items=(other.items + self.items))
1198 else:
1199 raise TypeError('adding ViewList to a non-ViewList')
1201 def __iadd__(self, other):
1202 if isinstance(other, ViewList):
1203 self.data += other.data
1204 else:
1205 raise TypeError('argument to += must be a ViewList')
1206 return self
1208 def __mul__(self, n):
1209 return self.__class__(self.data * n, items=(self.items * n))
1211 __rmul__ = __mul__
1213 def __imul__(self, n):
1214 self.data *= n
1215 self.items *= n
1216 return self
1218 def extend(self, other):
1219 if not isinstance(other, ViewList):
1220 raise TypeError('extending a ViewList with a non-ViewList')
1221 if self.parent:
1222 self.parent.insert(len(self.data) + self.parent_offset, other)
1223 self.data.extend(other.data)
1224 self.items.extend(other.items)
1226 def append(self, item, source=None, offset=0):
1227 if source is None:
1228 self.extend(item)
1229 else:
1230 if self.parent:
1231 self.parent.insert(len(self.data) + self.parent_offset, item,
1232 source, offset)
1233 self.data.append(item)
1234 self.items.append((source, offset))
1236 def insert(self, i, item, source=None, offset=0):
1237 if source is None:
1238 if not isinstance(item, ViewList):
1239 raise TypeError('inserting non-ViewList with no source given')
1240 self.data[i:i] = item.data
1241 self.items[i:i] = item.items
1242 if self.parent:
1243 index = (len(self.data) + i) % len(self.data)
1244 self.parent.insert(index + self.parent_offset, item)
1245 else:
1246 self.data.insert(i, item)
1247 self.items.insert(i, (source, offset))
1248 if self.parent:
1249 index = (len(self.data) + i) % len(self.data)
1250 self.parent.insert(index + self.parent_offset, item,
1251 source, offset)
1253 def pop(self, i=-1):
1254 if self.parent:
1255 index = (len(self.data) + i) % len(self.data)
1256 self.parent.pop(index + self.parent_offset)
1257 self.items.pop(i)
1258 return self.data.pop(i)
1260 def trim_start(self, n=1):
1262 Remove items from the start of the list, without touching the parent.
1264 if n > len(self.data):
1265 raise IndexError("Size of trim too large; can't trim %s items "
1266 "from a list of size %s." % (n, len(self.data)))
1267 elif n < 0:
1268 raise IndexError('Trim size must be >= 0.')
1269 del self.data[:n]
1270 del self.items[:n]
1271 if self.parent:
1272 self.parent_offset += n
1274 def trim_end(self, n=1):
1276 Remove items from the end of the list, without touching the parent.
1278 if n > len(self.data):
1279 raise IndexError("Size of trim too large; can't trim %s items "
1280 "from a list of size %s." % (n, len(self.data)))
1281 elif n < 0:
1282 raise IndexError('Trim size must be >= 0.')
1283 del self.data[-n:]
1284 del self.items[-n:]
1286 def remove(self, item):
1287 index = self.index(item)
1288 del self[index]
1290 def count(self, item): return self.data.count(item)
1291 def index(self, item): return self.data.index(item)
1293 def reverse(self):
1294 self.data.reverse()
1295 self.items.reverse()
1296 self.parent = None
1298 def sort(self, *args):
1299 tmp = zip(self.data, self.items)
1300 tmp.sort(*args)
1301 self.data = [entry[0] for entry in tmp]
1302 self.items = [entry[1] for entry in tmp]
1303 self.parent = None
1305 def info(self, i):
1306 """Return source & offset for index `i`."""
1307 try:
1308 return self.items[i]
1309 except IndexError:
1310 if i == len(self.data): # Just past the end
1311 return self.items[i - 1][0], None
1312 else:
1313 raise
1315 def source(self, i):
1316 """Return source for index `i`."""
1317 return self.info(i)[0]
1319 def offset(self, i):
1320 """Return offset for index `i`."""
1321 return self.info(i)[1]
1323 def disconnect(self):
1324 """Break link between this list and parent list."""
1325 self.parent = None
1327 def xitems(self):
1328 """Return iterator yielding (source, offset, value) tuples."""
1329 for (value, (source, offset)) in zip(self.data, self.items):
1330 yield (source, offset, value)
1332 def pprint(self):
1333 """Print the list in `grep` format (`source:offset:value` lines)"""
1334 for line in self.xitems():
1335 print "%s:%d:%s" % line
1338 class StringList(ViewList):
1340 """A `ViewList` with string-specific methods."""
1342 def trim_left(self, length, start=0, end=sys.maxint):
1344 Trim `length` characters off the beginning of each item, in-place,
1345 from index `start` to `end`. No whitespace-checking is done on the
1346 trimmed text. Does not affect slice parent.
1348 self.data[start:end] = [line[length:]
1349 for line in self.data[start:end]]
1351 def get_text_block(self, start, flush_left=0):
1353 Return a contiguous block of text.
1355 If `flush_left` is true, raise `UnexpectedIndentationError` if an
1356 indented line is encountered before the text block ends (with a blank
1357 line).
1359 end = start
1360 last = len(self.data)
1361 while end < last:
1362 line = self.data[end]
1363 if not line.strip():
1364 break
1365 if flush_left and (line[0] == ' '):
1366 source, offset = self.info(end)
1367 raise UnexpectedIndentationError(self[start:end], source,
1368 offset + 1)
1369 end += 1
1370 return self[start:end]
1372 def get_indented(self, start=0, until_blank=0, strip_indent=1,
1373 block_indent=None, first_indent=None):
1375 Extract and return a StringList of indented lines of text.
1377 Collect all lines with indentation, determine the minimum indentation,
1378 remove the minimum indentation from all indented lines (unless
1379 `strip_indent` is false), and return them. All lines up to but not
1380 including the first unindented line will be returned.
1382 :Parameters:
1383 - `start`: The index of the first line to examine.
1384 - `until_blank`: Stop collecting at the first blank line if true.
1385 - `strip_indent`: Strip common leading indent if true (default).
1386 - `block_indent`: The indent of the entire block, if known.
1387 - `first_indent`: The indent of the first line, if known.
1389 :Return:
1390 - a StringList of indented lines with mininum indent removed;
1391 - the amount of the indent;
1392 - a boolean: did the indented block finish with a blank line or EOF?
1394 indent = block_indent # start with None if unknown
1395 end = start
1396 if block_indent is not None and first_indent is None:
1397 first_indent = block_indent
1398 if first_indent is not None:
1399 end += 1
1400 last = len(self.data)
1401 while end < last:
1402 line = self.data[end]
1403 if line and (line[0] != ' '
1404 or (block_indent is not None
1405 and line[:block_indent].strip())):
1406 # Line not indented or insufficiently indented.
1407 # Block finished properly iff the last indented line blank:
1408 blank_finish = ((end > start)
1409 and not self.data[end - 1].strip())
1410 break
1411 stripped = line.lstrip()
1412 if not stripped: # blank line
1413 if until_blank:
1414 blank_finish = 1
1415 break
1416 elif block_indent is None:
1417 line_indent = len(line) - len(stripped)
1418 if indent is None:
1419 indent = line_indent
1420 else:
1421 indent = min(indent, line_indent)
1422 end += 1
1423 else:
1424 blank_finish = 1 # block ends at end of lines
1425 block = self[start:end]
1426 if first_indent is not None and block:
1427 block.data[0] = block.data[0][first_indent:]
1428 if indent and strip_indent:
1429 block.trim_left(indent, start=(first_indent is not None))
1430 return block, indent or 0, blank_finish
1432 def get_2D_block(self, top, left, bottom, right, strip_indent=1):
1433 block = self[top:bottom]
1434 indent = right
1435 for i in range(len(block.data)):
1436 # get slice from line, care for combining characters
1437 ci = utils.column_indices(block.data[i])
1438 try:
1439 left = ci[left]
1440 except IndexError:
1441 left += len(block.data[i]) - len(ci)
1442 try:
1443 right = ci[right]
1444 except IndexError:
1445 right += len(block.data[i]) - len(ci)
1446 block.data[i] = line = block.data[i][left:right].rstrip()
1447 if line:
1448 indent = min(indent, len(line) - len(line.lstrip()))
1449 if strip_indent and 0 < indent < right:
1450 block.data = [line[indent:] for line in block.data]
1451 return block
1453 def pad_double_width(self, pad_char):
1455 Pad all double-width characters in self by appending `pad_char` to each.
1456 For East Asian language support.
1458 if hasattr(unicodedata, 'east_asian_width'):
1459 east_asian_width = unicodedata.east_asian_width
1460 else:
1461 return # new in Python 2.4
1462 for i in range(len(self.data)):
1463 line = self.data[i]
1464 if isinstance(line, unicode):
1465 new = []
1466 for char in line:
1467 new.append(char)
1468 if east_asian_width(char) in 'WF': # 'W'ide & 'F'ull-width
1469 new.append(pad_char)
1470 self.data[i] = ''.join(new)
1472 def replace(self, old, new):
1473 """Replace all occurrences of substring `old` with `new`."""
1474 for i in range(len(self.data)):
1475 self.data[i] = self.data[i].replace(old, new)
1478 class StateMachineError(Exception): pass
1479 class UnknownStateError(StateMachineError): pass
1480 class DuplicateStateError(StateMachineError): pass
1481 class UnknownTransitionError(StateMachineError): pass
1482 class DuplicateTransitionError(StateMachineError): pass
1483 class TransitionPatternNotFound(StateMachineError): pass
1484 class TransitionMethodNotFound(StateMachineError): pass
1485 class UnexpectedIndentationError(StateMachineError): pass
1488 class TransitionCorrection(Exception):
1491 Raise from within a transition method to switch to another transition.
1493 Raise with one argument, the new transition name.
1497 class StateCorrection(Exception):
1500 Raise from within a transition method to switch to another state.
1502 Raise with one or two arguments: new state name, and an optional new
1503 transition name.
1507 def string2lines(astring, tab_width=8, convert_whitespace=0,
1508 whitespace=re.compile('[\v\f]')):
1510 Return a list of one-line strings with tabs expanded, no newlines, and
1511 trailing whitespace stripped.
1513 Each tab is expanded with between 1 and `tab_width` spaces, so that the
1514 next character's index becomes a multiple of `tab_width` (8 by default).
1516 Parameters:
1518 - `astring`: a multi-line string.
1519 - `tab_width`: the number of columns between tab stops.
1520 - `convert_whitespace`: convert form feeds and vertical tabs to spaces?
1522 if convert_whitespace:
1523 astring = whitespace.sub(' ', astring)
1524 return [s.expandtabs(tab_width).rstrip() for s in astring.splitlines()]
1526 def _exception_data():
1528 Return exception information:
1530 - the exception's class name;
1531 - the exception object;
1532 - the name of the file containing the offending code;
1533 - the line number of the offending code;
1534 - the function name of the offending code.
1536 type, value, traceback = sys.exc_info()
1537 while traceback.tb_next:
1538 traceback = traceback.tb_next
1539 code = traceback.tb_frame.f_code
1540 return (type.__name__, value, code.co_filename, traceback.tb_lineno,
1541 code.co_name)