Spelling fixes
[docutils.git] / docutils / statemachine.py
blob6a29b9747b533f559f13f1b701cff671a0feb202
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 A finite state machine specialized for regular-expression-based text filters,
7 this module defines the following classes:
9 - `StateMachine`, a state machine
10 - `State`, a state superclass
11 - `StateMachineWS`, a whitespace-sensitive version of `StateMachine`
12 - `StateWS`, a state superclass for use with `StateMachineWS`
13 - `SearchStateMachine`, uses `re.search()` instead of `re.match()`
14 - `SearchStateMachineWS`, uses `re.search()` instead of `re.match()`
15 - `ViewList`, extends standard Python lists.
16 - `StringList`, string-specific ViewList.
18 Exception classes:
20 - `StateMachineError`
21 - `UnknownStateError`
22 - `DuplicateStateError`
23 - `UnknownTransitionError`
24 - `DuplicateTransitionError`
25 - `TransitionPatternNotFound`
26 - `TransitionMethodNotFound`
27 - `UnexpectedIndentationError`
28 - `TransitionCorrection`: Raised to switch to another transition.
29 - `StateCorrection`: Raised to switch to another state & transition.
31 Functions:
33 - `string2lines()`: split a multi-line string into a list of one-line strings
36 How To Use This Module
37 ======================
38 (See the individual classes, methods, and attributes for details.)
40 1. Import it: ``import statemachine`` or ``from statemachine import ...``.
41 You will also need to ``import re``.
43 2. Derive a subclass of `State` (or `StateWS`) for each state in your state
44 machine::
46 class MyState(statemachine.State):
48 Within the state's class definition:
50 a) Include a pattern for each transition, in `State.patterns`::
52 patterns = {'atransition': r'pattern', ...}
54 b) Include a list of initial transitions to be set up automatically, in
55 `State.initial_transitions`::
57 initial_transitions = ['atransition', ...]
59 c) Define a method for each transition, with the same name as the
60 transition pattern::
62 def atransition(self, match, context, next_state):
63 # do something
64 result = [...] # a list
65 return context, next_state, result
66 # context, next_state may be altered
68 Transition methods may raise an `EOFError` to cut processing short.
70 d) You may wish to override the `State.bof()` and/or `State.eof()` implicit
71 transition methods, which handle the beginning- and end-of-file.
73 e) In order to handle nested processing, you may wish to override the
74 attributes `State.nested_sm` and/or `State.nested_sm_kwargs`.
76 If you are using `StateWS` as a base class, in order to handle nested
77 indented blocks, you may wish to:
79 - override the attributes `StateWS.indent_sm`,
80 `StateWS.indent_sm_kwargs`, `StateWS.known_indent_sm`, and/or
81 `StateWS.known_indent_sm_kwargs`;
82 - override the `StateWS.blank()` method; and/or
83 - override or extend the `StateWS.indent()`, `StateWS.known_indent()`,
84 and/or `StateWS.firstknown_indent()` methods.
86 3. Create a state machine object::
88 sm = StateMachine(state_classes=[MyState, ...],
89 initial_state='MyState')
91 4. Obtain the input text, which needs to be converted into a tab-free list of
92 one-line strings. For example, to read text from a file called
93 'inputfile'::
95 input_string = open('inputfile').read()
96 input_lines = statemachine.string2lines(input_string)
98 5. Run the state machine on the input text and collect the results, a list::
100 results = sm.run(input_lines)
102 6. Remove any lingering circular references::
104 sm.unlink()
107 __docformat__ = 'restructuredtext'
109 import sys
110 import re
111 import types
112 import unicodedata
113 from docutils import utils
114 from docutils.utils.error_reporting import ErrorOutput
117 class StateMachine:
120 A finite state machine for text filters using regular expressions.
122 The input is provided in the form of a list of one-line strings (no
123 newlines). States are subclasses of the `State` class. Transitions consist
124 of regular expression patterns and transition methods, and are defined in
125 each state.
127 The state machine is started with the `run()` method, which returns the
128 results of processing in a list.
131 def __init__(self, state_classes, initial_state, debug=False):
133 Initialize a `StateMachine` object; add state objects.
135 Parameters:
137 - `state_classes`: a list of `State` (sub)classes.
138 - `initial_state`: a string, the class name of the initial state.
139 - `debug`: a boolean; produce verbose output if true (nonzero).
142 self.input_lines = None
143 """`StringList` of input lines (without newlines).
144 Filled by `self.run()`."""
146 self.input_offset = 0
147 """Offset of `self.input_lines` from the beginning of the file."""
149 self.line = None
150 """Current input line."""
152 self.line_offset = -1
153 """Current input line offset from beginning of `self.input_lines`."""
155 self.debug = debug
156 """Debugging mode on/off."""
158 self.initial_state = initial_state
159 """The name of the initial state (key to `self.states`)."""
161 self.current_state = initial_state
162 """The name of the current state (key to `self.states`)."""
164 self.states = {}
165 """Mapping of {state_name: State_object}."""
167 self.add_states(state_classes)
169 self.observers = []
170 """List of bound methods or functions to call whenever the current
171 line changes. Observers are called with one argument, ``self``.
172 Cleared at the end of `run()`."""
174 self._stderr = ErrorOutput()
175 """Wrapper around sys.stderr catching en-/decoding errors"""
178 def unlink(self):
179 """Remove circular references to objects no longer required."""
180 for state in self.states.values():
181 state.unlink()
182 self.states = None
184 def run(self, input_lines, input_offset=0, context=None,
185 input_source=None, initial_state=None):
187 Run the state machine on `input_lines`. Return results (a list).
189 Reset `self.line_offset` and `self.current_state`. Run the
190 beginning-of-file transition. Input one line at a time and check for a
191 matching transition. If a match is found, call the transition method
192 and possibly change the state. Store the context returned by the
193 transition method to be passed on to the next transition matched.
194 Accumulate the results returned by the transition methods in a list.
195 Run the end-of-file transition. Finally, return the accumulated
196 results.
198 Parameters:
200 - `input_lines`: a list of strings without newlines, or `StringList`.
201 - `input_offset`: the line offset of `input_lines` from the beginning
202 of the file.
203 - `context`: application-specific storage.
204 - `input_source`: name or path of source of `input_lines`.
205 - `initial_state`: name of initial state.
207 self.runtime_init()
208 if isinstance(input_lines, StringList):
209 self.input_lines = input_lines
210 else:
211 self.input_lines = StringList(input_lines, source=input_source)
212 self.input_offset = input_offset
213 self.line_offset = -1
214 self.current_state = initial_state or self.initial_state
215 if self.debug:
216 print >>self._stderr, (
217 u'\nStateMachine.run: input_lines (line_offset=%s):\n| %s'
218 % (self.line_offset, u'\n| '.join(self.input_lines)))
219 transitions = None
220 results = []
221 state = self.get_state()
222 try:
223 if self.debug:
224 print >>self._stderr, '\nStateMachine.run: bof transition'
225 context, result = state.bof(context)
226 results.extend(result)
227 while True:
228 try:
229 try:
230 self.next_line()
231 if self.debug:
232 source, offset = self.input_lines.info(
233 self.line_offset)
234 print >>self._stderr, (
235 u'\nStateMachine.run: line (source=%r, '
236 u'offset=%r):\n| %s'
237 % (source, offset, self.line))
238 context, next_state, result = self.check_line(
239 context, state, transitions)
240 except EOFError:
241 if self.debug:
242 print >>self._stderr, (
243 '\nStateMachine.run: %s.eof transition'
244 % state.__class__.__name__)
245 result = state.eof(context)
246 results.extend(result)
247 break
248 else:
249 results.extend(result)
250 except TransitionCorrection, exception:
251 self.previous_line() # back up for another try
252 transitions = (exception.args[0],)
253 if self.debug:
254 print >>self._stderr, (
255 '\nStateMachine.run: TransitionCorrection to '
256 'state "%s", transition %s.'
257 % (state.__class__.__name__, transitions[0]))
258 continue
259 except StateCorrection, exception:
260 self.previous_line() # back up for another try
261 next_state = exception.args[0]
262 if len(exception.args) == 1:
263 transitions = None
264 else:
265 transitions = (exception.args[1],)
266 if self.debug:
267 print >>self._stderr, (
268 '\nStateMachine.run: StateCorrection to state '
269 '"%s", transition %s.'
270 % (next_state, transitions[0]))
271 else:
272 transitions = None
273 state = self.get_state(next_state)
274 except:
275 if self.debug:
276 self.error()
277 raise
278 self.observers = []
279 return results
281 def get_state(self, next_state=None):
283 Return current state object; set it first if `next_state` given.
285 Parameter `next_state`: a string, the name of the next state.
287 Exception: `UnknownStateError` raised if `next_state` unknown.
289 if next_state:
290 if self.debug and next_state != self.current_state:
291 print >>self._stderr, (
292 '\nStateMachine.get_state: Changing state from '
293 '"%s" to "%s" (input line %s).'
294 % (self.current_state, next_state,
295 self.abs_line_number()))
296 self.current_state = next_state
297 try:
298 return self.states[self.current_state]
299 except KeyError:
300 raise UnknownStateError(self.current_state)
302 def next_line(self, n=1):
303 """Load `self.line` with the `n`'th next line and return it."""
304 try:
305 try:
306 self.line_offset += n
307 self.line = self.input_lines[self.line_offset]
308 except IndexError:
309 self.line = None
310 raise EOFError
311 return self.line
312 finally:
313 self.notify_observers()
315 def is_next_line_blank(self):
316 """Return 1 if the next line is blank or non-existent."""
317 try:
318 return not self.input_lines[self.line_offset + 1].strip()
319 except IndexError:
320 return 1
322 def at_eof(self):
323 """Return 1 if the input is at or past end-of-file."""
324 return self.line_offset >= len(self.input_lines) - 1
326 def at_bof(self):
327 """Return 1 if the input is at or before beginning-of-file."""
328 return self.line_offset <= 0
330 def previous_line(self, n=1):
331 """Load `self.line` with the `n`'th previous line and return it."""
332 self.line_offset -= n
333 if self.line_offset < 0:
334 self.line = None
335 else:
336 self.line = self.input_lines[self.line_offset]
337 self.notify_observers()
338 return self.line
340 def goto_line(self, line_offset):
341 """Jump to absolute line offset `line_offset`, load and return it."""
342 try:
343 try:
344 self.line_offset = line_offset - self.input_offset
345 self.line = self.input_lines[self.line_offset]
346 except IndexError:
347 self.line = None
348 raise EOFError
349 return self.line
350 finally:
351 self.notify_observers()
353 def get_source(self, line_offset):
354 """Return source of line at absolute line offset `line_offset`."""
355 return self.input_lines.source(line_offset - self.input_offset)
357 def abs_line_offset(self):
358 """Return line offset of current line, from beginning of file."""
359 return self.line_offset + self.input_offset
361 def abs_line_number(self):
362 """Return line number of current line (counting from 1)."""
363 return self.line_offset + self.input_offset + 1
365 def get_source_and_line(self, lineno=None):
366 """Return (source, line) tuple for current or given line number.
368 Looks up the source and line number in the `self.input_lines`
369 StringList instance to count for included source files.
371 If the optional argument `lineno` is given, convert it from an
372 absolute line number to the corresponding (source, line) pair.
374 if lineno is None:
375 offset = self.line_offset
376 else:
377 offset = lineno - self.input_offset - 1
378 try:
379 src, srcoffset = self.input_lines.info(offset)
380 srcline = srcoffset + 1
381 except (TypeError):
382 # line is None if index is "Just past the end"
383 src, srcline = self.get_source_and_line(offset + self.input_offset)
384 return src, srcline + 1
385 except (IndexError): # `offset` is off the list
386 src, srcline = None, None
387 # raise AssertionError('cannot find line %d in %s lines' %
388 # (offset, len(self.input_lines)))
389 # # list(self.input_lines.lines())))
390 # assert offset == srcoffset, str(self.input_lines)
391 # print "get_source_and_line(%s):" % lineno,
392 # print offset + 1, '->', src, srcline
393 # print self.input_lines
394 return (src, srcline)
396 def insert_input(self, input_lines, source):
397 self.input_lines.insert(self.line_offset + 1, '',
398 source='internal padding after '+source,
399 offset=len(input_lines))
400 self.input_lines.insert(self.line_offset + 1, '',
401 source='internal padding before '+source,
402 offset=-1)
403 self.input_lines.insert(self.line_offset + 2,
404 StringList(input_lines, source))
406 def get_text_block(self, flush_left=False):
408 Return a contiguous block of text.
410 If `flush_left` is true, raise `UnexpectedIndentationError` if an
411 indented line is encountered before the text block ends (with a blank
412 line).
414 try:
415 block = self.input_lines.get_text_block(self.line_offset,
416 flush_left)
417 self.next_line(len(block) - 1)
418 return block
419 except UnexpectedIndentationError, err:
420 block = err.args[0]
421 self.next_line(len(block) - 1) # advance to last line of block
422 raise
424 def check_line(self, context, state, transitions=None):
426 Examine one line of input for a transition match & execute its method.
428 Parameters:
430 - `context`: application-dependent storage.
431 - `state`: a `State` object, the current state.
432 - `transitions`: an optional ordered list of transition names to try,
433 instead of ``state.transition_order``.
435 Return the values returned by the transition method:
437 - context: possibly modified from the parameter `context`;
438 - next state name (`State` subclass name);
439 - the result output of the transition, a list.
441 When there is no match, ``state.no_match()`` is called and its return
442 value is returned.
444 if transitions is None:
445 transitions = state.transition_order
446 state_correction = None
447 if self.debug:
448 print >>self._stderr, (
449 '\nStateMachine.check_line: state="%s", transitions=%r.'
450 % (state.__class__.__name__, transitions))
451 for name in transitions:
452 pattern, method, next_state = state.transitions[name]
453 match = pattern.match(self.line)
454 if match:
455 if self.debug:
456 print >>self._stderr, (
457 '\nStateMachine.check_line: Matched transition '
458 '"%s" in state "%s".'
459 % (name, state.__class__.__name__))
460 return method(match, context, next_state)
461 else:
462 if self.debug:
463 print >>self._stderr, (
464 '\nStateMachine.check_line: No match in state "%s".'
465 % state.__class__.__name__)
466 return state.no_match(context, transitions)
468 def add_state(self, state_class):
470 Initialize & add a `state_class` (`State` subclass) object.
472 Exception: `DuplicateStateError` raised if `state_class` was already
473 added.
475 statename = state_class.__name__
476 if statename in self.states:
477 raise DuplicateStateError(statename)
478 self.states[statename] = state_class(self, self.debug)
480 def add_states(self, state_classes):
482 Add `state_classes` (a list of `State` subclasses).
484 for state_class in state_classes:
485 self.add_state(state_class)
487 def runtime_init(self):
489 Initialize `self.states`.
491 for state in self.states.values():
492 state.runtime_init()
494 def error(self):
495 """Report error details."""
496 type, value, module, line, function = _exception_data()
497 print >>self._stderr, u'%s: %s' % (type, value)
498 print >>self._stderr, 'input line %s' % (self.abs_line_number())
499 print >>self._stderr, (u'module %s, line %s, function %s' %
500 (module, line, function))
502 def attach_observer(self, observer):
504 The `observer` parameter is a function or bound method which takes two
505 arguments, the source and offset of the current line.
507 self.observers.append(observer)
509 def detach_observer(self, observer):
510 self.observers.remove(observer)
512 def notify_observers(self):
513 for observer in self.observers:
514 try:
515 info = self.input_lines.info(self.line_offset)
516 except IndexError:
517 info = (None, None)
518 observer(*info)
521 class State:
524 State superclass. Contains a list of transitions, and transition methods.
526 Transition methods all have the same signature. They take 3 parameters:
528 - An `re` match object. ``match.string`` contains the matched input line,
529 ``match.start()`` gives the start index of the match, and
530 ``match.end()`` gives the end index.
531 - A context object, whose meaning is application-defined (initial value
532 ``None``). It can be used to store any information required by the state
533 machine, and the retured context is passed on to the next transition
534 method unchanged.
535 - The name of the next state, a string, taken from the transitions list;
536 normally it is returned unchanged, but it may be altered by the
537 transition method if necessary.
539 Transition methods all return a 3-tuple:
541 - A context object, as (potentially) modified by the transition method.
542 - The next state name (a return value of ``None`` means no state change).
543 - The processing result, a list, which is accumulated by the state
544 machine.
546 Transition methods may raise an `EOFError` to cut processing short.
548 There are two implicit transitions, and corresponding transition methods
549 are defined: `bof()` handles the beginning-of-file, and `eof()` handles
550 the end-of-file. These methods have non-standard signatures and return
551 values. `bof()` returns the initial context and results, and may be used
552 to return a header string, or do any other processing needed. `eof()`
553 should handle any remaining context and wrap things up; it returns the
554 final processing result.
556 Typical applications need only subclass `State` (or a subclass), set the
557 `patterns` and `initial_transitions` class attributes, and provide
558 corresponding transition methods. The default object initialization will
559 take care of constructing the list of transitions.
562 patterns = None
564 {Name: pattern} mapping, used by `make_transition()`. Each pattern may
565 be a string or a compiled `re` pattern. Override in subclasses.
568 initial_transitions = None
570 A list of transitions to initialize when a `State` is instantiated.
571 Each entry is either a transition name string, or a (transition name, next
572 state name) pair. See `make_transitions()`. Override in subclasses.
575 nested_sm = None
577 The `StateMachine` class for handling nested processing.
579 If left as ``None``, `nested_sm` defaults to the class of the state's
580 controlling state machine. Override it in subclasses to avoid the default.
583 nested_sm_kwargs = None
585 Keyword arguments dictionary, passed to the `nested_sm` constructor.
587 Two keys must have entries in the dictionary:
589 - Key 'state_classes' must be set to a list of `State` classes.
590 - Key 'initial_state' must be set to the name of the initial state class.
592 If `nested_sm_kwargs` is left as ``None``, 'state_classes' defaults to the
593 class of the current state, and 'initial_state' defaults to the name of
594 the class of the current state. Override in subclasses to avoid the
595 defaults.
598 def __init__(self, state_machine, debug=False):
600 Initialize a `State` object; make & add initial transitions.
602 Parameters:
604 - `statemachine`: the controlling `StateMachine` object.
605 - `debug`: a boolean; produce verbose output if true.
608 self.transition_order = []
609 """A list of transition names in search order."""
611 self.transitions = {}
613 A mapping of transition names to 3-tuples containing
614 (compiled_pattern, transition_method, next_state_name). Initialized as
615 an instance attribute dynamically (instead of as a class attribute)
616 because it may make forward references to patterns and methods in this
617 or other classes.
620 self.add_initial_transitions()
622 self.state_machine = state_machine
623 """A reference to the controlling `StateMachine` object."""
625 self.debug = debug
626 """Debugging mode on/off."""
628 if self.nested_sm is None:
629 self.nested_sm = self.state_machine.__class__
630 if self.nested_sm_kwargs is None:
631 self.nested_sm_kwargs = {'state_classes': [self.__class__],
632 'initial_state': self.__class__.__name__}
634 def runtime_init(self):
636 Initialize this `State` before running the state machine; called from
637 `self.state_machine.run()`.
639 pass
641 def unlink(self):
642 """Remove circular references to objects no longer required."""
643 self.state_machine = None
645 def add_initial_transitions(self):
646 """Make and add transitions listed in `self.initial_transitions`."""
647 if self.initial_transitions:
648 names, transitions = self.make_transitions(
649 self.initial_transitions)
650 self.add_transitions(names, transitions)
652 def add_transitions(self, names, transitions):
654 Add a list of transitions to the start of the transition list.
656 Parameters:
658 - `names`: a list of transition names.
659 - `transitions`: a mapping of names to transition tuples.
661 Exceptions: `DuplicateTransitionError`, `UnknownTransitionError`.
663 for name in names:
664 if name in self.transitions:
665 raise DuplicateTransitionError(name)
666 if name not in transitions:
667 raise UnknownTransitionError(name)
668 self.transition_order[:0] = names
669 self.transitions.update(transitions)
671 def add_transition(self, name, transition):
673 Add a transition to the start of the transition list.
675 Parameter `transition`: a ready-made transition 3-tuple.
677 Exception: `DuplicateTransitionError`.
679 if name in self.transitions:
680 raise DuplicateTransitionError(name)
681 self.transition_order[:0] = [name]
682 self.transitions[name] = transition
684 def remove_transition(self, name):
686 Remove a transition by `name`.
688 Exception: `UnknownTransitionError`.
690 try:
691 del self.transitions[name]
692 self.transition_order.remove(name)
693 except:
694 raise UnknownTransitionError(name)
696 def make_transition(self, name, next_state=None):
698 Make & return a transition tuple based on `name`.
700 This is a convenience function to simplify transition creation.
702 Parameters:
704 - `name`: a string, the name of the transition pattern & method. This
705 `State` object must have a method called '`name`', and a dictionary
706 `self.patterns` containing a key '`name`'.
707 - `next_state`: a string, the name of the next `State` object for this
708 transition. A value of ``None`` (or absent) implies no state change
709 (i.e., continue with the same state).
711 Exceptions: `TransitionPatternNotFound`, `TransitionMethodNotFound`.
713 if next_state is None:
714 next_state = self.__class__.__name__
715 try:
716 pattern = self.patterns[name]
717 if not hasattr(pattern, 'match'):
718 pattern = re.compile(pattern)
719 except KeyError:
720 raise TransitionPatternNotFound(
721 '%s.patterns[%r]' % (self.__class__.__name__, name))
722 try:
723 method = getattr(self, name)
724 except AttributeError:
725 raise TransitionMethodNotFound(
726 '%s.%s' % (self.__class__.__name__, name))
727 return (pattern, method, next_state)
729 def make_transitions(self, name_list):
731 Return a list of transition names and a transition mapping.
733 Parameter `name_list`: a list, where each entry is either a transition
734 name string, or a 1- or 2-tuple (transition name, optional next state
735 name).
737 stringtype = type('')
738 names = []
739 transitions = {}
740 for namestate in name_list:
741 if type(namestate) is stringtype:
742 transitions[namestate] = self.make_transition(namestate)
743 names.append(namestate)
744 else:
745 transitions[namestate[0]] = self.make_transition(*namestate)
746 names.append(namestate[0])
747 return names, transitions
749 def no_match(self, context, transitions):
751 Called when there is no match from `StateMachine.check_line()`.
753 Return the same values returned by transition methods:
755 - context: unchanged;
756 - next state name: ``None``;
757 - empty result list.
759 Override in subclasses to catch this event.
761 return context, None, []
763 def bof(self, context):
765 Handle beginning-of-file. Return unchanged `context`, empty result.
767 Override in subclasses.
769 Parameter `context`: application-defined storage.
771 return context, []
773 def eof(self, context):
775 Handle end-of-file. Return empty result.
777 Override in subclasses.
779 Parameter `context`: application-defined storage.
781 return []
783 def nop(self, match, context, next_state):
785 A "do nothing" transition method.
787 Return unchanged `context` & `next_state`, empty result. Useful for
788 simple state changes (actionless transitions).
790 return context, next_state, []
793 class StateMachineWS(StateMachine):
796 `StateMachine` subclass specialized for whitespace recognition.
798 There are three methods provided for extracting indented text blocks:
800 - `get_indented()`: use when the indent is unknown.
801 - `get_known_indented()`: use when the indent is known for all lines.
802 - `get_first_known_indented()`: use when only the first line's indent is
803 known.
806 def get_indented(self, until_blank=False, strip_indent=True):
808 Return a block of indented lines of text, and info.
810 Extract an indented block where the indent is unknown for all lines.
812 :Parameters:
813 - `until_blank`: Stop collecting at the first blank line if true.
814 - `strip_indent`: Strip common leading indent if true (default).
816 :Return:
817 - the indented block (a list of lines of text),
818 - its indent,
819 - its first line offset from BOF, and
820 - whether or not it finished with a blank line.
822 offset = self.abs_line_offset()
823 indented, indent, blank_finish = self.input_lines.get_indented(
824 self.line_offset, until_blank, strip_indent)
825 if indented:
826 self.next_line(len(indented) - 1) # advance to last indented line
827 while indented and not indented[0].strip():
828 indented.trim_start()
829 offset += 1
830 return indented, indent, offset, blank_finish
832 def get_known_indented(self, indent, until_blank=False, strip_indent=True):
834 Return an indented block and info.
836 Extract an indented block where the indent is known for all lines.
837 Starting with the current line, extract the entire text block with at
838 least `indent` indentation (which must be whitespace, except for the
839 first line).
841 :Parameters:
842 - `indent`: The number of indent columns/characters.
843 - `until_blank`: Stop collecting at the first blank line if true.
844 - `strip_indent`: Strip `indent` characters of indentation if true
845 (default).
847 :Return:
848 - the indented block,
849 - its first line offset from BOF, and
850 - whether or not it finished with a blank line.
852 offset = self.abs_line_offset()
853 indented, indent, blank_finish = self.input_lines.get_indented(
854 self.line_offset, until_blank, strip_indent,
855 block_indent=indent)
856 self.next_line(len(indented) - 1) # advance to last indented line
857 while indented and not indented[0].strip():
858 indented.trim_start()
859 offset += 1
860 return indented, offset, blank_finish
862 def get_first_known_indented(self, indent, until_blank=False,
863 strip_indent=True, strip_top=True):
865 Return an indented block and info.
867 Extract an indented block where the indent is known for the first line
868 and unknown for all other lines.
870 :Parameters:
871 - `indent`: The first line's indent (# of columns/characters).
872 - `until_blank`: Stop collecting at the first blank line if true
873 (1).
874 - `strip_indent`: Strip `indent` characters of indentation if true
875 (1, default).
876 - `strip_top`: Strip blank lines from the beginning of the block.
878 :Return:
879 - the indented block,
880 - its indent,
881 - its first line offset from BOF, and
882 - whether or not it finished with a blank line.
884 offset = self.abs_line_offset()
885 indented, indent, blank_finish = self.input_lines.get_indented(
886 self.line_offset, until_blank, strip_indent,
887 first_indent=indent)
888 self.next_line(len(indented) - 1) # advance to last indented line
889 if strip_top:
890 while indented and not indented[0].strip():
891 indented.trim_start()
892 offset += 1
893 return indented, indent, offset, blank_finish
896 class StateWS(State):
899 State superclass specialized for whitespace (blank lines & indents).
901 Use this class with `StateMachineWS`. The transitions 'blank' (for blank
902 lines) and 'indent' (for indented text blocks) are added automatically,
903 before any other transitions. The transition method `blank()` handles
904 blank lines and `indent()` handles nested indented blocks. Indented
905 blocks trigger a new state machine to be created by `indent()` and run.
906 The class of the state machine to be created is in `indent_sm`, and the
907 constructor keyword arguments are in the dictionary `indent_sm_kwargs`.
909 The methods `known_indent()` and `firstknown_indent()` are provided for
910 indented blocks where the indent (all lines' and first line's only,
911 respectively) is known to the transition method, along with the attributes
912 `known_indent_sm` and `known_indent_sm_kwargs`. Neither transition method
913 is triggered automatically.
916 indent_sm = None
918 The `StateMachine` class handling indented text blocks.
920 If left as ``None``, `indent_sm` defaults to the value of
921 `State.nested_sm`. Override it in subclasses to avoid the default.
924 indent_sm_kwargs = None
926 Keyword arguments dictionary, passed to the `indent_sm` constructor.
928 If left as ``None``, `indent_sm_kwargs` defaults to the value of
929 `State.nested_sm_kwargs`. Override it in subclasses to avoid the default.
932 known_indent_sm = None
934 The `StateMachine` class handling known-indented text blocks.
936 If left as ``None``, `known_indent_sm` defaults to the value of
937 `indent_sm`. Override it in subclasses to avoid the default.
940 known_indent_sm_kwargs = None
942 Keyword arguments dictionary, passed to the `known_indent_sm` constructor.
944 If left as ``None``, `known_indent_sm_kwargs` defaults to the value of
945 `indent_sm_kwargs`. Override it in subclasses to avoid the default.
948 ws_patterns = {'blank': ' *$',
949 'indent': ' +'}
950 """Patterns for default whitespace transitions. May be overridden in
951 subclasses."""
953 ws_initial_transitions = ('blank', 'indent')
954 """Default initial whitespace transitions, added before those listed in
955 `State.initial_transitions`. May be overridden in subclasses."""
957 def __init__(self, state_machine, debug=False):
959 Initialize a `StateSM` object; extends `State.__init__()`.
961 Check for indent state machine attributes, set defaults if not set.
963 State.__init__(self, state_machine, debug)
964 if self.indent_sm is None:
965 self.indent_sm = self.nested_sm
966 if self.indent_sm_kwargs is None:
967 self.indent_sm_kwargs = self.nested_sm_kwargs
968 if self.known_indent_sm is None:
969 self.known_indent_sm = self.indent_sm
970 if self.known_indent_sm_kwargs is None:
971 self.known_indent_sm_kwargs = self.indent_sm_kwargs
973 def add_initial_transitions(self):
975 Add whitespace-specific transitions before those defined in subclass.
977 Extends `State.add_initial_transitions()`.
979 State.add_initial_transitions(self)
980 if self.patterns is None:
981 self.patterns = {}
982 self.patterns.update(self.ws_patterns)
983 names, transitions = self.make_transitions(
984 self.ws_initial_transitions)
985 self.add_transitions(names, transitions)
987 def blank(self, match, context, next_state):
988 """Handle blank lines. Does nothing. Override in subclasses."""
989 return self.nop(match, context, next_state)
991 def indent(self, match, context, next_state):
993 Handle an indented text block. Extend or override in subclasses.
995 Recursively run the registered state machine for indented blocks
996 (`self.indent_sm`).
998 indented, indent, line_offset, blank_finish = \
999 self.state_machine.get_indented()
1000 sm = self.indent_sm(debug=self.debug, **self.indent_sm_kwargs)
1001 results = sm.run(indented, input_offset=line_offset)
1002 return context, next_state, results
1004 def known_indent(self, match, context, next_state):
1006 Handle a known-indent text block. Extend or override in subclasses.
1008 Recursively run the registered state machine for known-indent indented
1009 blocks (`self.known_indent_sm`). The indent is the length of the
1010 match, ``match.end()``.
1012 indented, line_offset, blank_finish = \
1013 self.state_machine.get_known_indented(match.end())
1014 sm = self.known_indent_sm(debug=self.debug,
1015 **self.known_indent_sm_kwargs)
1016 results = sm.run(indented, input_offset=line_offset)
1017 return context, next_state, results
1019 def first_known_indent(self, match, context, next_state):
1021 Handle an indented text block (first line's indent known).
1023 Extend or override in subclasses.
1025 Recursively run the registered state machine for known-indent indented
1026 blocks (`self.known_indent_sm`). The indent is the length of the
1027 match, ``match.end()``.
1029 indented, line_offset, blank_finish = \
1030 self.state_machine.get_first_known_indented(match.end())
1031 sm = self.known_indent_sm(debug=self.debug,
1032 **self.known_indent_sm_kwargs)
1033 results = sm.run(indented, input_offset=line_offset)
1034 return context, next_state, results
1037 class _SearchOverride:
1040 Mix-in class to override `StateMachine` regular expression behavior.
1042 Changes regular expression matching, from the default `re.match()`
1043 (succeeds only if the pattern matches at the start of `self.line`) to
1044 `re.search()` (succeeds if the pattern matches anywhere in `self.line`).
1045 When subclassing a `StateMachine`, list this class **first** in the
1046 inheritance list of the class definition.
1049 def match(self, pattern):
1051 Return the result of a regular expression search.
1053 Overrides `StateMachine.match()`.
1055 Parameter `pattern`: `re` compiled regular expression.
1057 return pattern.search(self.line)
1060 class SearchStateMachine(_SearchOverride, StateMachine):
1061 """`StateMachine` which uses `re.search()` instead of `re.match()`."""
1062 pass
1065 class SearchStateMachineWS(_SearchOverride, StateMachineWS):
1066 """`StateMachineWS` which uses `re.search()` instead of `re.match()`."""
1067 pass
1070 class ViewList:
1073 List with extended functionality: slices of ViewList objects are child
1074 lists, linked to their parents. Changes made to a child list also affect
1075 the parent list. A child list is effectively a "view" (in the SQL sense)
1076 of the parent list. Changes to parent lists, however, do *not* affect
1077 active child lists. If a parent list is changed, any active child lists
1078 should be recreated.
1080 The start and end of the slice can be trimmed using the `trim_start()` and
1081 `trim_end()` methods, without affecting the parent list. The link between
1082 child and parent lists can be broken by calling `disconnect()` on the
1083 child list.
1085 Also, ViewList objects keep track of the source & offset of each item.
1086 This information is accessible via the `source()`, `offset()`, and
1087 `info()` methods.
1090 def __init__(self, initlist=None, source=None, items=None,
1091 parent=None, parent_offset=None):
1092 self.data = []
1093 """The actual list of data, flattened from various sources."""
1095 self.items = []
1096 """A list of (source, offset) pairs, same length as `self.data`: the
1097 source of each line and the offset of each line from the beginning of
1098 its source."""
1100 self.parent = parent
1101 """The parent list."""
1103 self.parent_offset = parent_offset
1104 """Offset of this list from the beginning of the parent list."""
1106 if isinstance(initlist, ViewList):
1107 self.data = initlist.data[:]
1108 self.items = initlist.items[:]
1109 elif initlist is not None:
1110 self.data = list(initlist)
1111 if items:
1112 self.items = items
1113 else:
1114 self.items = [(source, i) for i in range(len(initlist))]
1115 assert len(self.data) == len(self.items), 'data mismatch'
1117 def __str__(self):
1118 return str(self.data)
1120 def __repr__(self):
1121 return '%s(%s, items=%s)' % (self.__class__.__name__,
1122 self.data, self.items)
1124 def __lt__(self, other): return self.data < self.__cast(other)
1125 def __le__(self, other): return self.data <= self.__cast(other)
1126 def __eq__(self, other): return self.data == self.__cast(other)
1127 def __ne__(self, other): return self.data != self.__cast(other)
1128 def __gt__(self, other): return self.data > self.__cast(other)
1129 def __ge__(self, other): return self.data >= self.__cast(other)
1130 def __cmp__(self, other): return cmp(self.data, self.__cast(other))
1132 def __cast(self, other):
1133 if isinstance(other, ViewList):
1134 return other.data
1135 else:
1136 return other
1138 def __contains__(self, item): return item in self.data
1139 def __len__(self): return len(self.data)
1141 # The __getitem__()/__setitem__() methods check whether the index
1142 # is a slice first, since indexing a native list with a slice object
1143 # just works.
1145 def __getitem__(self, i):
1146 if isinstance(i, types.SliceType):
1147 assert i.step in (None, 1), 'cannot handle slice with stride'
1148 return self.__class__(self.data[i.start:i.stop],
1149 items=self.items[i.start:i.stop],
1150 parent=self, parent_offset=i.start or 0)
1151 else:
1152 return self.data[i]
1154 def __setitem__(self, i, item):
1155 if isinstance(i, types.SliceType):
1156 assert i.step in (None, 1), 'cannot handle slice with stride'
1157 if not isinstance(item, ViewList):
1158 raise TypeError('assigning non-ViewList to ViewList slice')
1159 self.data[i.start:i.stop] = item.data
1160 self.items[i.start:i.stop] = item.items
1161 assert len(self.data) == len(self.items), 'data mismatch'
1162 if self.parent:
1163 self.parent[(i.start or 0) + self.parent_offset
1164 : (i.stop or len(self)) + self.parent_offset] = item
1165 else:
1166 self.data[i] = item
1167 if self.parent:
1168 self.parent[i + self.parent_offset] = item
1170 def __delitem__(self, i):
1171 try:
1172 del self.data[i]
1173 del self.items[i]
1174 if self.parent:
1175 del self.parent[i + self.parent_offset]
1176 except TypeError:
1177 assert i.step is None, 'cannot handle slice with stride'
1178 del self.data[i.start:i.stop]
1179 del self.items[i.start:i.stop]
1180 if self.parent:
1181 del self.parent[(i.start or 0) + self.parent_offset
1182 : (i.stop or len(self)) + self.parent_offset]
1184 def __add__(self, other):
1185 if isinstance(other, ViewList):
1186 return self.__class__(self.data + other.data,
1187 items=(self.items + other.items))
1188 else:
1189 raise TypeError('adding non-ViewList to a ViewList')
1191 def __radd__(self, other):
1192 if isinstance(other, ViewList):
1193 return self.__class__(other.data + self.data,
1194 items=(other.items + self.items))
1195 else:
1196 raise TypeError('adding ViewList to a non-ViewList')
1198 def __iadd__(self, other):
1199 if isinstance(other, ViewList):
1200 self.data += other.data
1201 else:
1202 raise TypeError('argument to += must be a ViewList')
1203 return self
1205 def __mul__(self, n):
1206 return self.__class__(self.data * n, items=(self.items * n))
1208 __rmul__ = __mul__
1210 def __imul__(self, n):
1211 self.data *= n
1212 self.items *= n
1213 return self
1215 def extend(self, other):
1216 if not isinstance(other, ViewList):
1217 raise TypeError('extending a ViewList with a non-ViewList')
1218 if self.parent:
1219 self.parent.insert(len(self.data) + self.parent_offset, other)
1220 self.data.extend(other.data)
1221 self.items.extend(other.items)
1223 def append(self, item, source=None, offset=0):
1224 if source is None:
1225 self.extend(item)
1226 else:
1227 if self.parent:
1228 self.parent.insert(len(self.data) + self.parent_offset, item,
1229 source, offset)
1230 self.data.append(item)
1231 self.items.append((source, offset))
1233 def insert(self, i, item, source=None, offset=0):
1234 if source is None:
1235 if not isinstance(item, ViewList):
1236 raise TypeError('inserting non-ViewList with no source given')
1237 self.data[i:i] = item.data
1238 self.items[i:i] = item.items
1239 if self.parent:
1240 index = (len(self.data) + i) % len(self.data)
1241 self.parent.insert(index + self.parent_offset, item)
1242 else:
1243 self.data.insert(i, item)
1244 self.items.insert(i, (source, offset))
1245 if self.parent:
1246 index = (len(self.data) + i) % len(self.data)
1247 self.parent.insert(index + self.parent_offset, item,
1248 source, offset)
1250 def pop(self, i=-1):
1251 if self.parent:
1252 index = (len(self.data) + i) % len(self.data)
1253 self.parent.pop(index + self.parent_offset)
1254 self.items.pop(i)
1255 return self.data.pop(i)
1257 def trim_start(self, n=1):
1259 Remove items from the start of the list, without touching the parent.
1261 if n > len(self.data):
1262 raise IndexError("Size of trim too large; can't trim %s items "
1263 "from a list of size %s." % (n, len(self.data)))
1264 elif n < 0:
1265 raise IndexError('Trim size must be >= 0.')
1266 del self.data[:n]
1267 del self.items[:n]
1268 if self.parent:
1269 self.parent_offset += n
1271 def trim_end(self, n=1):
1273 Remove items from the end of the list, without touching the parent.
1275 if n > len(self.data):
1276 raise IndexError("Size of trim too large; can't trim %s items "
1277 "from a list of size %s." % (n, len(self.data)))
1278 elif n < 0:
1279 raise IndexError('Trim size must be >= 0.')
1280 del self.data[-n:]
1281 del self.items[-n:]
1283 def remove(self, item):
1284 index = self.index(item)
1285 del self[index]
1287 def count(self, item): return self.data.count(item)
1288 def index(self, item): return self.data.index(item)
1290 def reverse(self):
1291 self.data.reverse()
1292 self.items.reverse()
1293 self.parent = None
1295 def sort(self, *args):
1296 tmp = zip(self.data, self.items)
1297 tmp.sort(*args)
1298 self.data = [entry[0] for entry in tmp]
1299 self.items = [entry[1] for entry in tmp]
1300 self.parent = None
1302 def info(self, i):
1303 """Return source & offset for index `i`."""
1304 try:
1305 return self.items[i]
1306 except IndexError:
1307 if i == len(self.data): # Just past the end
1308 return self.items[i - 1][0], None
1309 else:
1310 raise
1312 def source(self, i):
1313 """Return source for index `i`."""
1314 return self.info(i)[0]
1316 def offset(self, i):
1317 """Return offset for index `i`."""
1318 return self.info(i)[1]
1320 def disconnect(self):
1321 """Break link between this list and parent list."""
1322 self.parent = None
1324 def xitems(self):
1325 """Return iterator yielding (source, offset, value) tuples."""
1326 for (value, (source, offset)) in zip(self.data, self.items):
1327 yield (source, offset, value)
1329 def pprint(self):
1330 """Print the list in `grep` format (`source:offset:value` lines)"""
1331 for line in self.xitems():
1332 print "%s:%d:%s" % line
1335 class StringList(ViewList):
1337 """A `ViewList` with string-specific methods."""
1339 def trim_left(self, length, start=0, end=sys.maxint):
1341 Trim `length` characters off the beginning of each item, in-place,
1342 from index `start` to `end`. No whitespace-checking is done on the
1343 trimmed text. Does not affect slice parent.
1345 self.data[start:end] = [line[length:]
1346 for line in self.data[start:end]]
1348 def get_text_block(self, start, flush_left=False):
1350 Return a contiguous block of text.
1352 If `flush_left` is true, raise `UnexpectedIndentationError` if an
1353 indented line is encountered before the text block ends (with a blank
1354 line).
1356 end = start
1357 last = len(self.data)
1358 while end < last:
1359 line = self.data[end]
1360 if not line.strip():
1361 break
1362 if flush_left and (line[0] == ' '):
1363 source, offset = self.info(end)
1364 raise UnexpectedIndentationError(self[start:end], source,
1365 offset + 1)
1366 end += 1
1367 return self[start:end]
1369 def get_indented(self, start=0, until_blank=False, strip_indent=True,
1370 block_indent=None, first_indent=None):
1372 Extract and return a StringList of indented lines of text.
1374 Collect all lines with indentation, determine the minimum indentation,
1375 remove the minimum indentation from all indented lines (unless
1376 `strip_indent` is false), and return them. All lines up to but not
1377 including the first unindented line will be returned.
1379 :Parameters:
1380 - `start`: The index of the first line to examine.
1381 - `until_blank`: Stop collecting at the first blank line if true.
1382 - `strip_indent`: Strip common leading indent if true (default).
1383 - `block_indent`: The indent of the entire block, if known.
1384 - `first_indent`: The indent of the first line, if known.
1386 :Return:
1387 - a StringList of indented lines with mininum indent removed;
1388 - the amount of the indent;
1389 - a boolean: did the indented block finish with a blank line or EOF?
1391 indent = block_indent # start with None if unknown
1392 end = start
1393 if block_indent is not None and first_indent is None:
1394 first_indent = block_indent
1395 if first_indent is not None:
1396 end += 1
1397 last = len(self.data)
1398 while end < last:
1399 line = self.data[end]
1400 if line and (line[0] != ' '
1401 or (block_indent is not None
1402 and line[:block_indent].strip())):
1403 # Line not indented or insufficiently indented.
1404 # Block finished properly iff the last indented line blank:
1405 blank_finish = ((end > start)
1406 and not self.data[end - 1].strip())
1407 break
1408 stripped = line.lstrip()
1409 if not stripped: # blank line
1410 if until_blank:
1411 blank_finish = 1
1412 break
1413 elif block_indent is None:
1414 line_indent = len(line) - len(stripped)
1415 if indent is None:
1416 indent = line_indent
1417 else:
1418 indent = min(indent, line_indent)
1419 end += 1
1420 else:
1421 blank_finish = 1 # block ends at end of lines
1422 block = self[start:end]
1423 if first_indent is not None and block:
1424 block.data[0] = block.data[0][first_indent:]
1425 if indent and strip_indent:
1426 block.trim_left(indent, start=(first_indent is not None))
1427 return block, indent or 0, blank_finish
1429 def get_2D_block(self, top, left, bottom, right, strip_indent=True):
1430 block = self[top:bottom]
1431 indent = right
1432 for i in range(len(block.data)):
1433 # get slice from line, care for combining characters
1434 ci = utils.column_indices(block.data[i])
1435 try:
1436 left = ci[left]
1437 except IndexError:
1438 left += len(block.data[i]) - len(ci)
1439 try:
1440 right = ci[right]
1441 except IndexError:
1442 right += len(block.data[i]) - len(ci)
1443 block.data[i] = line = block.data[i][left:right].rstrip()
1444 if line:
1445 indent = min(indent, len(line) - len(line.lstrip()))
1446 if strip_indent and 0 < indent < right:
1447 block.data = [line[indent:] for line in block.data]
1448 return block
1450 def pad_double_width(self, pad_char):
1452 Pad all double-width characters in self by appending `pad_char` to each.
1453 For East Asian language support.
1455 if hasattr(unicodedata, 'east_asian_width'):
1456 east_asian_width = unicodedata.east_asian_width
1457 else:
1458 return # new in Python 2.4
1459 for i in range(len(self.data)):
1460 line = self.data[i]
1461 if isinstance(line, unicode):
1462 new = []
1463 for char in line:
1464 new.append(char)
1465 if east_asian_width(char) in 'WF': # 'W'ide & 'F'ull-width
1466 new.append(pad_char)
1467 self.data[i] = ''.join(new)
1469 def replace(self, old, new):
1470 """Replace all occurrences of substring `old` with `new`."""
1471 for i in range(len(self.data)):
1472 self.data[i] = self.data[i].replace(old, new)
1475 class StateMachineError(Exception): pass
1476 class UnknownStateError(StateMachineError): pass
1477 class DuplicateStateError(StateMachineError): pass
1478 class UnknownTransitionError(StateMachineError): pass
1479 class DuplicateTransitionError(StateMachineError): pass
1480 class TransitionPatternNotFound(StateMachineError): pass
1481 class TransitionMethodNotFound(StateMachineError): pass
1482 class UnexpectedIndentationError(StateMachineError): pass
1485 class TransitionCorrection(Exception):
1488 Raise from within a transition method to switch to another transition.
1490 Raise with one argument, the new transition name.
1494 class StateCorrection(Exception):
1497 Raise from within a transition method to switch to another state.
1499 Raise with one or two arguments: new state name, and an optional new
1500 transition name.
1504 def string2lines(astring, tab_width=8, convert_whitespace=False,
1505 whitespace=re.compile('[\v\f]')):
1507 Return a list of one-line strings with tabs expanded, no newlines, and
1508 trailing whitespace stripped.
1510 Each tab is expanded with between 1 and `tab_width` spaces, so that the
1511 next character's index becomes a multiple of `tab_width` (8 by default).
1513 Parameters:
1515 - `astring`: a multi-line string.
1516 - `tab_width`: the number of columns between tab stops.
1517 - `convert_whitespace`: convert form feeds and vertical tabs to spaces?
1519 if convert_whitespace:
1520 astring = whitespace.sub(' ', astring)
1521 return [s.expandtabs(tab_width).rstrip() for s in astring.splitlines()]
1523 def _exception_data():
1525 Return exception information:
1527 - the exception's class name;
1528 - the exception object;
1529 - the name of the file containing the offending code;
1530 - the line number of the offending code;
1531 - the function name of the offending code.
1533 type, value, traceback = sys.exc_info()
1534 while traceback.tb_next:
1535 traceback = traceback.tb_next
1536 code = traceback.tb_frame.f_code
1537 return (type.__name__, value, code.co_filename, traceback.tb_lineno,
1538 code.co_name)