Rewrite tests with r"strings", to make them better readable.
[docutils.git] / docutils / statemachine.py
blob6c0c6fc037112390b87e3141e60e236d69e7c82d
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 A finite state machine specialized for regular-expression-based text filters,
7 this module defines the following classes:
9 - `StateMachine`, a state machine
10 - `State`, a state superclass
11 - `StateMachineWS`, a whitespace-sensitive version of `StateMachine`
12 - `StateWS`, a state superclass for use with `StateMachineWS`
13 - `SearchStateMachine`, uses `re.search()` instead of `re.match()`
14 - `SearchStateMachineWS`, uses `re.search()` instead of `re.match()`
15 - `ViewList`, extends standard Python lists.
16 - `StringList`, string-specific ViewList.
18 Exception classes:
20 - `StateMachineError`
21 - `UnknownStateError`
22 - `DuplicateStateError`
23 - `UnknownTransitionError`
24 - `DuplicateTransitionError`
25 - `TransitionPatternNotFound`
26 - `TransitionMethodNotFound`
27 - `UnexpectedIndentationError`
28 - `TransitionCorrection`: Raised to switch to another transition.
29 - `StateCorrection`: Raised to switch to another state & transition.
31 Functions:
33 - `string2lines()`: split a multi-line string into a list of one-line strings
36 How To Use This Module
37 ======================
38 (See the individual classes, methods, and attributes for details.)
40 1. Import it: ``import statemachine`` or ``from statemachine import ...``.
41 You will also need to ``import re``.
43 2. Derive a subclass of `State` (or `StateWS`) for each state in your state
44 machine::
46 class MyState(statemachine.State):
48 Within the state's class definition:
50 a) Include a pattern for each transition, in `State.patterns`::
52 patterns = {'atransition': r'pattern', ...}
54 b) Include a list of initial transitions to be set up automatically, in
55 `State.initial_transitions`::
57 initial_transitions = ['atransition', ...]
59 c) Define a method for each transition, with the same name as the
60 transition pattern::
62 def atransition(self, match, context, next_state):
63 # do something
64 result = [...] # a list
65 return context, next_state, result
66 # context, next_state may be altered
68 Transition methods may raise an `EOFError` to cut processing short.
70 d) You may wish to override the `State.bof()` and/or `State.eof()` implicit
71 transition methods, which handle the beginning- and end-of-file.
73 e) In order to handle nested processing, you may wish to override the
74 attributes `State.nested_sm` and/or `State.nested_sm_kwargs`.
76 If you are using `StateWS` as a base class, in order to handle nested
77 indented blocks, you may wish to:
79 - override the attributes `StateWS.indent_sm`,
80 `StateWS.indent_sm_kwargs`, `StateWS.known_indent_sm`, and/or
81 `StateWS.known_indent_sm_kwargs`;
82 - override the `StateWS.blank()` method; and/or
83 - override or extend the `StateWS.indent()`, `StateWS.known_indent()`,
84 and/or `StateWS.firstknown_indent()` methods.
86 3. Create a state machine object::
88 sm = StateMachine(state_classes=[MyState, ...],
89 initial_state='MyState')
91 4. Obtain the input text, which needs to be converted into a tab-free list of
92 one-line strings. For example, to read text from a file called
93 'inputfile'::
95 input_string = open('inputfile').read()
96 input_lines = statemachine.string2lines(input_string)
98 5. Run the state machine on the input text and collect the results, a list::
100 results = sm.run(input_lines)
102 6. Remove any lingering circular references::
104 sm.unlink()
107 __docformat__ = 'restructuredtext'
109 import sys
110 import re
111 import types
112 import unicodedata
113 from docutils.error_reporting import ErrorOutput
115 class StateMachine:
118 A finite state machine for text filters using regular expressions.
120 The input is provided in the form of a list of one-line strings (no
121 newlines). States are subclasses of the `State` class. Transitions consist
122 of regular expression patterns and transition methods, and are defined in
123 each state.
125 The state machine is started with the `run()` method, which returns the
126 results of processing in a list.
129 def __init__(self, state_classes, initial_state, debug=0):
131 Initialize a `StateMachine` object; add state objects.
133 Parameters:
135 - `state_classes`: a list of `State` (sub)classes.
136 - `initial_state`: a string, the class name of the initial state.
137 - `debug`: a boolean; produce verbose output if true (nonzero).
140 self.input_lines = None
141 """`StringList` of input lines (without newlines).
142 Filled by `self.run()`."""
144 self.input_offset = 0
145 """Offset of `self.input_lines` from the beginning of the file."""
147 self.line = None
148 """Current input line."""
150 self.line_offset = -1
151 """Current input line offset from beginning of `self.input_lines`."""
153 self.debug = debug
154 """Debugging mode on/off."""
156 self.initial_state = initial_state
157 """The name of the initial state (key to `self.states`)."""
159 self.current_state = initial_state
160 """The name of the current state (key to `self.states`)."""
162 self.states = {}
163 """Mapping of {state_name: State_object}."""
165 self.add_states(state_classes)
167 self.observers = []
168 """List of bound methods or functions to call whenever the current
169 line changes. Observers are called with one argument, ``self``.
170 Cleared at the end of `run()`."""
172 self._stderr = ErrorOutput()
173 """Wrapper around sys.stderr catching en-/decoding errors"""
176 def unlink(self):
177 """Remove circular references to objects no longer required."""
178 for state in self.states.values():
179 state.unlink()
180 self.states = None
182 def run(self, input_lines, input_offset=0, context=None,
183 input_source=None, initial_state=None):
185 Run the state machine on `input_lines`. Return results (a list).
187 Reset `self.line_offset` and `self.current_state`. Run the
188 beginning-of-file transition. Input one line at a time and check for a
189 matching transition. If a match is found, call the transition method
190 and possibly change the state. Store the context returned by the
191 transition method to be passed on to the next transition matched.
192 Accumulate the results returned by the transition methods in a list.
193 Run the end-of-file transition. Finally, return the accumulated
194 results.
196 Parameters:
198 - `input_lines`: a list of strings without newlines, or `StringList`.
199 - `input_offset`: the line offset of `input_lines` from the beginning
200 of the file.
201 - `context`: application-specific storage.
202 - `input_source`: name or path of source of `input_lines`.
203 - `initial_state`: name of initial state.
205 self.runtime_init()
206 if isinstance(input_lines, StringList):
207 self.input_lines = input_lines
208 else:
209 self.input_lines = StringList(input_lines, source=input_source)
210 self.input_offset = input_offset
211 self.line_offset = -1
212 self.current_state = initial_state or self.initial_state
213 if self.debug:
214 print >>self._stderr, (
215 u'\nStateMachine.run: input_lines (line_offset=%s):\n| %s'
216 % (self.line_offset, u'\n| '.join(self.input_lines)))
217 transitions = None
218 results = []
219 state = self.get_state()
220 try:
221 if self.debug:
222 print >>self._stderr, '\nStateMachine.run: bof transition'
223 context, result = state.bof(context)
224 results.extend(result)
225 while 1:
226 try:
227 try:
228 self.next_line()
229 if self.debug:
230 source, offset = self.input_lines.info(
231 self.line_offset)
232 print >>self._stderr, (
233 u'\nStateMachine.run: line (source=%r, '
234 u'offset=%r):\n| %s'
235 % (source, offset, self.line))
236 context, next_state, result = self.check_line(
237 context, state, transitions)
238 except EOFError:
239 if self.debug:
240 print >>self._stderr, (
241 '\nStateMachine.run: %s.eof transition'
242 % state.__class__.__name__)
243 result = state.eof(context)
244 results.extend(result)
245 break
246 else:
247 results.extend(result)
248 except TransitionCorrection, exception:
249 self.previous_line() # back up for another try
250 transitions = (exception.args[0],)
251 if self.debug:
252 print >>self._stderr, (
253 '\nStateMachine.run: TransitionCorrection to '
254 'state "%s", transition %s.'
255 % (state.__class__.__name__, transitions[0]))
256 continue
257 except StateCorrection, exception:
258 self.previous_line() # back up for another try
259 next_state = exception.args[0]
260 if len(exception.args) == 1:
261 transitions = None
262 else:
263 transitions = (exception.args[1],)
264 if self.debug:
265 print >>self._stderr, (
266 '\nStateMachine.run: StateCorrection to state '
267 '"%s", transition %s.'
268 % (next_state, transitions[0]))
269 else:
270 transitions = None
271 state = self.get_state(next_state)
272 except:
273 if self.debug:
274 self.error()
275 raise
276 self.observers = []
277 return results
279 def get_state(self, next_state=None):
281 Return current state object; set it first if `next_state` given.
283 Parameter `next_state`: a string, the name of the next state.
285 Exception: `UnknownStateError` raised if `next_state` unknown.
287 if next_state:
288 if self.debug and next_state != self.current_state:
289 print >>self._stderr, (
290 '\nStateMachine.get_state: Changing state from '
291 '"%s" to "%s" (input line %s).'
292 % (self.current_state, next_state,
293 self.abs_line_number()))
294 self.current_state = next_state
295 try:
296 return self.states[self.current_state]
297 except KeyError:
298 raise UnknownStateError(self.current_state)
300 def next_line(self, n=1):
301 """Load `self.line` with the `n`'th next line and return it."""
302 try:
303 try:
304 self.line_offset += n
305 self.line = self.input_lines[self.line_offset]
306 except IndexError:
307 self.line = None
308 raise EOFError
309 return self.line
310 finally:
311 self.notify_observers()
313 def is_next_line_blank(self):
314 """Return 1 if the next line is blank or non-existant."""
315 try:
316 return not self.input_lines[self.line_offset + 1].strip()
317 except IndexError:
318 return 1
320 def at_eof(self):
321 """Return 1 if the input is at or past end-of-file."""
322 return self.line_offset >= len(self.input_lines) - 1
324 def at_bof(self):
325 """Return 1 if the input is at or before beginning-of-file."""
326 return self.line_offset <= 0
328 def previous_line(self, n=1):
329 """Load `self.line` with the `n`'th previous line and return it."""
330 self.line_offset -= n
331 if self.line_offset < 0:
332 self.line = None
333 else:
334 self.line = self.input_lines[self.line_offset]
335 self.notify_observers()
336 return self.line
338 def goto_line(self, line_offset):
339 """Jump to absolute line offset `line_offset`, load and return it."""
340 try:
341 try:
342 self.line_offset = line_offset - self.input_offset
343 self.line = self.input_lines[self.line_offset]
344 except IndexError:
345 self.line = None
346 raise EOFError
347 return self.line
348 finally:
349 self.notify_observers()
351 def get_source(self, line_offset):
352 """Return source of line at absolute line offset `line_offset`."""
353 return self.input_lines.source(line_offset - self.input_offset)
355 def abs_line_offset(self):
356 """Return line offset of current line, from beginning of file."""
357 return self.line_offset + self.input_offset
359 def abs_line_number(self):
360 """Return line number of current line (counting from 1)."""
361 return self.line_offset + self.input_offset + 1
363 def get_source_and_line(self, lineno=None):
364 """Return (source, line) tuple for current or given line number.
366 Looks up the source and line number in the `self.input_lines`
367 StringList instance to count for included source files.
369 If the optional argument `lineno` is given, convert it from an
370 absolute line number to the corresponding (source, line) pair.
372 if lineno is None:
373 offset = self.line_offset
374 else:
375 offset = lineno - self.input_offset - 1
376 try:
377 src, srcoffset = self.input_lines.info(offset)
378 srcline = srcoffset + 1
379 except (TypeError):
380 # line is None if index is "Just past the end"
381 src, srcline = self.get_source_and_line(offset + self.input_offset)
382 return src, srcline + 1
383 except (IndexError): # `offset` is off the list
384 src, srcline = None, None
385 # raise AssertionError('cannot find line %d in %s lines' %
386 # (offset, len(self.input_lines)))
387 # # list(self.input_lines.lines())))
388 # assert offset == srcoffset, str(self.input_lines)
389 # print "get_source_and_line(%s):" % lineno,
390 # print offset + 1, '->', src, srcline
391 # print self.input_lines
392 return (src, srcline)
394 def insert_input(self, input_lines, source):
395 self.input_lines.insert(self.line_offset + 1, '',
396 source='internal padding after '+source,
397 offset=len(input_lines))
398 self.input_lines.insert(self.line_offset + 1, '',
399 source='internal padding before '+source,
400 offset=-1)
401 self.input_lines.insert(self.line_offset + 2,
402 StringList(input_lines, source))
404 def get_text_block(self, flush_left=0):
406 Return a contiguous block of text.
408 If `flush_left` is true, raise `UnexpectedIndentationError` if an
409 indented line is encountered before the text block ends (with a blank
410 line).
412 try:
413 block = self.input_lines.get_text_block(self.line_offset,
414 flush_left)
415 self.next_line(len(block) - 1)
416 return block
417 except UnexpectedIndentationError, error:
418 block, source, lineno = error.args
419 self.next_line(len(block) - 1) # advance to last line of block
420 raise
422 def check_line(self, context, state, transitions=None):
424 Examine one line of input for a transition match & execute its method.
426 Parameters:
428 - `context`: application-dependent storage.
429 - `state`: a `State` object, the current state.
430 - `transitions`: an optional ordered list of transition names to try,
431 instead of ``state.transition_order``.
433 Return the values returned by the transition method:
435 - context: possibly modified from the parameter `context`;
436 - next state name (`State` subclass name);
437 - the result output of the transition, a list.
439 When there is no match, ``state.no_match()`` is called and its return
440 value is returned.
442 if transitions is None:
443 transitions = state.transition_order
444 state_correction = None
445 if self.debug:
446 print >>self._stderr, (
447 '\nStateMachine.check_line: state="%s", transitions=%r.'
448 % (state.__class__.__name__, transitions))
449 for name in transitions:
450 pattern, method, next_state = state.transitions[name]
451 match = pattern.match(self.line)
452 if match:
453 if self.debug:
454 print >>self._stderr, (
455 '\nStateMachine.check_line: Matched transition '
456 '"%s" in state "%s".'
457 % (name, state.__class__.__name__))
458 return method(match, context, next_state)
459 else:
460 if self.debug:
461 print >>self._stderr, (
462 '\nStateMachine.check_line: No match in state "%s".'
463 % state.__class__.__name__)
464 return state.no_match(context, transitions)
466 def add_state(self, state_class):
468 Initialize & add a `state_class` (`State` subclass) object.
470 Exception: `DuplicateStateError` raised if `state_class` was already
471 added.
473 statename = state_class.__name__
474 if statename in self.states:
475 raise DuplicateStateError(statename)
476 self.states[statename] = state_class(self, self.debug)
478 def add_states(self, state_classes):
480 Add `state_classes` (a list of `State` subclasses).
482 for state_class in state_classes:
483 self.add_state(state_class)
485 def runtime_init(self):
487 Initialize `self.states`.
489 for state in self.states.values():
490 state.runtime_init()
492 def error(self):
493 """Report error details."""
494 type, value, module, line, function = _exception_data()
495 print >>self._stderr, u'%s: %s' % (type, value)
496 print >>self._stderr, 'input line %s' % (self.abs_line_number())
497 print >>self._stderr, (u'module %s, line %s, function %s' %
498 (module, line, function))
500 def attach_observer(self, observer):
502 The `observer` parameter is a function or bound method which takes two
503 arguments, the source and offset of the current line.
505 self.observers.append(observer)
507 def detach_observer(self, observer):
508 self.observers.remove(observer)
510 def notify_observers(self):
511 for observer in self.observers:
512 try:
513 info = self.input_lines.info(self.line_offset)
514 except IndexError:
515 info = (None, None)
516 observer(*info)
519 class State:
522 State superclass. Contains a list of transitions, and transition methods.
524 Transition methods all have the same signature. They take 3 parameters:
526 - An `re` match object. ``match.string`` contains the matched input line,
527 ``match.start()`` gives the start index of the match, and
528 ``match.end()`` gives the end index.
529 - A context object, whose meaning is application-defined (initial value
530 ``None``). It can be used to store any information required by the state
531 machine, and the retured context is passed on to the next transition
532 method unchanged.
533 - The name of the next state, a string, taken from the transitions list;
534 normally it is returned unchanged, but it may be altered by the
535 transition method if necessary.
537 Transition methods all return a 3-tuple:
539 - A context object, as (potentially) modified by the transition method.
540 - The next state name (a return value of ``None`` means no state change).
541 - The processing result, a list, which is accumulated by the state
542 machine.
544 Transition methods may raise an `EOFError` to cut processing short.
546 There are two implicit transitions, and corresponding transition methods
547 are defined: `bof()` handles the beginning-of-file, and `eof()` handles
548 the end-of-file. These methods have non-standard signatures and return
549 values. `bof()` returns the initial context and results, and may be used
550 to return a header string, or do any other processing needed. `eof()`
551 should handle any remaining context and wrap things up; it returns the
552 final processing result.
554 Typical applications need only subclass `State` (or a subclass), set the
555 `patterns` and `initial_transitions` class attributes, and provide
556 corresponding transition methods. The default object initialization will
557 take care of constructing the list of transitions.
560 patterns = None
562 {Name: pattern} mapping, used by `make_transition()`. Each pattern may
563 be a string or a compiled `re` pattern. Override in subclasses.
566 initial_transitions = None
568 A list of transitions to initialize when a `State` is instantiated.
569 Each entry is either a transition name string, or a (transition name, next
570 state name) pair. See `make_transitions()`. Override in subclasses.
573 nested_sm = None
575 The `StateMachine` class for handling nested processing.
577 If left as ``None``, `nested_sm` defaults to the class of the state's
578 controlling state machine. Override it in subclasses to avoid the default.
581 nested_sm_kwargs = None
583 Keyword arguments dictionary, passed to the `nested_sm` constructor.
585 Two keys must have entries in the dictionary:
587 - Key 'state_classes' must be set to a list of `State` classes.
588 - Key 'initial_state' must be set to the name of the initial state class.
590 If `nested_sm_kwargs` is left as ``None``, 'state_classes' defaults to the
591 class of the current state, and 'initial_state' defaults to the name of
592 the class of the current state. Override in subclasses to avoid the
593 defaults.
596 def __init__(self, state_machine, debug=0):
598 Initialize a `State` object; make & add initial transitions.
600 Parameters:
602 - `statemachine`: the controlling `StateMachine` object.
603 - `debug`: a boolean; produce verbose output if true (nonzero).
606 self.transition_order = []
607 """A list of transition names in search order."""
609 self.transitions = {}
611 A mapping of transition names to 3-tuples containing
612 (compiled_pattern, transition_method, next_state_name). Initialized as
613 an instance attribute dynamically (instead of as a class attribute)
614 because it may make forward references to patterns and methods in this
615 or other classes.
618 self.add_initial_transitions()
620 self.state_machine = state_machine
621 """A reference to the controlling `StateMachine` object."""
623 self.debug = debug
624 """Debugging mode on/off."""
626 if self.nested_sm is None:
627 self.nested_sm = self.state_machine.__class__
628 if self.nested_sm_kwargs is None:
629 self.nested_sm_kwargs = {'state_classes': [self.__class__],
630 'initial_state': self.__class__.__name__}
632 def runtime_init(self):
634 Initialize this `State` before running the state machine; called from
635 `self.state_machine.run()`.
637 pass
639 def unlink(self):
640 """Remove circular references to objects no longer required."""
641 self.state_machine = None
643 def add_initial_transitions(self):
644 """Make and add transitions listed in `self.initial_transitions`."""
645 if self.initial_transitions:
646 names, transitions = self.make_transitions(
647 self.initial_transitions)
648 self.add_transitions(names, transitions)
650 def add_transitions(self, names, transitions):
652 Add a list of transitions to the start of the transition list.
654 Parameters:
656 - `names`: a list of transition names.
657 - `transitions`: a mapping of names to transition tuples.
659 Exceptions: `DuplicateTransitionError`, `UnknownTransitionError`.
661 for name in names:
662 if name in self.transitions:
663 raise DuplicateTransitionError(name)
664 if name not in transitions:
665 raise UnknownTransitionError(name)
666 self.transition_order[:0] = names
667 self.transitions.update(transitions)
669 def add_transition(self, name, transition):
671 Add a transition to the start of the transition list.
673 Parameter `transition`: a ready-made transition 3-tuple.
675 Exception: `DuplicateTransitionError`.
677 if name in self.transitions:
678 raise DuplicateTransitionError(name)
679 self.transition_order[:0] = [name]
680 self.transitions[name] = transition
682 def remove_transition(self, name):
684 Remove a transition by `name`.
686 Exception: `UnknownTransitionError`.
688 try:
689 del self.transitions[name]
690 self.transition_order.remove(name)
691 except:
692 raise UnknownTransitionError(name)
694 def make_transition(self, name, next_state=None):
696 Make & return a transition tuple based on `name`.
698 This is a convenience function to simplify transition creation.
700 Parameters:
702 - `name`: a string, the name of the transition pattern & method. This
703 `State` object must have a method called '`name`', and a dictionary
704 `self.patterns` containing a key '`name`'.
705 - `next_state`: a string, the name of the next `State` object for this
706 transition. A value of ``None`` (or absent) implies no state change
707 (i.e., continue with the same state).
709 Exceptions: `TransitionPatternNotFound`, `TransitionMethodNotFound`.
711 if next_state is None:
712 next_state = self.__class__.__name__
713 try:
714 pattern = self.patterns[name]
715 if not hasattr(pattern, 'match'):
716 pattern = re.compile(pattern)
717 except KeyError:
718 raise TransitionPatternNotFound(
719 '%s.patterns[%r]' % (self.__class__.__name__, name))
720 try:
721 method = getattr(self, name)
722 except AttributeError:
723 raise TransitionMethodNotFound(
724 '%s.%s' % (self.__class__.__name__, name))
725 return (pattern, method, next_state)
727 def make_transitions(self, name_list):
729 Return a list of transition names and a transition mapping.
731 Parameter `name_list`: a list, where each entry is either a transition
732 name string, or a 1- or 2-tuple (transition name, optional next state
733 name).
735 stringtype = type('')
736 names = []
737 transitions = {}
738 for namestate in name_list:
739 if type(namestate) is stringtype:
740 transitions[namestate] = self.make_transition(namestate)
741 names.append(namestate)
742 else:
743 transitions[namestate[0]] = self.make_transition(*namestate)
744 names.append(namestate[0])
745 return names, transitions
747 def no_match(self, context, transitions):
749 Called when there is no match from `StateMachine.check_line()`.
751 Return the same values returned by transition methods:
753 - context: unchanged;
754 - next state name: ``None``;
755 - empty result list.
757 Override in subclasses to catch this event.
759 return context, None, []
761 def bof(self, context):
763 Handle beginning-of-file. Return unchanged `context`, empty result.
765 Override in subclasses.
767 Parameter `context`: application-defined storage.
769 return context, []
771 def eof(self, context):
773 Handle end-of-file. Return empty result.
775 Override in subclasses.
777 Parameter `context`: application-defined storage.
779 return []
781 def nop(self, match, context, next_state):
783 A "do nothing" transition method.
785 Return unchanged `context` & `next_state`, empty result. Useful for
786 simple state changes (actionless transitions).
788 return context, next_state, []
791 class StateMachineWS(StateMachine):
794 `StateMachine` subclass specialized for whitespace recognition.
796 There are three methods provided for extracting indented text blocks:
798 - `get_indented()`: use when the indent is unknown.
799 - `get_known_indented()`: use when the indent is known for all lines.
800 - `get_first_known_indented()`: use when only the first line's indent is
801 known.
804 def get_indented(self, until_blank=0, strip_indent=1):
806 Return a block of indented lines of text, and info.
808 Extract an indented block where the indent is unknown for all lines.
810 :Parameters:
811 - `until_blank`: Stop collecting at the first blank line if true
812 (1).
813 - `strip_indent`: Strip common leading indent if true (1,
814 default).
816 :Return:
817 - the indented block (a list of lines of text),
818 - its indent,
819 - its first line offset from BOF, and
820 - whether or not it finished with a blank line.
822 offset = self.abs_line_offset()
823 indented, indent, blank_finish = self.input_lines.get_indented(
824 self.line_offset, until_blank, strip_indent)
825 if indented:
826 self.next_line(len(indented) - 1) # advance to last indented line
827 while indented and not indented[0].strip():
828 indented.trim_start()
829 offset += 1
830 return indented, indent, offset, blank_finish
832 def get_known_indented(self, indent, until_blank=0, strip_indent=1):
834 Return an indented block and info.
836 Extract an indented block where the indent is known for all lines.
837 Starting with the current line, extract the entire text block with at
838 least `indent` indentation (which must be whitespace, except for the
839 first line).
841 :Parameters:
842 - `indent`: The number of indent columns/characters.
843 - `until_blank`: Stop collecting at the first blank line if true
844 (1).
845 - `strip_indent`: Strip `indent` characters of indentation if true
846 (1, default).
848 :Return:
849 - the indented block,
850 - its first line offset from BOF, and
851 - whether or not it finished with a blank line.
853 offset = self.abs_line_offset()
854 indented, indent, blank_finish = self.input_lines.get_indented(
855 self.line_offset, until_blank, strip_indent,
856 block_indent=indent)
857 self.next_line(len(indented) - 1) # advance to last indented line
858 while indented and not indented[0].strip():
859 indented.trim_start()
860 offset += 1
861 return indented, offset, blank_finish
863 def get_first_known_indented(self, indent, until_blank=0, strip_indent=1,
864 strip_top=1):
866 Return an indented block and info.
868 Extract an indented block where the indent is known for the first line
869 and unknown for all other lines.
871 :Parameters:
872 - `indent`: The first line's indent (# of columns/characters).
873 - `until_blank`: Stop collecting at the first blank line if true
874 (1).
875 - `strip_indent`: Strip `indent` characters of indentation if true
876 (1, default).
877 - `strip_top`: Strip blank lines from the beginning of the block.
879 :Return:
880 - the indented block,
881 - its indent,
882 - its first line offset from BOF, and
883 - whether or not it finished with a blank line.
885 offset = self.abs_line_offset()
886 indented, indent, blank_finish = self.input_lines.get_indented(
887 self.line_offset, until_blank, strip_indent,
888 first_indent=indent)
889 self.next_line(len(indented) - 1) # advance to last indented line
890 if strip_top:
891 while indented and not indented[0].strip():
892 indented.trim_start()
893 offset += 1
894 return indented, indent, offset, blank_finish
897 class StateWS(State):
900 State superclass specialized for whitespace (blank lines & indents).
902 Use this class with `StateMachineWS`. The transitions 'blank' (for blank
903 lines) and 'indent' (for indented text blocks) are added automatically,
904 before any other transitions. The transition method `blank()` handles
905 blank lines and `indent()` handles nested indented blocks. Indented
906 blocks trigger a new state machine to be created by `indent()` and run.
907 The class of the state machine to be created is in `indent_sm`, and the
908 constructor keyword arguments are in the dictionary `indent_sm_kwargs`.
910 The methods `known_indent()` and `firstknown_indent()` are provided for
911 indented blocks where the indent (all lines' and first line's only,
912 respectively) is known to the transition method, along with the attributes
913 `known_indent_sm` and `known_indent_sm_kwargs`. Neither transition method
914 is triggered automatically.
917 indent_sm = None
919 The `StateMachine` class handling indented text blocks.
921 If left as ``None``, `indent_sm` defaults to the value of
922 `State.nested_sm`. Override it in subclasses to avoid the default.
925 indent_sm_kwargs = None
927 Keyword arguments dictionary, passed to the `indent_sm` constructor.
929 If left as ``None``, `indent_sm_kwargs` defaults to the value of
930 `State.nested_sm_kwargs`. Override it in subclasses to avoid the default.
933 known_indent_sm = None
935 The `StateMachine` class handling known-indented text blocks.
937 If left as ``None``, `known_indent_sm` defaults to the value of
938 `indent_sm`. Override it in subclasses to avoid the default.
941 known_indent_sm_kwargs = None
943 Keyword arguments dictionary, passed to the `known_indent_sm` constructor.
945 If left as ``None``, `known_indent_sm_kwargs` defaults to the value of
946 `indent_sm_kwargs`. Override it in subclasses to avoid the default.
949 ws_patterns = {'blank': ' *$',
950 'indent': ' +'}
951 """Patterns for default whitespace transitions. May be overridden in
952 subclasses."""
954 ws_initial_transitions = ('blank', 'indent')
955 """Default initial whitespace transitions, added before those listed in
956 `State.initial_transitions`. May be overridden in subclasses."""
958 def __init__(self, state_machine, debug=0):
960 Initialize a `StateSM` object; extends `State.__init__()`.
962 Check for indent state machine attributes, set defaults if not set.
964 State.__init__(self, state_machine, debug)
965 if self.indent_sm is None:
966 self.indent_sm = self.nested_sm
967 if self.indent_sm_kwargs is None:
968 self.indent_sm_kwargs = self.nested_sm_kwargs
969 if self.known_indent_sm is None:
970 self.known_indent_sm = self.indent_sm
971 if self.known_indent_sm_kwargs is None:
972 self.known_indent_sm_kwargs = self.indent_sm_kwargs
974 def add_initial_transitions(self):
976 Add whitespace-specific transitions before those defined in subclass.
978 Extends `State.add_initial_transitions()`.
980 State.add_initial_transitions(self)
981 if self.patterns is None:
982 self.patterns = {}
983 self.patterns.update(self.ws_patterns)
984 names, transitions = self.make_transitions(
985 self.ws_initial_transitions)
986 self.add_transitions(names, transitions)
988 def blank(self, match, context, next_state):
989 """Handle blank lines. Does nothing. Override in subclasses."""
990 return self.nop(match, context, next_state)
992 def indent(self, match, context, next_state):
994 Handle an indented text block. Extend or override in subclasses.
996 Recursively run the registered state machine for indented blocks
997 (`self.indent_sm`).
999 indented, indent, line_offset, blank_finish = \
1000 self.state_machine.get_indented()
1001 sm = self.indent_sm(debug=self.debug, **self.indent_sm_kwargs)
1002 results = sm.run(indented, input_offset=line_offset)
1003 return context, next_state, results
1005 def known_indent(self, match, context, next_state):
1007 Handle a known-indent text block. Extend or override in subclasses.
1009 Recursively run the registered state machine for known-indent indented
1010 blocks (`self.known_indent_sm`). The indent is the length of the
1011 match, ``match.end()``.
1013 indented, line_offset, blank_finish = \
1014 self.state_machine.get_known_indented(match.end())
1015 sm = self.known_indent_sm(debug=self.debug,
1016 **self.known_indent_sm_kwargs)
1017 results = sm.run(indented, input_offset=line_offset)
1018 return context, next_state, results
1020 def first_known_indent(self, match, context, next_state):
1022 Handle an indented text block (first line's indent known).
1024 Extend or override in subclasses.
1026 Recursively run the registered state machine for known-indent indented
1027 blocks (`self.known_indent_sm`). The indent is the length of the
1028 match, ``match.end()``.
1030 indented, line_offset, blank_finish = \
1031 self.state_machine.get_first_known_indented(match.end())
1032 sm = self.known_indent_sm(debug=self.debug,
1033 **self.known_indent_sm_kwargs)
1034 results = sm.run(indented, input_offset=line_offset)
1035 return context, next_state, results
1038 class _SearchOverride:
1041 Mix-in class to override `StateMachine` regular expression behavior.
1043 Changes regular expression matching, from the default `re.match()`
1044 (succeeds only if the pattern matches at the start of `self.line`) to
1045 `re.search()` (succeeds if the pattern matches anywhere in `self.line`).
1046 When subclassing a `StateMachine`, list this class **first** in the
1047 inheritance list of the class definition.
1050 def match(self, pattern):
1052 Return the result of a regular expression search.
1054 Overrides `StateMachine.match()`.
1056 Parameter `pattern`: `re` compiled regular expression.
1058 return pattern.search(self.line)
1061 class SearchStateMachine(_SearchOverride, StateMachine):
1062 """`StateMachine` which uses `re.search()` instead of `re.match()`."""
1063 pass
1066 class SearchStateMachineWS(_SearchOverride, StateMachineWS):
1067 """`StateMachineWS` which uses `re.search()` instead of `re.match()`."""
1068 pass
1071 class ViewList:
1074 List with extended functionality: slices of ViewList objects are child
1075 lists, linked to their parents. Changes made to a child list also affect
1076 the parent list. A child list is effectively a "view" (in the SQL sense)
1077 of the parent list. Changes to parent lists, however, do *not* affect
1078 active child lists. If a parent list is changed, any active child lists
1079 should be recreated.
1081 The start and end of the slice can be trimmed using the `trim_start()` and
1082 `trim_end()` methods, without affecting the parent list. The link between
1083 child and parent lists can be broken by calling `disconnect()` on the
1084 child list.
1086 Also, ViewList objects keep track of the source & offset of each item.
1087 This information is accessible via the `source()`, `offset()`, and
1088 `info()` methods.
1091 def __init__(self, initlist=None, source=None, items=None,
1092 parent=None, parent_offset=None):
1093 self.data = []
1094 """The actual list of data, flattened from various sources."""
1096 self.items = []
1097 """A list of (source, offset) pairs, same length as `self.data`: the
1098 source of each line and the offset of each line from the beginning of
1099 its source."""
1101 self.parent = parent
1102 """The parent list."""
1104 self.parent_offset = parent_offset
1105 """Offset of this list from the beginning of the parent list."""
1107 if isinstance(initlist, ViewList):
1108 self.data = initlist.data[:]
1109 self.items = initlist.items[:]
1110 elif initlist is not None:
1111 self.data = list(initlist)
1112 if items:
1113 self.items = items
1114 else:
1115 self.items = [(source, i) for i in range(len(initlist))]
1116 assert len(self.data) == len(self.items), 'data mismatch'
1118 def __str__(self):
1119 return str(self.data)
1121 def __repr__(self):
1122 return '%s(%s, items=%s)' % (self.__class__.__name__,
1123 self.data, self.items)
1125 def __lt__(self, other): return self.data < self.__cast(other)
1126 def __le__(self, other): return self.data <= self.__cast(other)
1127 def __eq__(self, other): return self.data == self.__cast(other)
1128 def __ne__(self, other): return self.data != self.__cast(other)
1129 def __gt__(self, other): return self.data > self.__cast(other)
1130 def __ge__(self, other): return self.data >= self.__cast(other)
1131 def __cmp__(self, other): return cmp(self.data, self.__cast(other))
1133 def __cast(self, other):
1134 if isinstance(other, ViewList):
1135 return other.data
1136 else:
1137 return other
1139 def __contains__(self, item): return item in self.data
1140 def __len__(self): return len(self.data)
1142 # The __getitem__()/__setitem__() methods check whether the index
1143 # is a slice first, since indexing a native list with a slice object
1144 # just works.
1146 def __getitem__(self, i):
1147 if isinstance(i, types.SliceType):
1148 assert i.step in (None, 1), 'cannot handle slice with stride'
1149 return self.__class__(self.data[i.start:i.stop],
1150 items=self.items[i.start:i.stop],
1151 parent=self, parent_offset=i.start or 0)
1152 else:
1153 return self.data[i]
1155 def __setitem__(self, i, item):
1156 if isinstance(i, types.SliceType):
1157 assert i.step in (None, 1), 'cannot handle slice with stride'
1158 if not isinstance(item, ViewList):
1159 raise TypeError('assigning non-ViewList to ViewList slice')
1160 self.data[i.start:i.stop] = item.data
1161 self.items[i.start:i.stop] = item.items
1162 assert len(self.data) == len(self.items), 'data mismatch'
1163 if self.parent:
1164 self.parent[(i.start or 0) + self.parent_offset
1165 : (i.stop or len(self)) + self.parent_offset] = item
1166 else:
1167 self.data[i] = item
1168 if self.parent:
1169 self.parent[i + self.parent_offset] = item
1171 def __delitem__(self, i):
1172 try:
1173 del self.data[i]
1174 del self.items[i]
1175 if self.parent:
1176 del self.parent[i + self.parent_offset]
1177 except TypeError:
1178 assert i.step is None, 'cannot handle slice with stride'
1179 del self.data[i.start:i.stop]
1180 del self.items[i.start:i.stop]
1181 if self.parent:
1182 del self.parent[(i.start or 0) + self.parent_offset
1183 : (i.stop or len(self)) + self.parent_offset]
1185 def __add__(self, other):
1186 if isinstance(other, ViewList):
1187 return self.__class__(self.data + other.data,
1188 items=(self.items + other.items))
1189 else:
1190 raise TypeError('adding non-ViewList to a ViewList')
1192 def __radd__(self, other):
1193 if isinstance(other, ViewList):
1194 return self.__class__(other.data + self.data,
1195 items=(other.items + self.items))
1196 else:
1197 raise TypeError('adding ViewList to a non-ViewList')
1199 def __iadd__(self, other):
1200 if isinstance(other, ViewList):
1201 self.data += other.data
1202 else:
1203 raise TypeError('argument to += must be a ViewList')
1204 return self
1206 def __mul__(self, n):
1207 return self.__class__(self.data * n, items=(self.items * n))
1209 __rmul__ = __mul__
1211 def __imul__(self, n):
1212 self.data *= n
1213 self.items *= n
1214 return self
1216 def extend(self, other):
1217 if not isinstance(other, ViewList):
1218 raise TypeError('extending a ViewList with a non-ViewList')
1219 if self.parent:
1220 self.parent.insert(len(self.data) + self.parent_offset, other)
1221 self.data.extend(other.data)
1222 self.items.extend(other.items)
1224 def append(self, item, source=None, offset=0):
1225 if source is None:
1226 self.extend(item)
1227 else:
1228 if self.parent:
1229 self.parent.insert(len(self.data) + self.parent_offset, item,
1230 source, offset)
1231 self.data.append(item)
1232 self.items.append((source, offset))
1234 def insert(self, i, item, source=None, offset=0):
1235 if source is None:
1236 if not isinstance(item, ViewList):
1237 raise TypeError('inserting non-ViewList with no source given')
1238 self.data[i:i] = item.data
1239 self.items[i:i] = item.items
1240 if self.parent:
1241 index = (len(self.data) + i) % len(self.data)
1242 self.parent.insert(index + self.parent_offset, item)
1243 else:
1244 self.data.insert(i, item)
1245 self.items.insert(i, (source, offset))
1246 if self.parent:
1247 index = (len(self.data) + i) % len(self.data)
1248 self.parent.insert(index + self.parent_offset, item,
1249 source, offset)
1251 def pop(self, i=-1):
1252 if self.parent:
1253 index = (len(self.data) + i) % len(self.data)
1254 self.parent.pop(index + self.parent_offset)
1255 self.items.pop(i)
1256 return self.data.pop(i)
1258 def trim_start(self, n=1):
1260 Remove items from the start of the list, without touching the parent.
1262 if n > len(self.data):
1263 raise IndexError("Size of trim too large; can't trim %s items "
1264 "from a list of size %s." % (n, len(self.data)))
1265 elif n < 0:
1266 raise IndexError('Trim size must be >= 0.')
1267 del self.data[:n]
1268 del self.items[:n]
1269 if self.parent:
1270 self.parent_offset += n
1272 def trim_end(self, n=1):
1274 Remove items from the end of the list, without touching the parent.
1276 if n > len(self.data):
1277 raise IndexError("Size of trim too large; can't trim %s items "
1278 "from a list of size %s." % (n, len(self.data)))
1279 elif n < 0:
1280 raise IndexError('Trim size must be >= 0.')
1281 del self.data[-n:]
1282 del self.items[-n:]
1284 def remove(self, item):
1285 index = self.index(item)
1286 del self[index]
1288 def count(self, item): return self.data.count(item)
1289 def index(self, item): return self.data.index(item)
1291 def reverse(self):
1292 self.data.reverse()
1293 self.items.reverse()
1294 self.parent = None
1296 def sort(self, *args):
1297 tmp = zip(self.data, self.items)
1298 tmp.sort(*args)
1299 self.data = [entry[0] for entry in tmp]
1300 self.items = [entry[1] for entry in tmp]
1301 self.parent = None
1303 def info(self, i):
1304 """Return source & offset for index `i`."""
1305 try:
1306 return self.items[i]
1307 except IndexError:
1308 if i == len(self.data): # Just past the end
1309 return self.items[i - 1][0], None
1310 else:
1311 raise
1313 def source(self, i):
1314 """Return source for index `i`."""
1315 return self.info(i)[0]
1317 def offset(self, i):
1318 """Return offset for index `i`."""
1319 return self.info(i)[1]
1321 def disconnect(self):
1322 """Break link between this list and parent list."""
1323 self.parent = None
1325 def xitems(self):
1326 """Return iterator yielding (source, offset, value) tuples."""
1327 for (value, (source, offset)) in zip(self.data, self.items):
1328 yield (source, offset, value)
1330 def pprint(self):
1331 """Print the list in `grep` format (`source:offset:value` lines)"""
1332 for line in self.xitems():
1333 print "%s:%d:%s" % line
1336 class StringList(ViewList):
1338 """A `ViewList` with string-specific methods."""
1340 def trim_left(self, length, start=0, end=sys.maxint):
1342 Trim `length` characters off the beginning of each item, in-place,
1343 from index `start` to `end`. No whitespace-checking is done on the
1344 trimmed text. Does not affect slice parent.
1346 self.data[start:end] = [line[length:]
1347 for line in self.data[start:end]]
1349 def get_text_block(self, start, flush_left=0):
1351 Return a contiguous block of text.
1353 If `flush_left` is true, raise `UnexpectedIndentationError` if an
1354 indented line is encountered before the text block ends (with a blank
1355 line).
1357 end = start
1358 last = len(self.data)
1359 while end < last:
1360 line = self.data[end]
1361 if not line.strip():
1362 break
1363 if flush_left and (line[0] == ' '):
1364 source, offset = self.info(end)
1365 raise UnexpectedIndentationError(self[start:end], source,
1366 offset + 1)
1367 end += 1
1368 return self[start:end]
1370 def get_indented(self, start=0, until_blank=0, strip_indent=1,
1371 block_indent=None, first_indent=None):
1373 Extract and return a StringList of indented lines of text.
1375 Collect all lines with indentation, determine the minimum indentation,
1376 remove the minimum indentation from all indented lines (unless
1377 `strip_indent` is false), and return them. All lines up to but not
1378 including the first unindented line will be returned.
1380 :Parameters:
1381 - `start`: The index of the first line to examine.
1382 - `until_blank`: Stop collecting at the first blank line if true.
1383 - `strip_indent`: Strip common leading indent if true (default).
1384 - `block_indent`: The indent of the entire block, if known.
1385 - `first_indent`: The indent of the first line, if known.
1387 :Return:
1388 - a StringList of indented lines with mininum indent removed;
1389 - the amount of the indent;
1390 - a boolean: did the indented block finish with a blank line or EOF?
1392 indent = block_indent # start with None if unknown
1393 end = start
1394 if block_indent is not None and first_indent is None:
1395 first_indent = block_indent
1396 if first_indent is not None:
1397 end += 1
1398 last = len(self.data)
1399 while end < last:
1400 line = self.data[end]
1401 if line and (line[0] != ' '
1402 or (block_indent is not None
1403 and line[:block_indent].strip())):
1404 # Line not indented or insufficiently indented.
1405 # Block finished properly iff the last indented line blank:
1406 blank_finish = ((end > start)
1407 and not self.data[end - 1].strip())
1408 break
1409 stripped = line.lstrip()
1410 if not stripped: # blank line
1411 if until_blank:
1412 blank_finish = 1
1413 break
1414 elif block_indent is None:
1415 line_indent = len(line) - len(stripped)
1416 if indent is None:
1417 indent = line_indent
1418 else:
1419 indent = min(indent, line_indent)
1420 end += 1
1421 else:
1422 blank_finish = 1 # block ends at end of lines
1423 block = self[start:end]
1424 if first_indent is not None and block:
1425 block.data[0] = block.data[0][first_indent:]
1426 if indent and strip_indent:
1427 block.trim_left(indent, start=(first_indent is not None))
1428 return block, indent or 0, blank_finish
1430 def get_2D_block(self, top, left, bottom, right, strip_indent=1):
1431 block = self[top:bottom]
1432 indent = right
1433 for i in range(len(block.data)):
1434 block.data[i] = line = block.data[i][left:right].rstrip()
1435 if line:
1436 indent = min(indent, len(line) - len(line.lstrip()))
1437 if strip_indent and 0 < indent < right:
1438 block.data = [line[indent:] for line in block.data]
1439 return block
1441 def pad_double_width(self, pad_char):
1443 Pad all double-width characters in self by appending `pad_char` to each.
1444 For East Asian language support.
1446 if hasattr(unicodedata, 'east_asian_width'):
1447 east_asian_width = unicodedata.east_asian_width
1448 else:
1449 return # new in Python 2.4
1450 for i in range(len(self.data)):
1451 line = self.data[i]
1452 if isinstance(line, unicode):
1453 new = []
1454 for char in line:
1455 new.append(char)
1456 if east_asian_width(char) in 'WF': # 'W'ide & 'F'ull-width
1457 new.append(pad_char)
1458 self.data[i] = ''.join(new)
1460 def replace(self, old, new):
1461 """Replace all occurrences of substring `old` with `new`."""
1462 for i in range(len(self.data)):
1463 self.data[i] = self.data[i].replace(old, new)
1466 class StateMachineError(Exception): pass
1467 class UnknownStateError(StateMachineError): pass
1468 class DuplicateStateError(StateMachineError): pass
1469 class UnknownTransitionError(StateMachineError): pass
1470 class DuplicateTransitionError(StateMachineError): pass
1471 class TransitionPatternNotFound(StateMachineError): pass
1472 class TransitionMethodNotFound(StateMachineError): pass
1473 class UnexpectedIndentationError(StateMachineError): pass
1476 class TransitionCorrection(Exception):
1479 Raise from within a transition method to switch to another transition.
1481 Raise with one argument, the new transition name.
1485 class StateCorrection(Exception):
1488 Raise from within a transition method to switch to another state.
1490 Raise with one or two arguments: new state name, and an optional new
1491 transition name.
1495 def string2lines(astring, tab_width=8, convert_whitespace=0,
1496 whitespace=re.compile('[\v\f]')):
1498 Return a list of one-line strings with tabs expanded, no newlines, and
1499 trailing whitespace stripped.
1501 Each tab is expanded with between 1 and `tab_width` spaces, so that the
1502 next character's index becomes a multiple of `tab_width` (8 by default).
1504 Parameters:
1506 - `astring`: a multi-line string.
1507 - `tab_width`: the number of columns between tab stops.
1508 - `convert_whitespace`: convert form feeds and vertical tabs to spaces?
1510 if convert_whitespace:
1511 astring = whitespace.sub(' ', astring)
1512 return [s.expandtabs(tab_width).rstrip() for s in astring.splitlines()]
1514 def _exception_data():
1516 Return exception information:
1518 - the exception's class name;
1519 - the exception object;
1520 - the name of the file containing the offending code;
1521 - the line number of the offending code;
1522 - the function name of the offending code.
1524 type, value, traceback = sys.exc_info()
1525 while traceback.tb_next:
1526 traceback = traceback.tb_next
1527 code = traceback.tb_frame.f_code
1528 return (type.__name__, value, code.co_filename, traceback.tb_lineno,
1529 code.co_name)