docutils/docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import sys
 107 import re
 108 from types import FunctionType, MethodType
 109
 110 from docutils import nodes, statemachine, utils
 111 from docutils import ApplicationError, DataError
 112 from docutils.statemachine import StateMachineWS, StateWS
 113 from docutils.nodes import fully_normalize_name as normalize_name
 114 from docutils.nodes import whitespace_normalize_name
 115 import docutils.parsers.rst
 116 from docutils.parsers.rst import directives, languages, tableparser, roles
 117 from docutils.parsers.rst.languages import en as _fallback_language_module
 118 from docutils.utils import escape2null, unescape, column_width
 119 from docutils.utils import punctuation_chars, roman, urischemes
 120
 121 class MarkupError(DataError): pass
 122 class UnknownInterpretedRoleError(DataError): pass
 123 class InterpretedRoleNotImplementedError(DataError): pass
 124 class ParserError(ApplicationError): pass
 125 class MarkupMismatch(Exception): pass
 126
 127
 128 class Struct:
 129
 130     """Stores data attributes for dotted-attribute access."""
 131
 132     def __init__(self, **keywordargs):
 133         self.__dict__.update(keywordargs)
 134
 135
 136 class RSTStateMachine(StateMachineWS):
 137
 138     """
 139     reStructuredText's master StateMachine.
 140
 141     The entry point to reStructuredText parsing is the `run()` method.
 142     """
 143
 144     def run(self, input_lines, document, input_offset=0, match_titles=True,
 145             inliner=None):
 146         """
 147         Parse `input_lines` and modify the `document` node in place.
 148
 149         Extend `StateMachineWS.run()`: set up parse-global data and
 150         run the StateMachine.
 151         """
 152         self.language = languages.get_language(
 153             document.settings.language_code)
 154         self.match_titles = match_titles
 155         if inliner is None:
 156             inliner = Inliner()
 157         inliner.init_customizations(document.settings)
 158         self.memo = Struct(document=document,
 159                            reporter=document.reporter,
 160                            language=self.language,
 161                            title_styles=[],
 162                            section_level=0,
 163                            section_bubble_up_kludge=False,
 164                            inliner=inliner)
 165         self.document = document
 166         self.attach_observer(document.note_source)
 167         self.reporter = self.memo.reporter
 168         self.node = document
 169         results = StateMachineWS.run(self, input_lines, input_offset,
 170                                      input_source=document['source'])
 171         assert results == [], 'RSTStateMachine.run() results should be empty!'
 172         self.node = self.memo = None    # remove unneeded references
 173
 174
 175 class NestedStateMachine(StateMachineWS):
 176
 177     """
 178     StateMachine run from within other StateMachine runs, to parse nested
 179     document structures.
 180     """
 181
 182     def run(self, input_lines, input_offset, memo, node, match_titles=True):
 183         """
 184         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 185
 186         Extend `StateMachineWS.run()`: set up document-wide data.
 187         """
 188         self.match_titles = match_titles
 189         self.memo = memo
 190         self.document = memo.document
 191         self.attach_observer(self.document.note_source)
 192         self.reporter = memo.reporter
 193         self.language = memo.language
 194         self.node = node
 195         results = StateMachineWS.run(self, input_lines, input_offset)
 196         assert results == [], ('NestedStateMachine.run() results should be '
 197                                'empty!')
 198         return results
 199
 200
 201 class RSTState(StateWS):
 202
 203     """
 204     reStructuredText State superclass.
 205
 206     Contains methods used by all State subclasses.
 207     """
 208
 209     nested_sm = NestedStateMachine
 210     nested_sm_cache = []
 211
 212     def __init__(self, state_machine, debug=False):
 213         self.nested_sm_kwargs = {'state_classes': state_classes,
 214                                  'initial_state': 'Body'}
 215         StateWS.__init__(self, state_machine, debug)
 216
 217     def runtime_init(self):
 218         StateWS.runtime_init(self)
 219         memo = self.state_machine.memo
 220         self.memo = memo
 221         self.reporter = memo.reporter
 222         self.inliner = memo.inliner
 223         self.document = memo.document
 224         self.parent = self.state_machine.node
 225         # enable the reporter to determine source and source-line
 226         if not hasattr(self.reporter, 'get_source_and_line'):
 227             self.reporter.get_source_and_line = self.state_machine.get_source_and_line
 228
 229
 230     def goto_line(self, abs_line_offset):
 231         """
 232         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 233         """
 234         try:
 235             self.state_machine.goto_line(abs_line_offset)
 236         except EOFError:
 237             pass
 238
 239     def no_match(self, context, transitions):
 240         """
 241         Override `StateWS.no_match` to generate a system message.
 242
 243         This code should never be run.
 244         """
 245         self.reporter.severe(
 246             'Internal error: no transition pattern match.  State: "%s"; '
 247             'transitions: %s; context: %s; current line: %r.'
 248             % (self.__class__.__name__, transitions, context,
 249                self.state_machine.line))
 250         return context, None, []
 251
 252     def bof(self, context):
 253         """Called at beginning of file."""
 254         return [], []
 255
 256     def nested_parse(self, block, input_offset, node, match_titles=False,
 257                      state_machine_class=None, state_machine_kwargs=None):
 258         """
 259         Create a new StateMachine rooted at `node` and run it over the input
 260         `block`.
 261         """
 262         use_default = 0
 263         if state_machine_class is None:
 264             state_machine_class = self.nested_sm
 265             use_default += 1
 266         if state_machine_kwargs is None:
 267             state_machine_kwargs = self.nested_sm_kwargs
 268             use_default += 1
 269         block_length = len(block)
 270
 271         state_machine = None
 272         if use_default == 2:
 273             try:
 274                 state_machine = self.nested_sm_cache.pop()
 275             except IndexError:
 276                 pass
 277         if not state_machine:
 278             state_machine = state_machine_class(debug=self.debug,
 279                                                 **state_machine_kwargs)
 280         state_machine.run(block, input_offset, memo=self.memo,
 281                           node=node, match_titles=match_titles)
 282         if use_default == 2:
 283             self.nested_sm_cache.append(state_machine)
 284         else:
 285             state_machine.unlink()
 286         new_offset = state_machine.abs_line_offset()
 287         # No `block.parent` implies disconnected -- lines aren't in sync:
 288         if block.parent and (len(block) - block_length) != 0:
 289             # Adjustment for block if modified in nested parse:
 290             self.state_machine.next_line(len(block) - block_length)
 291         return new_offset
 292
 293     def nested_list_parse(self, block, input_offset, node, initial_state,
 294                           blank_finish,
 295                           blank_finish_state=None,
 296                           extra_settings={},
 297                           match_titles=False,
 298                           state_machine_class=None,
 299                           state_machine_kwargs=None):
 300         """
 301         Create a new StateMachine rooted at `node` and run it over the input
 302         `block`. Also keep track of optional intermediate blank lines and the
 303         required final one.
 304         """
 305         if state_machine_class is None:
 306             state_machine_class = self.nested_sm
 307         if state_machine_kwargs is None:
 308             state_machine_kwargs = self.nested_sm_kwargs.copy()
 309         state_machine_kwargs['initial_state'] = initial_state
 310         state_machine = state_machine_class(debug=self.debug,
 311                                             **state_machine_kwargs)
 312         if blank_finish_state is None:
 313             blank_finish_state = initial_state
 314         state_machine.states[blank_finish_state].blank_finish = blank_finish
 315         for key, value in extra_settings.items():
 316             setattr(state_machine.states[initial_state], key, value)
 317         state_machine.run(block, input_offset, memo=self.memo,
 318                           node=node, match_titles=match_titles)
 319         blank_finish = state_machine.states[blank_finish_state].blank_finish
 320         state_machine.unlink()
 321         return state_machine.abs_line_offset(), blank_finish
 322
 323     def section(self, title, source, style, lineno, messages):
 324         """Check for a valid subsection and create one if it checks out."""
 325         if self.check_subsection(source, style, lineno):
 326             self.new_subsection(title, lineno, messages)
 327
 328     def check_subsection(self, source, style, lineno):
 329         """
 330         Check for a valid subsection header.  Return 1 (true) or None (false).
 331
 332         When a new section is reached that isn't a subsection of the current
 333         section, back up the line count (use ``previous_line(-x)``), then
 334         ``raise EOFError``.  The current StateMachine will finish, then the
 335         calling StateMachine can re-examine the title.  This will work its way
 336         back up the calling chain until the correct section level isreached.
 337
 338         @@@ Alternative: Evaluate the title, store the title info & level, and
 339         back up the chain until that level is reached.  Store in memo? Or
 340         return in results?
 341
 342         :Exception: `EOFError` when a sibling or supersection encountered.
 343         """
 344         memo = self.memo
 345         title_styles = memo.title_styles
 346         mylevel = memo.section_level
 347         try:                            # check for existing title style
 348             level = title_styles.index(style) + 1
 349         except ValueError:              # new title style
 350             if len(title_styles) == memo.section_level: # new subsection
 351                 title_styles.append(style)
 352                 return 1
 353             else:                       # not at lowest level
 354                 self.parent += self.title_inconsistent(source, lineno)
 355                 return None
 356         if level <= mylevel:            # sibling or supersection
 357             memo.section_level = level   # bubble up to parent section
 358             if len(style) == 2:
 359                 memo.section_bubble_up_kludge = True
 360             # back up 2 lines for underline title, 3 for overline title
 361             self.state_machine.previous_line(len(style) + 1)
 362             raise EOFError              # let parent section re-evaluate
 363         if level == mylevel + 1:        # immediate subsection
 364             return 1
 365         else:                           # invalid subsection
 366             self.parent += self.title_inconsistent(source, lineno)
 367             return None
 368
 369     def title_inconsistent(self, sourcetext, lineno):
 370         error = self.reporter.severe(
 371             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 372             line=lineno)
 373         return error
 374
 375     def new_subsection(self, title, lineno, messages):
 376         """Append new subsection to document tree. On return, check level."""
 377         memo = self.memo
 378         mylevel = memo.section_level
 379         memo.section_level += 1
 380         section_node = nodes.section()
 381         self.parent += section_node
 382         textnodes, title_messages = self.inline_text(title, lineno)
 383         titlenode = nodes.title(title, '', *textnodes)
 384         name = normalize_name(titlenode.astext())
 385         section_node['names'].append(name)
 386         section_node += titlenode
 387         section_node += messages
 388         section_node += title_messages
 389         self.document.note_implicit_target(section_node, section_node)
 390         offset = self.state_machine.line_offset + 1
 391         absoffset = self.state_machine.abs_line_offset() + 1
 392         newabsoffset = self.nested_parse(
 393               self.state_machine.input_lines[offset:], input_offset=absoffset,
 394               node=section_node, match_titles=True)
 395         self.goto_line(newabsoffset)
 396         if memo.section_level <= mylevel: # can't handle next section?
 397             raise EOFError              # bubble up to supersection
 398         # reset section_level; next pass will detect it properly
 399         memo.section_level = mylevel
 400
 401     def paragraph(self, lines, lineno):
 402         """
 403         Return a list (paragraph & messages) & a boolean: literal_block next?
 404         """
 405         data = '\n'.join(lines).rstrip()
 406         if re.search(r'(?<!\\)(\\\\)*::$', data):
 407             if len(data) == 2:
 408                 return [], 1
 409             elif data[-3] in ' \n':
 410                 text = data[:-3].rstrip()
 411             else:
 412                 text = data[:-1]
 413             literalnext = 1
 414         else:
 415             text = data
 416             literalnext = 0
 417         textnodes, messages = self.inline_text(text, lineno)
 418         p = nodes.paragraph(data, '', *textnodes)
 419         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 420         return [p] + messages, literalnext
 421
 422     def inline_text(self, text, lineno):
 423         """
 424         Return 2 lists: nodes (text and inline elements), and system_messages.
 425         """
 426         return self.inliner.parse(text, lineno, self.memo, self.parent)
 427
 428     def unindent_warning(self, node_name):
 429         # the actual problem is one line below the current line
 430         lineno = self.state_machine.abs_line_number()+1
 431         return self.reporter.warning('%s ends without a blank line; '
 432                                      'unexpected unindent.' % node_name,
 433                                      line=lineno)
 434
 435
 436 def build_regexp(definition, compile=True):
 437     """
 438     Build, compile and return a regular expression based on `definition`.
 439
 440     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 441         where "parts" is a list of regular expressions and/or regular
 442         expression definitions to be joined into an or-group.
 443     """
 444     name, prefix, suffix, parts = definition
 445     part_strings = []
 446     for part in parts:
 447         if type(part) is tuple:
 448             part_strings.append(build_regexp(part, None))
 449         else:
 450             part_strings.append(part)
 451     or_group = '|'.join(part_strings)
 452     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 453     if compile:
 454         return re.compile(regexp, re.UNICODE)
 455     else:
 456         return regexp
 457
 458
 459 class Inliner:
 460
 461     """
 462     Parse inline markup; call the `parse()` method.
 463     """
 464
 465     def __init__(self):
 466         self.implicit_dispatch = [(self.patterns.uri, self.standalone_uri),]
 467         """List of (pattern, bound method) tuples, used by
 468         `self.implicit_inline`."""
 469
 470     def init_customizations(self, settings):
 471         """Setting-based customizations; run when parsing begins."""
 472         if settings.pep_references:
 473             self.implicit_dispatch.append((self.patterns.pep,
 474                                            self.pep_reference))
 475         if settings.rfc_references:
 476             self.implicit_dispatch.append((self.patterns.rfc,
 477                                            self.rfc_reference))
 478
 479     def parse(self, text, lineno, memo, parent):
 480         # Needs to be refactored for nested inline markup.
 481         # Add nested_parse() method?
 482         """
 483         Return 2 lists: nodes (text and inline elements), and system_messages.
 484
 485         Using `self.patterns.initial`, a pattern which matches start-strings
 486         (emphasis, strong, interpreted, phrase reference, literal,
 487         substitution reference, and inline target) and complete constructs
 488         (simple reference, footnote reference), search for a candidate.  When
 489         one is found, check for validity (e.g., not a quoted '*' character).
 490         If valid, search for the corresponding end string if applicable, and
 491         check it for validity.  If not found or invalid, generate a warning
 492         and ignore the start-string.  Implicit inline markup (e.g. standalone
 493         URIs) is found last.
 494         """
 495         self.reporter = memo.reporter
 496         self.document = memo.document
 497         self.language = memo.language
 498         self.parent = parent
 499         pattern_search = self.patterns.initial.search
 500         dispatch = self.dispatch
 501         remaining = escape2null(text)
 502         processed = []
 503         unprocessed = []
 504         messages = []
 505         while remaining:
 506             match = pattern_search(remaining)
 507             if match:
 508                 groups = match.groupdict()
 509                 method = dispatch[groups['start'] or groups['backquote']
 510                                   or groups['refend'] or groups['fnend']]
 511                 before, inlines, remaining, sysmessages = method(self, match,
 512                                                                  lineno)
 513                 unprocessed.append(before)
 514                 messages += sysmessages
 515                 if inlines:
 516                     processed += self.implicit_inline(''.join(unprocessed),
 517                                                       lineno)
 518                     processed += inlines
 519                     unprocessed = []
 520             else:
 521                 break
 522         remaining = ''.join(unprocessed) + remaining
 523         if remaining:
 524             processed += self.implicit_inline(remaining, lineno)
 525         return processed, messages
 526
 527     # Inline object recognition
 528     # -------------------------
 529     # lookahead and look-behind expressions for inline markup rules
 530     start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' %
 531                            (punctuation_chars.openers,
 532                             punctuation_chars.delimiters))
 533     end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' %
 534                          (punctuation_chars.closing_delimiters,
 535                           punctuation_chars.delimiters,
 536                           punctuation_chars.closers))
 537     # print start_string_prefix.encode('utf8')
 538     # TODO: support non-ASCII whitespace in the following 4 patterns?
 539     non_whitespace_before = r'(?<![ \n])'
 540     non_whitespace_escape_before = r'(?<![ \n\x00])'
 541     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[ \n\x00])'
 542     non_whitespace_after = r'(?![ \n])'
 543     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 544     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 545     # Valid URI characters (see RFC 2396 & RFC 2732);
 546     # final \x00 allows backslash escapes in URIs:
 547     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 548     # Delimiter indicating the end of a URI (not part of the URI):
 549     uri_end_delim = r"""[>]"""
 550     # Last URI character; same as uric but no punctuation:
 551     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 552     # End of a URI (either 'urilast' or 'uric followed by a
 553     # uri_end_delim'):
 554     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 555     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 556     email_pattern = r"""
 557           %(emailc)s+(?:\.%(emailc)s+)*   # name
 558           (?<!\x00)@                      # at
 559           %(emailc)s+(?:\.%(emailc)s*)*   # host
 560           %(uri_end)s                     # final URI char
 561           """
 562     parts = ('initial_inline', start_string_prefix, '',
 563              [('start', '', non_whitespace_after,  # simple start-strings
 564                [r'\*\*',                # strong
 565                 r'\*(?!\*)',            # emphasis but not strong
 566                 r'``',                  # literal
 567                 r'_`',                  # inline internal target
 568                 r'\|(?!\|)']            # substitution reference
 569                ),
 570               ('whole', '', end_string_suffix, # whole constructs
 571                [# reference name & end-string
 572                 r'(?P<refname>%s)(?P<refend>__?)' % simplename,
 573                 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 574                  [r'[0-9]+',               # manually numbered
 575                   r'\#(%s)?' % simplename, # auto-numbered (w/ label?)
 576                   r'\*',                   # auto-symbol
 577                   r'(?P<citationlabel>%s)' % simplename] # citation reference
 578                  )
 579                 ]
 580                ),
 581               ('backquote',             # interpreted text or phrase reference
 582                '(?P<role>(:%s:)?)' % simplename, # optional role
 583                non_whitespace_after,
 584                ['`(?!`)']               # but not literal
 585                )
 586               ]
 587              )
 588     patterns = Struct(
 589           initial=build_regexp(parts),
 590           emphasis=re.compile(non_whitespace_escape_before
 591                               + r'(\*)' + end_string_suffix, re.UNICODE),
 592           strong=re.compile(non_whitespace_escape_before
 593                             + r'(\*\*)' + end_string_suffix, re.UNICODE),
 594           interpreted_or_phrase_ref=re.compile(
 595               r"""
 596               %(non_unescaped_whitespace_escape_before)s
 597               (
 598                 `
 599                 (?P<suffix>
 600                   (?P<role>:%(simplename)s:)?
 601                   (?P<refend>__?)?
 602                 )
 603               )
 604               %(end_string_suffix)s
 605               """ % locals(), re.VERBOSE | re.UNICODE),
 606           embedded_link=re.compile(
 607               r"""
 608               (
 609                 (?:[ \n]+|^)            # spaces or beginning of line/string
 610                 <                       # open bracket
 611                 %(non_whitespace_after)s
 612                 ([^<>\x00]+(\x00_)?)    # anything but angle brackets & nulls
 613                                         # except escaped trailing low line
 614                 %(non_whitespace_before)s
 615                 >                       # close bracket w/o whitespace before
 616               )
 617               $                         # end of string
 618               """ % locals(), re.VERBOSE | re.UNICODE),
 619           literal=re.compile(non_whitespace_before + '(``)'
 620                              + end_string_suffix),
 621           target=re.compile(non_whitespace_escape_before
 622                             + r'(`)' + end_string_suffix),
 623           substitution_ref=re.compile(non_whitespace_escape_before
 624                                       + r'(\|_{0,2})'
 625                                       + end_string_suffix),
 626           email=re.compile(email_pattern % locals() + '$',
 627                            re.VERBOSE | re.UNICODE),
 628           uri=re.compile(
 629                 (r"""
 630                 %(start_string_prefix)s
 631                 (?P<whole>
 632                   (?P<absolute>           # absolute URI
 633                     (?P<scheme>             # scheme (http, ftp, mailto)
 634                       [a-zA-Z][a-zA-Z0-9.+-]*
 635                     )
 636                     :
 637                     (
 638                       (                       # either:
 639                         (//?)?                  # hierarchical URI
 640                         %(uric)s*               # URI characters
 641                         %(uri_end)s             # final URI char
 642                       )
 643                       (                       # optional query
 644                         \?%(uric)s*
 645                         %(uri_end)s
 646                       )?
 647                       (                       # optional fragment
 648                         \#%(uric)s*
 649                         %(uri_end)s
 650                       )?
 651                     )
 652                   )
 653                 |                       # *OR*
 654                   (?P<email>              # email address
 655                     """ + email_pattern + r"""
 656                   )
 657                 )
 658                 %(end_string_suffix)s
 659                 """) % locals(), re.VERBOSE | re.UNICODE),
 660           pep=re.compile(
 661                 r"""
 662                 %(start_string_prefix)s
 663                 (
 664                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 665                 |
 666                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 667                 )
 668                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE),
 669           rfc=re.compile(
 670                 r"""
 671                 %(start_string_prefix)s
 672                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 673                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE))
 674
 675     def quoted_start(self, match):
 676         """Test if inline markup start-string is 'quoted'.
 677
 678         'Quoted' in this context means the start-string is enclosed in a pair
 679         of matching opening/closing delimiters (not necessarily quotes)
 680         or at the end of the match.
 681         """
 682         string = match.string
 683         start = match.start()
 684         if start == 0:                  # start-string at beginning of text
 685             return False
 686         prestart = string[start - 1]
 687         try:
 688             poststart = string[match.end()]
 689         except IndexError:          # start-string at end of text
 690             return True  # not "quoted" but no markup start-string either
 691         return punctuation_chars.match_chars(prestart, poststart)
 692
 693     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 694                    restore_backslashes=False):
 695         string = match.string
 696         matchstart = match.start('start')
 697         matchend = match.end('start')
 698         if self.quoted_start(match):
 699             return (string[:matchend], [], string[matchend:], [], '')
 700         endmatch = end_pattern.search(string[matchend:])
 701         if endmatch and endmatch.start(1):  # 1 or more chars
 702             text = unescape(endmatch.string[:endmatch.start(1)],
 703                             restore_backslashes)
 704             textend = matchend + endmatch.end(1)
 705             rawsource = unescape(string[matchstart:textend], 1)
 706             return (string[:matchstart], [nodeclass(rawsource, text)],
 707                     string[textend:], [], endmatch.group(1))
 708         msg = self.reporter.warning(
 709               'Inline %s start-string without end-string.'
 710               % nodeclass.__name__, line=lineno)
 711         text = unescape(string[matchstart:matchend], 1)
 712         rawsource = unescape(string[matchstart:matchend], 1)
 713         prb = self.problematic(text, rawsource, msg)
 714         return string[:matchstart], [prb], string[matchend:], [msg], ''
 715
 716     def problematic(self, text, rawsource, message):
 717         msgid = self.document.set_id(message, self.parent)
 718         problematic = nodes.problematic(rawsource, text, refid=msgid)
 719         prbid = self.document.set_id(problematic)
 720         message.add_backref(prbid)
 721         return problematic
 722
 723     def emphasis(self, match, lineno):
 724         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 725               match, lineno, self.patterns.emphasis, nodes.emphasis)
 726         return before, inlines, remaining, sysmessages
 727
 728     def strong(self, match, lineno):
 729         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 730               match, lineno, self.patterns.strong, nodes.strong)
 731         return before, inlines, remaining, sysmessages
 732
 733     def interpreted_or_phrase_ref(self, match, lineno):
 734         end_pattern = self.patterns.interpreted_or_phrase_ref
 735         string = match.string
 736         matchstart = match.start('backquote')
 737         matchend = match.end('backquote')
 738         rolestart = match.start('role')
 739         role = match.group('role')
 740         position = ''
 741         if role:
 742             role = role[1:-1]
 743             position = 'prefix'
 744         elif self.quoted_start(match):
 745             return (string[:matchend], [], string[matchend:], [])
 746         endmatch = end_pattern.search(string[matchend:])
 747         if endmatch and endmatch.start(1):  # 1 or more chars
 748             textend = matchend + endmatch.end()
 749             if endmatch.group('role'):
 750                 if role:
 751                     msg = self.reporter.warning(
 752                         'Multiple roles in interpreted text (both '
 753                         'prefix and suffix present; only one allowed).',
 754                         line=lineno)
 755                     text = unescape(string[rolestart:textend], 1)
 756                     prb = self.problematic(text, text, msg)
 757                     return string[:rolestart], [prb], string[textend:], [msg]
 758                 role = endmatch.group('suffix')[1:-1]
 759                 position = 'suffix'
 760             escaped = endmatch.string[:endmatch.start(1)]
 761             rawsource = unescape(string[matchstart:textend], 1)
 762             if rawsource[-1:] == '_':
 763                 if role:
 764                     msg = self.reporter.warning(
 765                           'Mismatch: both interpreted text role %s and '
 766                           'reference suffix.' % position, line=lineno)
 767                     text = unescape(string[rolestart:textend], 1)
 768                     prb = self.problematic(text, text, msg)
 769                     return string[:rolestart], [prb], string[textend:], [msg]
 770                 return self.phrase_ref(string[:matchstart], string[textend:],
 771                                        rawsource, escaped, unescape(escaped))
 772             else:
 773                 rawsource = unescape(string[rolestart:textend], 1)
 774                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 775                                                       lineno)
 776                 return (string[:rolestart], nodelist,
 777                         string[textend:], messages)
 778         msg = self.reporter.warning(
 779               'Inline interpreted text or phrase reference start-string '
 780               'without end-string.', line=lineno)
 781         text = unescape(string[matchstart:matchend], 1)
 782         prb = self.problematic(text, text, msg)
 783         return string[:matchstart], [prb], string[matchend:], [msg]
 784
 785     def phrase_ref(self, before, after, rawsource, escaped, text):
 786         match = self.patterns.embedded_link.search(escaped)
 787         if match: # embedded <URI> or <alias_>
 788             text = unescape(escaped[:match.start(0)])
 789             aliastext = unescape(match.group(2), restore_backslashes=True)
 790             if aliastext.endswith('_') and not (aliastext.endswith(r'\_')
 791                                         or self.patterns.uri.match(aliastext)):
 792                 aliastype = 'name'
 793                 alias = normalize_name(aliastext[:-1])
 794                 target = nodes.target(match.group(1), refname=alias)
 795                 target.indirect_reference_name = aliastext[:-1]
 796             else:
 797                 aliastype = 'uri'
 798                 alias = ''.join(aliastext.split())
 799                 alias = self.adjust_uri(alias)
 800                 if alias.endswith(r'\_'):
 801                     alias = alias[:-2] + '_'
 802                 target = nodes.target(match.group(1), refuri=alias)
 803                 target.referenced = 1
 804             if not aliastext:
 805                 raise ApplicationError('problem with embedded link: %r'
 806                                        % aliastext)
 807             if not text:
 808                 text = alias
 809         else:
 810             target = None
 811
 812         refname = normalize_name(text)
 813         reference = nodes.reference(rawsource, text,
 814                                     name=whitespace_normalize_name(text))
 815         node_list = [reference]
 816
 817         if rawsource[-2:] == '__':
 818             if  target and (aliastype == 'name'):
 819                 reference['refname'] = alias
 820                 self.document.note_refname(reference)
 821                 # self.document.note_indirect_target(target) # required?
 822             elif target and (aliastype == 'uri'):
 823                 reference['refuri'] = alias
 824             else:
 825                 reference['anonymous'] = 1
 826         else:
 827             if target:
 828                 target['names'].append(refname)
 829                 if aliastype == 'name':
 830                     reference['refname'] = alias
 831                     self.document.note_indirect_target(target)
 832                     self.document.note_refname(reference)
 833                 else:
 834                     reference['refuri'] = alias
 835                     self.document.note_explicit_target(target, self.parent)
 836                 # target.note_referenced_by(name=refname)
 837                 node_list.append(target)
 838             else:
 839                 reference['refname'] = refname
 840                 self.document.note_refname(reference)
 841         return before, node_list, after, []
 842
 843
 844     def adjust_uri(self, uri):
 845         match = self.patterns.email.match(uri)
 846         if match:
 847             return 'mailto:' + uri
 848         else:
 849             return uri
 850
 851     def interpreted(self, rawsource, text, role, lineno):
 852         role_fn, messages = roles.role(role, self.language, lineno,
 853                                        self.reporter)
 854         if role_fn:
 855             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 856             return nodes, messages + messages2
 857         else:
 858             msg = self.reporter.error(
 859                 'Unknown interpreted text role "%s".' % role,
 860                 line=lineno)
 861             return ([self.problematic(rawsource, rawsource, msg)],
 862                     messages + [msg])
 863
 864     def literal(self, match, lineno):
 865         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 866               match, lineno, self.patterns.literal, nodes.literal,
 867               restore_backslashes=True)
 868         return before, inlines, remaining, sysmessages
 869
 870     def inline_internal_target(self, match, lineno):
 871         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 872               match, lineno, self.patterns.target, nodes.target)
 873         if inlines and isinstance(inlines[0], nodes.target):
 874             assert len(inlines) == 1
 875             target = inlines[0]
 876             name = normalize_name(target.astext())
 877             target['names'].append(name)
 878             self.document.note_explicit_target(target, self.parent)
 879         return before, inlines, remaining, sysmessages
 880
 881     def substitution_reference(self, match, lineno):
 882         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 883               match, lineno, self.patterns.substitution_ref,
 884               nodes.substitution_reference)
 885         if len(inlines) == 1:
 886             subref_node = inlines[0]
 887             if isinstance(subref_node, nodes.substitution_reference):
 888                 subref_text = subref_node.astext()
 889                 self.document.note_substitution_ref(subref_node, subref_text)
 890                 if endstring[-1:] == '_':
 891                     reference_node = nodes.reference(
 892                         '|%s%s' % (subref_text, endstring), '')
 893                     if endstring[-2:] == '__':
 894                         reference_node['anonymous'] = 1
 895                     else:
 896                         reference_node['refname'] = normalize_name(subref_text)
 897                         self.document.note_refname(reference_node)
 898                     reference_node += subref_node
 899                     inlines = [reference_node]
 900         return before, inlines, remaining, sysmessages
 901
 902     def footnote_reference(self, match, lineno):
 903         """
 904         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 905         elements.
 906         """
 907         label = match.group('footnotelabel')
 908         refname = normalize_name(label)
 909         string = match.string
 910         before = string[:match.start('whole')]
 911         remaining = string[match.end('whole'):]
 912         if match.group('citationlabel'):
 913             refnode = nodes.citation_reference('[%s]_' % label,
 914                                                refname=refname)
 915             refnode += nodes.Text(label)
 916             self.document.note_citation_ref(refnode)
 917         else:
 918             refnode = nodes.footnote_reference('[%s]_' % label)
 919             if refname[0] == '#':
 920                 refname = refname[1:]
 921                 refnode['auto'] = 1
 922                 self.document.note_autofootnote_ref(refnode)
 923             elif refname == '*':
 924                 refname = ''
 925                 refnode['auto'] = '*'
 926                 self.document.note_symbol_footnote_ref(
 927                       refnode)
 928             else:
 929                 refnode += nodes.Text(label)
 930             if refname:
 931                 refnode['refname'] = refname
 932                 self.document.note_footnote_ref(refnode)
 933             if utils.get_trim_footnote_ref_space(self.document.settings):
 934                 before = before.rstrip()
 935         return (before, [refnode], remaining, [])
 936
 937     def reference(self, match, lineno, anonymous=False):
 938         referencename = match.group('refname')
 939         refname = normalize_name(referencename)
 940         referencenode = nodes.reference(
 941             referencename + match.group('refend'), referencename,
 942             name=whitespace_normalize_name(referencename))
 943         if anonymous:
 944             referencenode['anonymous'] = 1
 945         else:
 946             referencenode['refname'] = refname
 947             self.document.note_refname(referencenode)
 948         string = match.string
 949         matchstart = match.start('whole')
 950         matchend = match.end('whole')
 951         return (string[:matchstart], [referencenode], string[matchend:], [])
 952
 953     def anonymous_reference(self, match, lineno):
 954         return self.reference(match, lineno, anonymous=1)
 955
 956     def standalone_uri(self, match, lineno):
 957         if (not match.group('scheme')
 958                 or match.group('scheme').lower() in urischemes.schemes):
 959             if match.group('email'):
 960                 addscheme = 'mailto:'
 961             else:
 962                 addscheme = ''
 963             text = match.group('whole')
 964             unescaped = unescape(text, 0)
 965             return [nodes.reference(unescape(text, 1), unescaped,
 966                                     refuri=addscheme + unescaped)]
 967         else:                   # not a valid scheme
 968             raise MarkupMismatch
 969
 970     def pep_reference(self, match, lineno):
 971         text = match.group(0)
 972         if text.startswith('pep-'):
 973             pepnum = int(match.group('pepnum1'))
 974         elif text.startswith('PEP'):
 975             pepnum = int(match.group('pepnum2'))
 976         else:
 977             raise MarkupMismatch
 978         ref = (self.document.settings.pep_base_url
 979                + self.document.settings.pep_file_url_template % pepnum)
 980         unescaped = unescape(text, 0)
 981         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 982
 983     rfc_url = 'rfc%d.html'
 984
 985     def rfc_reference(self, match, lineno):
 986         text = match.group(0)
 987         if text.startswith('RFC'):
 988             rfcnum = int(match.group('rfcnum'))
 989             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
 990         else:
 991             raise MarkupMismatch
 992         unescaped = unescape(text, 0)
 993         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 994
 995     def implicit_inline(self, text, lineno):
 996         """
 997         Check each of the patterns in `self.implicit_dispatch` for a match,
 998         and dispatch to the stored method for the pattern.  Recursively check
 999         the text before and after the match.  Return a list of `nodes.Text`
1000         and inline element nodes.
1001         """
1002         if not text:
1003             return []
1004         for pattern, method in self.implicit_dispatch:
1005             match = pattern.search(text)
1006             if match:
1007                 try:
1008                     # Must recurse on strings before *and* after the match;
1009                     # there may be multiple patterns.
1010                     return (self.implicit_inline(text[:match.start()], lineno)
1011                             + method(match, lineno) +
1012                             self.implicit_inline(text[match.end():], lineno))
1013                 except MarkupMismatch:
1014                     pass
1015         return [nodes.Text(unescape(text), rawsource=unescape(text, 1))]
1016
1017     dispatch = {'*': emphasis,
1018                 '**': strong,
1019                 '`': interpreted_or_phrase_ref,
1020                 '``': literal,
1021                 '_`': inline_internal_target,
1022                 ']_': footnote_reference,
1023                 '|': substitution_reference,
1024                 '_': reference,
1025                 '__': anonymous_reference}
1026
1027
1028 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1029     return ord(s) - _zero
1030
1031 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1032     return ord(s) - _zero
1033
1034 def _lowerroman_to_int(s):
1035     return roman.fromRoman(s.upper())
1036
1037
1038 class Body(RSTState):
1039
1040     """
1041     Generic classifier of the first line of a block.
1042     """
1043
1044     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1045     """Padding character for East Asian double-width text."""
1046
1047     enum = Struct()
1048     """Enumerated list parsing information."""
1049
1050     enum.formatinfo = {
1051           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1052           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1053           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1054     enum.formats = enum.formatinfo.keys()
1055     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1056                       'lowerroman', 'upperroman'] # ORDERED!
1057     enum.sequencepats = {'arabic': '[0-9]+',
1058                          'loweralpha': '[a-z]',
1059                          'upperalpha': '[A-Z]',
1060                          'lowerroman': '[ivxlcdm]+',
1061                          'upperroman': '[IVXLCDM]+',}
1062     enum.converters = {'arabic': int,
1063                        'loweralpha': _loweralpha_to_int,
1064                        'upperalpha': _upperalpha_to_int,
1065                        'lowerroman': _lowerroman_to_int,
1066                        'upperroman': roman.fromRoman}
1067
1068     enum.sequenceregexps = {}
1069     for sequence in enum.sequences:
1070         enum.sequenceregexps[sequence] = re.compile(
1071               enum.sequencepats[sequence] + '$', re.UNICODE)
1072
1073     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1074     """Matches the top (& bottom) of a full table)."""
1075
1076     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1077     """Matches the top of a simple table."""
1078
1079     simple_table_border_pat = re.compile('=+[ =]*$')
1080     """Matches the bottom & header bottom of a simple table."""
1081
1082     pats = {}
1083     """Fragments of patterns used by transitions."""
1084
1085     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1086     pats['alpha'] = '[a-zA-Z]'
1087     pats['alphanum'] = '[a-zA-Z0-9]'
1088     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1089     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1090                     '|%(upperroman)s|#)' % enum.sequencepats)
1091     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1092     # @@@ Loosen up the pattern?  Allow Unicode?
1093     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1094     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1095     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1096     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1097
1098     for format in enum.formats:
1099         pats[format] = '(?P<%s>%s%s%s)' % (
1100               format, re.escape(enum.formatinfo[format].prefix),
1101               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1102
1103     patterns = {
1104           'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
1105           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1106           'field_marker': r':(?![: ])([^:\\]|\\.)*(?<! ):( +|$)',
1107           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1108           'doctest': r'>>>( +|$)',
1109           'line_block': r'\|( +|$)',
1110           'grid_table_top': grid_table_top_pat,
1111           'simple_table_top': simple_table_top_pat,
1112           'explicit_markup': r'\.\.( +|$)',
1113           'anonymous': r'__( +|$)',
1114           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1115           'text': r''}
1116     initial_transitions = (
1117           'bullet',
1118           'enumerator',
1119           'field_marker',
1120           'option_marker',
1121           'doctest',
1122           'line_block',
1123           'grid_table_top',
1124           'simple_table_top',
1125           'explicit_markup',
1126           'anonymous',
1127           'line',
1128           'text')
1129
1130     def indent(self, match, context, next_state):
1131         """Block quote."""
1132         indented, indent, line_offset, blank_finish = \
1133               self.state_machine.get_indented()
1134         elements = self.block_quote(indented, line_offset)
1135         self.parent += elements
1136         if not blank_finish:
1137             self.parent += self.unindent_warning('Block quote')
1138         return context, next_state, []
1139
1140     def block_quote(self, indented, line_offset):
1141         elements = []
1142         while indented:
1143             (blockquote_lines,
1144              attribution_lines,
1145              attribution_offset,
1146              indented,
1147              new_line_offset) = self.split_attribution(indented, line_offset)
1148             blockquote = nodes.block_quote()
1149             self.nested_parse(blockquote_lines, line_offset, blockquote)
1150             elements.append(blockquote)
1151             if attribution_lines:
1152                 attribution, messages = self.parse_attribution(
1153                     attribution_lines, attribution_offset)
1154                 blockquote += attribution
1155                 elements += messages
1156             line_offset = new_line_offset
1157             while indented and not indented[0]:
1158                 indented = indented[1:]
1159                 line_offset += 1
1160         return elements
1161
1162     # U+2014 is an em-dash:
1163     attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
1164                                      re.UNICODE)
1165
1166     def split_attribution(self, indented, line_offset):
1167         """
1168         Check for a block quote attribution and split it off:
1169
1170         * First line after a blank line must begin with a dash ("--", "---",
1171           em-dash; matches `self.attribution_pattern`).
1172         * Every line after that must have consistent indentation.
1173         * Attributions must be preceded by block quote content.
1174
1175         Return a tuple of: (block quote content lines, content offset,
1176         attribution lines, attribution offset, remaining indented lines).
1177         """
1178         blank = None
1179         nonblank_seen = False
1180         for i in range(len(indented)):
1181             line = indented[i].rstrip()
1182             if line:
1183                 if nonblank_seen and blank == i - 1: # last line blank
1184                     match = self.attribution_pattern.match(line)
1185                     if match:
1186                         attribution_end, indent = self.check_attribution(
1187                             indented, i)
1188                         if attribution_end:
1189                             a_lines = indented[i:attribution_end]
1190                             a_lines.trim_left(match.end(), end=1)
1191                             a_lines.trim_left(indent, start=1)
1192                             return (indented[:i], a_lines,
1193                                     i, indented[attribution_end:],
1194                                     line_offset + attribution_end)
1195                 nonblank_seen = True
1196             else:
1197                 blank = i
1198         else:
1199             return (indented, None, None, None, None)
1200
1201     def check_attribution(self, indented, attribution_start):
1202         """
1203         Check attribution shape.
1204         Return the index past the end of the attribution, and the indent.
1205         """
1206         indent = None
1207         i = attribution_start + 1
1208         for i in range(attribution_start + 1, len(indented)):
1209             line = indented[i].rstrip()
1210             if not line:
1211                 break
1212             if indent is None:
1213                 indent = len(line) - len(line.lstrip())
1214             elif len(line) - len(line.lstrip()) != indent:
1215                 return None, None       # bad shape; not an attribution
1216         else:
1217             # return index of line after last attribution line:
1218             i += 1
1219         return i, (indent or 0)
1220
1221     def parse_attribution(self, indented, line_offset):
1222         text = '\n'.join(indented).rstrip()
1223         lineno = self.state_machine.abs_line_number() + line_offset
1224         textnodes, messages = self.inline_text(text, lineno)
1225         node = nodes.attribution(text, '', *textnodes)
1226         node.source, node.line = self.state_machine.get_source_and_line(lineno)
1227         return node, messages
1228
1229     def bullet(self, match, context, next_state):
1230         """Bullet list item."""
1231         bulletlist = nodes.bullet_list()
1232         (bulletlist.source,
1233          bulletlist.line) = self.state_machine.get_source_and_line()
1234         self.parent += bulletlist
1235         bulletlist['bullet'] = match.string[0]
1236         i, blank_finish = self.list_item(match.end())
1237         bulletlist += i
1238         offset = self.state_machine.line_offset + 1   # next line
1239         new_line_offset, blank_finish = self.nested_list_parse(
1240               self.state_machine.input_lines[offset:],
1241               input_offset=self.state_machine.abs_line_offset() + 1,
1242               node=bulletlist, initial_state='BulletList',
1243               blank_finish=blank_finish)
1244         self.goto_line(new_line_offset)
1245         if not blank_finish:
1246             self.parent += self.unindent_warning('Bullet list')
1247         return [], next_state, []
1248
1249     def list_item(self, indent):
1250         if self.state_machine.line[indent:]:
1251             indented, line_offset, blank_finish = (
1252                 self.state_machine.get_known_indented(indent))
1253         else:
1254             indented, indent, line_offset, blank_finish = (
1255                 self.state_machine.get_first_known_indented(indent))
1256         listitem = nodes.list_item('\n'.join(indented))
1257         if indented:
1258             self.nested_parse(indented, input_offset=line_offset,
1259                               node=listitem)
1260         return listitem, blank_finish
1261
1262     def enumerator(self, match, context, next_state):
1263         """Enumerated List Item"""
1264         format, sequence, text, ordinal = self.parse_enumerator(match)
1265         if not self.is_enumerated_list_item(ordinal, sequence, format):
1266             raise statemachine.TransitionCorrection('text')
1267         enumlist = nodes.enumerated_list()
1268         self.parent += enumlist
1269         if sequence == '#':
1270             enumlist['enumtype'] = 'arabic'
1271         else:
1272             enumlist['enumtype'] = sequence
1273         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1274         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1275         if ordinal != 1:
1276             enumlist['start'] = ordinal
1277             msg = self.reporter.info(
1278                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1279                 % (text, ordinal))
1280             self.parent += msg
1281         listitem, blank_finish = self.list_item(match.end())
1282         enumlist += listitem
1283         offset = self.state_machine.line_offset + 1   # next line
1284         newline_offset, blank_finish = self.nested_list_parse(
1285               self.state_machine.input_lines[offset:],
1286               input_offset=self.state_machine.abs_line_offset() + 1,
1287               node=enumlist, initial_state='EnumeratedList',
1288               blank_finish=blank_finish,
1289               extra_settings={'lastordinal': ordinal,
1290                               'format': format,
1291                               'auto': sequence == '#'})
1292         self.goto_line(newline_offset)
1293         if not blank_finish:
1294             self.parent += self.unindent_warning('Enumerated list')
1295         return [], next_state, []
1296
1297     def parse_enumerator(self, match, expected_sequence=None):
1298         """
1299         Analyze an enumerator and return the results.
1300
1301         :Return:
1302             - the enumerator format ('period', 'parens', or 'rparen'),
1303             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1304             - the text of the enumerator, stripped of formatting, and
1305             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1306               ``None`` is returned for invalid enumerator text).
1307
1308         The enumerator format has already been determined by the regular
1309         expression match. If `expected_sequence` is given, that sequence is
1310         tried first. If not, we check for Roman numeral 1. This way,
1311         single-character Roman numerals (which are also alphabetical) can be
1312         matched. If no sequence has been matched, all sequences are checked in
1313         order.
1314         """
1315         groupdict = match.groupdict()
1316         sequence = ''
1317         for format in self.enum.formats:
1318             if groupdict[format]:       # was this the format matched?
1319                 break                   # yes; keep `format`
1320         else:                           # shouldn't happen
1321             raise ParserError('enumerator format not matched')
1322         text = groupdict[format][self.enum.formatinfo[format].start
1323                                  :self.enum.formatinfo[format].end]
1324         if text == '#':
1325             sequence = '#'
1326         elif expected_sequence:
1327             try:
1328                 if self.enum.sequenceregexps[expected_sequence].match(text):
1329                     sequence = expected_sequence
1330             except KeyError:            # shouldn't happen
1331                 raise ParserError('unknown enumerator sequence: %s'
1332                                   % sequence)
1333         elif text == 'i':
1334             sequence = 'lowerroman'
1335         elif text == 'I':
1336             sequence = 'upperroman'
1337         if not sequence:
1338             for sequence in self.enum.sequences:
1339                 if self.enum.sequenceregexps[sequence].match(text):
1340                     break
1341             else:                       # shouldn't happen
1342                 raise ParserError('enumerator sequence not matched')
1343         if sequence == '#':
1344             ordinal = 1
1345         else:
1346             try:
1347                 ordinal = self.enum.converters[sequence](text)
1348             except roman.InvalidRomanNumeralError:
1349                 ordinal = None
1350         return format, sequence, text, ordinal
1351
1352     def is_enumerated_list_item(self, ordinal, sequence, format):
1353         """
1354         Check validity based on the ordinal value and the second line.
1355
1356         Return true if the ordinal is valid and the second line is blank,
1357         indented, or starts with the next enumerator or an auto-enumerator.
1358         """
1359         if ordinal is None:
1360             return None
1361         try:
1362             next_line = self.state_machine.next_line()
1363         except EOFError:              # end of input lines
1364             self.state_machine.previous_line()
1365             return 1
1366         else:
1367             self.state_machine.previous_line()
1368         if not next_line[:1].strip():   # blank or indented
1369             return 1
1370         result = self.make_enumerator(ordinal + 1, sequence, format)
1371         if result:
1372             next_enumerator, auto_enumerator = result
1373             try:
1374                 if ( next_line.startswith(next_enumerator) or
1375                      next_line.startswith(auto_enumerator) ):
1376                     return 1
1377             except TypeError:
1378                 pass
1379         return None
1380
1381     def make_enumerator(self, ordinal, sequence, format):
1382         """
1383         Construct and return the next enumerated list item marker, and an
1384         auto-enumerator ("#" instead of the regular enumerator).
1385
1386         Return ``None`` for invalid (out of range) ordinals.
1387         """ #"
1388         if sequence == '#':
1389             enumerator = '#'
1390         elif sequence == 'arabic':
1391             enumerator = str(ordinal)
1392         else:
1393             if sequence.endswith('alpha'):
1394                 if ordinal > 26:
1395                     return None
1396                 enumerator = chr(ordinal + ord('a') - 1)
1397             elif sequence.endswith('roman'):
1398                 try:
1399                     enumerator = roman.toRoman(ordinal)
1400                 except roman.RomanError:
1401                     return None
1402             else:                       # shouldn't happen
1403                 raise ParserError('unknown enumerator sequence: "%s"'
1404                                   % sequence)
1405             if sequence.startswith('lower'):
1406                 enumerator = enumerator.lower()
1407             elif sequence.startswith('upper'):
1408                 enumerator = enumerator.upper()
1409             else:                       # shouldn't happen
1410                 raise ParserError('unknown enumerator sequence: "%s"'
1411                                   % sequence)
1412         formatinfo = self.enum.formatinfo[format]
1413         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1414                            + ' ')
1415         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1416         return next_enumerator, auto_enumerator
1417
1418     def field_marker(self, match, context, next_state):
1419         """Field list item."""
1420         field_list = nodes.field_list()
1421         self.parent += field_list
1422         field, blank_finish = self.field(match)
1423         field_list += field
1424         offset = self.state_machine.line_offset + 1   # next line
1425         newline_offset, blank_finish = self.nested_list_parse(
1426               self.state_machine.input_lines[offset:],
1427               input_offset=self.state_machine.abs_line_offset() + 1,
1428               node=field_list, initial_state='FieldList',
1429               blank_finish=blank_finish)
1430         self.goto_line(newline_offset)
1431         if not blank_finish:
1432             self.parent += self.unindent_warning('Field list')
1433         return [], next_state, []
1434
1435     def field(self, match):
1436         name = self.parse_field_marker(match)
1437         src, srcline = self.state_machine.get_source_and_line()
1438         lineno = self.state_machine.abs_line_number()
1439         indented, indent, line_offset, blank_finish = \
1440               self.state_machine.get_first_known_indented(match.end())
1441         field_node = nodes.field()
1442         field_node.source = src
1443         field_node.line = srcline
1444         name_nodes, name_messages = self.inline_text(name, lineno)
1445         field_node += nodes.field_name(name, '', *name_nodes)
1446         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1447         field_node += field_body
1448         if indented:
1449             self.parse_field_body(indented, line_offset, field_body)
1450         return field_node, blank_finish
1451
1452     def parse_field_marker(self, match):
1453         """Extract & return field name from a field marker match."""
1454         field = match.group()[1:]        # strip off leading ':'
1455         field = field[:field.rfind(':')] # strip off trailing ':' etc.
1456         return field
1457
1458     def parse_field_body(self, indented, offset, node):
1459         self.nested_parse(indented, input_offset=offset, node=node)
1460
1461     def option_marker(self, match, context, next_state):
1462         """Option list item."""
1463         optionlist = nodes.option_list()
1464         (optionlist.source, optionlist.line) = self.state_machine.get_source_and_line()
1465         try:
1466             listitem, blank_finish = self.option_list_item(match)
1467         except MarkupError, error:
1468             # This shouldn't happen; pattern won't match.
1469             msg = self.reporter.error(u'Invalid option list marker: %s' %
1470                                       error)
1471             self.parent += msg
1472             indented, indent, line_offset, blank_finish = \
1473                   self.state_machine.get_first_known_indented(match.end())
1474             elements = self.block_quote(indented, line_offset)
1475             self.parent += elements
1476             if not blank_finish:
1477                 self.parent += self.unindent_warning('Option list')
1478             return [], next_state, []
1479         self.parent += optionlist
1480         optionlist += listitem
1481         offset = self.state_machine.line_offset + 1   # next line
1482         newline_offset, blank_finish = self.nested_list_parse(
1483               self.state_machine.input_lines[offset:],
1484               input_offset=self.state_machine.abs_line_offset() + 1,
1485               node=optionlist, initial_state='OptionList',
1486               blank_finish=blank_finish)
1487         self.goto_line(newline_offset)
1488         if not blank_finish:
1489             self.parent += self.unindent_warning('Option list')
1490         return [], next_state, []
1491
1492     def option_list_item(self, match):
1493         offset = self.state_machine.abs_line_offset()
1494         options = self.parse_option_marker(match)
1495         indented, indent, line_offset, blank_finish = \
1496               self.state_machine.get_first_known_indented(match.end())
1497         if not indented:                # not an option list item
1498             self.goto_line(offset)
1499             raise statemachine.TransitionCorrection('text')
1500         option_group = nodes.option_group('', *options)
1501         description = nodes.description('\n'.join(indented))
1502         option_list_item = nodes.option_list_item('', option_group,
1503                                                   description)
1504         if indented:
1505             self.nested_parse(indented, input_offset=line_offset,
1506                               node=description)
1507         return option_list_item, blank_finish
1508
1509     def parse_option_marker(self, match):
1510         """
1511         Return a list of `node.option` and `node.option_argument` objects,
1512         parsed from an option marker match.
1513
1514         :Exception: `MarkupError` for invalid option markers.
1515         """
1516         optlist = []
1517         optionstrings = match.group().rstrip().split(', ')
1518         for optionstring in optionstrings:
1519             tokens = optionstring.split()
1520             delimiter = ' '
1521             firstopt = tokens[0].split('=', 1)
1522             if len(firstopt) > 1:
1523                 # "--opt=value" form
1524                 tokens[:1] = firstopt
1525                 delimiter = '='
1526             elif (len(tokens[0]) > 2
1527                   and ((tokens[0].startswith('-')
1528                         and not tokens[0].startswith('--'))
1529                        or tokens[0].startswith('+'))):
1530                 # "-ovalue" form
1531                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1532                 delimiter = ''
1533             if len(tokens) > 1 and (tokens[1].startswith('<')
1534                                     and tokens[-1].endswith('>')):
1535                 # "-o <value1 value2>" form; join all values into one token
1536                 tokens[1:] = [' '.join(tokens[1:])]
1537             if 0 < len(tokens) <= 2:
1538                 option = nodes.option(optionstring)
1539                 option += nodes.option_string(tokens[0], tokens[0])
1540                 if len(tokens) > 1:
1541                     option += nodes.option_argument(tokens[1], tokens[1],
1542                                                     delimiter=delimiter)
1543                 optlist.append(option)
1544             else:
1545                 raise MarkupError(
1546                     'wrong number of option tokens (=%s), should be 1 or 2: '
1547                     '"%s"' % (len(tokens), optionstring))
1548         return optlist
1549
1550     def doctest(self, match, context, next_state):
1551         data = '\n'.join(self.state_machine.get_text_block())
1552         # TODO: prepend class value ['pycon'] (Python Console)
1553         # parse with `directives.body.CodeBlock` (returns literal-block
1554         # with class "code" and syntax highlight markup).
1555         self.parent += nodes.doctest_block(data, data)
1556         return [], next_state, []
1557
1558     def line_block(self, match, context, next_state):
1559         """First line of a line block."""
1560         block = nodes.line_block()
1561         self.parent += block
1562         lineno = self.state_machine.abs_line_number()
1563         line, messages, blank_finish = self.line_block_line(match, lineno)
1564         block += line
1565         self.parent += messages
1566         if not blank_finish:
1567             offset = self.state_machine.line_offset + 1   # next line
1568             new_line_offset, blank_finish = self.nested_list_parse(
1569                   self.state_machine.input_lines[offset:],
1570                   input_offset=self.state_machine.abs_line_offset() + 1,
1571                   node=block, initial_state='LineBlock',
1572                   blank_finish=0)
1573             self.goto_line(new_line_offset)
1574         if not blank_finish:
1575             self.parent += self.reporter.warning(
1576                 'Line block ends without a blank line.',
1577                 line=lineno+1)
1578         if len(block):
1579             if block[0].indent is None:
1580                 block[0].indent = 0
1581             self.nest_line_block_lines(block)
1582         return [], next_state, []
1583
1584     def line_block_line(self, match, lineno):
1585         """Return one line element of a line_block."""
1586         indented, indent, line_offset, blank_finish = \
1587             self.state_machine.get_first_known_indented(match.end(),
1588                                                         until_blank=True)
1589         text = u'\n'.join(indented)
1590         text_nodes, messages = self.inline_text(text, lineno)
1591         line = nodes.line(text, '', *text_nodes)
1592         if match.string.rstrip() != '|': # not empty
1593             line.indent = len(match.group(1)) - 1
1594         return line, messages, blank_finish
1595
1596     def nest_line_block_lines(self, block):
1597         for index in range(1, len(block)):
1598             if getattr(block[index], 'indent', None) is None:
1599                 block[index].indent = block[index - 1].indent
1600         self.nest_line_block_segment(block)
1601
1602     def nest_line_block_segment(self, block):
1603         indents = [item.indent for item in block]
1604         least = min(indents)
1605         new_items = []
1606         new_block = nodes.line_block()
1607         for item in block:
1608             if item.indent > least:
1609                 new_block.append(item)
1610             else:
1611                 if len(new_block):
1612                     self.nest_line_block_segment(new_block)
1613                     new_items.append(new_block)
1614                     new_block = nodes.line_block()
1615                 new_items.append(item)
1616         if len(new_block):
1617             self.nest_line_block_segment(new_block)
1618             new_items.append(new_block)
1619         block[:] = new_items
1620
1621     def grid_table_top(self, match, context, next_state):
1622         """Top border of a full table."""
1623         return self.table_top(match, context, next_state,
1624                               self.isolate_grid_table,
1625                               tableparser.GridTableParser)
1626
1627     def simple_table_top(self, match, context, next_state):
1628         """Top border of a simple table."""
1629         return self.table_top(match, context, next_state,
1630                               self.isolate_simple_table,
1631                               tableparser.SimpleTableParser)
1632
1633     def table_top(self, match, context, next_state,
1634                   isolate_function, parser_class):
1635         """Top border of a generic table."""
1636         nodelist, blank_finish = self.table(isolate_function, parser_class)
1637         self.parent += nodelist
1638         if not blank_finish:
1639             msg = self.reporter.warning(
1640                 'Blank line required after table.',
1641                 line=self.state_machine.abs_line_number()+1)
1642             self.parent += msg
1643         return [], next_state, []
1644
1645     def table(self, isolate_function, parser_class):
1646         """Parse a table."""
1647         block, messages, blank_finish = isolate_function()
1648         if block:
1649             try:
1650                 parser = parser_class()
1651                 tabledata = parser.parse(block)
1652                 tableline = (self.state_machine.abs_line_number() - len(block)
1653                              + 1)
1654                 table = self.build_table(tabledata, tableline)
1655                 nodelist = [table] + messages
1656             except tableparser.TableMarkupError, err:
1657                 nodelist = self.malformed_table(block, ' '.join(err.args),
1658                                                 offset=err.offset) + messages
1659         else:
1660             nodelist = messages
1661         return nodelist, blank_finish
1662
1663     def isolate_grid_table(self):
1664         messages = []
1665         blank_finish = 1
1666         try:
1667             block = self.state_machine.get_text_block(flush_left=True)
1668         except statemachine.UnexpectedIndentationError, err:
1669             block, src, srcline = err.args
1670             messages.append(self.reporter.error('Unexpected indentation.',
1671                                                 source=src, line=srcline))
1672             blank_finish = 0
1673         block.disconnect()
1674         # for East Asian chars:
1675         block.pad_double_width(self.double_width_pad_char)
1676         width = len(block[0].strip())
1677         for i in range(len(block)):
1678             block[i] = block[i].strip()
1679             if block[i][0] not in '+|': # check left edge
1680                 blank_finish = 0
1681                 self.state_machine.previous_line(len(block) - i)
1682                 del block[i:]
1683                 break
1684         if not self.grid_table_top_pat.match(block[-1]): # find bottom
1685             blank_finish = 0
1686             # from second-last to third line of table:
1687             for i in range(len(block) - 2, 1, -1):
1688                 if self.grid_table_top_pat.match(block[i]):
1689                     self.state_machine.previous_line(len(block) - i + 1)
1690                     del block[i+1:]
1691                     break
1692             else:
1693                 messages.extend(self.malformed_table(block))
1694                 return [], messages, blank_finish
1695         for i in range(len(block)):     # check right edge
1696             if len(block[i]) != width or block[i][-1] not in '+|':
1697                 messages.extend(self.malformed_table(block))
1698                 return [], messages, blank_finish
1699         return block, messages, blank_finish
1700
1701     def isolate_simple_table(self):
1702         start = self.state_machine.line_offset
1703         lines = self.state_machine.input_lines
1704         limit = len(lines) - 1
1705         toplen = len(lines[start].strip())
1706         pattern_match = self.simple_table_border_pat.match
1707         found = 0
1708         found_at = None
1709         i = start + 1
1710         while i <= limit:
1711             line = lines[i]
1712             match = pattern_match(line)
1713             if match:
1714                 if len(line.strip()) != toplen:
1715                     self.state_machine.next_line(i - start)
1716                     messages = self.malformed_table(
1717                         lines[start:i+1], 'Bottom/header table border does '
1718                         'not match top border.')
1719                     return [], messages, i == limit or not lines[i+1].strip()
1720                 found += 1
1721                 found_at = i
1722                 if found == 2 or i == limit or not lines[i+1].strip():
1723                     end = i
1724                     break
1725             i += 1
1726         else:                           # reached end of input_lines
1727             if found:
1728                 extra = ' or no blank line after table bottom'
1729                 self.state_machine.next_line(found_at - start)
1730                 block = lines[start:found_at+1]
1731             else:
1732                 extra = ''
1733                 self.state_machine.next_line(i - start - 1)
1734                 block = lines[start:]
1735             messages = self.malformed_table(
1736                 block, 'No bottom table border found%s.' % extra)
1737             return [], messages, not extra
1738         self.state_machine.next_line(end - start)
1739         block = lines[start:end+1]
1740         # for East Asian chars:
1741         block.pad_double_width(self.double_width_pad_char)
1742         return block, [], end == limit or not lines[end+1].strip()
1743
1744     def malformed_table(self, block, detail='', offset=0):
1745         block.replace(self.double_width_pad_char, '')
1746         data = '\n'.join(block)
1747         message = 'Malformed table.'
1748         startline = self.state_machine.abs_line_number() - len(block) + 1
1749         if detail:
1750             message += '\n' + detail
1751         error = self.reporter.error(message, nodes.literal_block(data, data),
1752                                     line=startline+offset)
1753         return [error]
1754
1755     def build_table(self, tabledata, tableline, stub_columns=0, widths='auto'):
1756         colwidths, headrows, bodyrows = tabledata
1757         table = nodes.table()
1758         tgroup = nodes.tgroup(cols=len(colwidths), colwidths=widths)
1759         table += tgroup
1760         for colwidth in colwidths:
1761             colspec = nodes.colspec(colwidth=colwidth)
1762             if stub_columns:
1763                 colspec.attributes['stub'] = 1
1764                 stub_columns -= 1
1765             tgroup += colspec
1766         if headrows:
1767             thead = nodes.thead()
1768             tgroup += thead
1769             for row in headrows:
1770                 thead += self.build_table_row(row, tableline)
1771         tbody = nodes.tbody()
1772         tgroup += tbody
1773         for row in bodyrows:
1774             tbody += self.build_table_row(row, tableline)
1775         return table
1776
1777     def build_table_row(self, rowdata, tableline):
1778         row = nodes.row()
1779         for cell in rowdata:
1780             if cell is None:
1781                 continue
1782             morerows, morecols, offset, cellblock = cell
1783             attributes = {}
1784             if morerows:
1785                 attributes['morerows'] = morerows
1786             if morecols:
1787                 attributes['morecols'] = morecols
1788             entry = nodes.entry(**attributes)
1789             row += entry
1790             if ''.join(cellblock):
1791                 self.nested_parse(cellblock, input_offset=tableline+offset,
1792                                   node=entry)
1793         return row
1794
1795
1796     explicit = Struct()
1797     """Patterns and constants used for explicit markup recognition."""
1798
1799     explicit.patterns = Struct(
1800           target=re.compile(r"""
1801                             (
1802                               _               # anonymous target
1803                             |               # *OR*
1804                               (?!_)           # no underscore at the beginning
1805                               (?P<quote>`?)   # optional open quote
1806                               (?![ `])        # first char. not space or
1807                                               # backquote
1808                               (?P<name>       # reference name
1809                                 .+?
1810                               )
1811                               %(non_whitespace_escape_before)s
1812                               (?P=quote)      # close quote if open quote used
1813                             )
1814                             (?<!(?<!\x00):) # no unescaped colon at end
1815                             %(non_whitespace_escape_before)s
1816                             [ ]?            # optional space
1817                             :               # end of reference name
1818                             ([ ]+|$)        # followed by whitespace
1819                             """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1820           reference=re.compile(r"""
1821                                (
1822                                  (?P<simple>%(simplename)s)_
1823                                |                  # *OR*
1824                                  `                  # open backquote
1825                                  (?![ ])            # not space
1826                                  (?P<phrase>.+?)    # hyperlink phrase
1827                                  %(non_whitespace_escape_before)s
1828                                  `_                 # close backquote,
1829                                                     # reference mark
1830                                )
1831                                $                  # end of string
1832                                """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1833           substitution=re.compile(r"""
1834                                   (
1835                                     (?![ ])          # first char. not space
1836                                     (?P<name>.+?)    # substitution text
1837                                     %(non_whitespace_escape_before)s
1838                                     \|               # close delimiter
1839                                   )
1840                                   ([ ]+|$)           # followed by whitespace
1841                                   """ % vars(Inliner),
1842                                   re.VERBOSE | re.UNICODE),)
1843
1844     def footnote(self, match):
1845         src, srcline = self.state_machine.get_source_and_line()
1846         indented, indent, offset, blank_finish = \
1847               self.state_machine.get_first_known_indented(match.end())
1848         label = match.group(1)
1849         name = normalize_name(label)
1850         footnote = nodes.footnote('\n'.join(indented))
1851         footnote.source = src
1852         footnote.line = srcline
1853         if name[0] == '#':              # auto-numbered
1854             name = name[1:]             # autonumber label
1855             footnote['auto'] = 1
1856             if name:
1857                 footnote['names'].append(name)
1858             self.document.note_autofootnote(footnote)
1859         elif name == '*':               # auto-symbol
1860             name = ''
1861             footnote['auto'] = '*'
1862             self.document.note_symbol_footnote(footnote)
1863         else:                           # manually numbered
1864             footnote += nodes.label('', label)
1865             footnote['names'].append(name)
1866             self.document.note_footnote(footnote)
1867         if name:
1868             self.document.note_explicit_target(footnote, footnote)
1869         else:
1870             self.document.set_id(footnote, footnote)
1871         if indented:
1872             self.nested_parse(indented, input_offset=offset, node=footnote)
1873         return [footnote], blank_finish
1874
1875     def citation(self, match):
1876         src, srcline = self.state_machine.get_source_and_line()
1877         indented, indent, offset, blank_finish = \
1878               self.state_machine.get_first_known_indented(match.end())
1879         label = match.group(1)
1880         name = normalize_name(label)
1881         citation = nodes.citation('\n'.join(indented))
1882         citation.source = src
1883         citation.line = srcline
1884         citation += nodes.label('', label)
1885         citation['names'].append(name)
1886         self.document.note_citation(citation)
1887         self.document.note_explicit_target(citation, citation)
1888         if indented:
1889             self.nested_parse(indented, input_offset=offset, node=citation)
1890         return [citation], blank_finish
1891
1892     def hyperlink_target(self, match):
1893         pattern = self.explicit.patterns.target
1894         lineno = self.state_machine.abs_line_number()
1895         block, indent, offset, blank_finish = \
1896               self.state_machine.get_first_known_indented(
1897               match.end(), until_blank=True, strip_indent=False)
1898         blocktext = match.string[:match.end()] + '\n'.join(block)
1899         block = [escape2null(line) for line in block]
1900         escaped = block[0]
1901         blockindex = 0
1902         while True:
1903             targetmatch = pattern.match(escaped)
1904             if targetmatch:
1905                 break
1906             blockindex += 1
1907             try:
1908                 escaped += block[blockindex]
1909             except IndexError:
1910                 raise MarkupError('malformed hyperlink target.')
1911         del block[:blockindex]
1912         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1913         target = self.make_target(block, blocktext, lineno,
1914                                   targetmatch.group('name'))
1915         return [target], blank_finish
1916
1917     def make_target(self, block, block_text, lineno, target_name):
1918         target_type, data = self.parse_target(block, block_text, lineno)
1919         if target_type == 'refname':
1920             target = nodes.target(block_text, '', refname=normalize_name(data))
1921             target.indirect_reference_name = data
1922             self.add_target(target_name, '', target, lineno)
1923             self.document.note_indirect_target(target)
1924             return target
1925         elif target_type == 'refuri':
1926             target = nodes.target(block_text, '')
1927             self.add_target(target_name, data, target, lineno)
1928             return target
1929         else:
1930             return data
1931
1932     def parse_target(self, block, block_text, lineno):
1933         """
1934         Determine the type of reference of a target.
1935
1936         :Return: A 2-tuple, one of:
1937
1938             - 'refname' and the indirect reference name
1939             - 'refuri' and the URI
1940             - 'malformed' and a system_message node
1941         """
1942         if block and block[-1].strip()[-1:] == '_': # possible indirect target
1943             reference = ' '.join([line.strip() for line in block])
1944             refname = self.is_reference(reference)
1945             if refname:
1946                 return 'refname', refname
1947         reference = ''.join([''.join(line.split()) for line in block])
1948         return 'refuri', unescape(reference)
1949
1950     def is_reference(self, reference):
1951         match = self.explicit.patterns.reference.match(
1952             whitespace_normalize_name(reference))
1953         if not match:
1954             return None
1955         return unescape(match.group('simple') or match.group('phrase'))
1956
1957     def add_target(self, targetname, refuri, target, lineno):
1958         target.line = lineno
1959         if targetname:
1960             name = normalize_name(unescape(targetname))
1961             target['names'].append(name)
1962             if refuri:
1963                 uri = self.inliner.adjust_uri(refuri)
1964                 if uri:
1965                     target['refuri'] = uri
1966                 else:
1967                     raise ApplicationError('problem with URI: %r' % refuri)
1968             self.document.note_explicit_target(target, self.parent)
1969         else:                       # anonymous target
1970             if refuri:
1971                 target['refuri'] = refuri
1972             target['anonymous'] = 1
1973             self.document.note_anonymous_target(target)
1974
1975     def substitution_def(self, match):
1976         pattern = self.explicit.patterns.substitution
1977         src, srcline = self.state_machine.get_source_and_line()
1978         block, indent, offset, blank_finish = \
1979               self.state_machine.get_first_known_indented(match.end(),
1980                                                           strip_indent=False)
1981         blocktext = (match.string[:match.end()] + '\n'.join(block))
1982         block.disconnect()
1983         escaped = escape2null(block[0].rstrip())
1984         blockindex = 0
1985         while True:
1986             subdefmatch = pattern.match(escaped)
1987             if subdefmatch:
1988                 break
1989             blockindex += 1
1990             try:
1991                 escaped = escaped + ' ' + escape2null(block[blockindex].strip())
1992             except IndexError:
1993                 raise MarkupError('malformed substitution definition.')
1994         del block[:blockindex]          # strip out the substitution marker
1995         block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1]
1996         if not block[0]:
1997             del block[0]
1998             offset += 1
1999         while block and not block[-1].strip():
2000             block.pop()
2001         subname = subdefmatch.group('name')
2002         substitution_node = nodes.substitution_definition(blocktext)
2003         substitution_node.source = src
2004         substitution_node.line = srcline
2005         if not block:
2006             msg = self.reporter.warning(
2007                 'Substitution definition "%s" missing contents.' % subname,
2008                 nodes.literal_block(blocktext, blocktext),
2009                 source=src, line=srcline)
2010             return [msg], blank_finish
2011         block[0] = block[0].strip()
2012         substitution_node['names'].append(
2013             nodes.whitespace_normalize_name(subname))
2014         new_abs_offset, blank_finish = self.nested_list_parse(
2015               block, input_offset=offset, node=substitution_node,
2016               initial_state='SubstitutionDef', blank_finish=blank_finish)
2017         i = 0
2018         for node in substitution_node[:]:
2019             if not (isinstance(node, nodes.Inline) or
2020                     isinstance(node, nodes.Text)):
2021                 self.parent += substitution_node[i]
2022                 del substitution_node[i]
2023             else:
2024                 i += 1
2025         for node in substitution_node.traverse(nodes.Element):
2026             if self.disallowed_inside_substitution_definitions(node):
2027                 pformat = nodes.literal_block('', node.pformat().rstrip())
2028                 msg = self.reporter.error(
2029                     'Substitution definition contains illegal element:',
2030                     pformat, nodes.literal_block(blocktext, blocktext),
2031                     source=src, line=srcline)
2032                 return [msg], blank_finish
2033         if len(substitution_node) == 0:
2034             msg = self.reporter.warning(
2035                   'Substitution definition "%s" empty or invalid.' % subname,
2036                   nodes.literal_block(blocktext, blocktext),
2037                   source=src, line=srcline)
2038             return [msg], blank_finish
2039         self.document.note_substitution_def(
2040             substitution_node, subname, self.parent)
2041         return [substitution_node], blank_finish
2042
2043     def disallowed_inside_substitution_definitions(self, node):
2044         if (node['ids'] or
2045             isinstance(node, nodes.reference) and node.get('anonymous') or
2046             isinstance(node, nodes.footnote_reference) and node.get('auto')):
2047             return 1
2048         else:
2049             return 0
2050
2051     def directive(self, match, **option_presets):
2052         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2053         type_name = match.group(1)
2054         directive_class, messages = directives.directive(
2055             type_name, self.memo.language, self.document)
2056         self.parent += messages
2057         if directive_class:
2058             return self.run_directive(
2059                 directive_class, match, type_name, option_presets)
2060         else:
2061             return self.unknown_directive(type_name)
2062
2063     def run_directive(self, directive, match, type_name, option_presets):
2064         """
2065         Parse a directive then run its directive function.
2066
2067         Parameters:
2068
2069         - `directive`: The class implementing the directive.  Must be
2070           a subclass of `rst.Directive`.
2071
2072         - `match`: A regular expression match object which matched the first
2073           line of the directive.
2074
2075         - `type_name`: The directive name, as used in the source text.
2076
2077         - `option_presets`: A dictionary of preset options, defaults for the
2078           directive options.  Currently, only an "alt" option is passed by
2079           substitution definitions (value: the substitution name), which may
2080           be used by an embedded image directive.
2081
2082         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2083         """
2084         if isinstance(directive, (FunctionType, MethodType)):
2085             from docutils.parsers.rst import convert_directive_function
2086             directive = convert_directive_function(directive)
2087         lineno = self.state_machine.abs_line_number()
2088         initial_line_offset = self.state_machine.line_offset
2089         indented, indent, line_offset, blank_finish \
2090                   = self.state_machine.get_first_known_indented(match.end(),
2091                                                                 strip_top=0)
2092         block_text = '\n'.join(self.state_machine.input_lines[
2093             initial_line_offset : self.state_machine.line_offset + 1])
2094         try:
2095             arguments, options, content, content_offset = (
2096                 self.parse_directive_block(indented, line_offset,
2097                                            directive, option_presets))
2098         except MarkupError, detail:
2099             error = self.reporter.error(
2100                 'Error in "%s" directive:\n%s.' % (type_name,
2101                                                    ' '.join(detail.args)),
2102                 nodes.literal_block(block_text, block_text), line=lineno)
2103             return [error], blank_finish
2104         directive_instance = directive(
2105             type_name, arguments, options, content, lineno,
2106             content_offset, block_text, self, self.state_machine)
2107         try:
2108             result = directive_instance.run()
2109         except docutils.parsers.rst.DirectiveError, error:
2110             msg_node = self.reporter.system_message(error.level, error.msg,
2111                                                     line=lineno)
2112             msg_node += nodes.literal_block(block_text, block_text)
2113             result = [msg_node]
2114         assert isinstance(result, list), \
2115                'Directive "%s" must return a list of nodes.' % type_name
2116         for i in range(len(result)):
2117             assert isinstance(result[i], nodes.Node), \
2118                    ('Directive "%s" returned non-Node object (index %s): %r'
2119                     % (type_name, i, result[i]))
2120         return (result,
2121                 blank_finish or self.state_machine.is_next_line_blank())
2122
2123     def parse_directive_block(self, indented, line_offset, directive,
2124                               option_presets):
2125         option_spec = directive.option_spec
2126         has_content = directive.has_content
2127         if indented and not indented[0].strip():
2128             indented.trim_start()
2129             line_offset += 1
2130         while indented and not indented[-1].strip():
2131             indented.trim_end()
2132         if indented and (directive.required_arguments
2133                          or directive.optional_arguments
2134                          or option_spec):
2135             for i, line in enumerate(indented):
2136                 if not line.strip():
2137                     break
2138             else:
2139                 i += 1
2140             arg_block = indented[:i]
2141             content = indented[i+1:]
2142             content_offset = line_offset + i + 1
2143         else:
2144             content = indented
2145             content_offset = line_offset
2146             arg_block = []
2147         if option_spec:
2148             options, arg_block = self.parse_directive_options(
2149                 option_presets, option_spec, arg_block)
2150         else:
2151             options = {}
2152         if arg_block and not (directive.required_arguments
2153                               or directive.optional_arguments):
2154             content = arg_block + indented[i:]
2155             content_offset = line_offset
2156             arg_block = []
2157         while content and not content[0].strip():
2158             content.trim_start()
2159             content_offset += 1
2160         if directive.required_arguments or directive.optional_arguments:
2161             arguments = self.parse_directive_arguments(
2162                 directive, arg_block)
2163         else:
2164             arguments = []
2165         if content and not has_content:
2166             raise MarkupError('no content permitted')
2167         return (arguments, options, content, content_offset)
2168
2169     def parse_directive_options(self, option_presets, option_spec, arg_block):
2170         options = option_presets.copy()
2171         for i, line in enumerate(arg_block):
2172             if re.match(Body.patterns['field_marker'], line):
2173                 opt_block = arg_block[i:]
2174                 arg_block = arg_block[:i]
2175                 break
2176         else:
2177             opt_block = []
2178         if opt_block:
2179             success, data = self.parse_extension_options(option_spec,
2180                                                          opt_block)
2181             if success:                 # data is a dict of options
2182                 options.update(data)
2183             else:                       # data is an error string
2184                 raise MarkupError(data)
2185         return options, arg_block
2186
2187     def parse_directive_arguments(self, directive, arg_block):
2188         required = directive.required_arguments
2189         optional = directive.optional_arguments
2190         arg_text = '\n'.join(arg_block)
2191         arguments = arg_text.split()
2192         if len(arguments) < required:
2193             raise MarkupError('%s argument(s) required, %s supplied'
2194                               % (required, len(arguments)))
2195         elif len(arguments) > required + optional:
2196             if directive.final_argument_whitespace:
2197                 arguments = arg_text.split(None, required + optional - 1)
2198             else:
2199                 raise MarkupError(
2200                     'maximum %s argument(s) allowed, %s supplied'
2201                     % (required + optional, len(arguments)))
2202         return arguments
2203
2204     def parse_extension_options(self, option_spec, datalines):
2205         """
2206         Parse `datalines` for a field list containing extension options
2207         matching `option_spec`.
2208
2209         :Parameters:
2210             - `option_spec`: a mapping of option name to conversion
2211               function, which should raise an exception on bad input.
2212             - `datalines`: a list of input strings.
2213
2214         :Return:
2215             - Success value, 1 or 0.
2216             - An option dictionary on success, an error string on failure.
2217         """
2218         node = nodes.field_list()
2219         newline_offset, blank_finish = self.nested_list_parse(
2220               datalines, 0, node, initial_state='ExtensionOptions',
2221               blank_finish=True)
2222         if newline_offset != len(datalines): # incomplete parse of block
2223             return 0, 'invalid option block'
2224         try:
2225             options = utils.extract_extension_options(node, option_spec)
2226         except KeyError, detail:
2227             return 0, ('unknown option: "%s"' % detail.args[0])
2228         except (ValueError, TypeError), detail:
2229             return 0, ('invalid option value: %s' % ' '.join(detail.args))
2230         except utils.ExtensionOptionError, detail:
2231             return 0, ('invalid option data: %s' % ' '.join(detail.args))
2232         if blank_finish:
2233             return 1, options
2234         else:
2235             return 0, 'option data incompletely parsed'
2236
2237     def unknown_directive(self, type_name):
2238         lineno = self.state_machine.abs_line_number()
2239         indented, indent, offset, blank_finish = \
2240             self.state_machine.get_first_known_indented(0, strip_indent=False)
2241         text = '\n'.join(indented)
2242         error = self.reporter.error(
2243               'Unknown directive type "%s".' % type_name,
2244               nodes.literal_block(text, text), line=lineno)
2245         return [error], blank_finish
2246
2247     def comment(self, match):
2248         if not match.string[match.end():].strip() \
2249               and self.state_machine.is_next_line_blank(): # an empty comment?
2250             return [nodes.comment()], 1 # "A tiny but practical wart."
2251         indented, indent, offset, blank_finish = \
2252               self.state_machine.get_first_known_indented(match.end())
2253         while indented and not indented[-1].strip():
2254             indented.trim_end()
2255         text = '\n'.join(indented)
2256         return [nodes.comment(text, text)], blank_finish
2257
2258     explicit.constructs = [
2259           (footnote,
2260            re.compile(r"""
2261                       \.\.[ ]+          # explicit markup start
2262                       \[
2263                       (                 # footnote label:
2264                           [0-9]+          # manually numbered footnote
2265                         |               # *OR*
2266                           \#              # anonymous auto-numbered footnote
2267                         |               # *OR*
2268                           \#%s            # auto-number ed?) footnote label
2269                         |               # *OR*
2270                           \*              # auto-symbol footnote
2271                       )
2272                       \]
2273                       ([ ]+|$)          # whitespace or end of line
2274                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2275           (citation,
2276            re.compile(r"""
2277                       \.\.[ ]+          # explicit markup start
2278                       \[(%s)\]          # citation label
2279                       ([ ]+|$)          # whitespace or end of line
2280                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2281           (hyperlink_target,
2282            re.compile(r"""
2283                       \.\.[ ]+          # explicit markup start
2284                       _                 # target indicator
2285                       (?![ ]|$)         # first char. not space or EOL
2286                       """, re.VERBOSE | re.UNICODE)),
2287           (substitution_def,
2288            re.compile(r"""
2289                       \.\.[ ]+          # explicit markup start
2290                       \|                # substitution indicator
2291                       (?![ ]|$)         # first char. not space or EOL
2292                       """, re.VERBOSE | re.UNICODE)),
2293           (directive,
2294            re.compile(r"""
2295                       \.\.[ ]+          # explicit markup start
2296                       (%s)              # directive name
2297                       [ ]?              # optional space
2298                       ::                # directive delimiter
2299                       ([ ]+|$)          # whitespace or end of line
2300                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE))]
2301
2302     def explicit_markup(self, match, context, next_state):
2303         """Footnotes, hyperlink targets, directives, comments."""
2304         nodelist, blank_finish = self.explicit_construct(match)
2305         self.parent += nodelist
2306         self.explicit_list(blank_finish)
2307         return [], next_state, []
2308
2309     def explicit_construct(self, match):
2310         """Determine which explicit construct this is, parse & return it."""
2311         errors = []
2312         for method, pattern in self.explicit.constructs:
2313             expmatch = pattern.match(match.string)
2314             if expmatch:
2315                 try:
2316                     return method(self, expmatch)
2317                 except MarkupError, error:
2318                     lineno = self.state_machine.abs_line_number()
2319                     message = ' '.join(error.args)
2320                     errors.append(self.reporter.warning(message, line=lineno))
2321                     break
2322         nodelist, blank_finish = self.comment(match)
2323         return nodelist + errors, blank_finish
2324
2325     def explicit_list(self, blank_finish):
2326         """
2327         Create a nested state machine for a series of explicit markup
2328         constructs (including anonymous hyperlink targets).
2329         """
2330         offset = self.state_machine.line_offset + 1   # next line
2331         newline_offset, blank_finish = self.nested_list_parse(
2332               self.state_machine.input_lines[offset:],
2333               input_offset=self.state_machine.abs_line_offset() + 1,
2334               node=self.parent, initial_state='Explicit',
2335               blank_finish=blank_finish,
2336               match_titles=self.state_machine.match_titles)
2337         self.goto_line(newline_offset)
2338         if not blank_finish:
2339             self.parent += self.unindent_warning('Explicit markup')
2340
2341     def anonymous(self, match, context, next_state):
2342         """Anonymous hyperlink targets."""
2343         nodelist, blank_finish = self.anonymous_target(match)
2344         self.parent += nodelist
2345         self.explicit_list(blank_finish)
2346         return [], next_state, []
2347
2348     def anonymous_target(self, match):
2349         lineno = self.state_machine.abs_line_number()
2350         block, indent, offset, blank_finish \
2351             = self.state_machine.get_first_known_indented(match.end(),
2352                                                         until_blank=True)
2353         blocktext = match.string[:match.end()] + '\n'.join(block)
2354         block = [escape2null(line) for line in block]
2355         target = self.make_target(block, blocktext, lineno, '')
2356         return [target], blank_finish
2357
2358     def line(self, match, context, next_state):
2359         """Section title overline or transition marker."""
2360         if self.state_machine.match_titles:
2361             return [match.string], 'Line', []
2362         elif match.string.strip() == '::':
2363             raise statemachine.TransitionCorrection('text')
2364         elif len(match.string.strip()) < 4:
2365             msg = self.reporter.info(
2366                 'Unexpected possible title overline or transition.\n'
2367                 "Treating it as ordinary text because it's so short.",
2368                 line=self.state_machine.abs_line_number())
2369             self.parent += msg
2370             raise statemachine.TransitionCorrection('text')
2371         else:
2372             blocktext = self.state_machine.line
2373             msg = self.reporter.severe(
2374                   'Unexpected section title or transition.',
2375                   nodes.literal_block(blocktext, blocktext),
2376                   line=self.state_machine.abs_line_number())
2377             self.parent += msg
2378             return [], next_state, []
2379
2380     def text(self, match, context, next_state):
2381         """Titles, definition lists, paragraphs."""
2382         return [match.string], 'Text', []
2383
2384
2385 class RFC2822Body(Body):
2386
2387     """
2388     RFC2822 headers are only valid as the first constructs in documents.  As
2389     soon as anything else appears, the `Body` state should take over.
2390     """
2391
2392     patterns = Body.patterns.copy()     # can't modify the original
2393     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2394     initial_transitions = [(name, 'Body')
2395                            for name in Body.initial_transitions]
2396     initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2397
2398     def rfc2822(self, match, context, next_state):
2399         """RFC2822-style field list item."""
2400         fieldlist = nodes.field_list(classes=['rfc2822'])
2401         self.parent += fieldlist
2402         field, blank_finish = self.rfc2822_field(match)
2403         fieldlist += field
2404         offset = self.state_machine.line_offset + 1   # next line
2405         newline_offset, blank_finish = self.nested_list_parse(
2406               self.state_machine.input_lines[offset:],
2407               input_offset=self.state_machine.abs_line_offset() + 1,
2408               node=fieldlist, initial_state='RFC2822List',
2409               blank_finish=blank_finish)
2410         self.goto_line(newline_offset)
2411         if not blank_finish:
2412             self.parent += self.unindent_warning(
2413                   'RFC2822-style field list')
2414         return [], next_state, []
2415
2416     def rfc2822_field(self, match):
2417         name = match.string[:match.string.find(':')]
2418         indented, indent, line_offset, blank_finish = \
2419               self.state_machine.get_first_known_indented(match.end(),
2420                                                           until_blank=True)
2421         fieldnode = nodes.field()
2422         fieldnode += nodes.field_name(name, name)
2423         fieldbody = nodes.field_body('\n'.join(indented))
2424         fieldnode += fieldbody
2425         if indented:
2426             self.nested_parse(indented, input_offset=line_offset,
2427                               node=fieldbody)
2428         return fieldnode, blank_finish
2429
2430
2431 class SpecializedBody(Body):
2432
2433     """
2434     Superclass for second and subsequent compound element members.  Compound
2435     elements are lists and list-like constructs.
2436
2437     All transition methods are disabled (redefined as `invalid_input`).
2438     Override individual methods in subclasses to re-enable.
2439
2440     For example, once an initial bullet list item, say, is recognized, the
2441     `BulletList` subclass takes over, with a "bullet_list" node as its
2442     container.  Upon encountering the initial bullet list item, `Body.bullet`
2443     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2444     starts up a nested parsing session with `BulletList` as the initial state.
2445     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2446     as only bullet list items are encountered, they are parsed and inserted
2447     into the container.  The first construct which is *not* a bullet list item
2448     triggers the `invalid_input` method, which ends the nested parse and
2449     closes the container.  `BulletList` needs to recognize input that is
2450     invalid in the context of a bullet list, which means everything *other
2451     than* bullet list items, so it inherits the transition list created in
2452     `Body`.
2453     """
2454
2455     def invalid_input(self, match=None, context=None, next_state=None):
2456         """Not a compound element member. Abort this state machine."""
2457         self.state_machine.previous_line() # back up so parent SM can reassess
2458         raise EOFError
2459
2460     indent = invalid_input
2461     bullet = invalid_input
2462     enumerator = invalid_input
2463     field_marker = invalid_input
2464     option_marker = invalid_input
2465     doctest = invalid_input
2466     line_block = invalid_input
2467     grid_table_top = invalid_input
2468     simple_table_top = invalid_input
2469     explicit_markup = invalid_input
2470     anonymous = invalid_input
2471     line = invalid_input
2472     text = invalid_input
2473
2474
2475 class BulletList(SpecializedBody):
2476
2477     """Second and subsequent bullet_list list_items."""
2478
2479     def bullet(self, match, context, next_state):
2480         """Bullet list item."""
2481         if match.string[0] != self.parent['bullet']:
2482             # different bullet: new list
2483             self.invalid_input()
2484         listitem, blank_finish = self.list_item(match.end())
2485         self.parent += listitem
2486         self.blank_finish = blank_finish
2487         return [], next_state, []
2488
2489
2490 class DefinitionList(SpecializedBody):
2491
2492     """Second and subsequent definition_list_items."""
2493
2494     def text(self, match, context, next_state):
2495         """Definition lists."""
2496         return [match.string], 'Definition', []
2497
2498
2499 class EnumeratedList(SpecializedBody):
2500
2501     """Second and subsequent enumerated_list list_items."""
2502
2503     def enumerator(self, match, context, next_state):
2504         """Enumerated list item."""
2505         format, sequence, text, ordinal = self.parse_enumerator(
2506               match, self.parent['enumtype'])
2507         if ( format != self.format
2508              or (sequence != '#' and (sequence != self.parent['enumtype']
2509                                       or self.auto
2510                                       or ordinal != (self.lastordinal + 1)))
2511              or not self.is_enumerated_list_item(ordinal, sequence, format)):
2512             # different enumeration: new list
2513             self.invalid_input()
2514         if sequence == '#':
2515             self.auto = 1
2516         listitem, blank_finish = self.list_item(match.end())
2517         self.parent += listitem
2518         self.blank_finish = blank_finish
2519         self.lastordinal = ordinal
2520         return [], next_state, []
2521
2522
2523 class FieldList(SpecializedBody):
2524
2525     """Second and subsequent field_list fields."""
2526
2527     def field_marker(self, match, context, next_state):
2528         """Field list field."""
2529         field, blank_finish = self.field(match)
2530         self.parent += field
2531         self.blank_finish = blank_finish
2532         return [], next_state, []
2533
2534
2535 class OptionList(SpecializedBody):
2536
2537     """Second and subsequent option_list option_list_items."""
2538
2539     def option_marker(self, match, context, next_state):
2540         """Option list item."""
2541         try:
2542             option_list_item, blank_finish = self.option_list_item(match)
2543         except MarkupError:
2544             self.invalid_input()
2545         self.parent += option_list_item
2546         self.blank_finish = blank_finish
2547         return [], next_state, []
2548
2549
2550 class RFC2822List(SpecializedBody, RFC2822Body):
2551
2552     """Second and subsequent RFC2822-style field_list fields."""
2553
2554     patterns = RFC2822Body.patterns
2555     initial_transitions = RFC2822Body.initial_transitions
2556
2557     def rfc2822(self, match, context, next_state):
2558         """RFC2822-style field list item."""
2559         field, blank_finish = self.rfc2822_field(match)
2560         self.parent += field
2561         self.blank_finish = blank_finish
2562         return [], 'RFC2822List', []
2563
2564     blank = SpecializedBody.invalid_input
2565
2566
2567 class ExtensionOptions(FieldList):
2568
2569     """
2570     Parse field_list fields for extension options.
2571
2572     No nested parsing is done (including inline markup parsing).
2573     """
2574
2575     def parse_field_body(self, indented, offset, node):
2576         """Override `Body.parse_field_body` for simpler parsing."""
2577         lines = []
2578         for line in list(indented) + ['']:
2579             if line.strip():
2580                 lines.append(line)
2581             elif lines:
2582                 text = '\n'.join(lines)
2583                 node += nodes.paragraph(text, text)
2584                 lines = []
2585
2586
2587 class LineBlock(SpecializedBody):
2588
2589     """Second and subsequent lines of a line_block."""
2590
2591     blank = SpecializedBody.invalid_input
2592
2593     def line_block(self, match, context, next_state):
2594         """New line of line block."""
2595         lineno = self.state_machine.abs_line_number()
2596         line, messages, blank_finish = self.line_block_line(match, lineno)
2597         self.parent += line
2598         self.parent.parent += messages
2599         self.blank_finish = blank_finish
2600         return [], next_state, []
2601
2602
2603 class Explicit(SpecializedBody):
2604
2605     """Second and subsequent explicit markup construct."""
2606
2607     def explicit_markup(self, match, context, next_state):
2608         """Footnotes, hyperlink targets, directives, comments."""
2609         nodelist, blank_finish = self.explicit_construct(match)
2610         self.parent += nodelist
2611         self.blank_finish = blank_finish
2612         return [], next_state, []
2613
2614     def anonymous(self, match, context, next_state):
2615         """Anonymous hyperlink targets."""
2616         nodelist, blank_finish = self.anonymous_target(match)
2617         self.parent += nodelist
2618         self.blank_finish = blank_finish
2619         return [], next_state, []
2620
2621     blank = SpecializedBody.invalid_input
2622
2623
2624 class SubstitutionDef(Body):
2625
2626     """
2627     Parser for the contents of a substitution_definition element.
2628     """
2629
2630     patterns = {
2631           'embedded_directive': re.compile(r'(%s)::( +|$)'
2632                                            % Inliner.simplename, re.UNICODE),
2633           'text': r''}
2634     initial_transitions = ['embedded_directive', 'text']
2635
2636     def embedded_directive(self, match, context, next_state):
2637         nodelist, blank_finish = self.directive(match,
2638                                                 alt=self.parent['names'][0])
2639         self.parent += nodelist
2640         if not self.state_machine.at_eof():
2641             self.blank_finish = blank_finish
2642         raise EOFError
2643
2644     def text(self, match, context, next_state):
2645         if not self.state_machine.at_eof():
2646             self.blank_finish = self.state_machine.is_next_line_blank()
2647         raise EOFError
2648
2649
2650 class Text(RSTState):
2651
2652     """
2653     Classifier of second line of a text block.
2654
2655     Could be a paragraph, a definition list item, or a title.
2656     """
2657
2658     patterns = {'underline': Body.patterns['line'],
2659                 'text': r''}
2660     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2661
2662     def blank(self, match, context, next_state):
2663         """End of paragraph."""
2664         # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext
2665         paragraph, literalnext = self.paragraph(
2666               context, self.state_machine.abs_line_number() - 1)
2667         self.parent += paragraph
2668         if literalnext:
2669             self.parent += self.literal_block()
2670         return [], 'Body', []
2671
2672     def eof(self, context):
2673         if context:
2674             self.blank(None, context, None)
2675         return []
2676
2677     def indent(self, match, context, next_state):
2678         """Definition list item."""
2679         definitionlist = nodes.definition_list()
2680         definitionlistitem, blank_finish = self.definition_list_item(context)
2681         definitionlist += definitionlistitem
2682         self.parent += definitionlist
2683         offset = self.state_machine.line_offset + 1   # next line
2684         newline_offset, blank_finish = self.nested_list_parse(
2685               self.state_machine.input_lines[offset:],
2686               input_offset=self.state_machine.abs_line_offset() + 1,
2687               node=definitionlist, initial_state='DefinitionList',
2688               blank_finish=blank_finish, blank_finish_state='Definition')
2689         self.goto_line(newline_offset)
2690         if not blank_finish:
2691             self.parent += self.unindent_warning('Definition list')
2692         return [], 'Body', []
2693
2694     def underline(self, match, context, next_state):
2695         """Section title."""
2696         lineno = self.state_machine.abs_line_number()
2697         title = context[0].rstrip()
2698         underline = match.string.rstrip()
2699         source = title + '\n' + underline
2700         messages = []
2701         if column_width(title) > len(underline):
2702             if len(underline) < 4:
2703                 if self.state_machine.match_titles:
2704                     msg = self.reporter.info(
2705                         'Possible title underline, too short for the title.\n'
2706                         "Treating it as ordinary text because it's so short.",
2707                         line=lineno)
2708                     self.parent += msg
2709                 raise statemachine.TransitionCorrection('text')
2710             else:
2711                 blocktext = context[0] + '\n' + self.state_machine.line
2712                 msg = self.reporter.warning('Title underline too short.',
2713                     nodes.literal_block(blocktext, blocktext), line=lineno)
2714                 messages.append(msg)
2715         if not self.state_machine.match_titles:
2716             blocktext = context[0] + '\n' + self.state_machine.line
2717             # We need get_source_and_line() here to report correctly
2718             src, srcline = self.state_machine.get_source_and_line()
2719             # TODO: why is abs_line_number() == srcline+1
2720             # if the error is in a table (try with test_tables.py)?
2721             # print "get_source_and_line", srcline
2722             # print "abs_line_number", self.state_machine.abs_line_number()
2723             msg = self.reporter.severe('Unexpected section title.',
2724                 nodes.literal_block(blocktext, blocktext),
2725                 source=src, line=srcline)
2726             self.parent += messages
2727             self.parent += msg
2728             return [], next_state, []
2729         style = underline[0]
2730         context[:] = []
2731         self.section(title, source, style, lineno - 1, messages)
2732         return [], next_state, []
2733
2734     def text(self, match, context, next_state):
2735         """Paragraph."""
2736         startline = self.state_machine.abs_line_number() - 1
2737         msg = None
2738         try:
2739             block = self.state_machine.get_text_block(flush_left=True)
2740         except statemachine.UnexpectedIndentationError, err:
2741             block, src, srcline = err.args
2742             msg = self.reporter.error('Unexpected indentation.',
2743                                       source=src, line=srcline)
2744         lines = context + list(block)
2745         paragraph, literalnext = self.paragraph(lines, startline)
2746         self.parent += paragraph
2747         self.parent += msg
2748         if literalnext:
2749             try:
2750                 self.state_machine.next_line()
2751             except EOFError:
2752                 pass
2753             self.parent += self.literal_block()
2754         return [], next_state, []
2755
2756     def literal_block(self):
2757         """Return a list of nodes."""
2758         indented, indent, offset, blank_finish = \
2759               self.state_machine.get_indented()
2760         while indented and not indented[-1].strip():
2761             indented.trim_end()
2762         if not indented:
2763             return self.quoted_literal_block()
2764         data = '\n'.join(indented)
2765         literal_block = nodes.literal_block(data, data)
2766         literal_block.line = offset + 1
2767         nodelist = [literal_block]
2768         if not blank_finish:
2769             nodelist.append(self.unindent_warning('Literal block'))
2770         return nodelist
2771
2772     def quoted_literal_block(self):
2773         abs_line_offset = self.state_machine.abs_line_offset()
2774         offset = self.state_machine.line_offset
2775         parent_node = nodes.Element()
2776         new_abs_offset = self.nested_parse(
2777             self.state_machine.input_lines[offset:],
2778             input_offset=abs_line_offset, node=parent_node, match_titles=False,
2779             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2780                                   'initial_state': 'QuotedLiteralBlock'})
2781         self.goto_line(new_abs_offset)
2782         return parent_node.children
2783
2784     def definition_list_item(self, termline):
2785         indented, indent, line_offset, blank_finish = \
2786               self.state_machine.get_indented()
2787         itemnode = nodes.definition_list_item(
2788             '\n'.join(termline + list(indented)))
2789         lineno = self.state_machine.abs_line_number() - 1
2790         (itemnode.source,
2791          itemnode.line) = self.state_machine.get_source_and_line(lineno)
2792         termlist, messages = self.term(termline, lineno)
2793         itemnode += termlist
2794         definition = nodes.definition('', *messages)
2795         itemnode += definition
2796         if termline[0][-2:] == '::':
2797             definition += self.reporter.info(
2798                   'Blank line missing before literal block (after the "::")? '
2799                   'Interpreted as a definition list item.',
2800                   line=lineno+1)
2801         self.nested_parse(indented, input_offset=line_offset, node=definition)
2802         return itemnode, blank_finish
2803
2804     classifier_delimiter = re.compile(' +: +')
2805
2806     def term(self, lines, lineno):
2807         """Return a definition_list's term and optional classifiers."""
2808         assert len(lines) == 1
2809         text_nodes, messages = self.inline_text(lines[0], lineno)
2810         term_node = nodes.term()
2811         (term_node.source,
2812          term_node.line) = self.state_machine.get_source_and_line(lineno)
2813         term_node.rawsource = unescape(lines[0])
2814         node_list = [term_node]
2815         for i in range(len(text_nodes)):
2816             node = text_nodes[i]
2817             if isinstance(node, nodes.Text):
2818                 parts = self.classifier_delimiter.split(node.rawsource)
2819                 if len(parts) == 1:
2820                     node_list[-1] += node
2821                 else:
2822
2823                     node_list[-1] += nodes.Text(parts[0].rstrip())
2824                     for part in parts[1:]:
2825                         classifier_node = nodes.classifier('', part)
2826                         node_list.append(classifier_node)
2827             else:
2828                 node_list[-1] += node
2829         return node_list, messages
2830
2831
2832 class SpecializedText(Text):
2833
2834     """
2835     Superclass for second and subsequent lines of Text-variants.
2836
2837     All transition methods are disabled. Override individual methods in
2838     subclasses to re-enable.
2839     """
2840
2841     def eof(self, context):
2842         """Incomplete construct."""
2843         return []
2844
2845     def invalid_input(self, match=None, context=None, next_state=None):
2846         """Not a compound element member. Abort this state machine."""
2847         raise EOFError
2848
2849     blank = invalid_input
2850     indent = invalid_input
2851     underline = invalid_input
2852     text = invalid_input
2853
2854
2855 class Definition(SpecializedText):
2856
2857     """Second line of potential definition_list_item."""
2858
2859     def eof(self, context):
2860         """Not a definition."""
2861         self.state_machine.previous_line(2) # so parent SM can reassess
2862         return []
2863
2864     def indent(self, match, context, next_state):
2865         """Definition list item."""
2866         itemnode, blank_finish = self.definition_list_item(context)
2867         self.parent += itemnode
2868         self.blank_finish = blank_finish
2869         return [], 'DefinitionList', []
2870
2871
2872 class Line(SpecializedText):
2873
2874     """
2875     Second line of over- & underlined section title or transition marker.
2876     """
2877
2878     eofcheck = 1                        # @@@ ???
2879     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2880
2881     def eof(self, context):
2882         """Transition marker at end of section or document."""
2883         marker = context[0].strip()
2884         if self.memo.section_bubble_up_kludge:
2885             self.memo.section_bubble_up_kludge = False
2886         elif len(marker) < 4:
2887             self.state_correction(context)
2888         if self.eofcheck:               # ignore EOFError with sections
2889             lineno = self.state_machine.abs_line_number() - 1
2890             transition = nodes.transition(rawsource=context[0])
2891             transition.line = lineno
2892             self.parent += transition
2893         self.eofcheck = 1
2894         return []
2895
2896     def blank(self, match, context, next_state):
2897         """Transition marker."""
2898         src, srcline = self.state_machine.get_source_and_line()
2899         marker = context[0].strip()
2900         if len(marker) < 4:
2901             self.state_correction(context)
2902         transition = nodes.transition(rawsource=marker)
2903         transition.source = src
2904         transition.line = srcline - 1
2905         self.parent += transition
2906         return [], 'Body', []
2907
2908     def text(self, match, context, next_state):
2909         """Potential over- & underlined title."""
2910         lineno = self.state_machine.abs_line_number() - 1
2911         overline = context[0]
2912         title = match.string
2913         underline = ''
2914         try:
2915             underline = self.state_machine.next_line()
2916         except EOFError:
2917             blocktext = overline + '\n' + title
2918             if len(overline.rstrip()) < 4:
2919                 self.short_overline(context, blocktext, lineno, 2)
2920             else:
2921                 msg = self.reporter.severe(
2922                     'Incomplete section title.',
2923                     nodes.literal_block(blocktext, blocktext),
2924                     line=lineno)
2925                 self.parent += msg
2926                 return [], 'Body', []
2927         source = '%s\n%s\n%s' % (overline, title, underline)
2928         overline = overline.rstrip()
2929         underline = underline.rstrip()
2930         if not self.transitions['underline'][0].match(underline):
2931             blocktext = overline + '\n' + title + '\n' + underline
2932             if len(overline.rstrip()) < 4:
2933                 self.short_overline(context, blocktext, lineno, 2)
2934             else:
2935                 msg = self.reporter.severe(
2936                     'Missing matching underline for section title overline.',
2937                     nodes.literal_block(source, source),
2938                     line=lineno)
2939                 self.parent += msg
2940                 return [], 'Body', []
2941         elif overline != underline:
2942             blocktext = overline + '\n' + title + '\n' + underline
2943             if len(overline.rstrip()) < 4:
2944                 self.short_overline(context, blocktext, lineno, 2)
2945             else:
2946                 msg = self.reporter.severe(
2947                       'Title overline & underline mismatch.',
2948                       nodes.literal_block(source, source),
2949                       line=lineno)
2950                 self.parent += msg
2951                 return [], 'Body', []
2952         title = title.rstrip()
2953         messages = []
2954         if column_width(title) > len(overline):
2955             blocktext = overline + '\n' + title + '\n' + underline
2956             if len(overline.rstrip()) < 4:
2957                 self.short_overline(context, blocktext, lineno, 2)
2958             else:
2959                 msg = self.reporter.warning(
2960                       'Title overline too short.',
2961                       nodes.literal_block(source, source),
2962                       line=lineno)
2963                 messages.append(msg)
2964         style = (overline[0], underline[0])
2965         self.eofcheck = 0               # @@@ not sure this is correct
2966         self.section(title.lstrip(), source, style, lineno + 1, messages)
2967         self.eofcheck = 1
2968         return [], 'Body', []
2969
2970     indent = text                       # indented title
2971
2972     def underline(self, match, context, next_state):
2973         overline = context[0]
2974         blocktext = overline + '\n' + self.state_machine.line
2975         lineno = self.state_machine.abs_line_number() - 1
2976         if len(overline.rstrip()) < 4:
2977             self.short_overline(context, blocktext, lineno, 1)
2978         msg = self.reporter.error(
2979               'Invalid section title or transition marker.',
2980               nodes.literal_block(blocktext, blocktext),
2981               line=lineno)
2982         self.parent += msg
2983         return [], 'Body', []
2984
2985     def short_overline(self, context, blocktext, lineno, lines=1):
2986         msg = self.reporter.info(
2987             'Possible incomplete section title.\nTreating the overline as '
2988             "ordinary text because it's so short.",
2989             line=lineno)
2990         self.parent += msg
2991         self.state_correction(context, lines)
2992
2993     def state_correction(self, context, lines=1):
2994         self.state_machine.previous_line(lines)
2995         context[:] = []
2996         raise statemachine.StateCorrection('Body', 'text')
2997
2998
2999 class QuotedLiteralBlock(RSTState):
3000
3001     """
3002     Nested parse handler for quoted (unindented) literal blocks.
3003
3004     Special-purpose.  Not for inclusion in `state_classes`.
3005     """
3006
3007     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3008                 'text': r''}
3009     initial_transitions = ('initial_quoted', 'text')
3010
3011     def __init__(self, state_machine, debug=False):
3012         RSTState.__init__(self, state_machine, debug)
3013         self.messages = []
3014         self.initial_lineno = None
3015
3016     def blank(self, match, context, next_state):
3017         if context:
3018             raise EOFError
3019         else:
3020             return context, next_state, []
3021
3022     def eof(self, context):
3023         if context:
3024             src, srcline = self.state_machine.get_source_and_line(
3025                                                         self.initial_lineno)
3026             text = '\n'.join(context)
3027             literal_block = nodes.literal_block(text, text)
3028             literal_block.source = src
3029             literal_block.line = srcline
3030             self.parent += literal_block
3031         else:
3032             self.parent += self.reporter.warning(
3033                 'Literal block expected; none found.',
3034                 line=self.state_machine.abs_line_number())
3035                 # src not available, because statemachine.input_lines is empty
3036             self.state_machine.previous_line()
3037         self.parent += self.messages
3038         return []
3039
3040     def indent(self, match, context, next_state):
3041         assert context, ('QuotedLiteralBlock.indent: context should not '
3042                          'be empty!')
3043         self.messages.append(
3044             self.reporter.error('Unexpected indentation.',
3045                                 line=self.state_machine.abs_line_number()))
3046         self.state_machine.previous_line()
3047         raise EOFError
3048
3049     def initial_quoted(self, match, context, next_state):
3050         """Match arbitrary quote character on the first line only."""
3051         self.remove_transition('initial_quoted')
3052         quote = match.string[0]
3053         pattern = re.compile(re.escape(quote), re.UNICODE)
3054         # New transition matches consistent quotes only:
3055         self.add_transition('quoted',
3056                             (pattern, self.quoted, self.__class__.__name__))
3057         self.initial_lineno = self.state_machine.abs_line_number()
3058         return [match.string], next_state, []
3059
3060     def quoted(self, match, context, next_state):
3061         """Match consistent quotes on subsequent lines."""
3062         context.append(match.string)
3063         return context, next_state, []
3064
3065     def text(self, match, context, next_state):
3066         if context:
3067             self.messages.append(
3068                 self.reporter.error('Inconsistent literal block quoting.',
3069                                    line=self.state_machine.abs_line_number()))
3070             self.state_machine.previous_line()
3071         raise EOFError
3072
3073
3074 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3075                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3076                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3077 """Standard set of State classes used to start `RSTStateMachine`."""