docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import sys
 107 import re
 108 try:
 109     import roman
 110 except ImportError:
 111     import docutils.utils.roman as roman
 112 from types import FunctionType, MethodType
 113
 114 from docutils import nodes, statemachine, utils, urischemes
 115 from docutils import ApplicationError, DataError
 116 from docutils.statemachine import StateMachineWS, StateWS
 117 from docutils.nodes import fully_normalize_name as normalize_name
 118 from docutils.nodes import whitespace_normalize_name
 119 import docutils.parsers.rst
 120 from docutils.parsers.rst import directives, languages, tableparser, roles
 121 from docutils.parsers.rst.languages import en as _fallback_language_module
 122 from docutils.utils import escape2null, unescape, column_width
 123 from docutils.utils import punctuation_chars
 124
 125 class MarkupError(DataError): pass
 126 class UnknownInterpretedRoleError(DataError): pass
 127 class InterpretedRoleNotImplementedError(DataError): pass
 128 class ParserError(ApplicationError): pass
 129 class MarkupMismatch(Exception): pass
 130
 131
 132 class Struct:
 133
 134     """Stores data attributes for dotted-attribute access."""
 135
 136     def __init__(self, **keywordargs):
 137         self.__dict__.update(keywordargs)
 138
 139
 140 class RSTStateMachine(StateMachineWS):
 141
 142     """
 143     reStructuredText's master StateMachine.
 144
 145     The entry point to reStructuredText parsing is the `run()` method.
 146     """
 147
 148     def run(self, input_lines, document, input_offset=0, match_titles=True,
 149             inliner=None):
 150         """
 151         Parse `input_lines` and modify the `document` node in place.
 152
 153         Extend `StateMachineWS.run()`: set up parse-global data and
 154         run the StateMachine.
 155         """
 156         self.language = languages.get_language(
 157             document.settings.language_code)
 158         self.match_titles = match_titles
 159         if inliner is None:
 160             inliner = Inliner()
 161         inliner.init_customizations(document.settings)
 162         self.memo = Struct(document=document,
 163                            reporter=document.reporter,
 164                            language=self.language,
 165                            title_styles=[],
 166                            section_level=0,
 167                            section_bubble_up_kludge=False,
 168                            inliner=inliner)
 169         self.document = document
 170         self.attach_observer(document.note_source)
 171         self.reporter = self.memo.reporter
 172         self.node = document
 173         results = StateMachineWS.run(self, input_lines, input_offset,
 174                                      input_source=document['source'])
 175         assert results == [], 'RSTStateMachine.run() results should be empty!'
 176         self.node = self.memo = None    # remove unneeded references
 177
 178
 179 class NestedStateMachine(StateMachineWS):
 180
 181     """
 182     StateMachine run from within other StateMachine runs, to parse nested
 183     document structures.
 184     """
 185
 186     def run(self, input_lines, input_offset, memo, node, match_titles=True):
 187         """
 188         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 189
 190         Extend `StateMachineWS.run()`: set up document-wide data.
 191         """
 192         self.match_titles = match_titles
 193         self.memo = memo
 194         self.document = memo.document
 195         self.attach_observer(self.document.note_source)
 196         self.reporter = memo.reporter
 197         self.language = memo.language
 198         self.node = node
 199         results = StateMachineWS.run(self, input_lines, input_offset)
 200         assert results == [], ('NestedStateMachine.run() results should be '
 201                                'empty!')
 202         return results
 203
 204
 205 class RSTState(StateWS):
 206
 207     """
 208     reStructuredText State superclass.
 209
 210     Contains methods used by all State subclasses.
 211     """
 212
 213     nested_sm = NestedStateMachine
 214     nested_sm_cache = []
 215
 216     def __init__(self, state_machine, debug=False):
 217         self.nested_sm_kwargs = {'state_classes': state_classes,
 218                                  'initial_state': 'Body'}
 219         StateWS.__init__(self, state_machine, debug)
 220
 221     def runtime_init(self):
 222         StateWS.runtime_init(self)
 223         memo = self.state_machine.memo
 224         self.memo = memo
 225         self.reporter = memo.reporter
 226         self.inliner = memo.inliner
 227         self.document = memo.document
 228         self.parent = self.state_machine.node
 229         # enable the reporter to determine source and source-line
 230         if not hasattr(self.reporter, 'get_source_and_line'):
 231             self.reporter.get_source_and_line = self.state_machine.get_source_and_line
 232             # print "adding get_source_and_line to reporter", self.state_machine.input_offset
 233
 234
 235     def goto_line(self, abs_line_offset):
 236         """
 237         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 238         """
 239         try:
 240             self.state_machine.goto_line(abs_line_offset)
 241         except EOFError:
 242             pass
 243
 244     def no_match(self, context, transitions):
 245         """
 246         Override `StateWS.no_match` to generate a system message.
 247
 248         This code should never be run.
 249         """
 250         self.reporter.severe(
 251             'Internal error: no transition pattern match.  State: "%s"; '
 252             'transitions: %s; context: %s; current line: %r.'
 253             % (self.__class__.__name__, transitions, context,
 254                self.state_machine.line))
 255         return context, None, []
 256
 257     def bof(self, context):
 258         """Called at beginning of file."""
 259         return [], []
 260
 261     def nested_parse(self, block, input_offset, node, match_titles=False,
 262                      state_machine_class=None, state_machine_kwargs=None):
 263         """
 264         Create a new StateMachine rooted at `node` and run it over the input
 265         `block`.
 266         """
 267         use_default = 0
 268         if state_machine_class is None:
 269             state_machine_class = self.nested_sm
 270             use_default += 1
 271         if state_machine_kwargs is None:
 272             state_machine_kwargs = self.nested_sm_kwargs
 273             use_default += 1
 274         block_length = len(block)
 275
 276         state_machine = None
 277         if use_default == 2:
 278             try:
 279                 state_machine = self.nested_sm_cache.pop()
 280             except IndexError:
 281                 pass
 282         if not state_machine:
 283             state_machine = state_machine_class(debug=self.debug,
 284                                                 **state_machine_kwargs)
 285         state_machine.run(block, input_offset, memo=self.memo,
 286                           node=node, match_titles=match_titles)
 287         if use_default == 2:
 288             self.nested_sm_cache.append(state_machine)
 289         else:
 290             state_machine.unlink()
 291         new_offset = state_machine.abs_line_offset()
 292         # No `block.parent` implies disconnected -- lines aren't in sync:
 293         if block.parent and (len(block) - block_length) != 0:
 294             # Adjustment for block if modified in nested parse:
 295             self.state_machine.next_line(len(block) - block_length)
 296         return new_offset
 297
 298     def nested_list_parse(self, block, input_offset, node, initial_state,
 299                           blank_finish,
 300                           blank_finish_state=None,
 301                           extra_settings={},
 302                           match_titles=False,
 303                           state_machine_class=None,
 304                           state_machine_kwargs=None):
 305         """
 306         Create a new StateMachine rooted at `node` and run it over the input
 307         `block`. Also keep track of optional intermediate blank lines and the
 308         required final one.
 309         """
 310         if state_machine_class is None:
 311             state_machine_class = self.nested_sm
 312         if state_machine_kwargs is None:
 313             state_machine_kwargs = self.nested_sm_kwargs.copy()
 314         state_machine_kwargs['initial_state'] = initial_state
 315         state_machine = state_machine_class(debug=self.debug,
 316                                             **state_machine_kwargs)
 317         if blank_finish_state is None:
 318             blank_finish_state = initial_state
 319         state_machine.states[blank_finish_state].blank_finish = blank_finish
 320         for key, value in extra_settings.items():
 321             setattr(state_machine.states[initial_state], key, value)
 322         state_machine.run(block, input_offset, memo=self.memo,
 323                           node=node, match_titles=match_titles)
 324         blank_finish = state_machine.states[blank_finish_state].blank_finish
 325         state_machine.unlink()
 326         return state_machine.abs_line_offset(), blank_finish
 327
 328     def section(self, title, source, style, lineno, messages):
 329         """Check for a valid subsection and create one if it checks out."""
 330         if self.check_subsection(source, style, lineno):
 331             self.new_subsection(title, lineno, messages)
 332
 333     def check_subsection(self, source, style, lineno):
 334         """
 335         Check for a valid subsection header.  Return 1 (true) or None (false).
 336
 337         When a new section is reached that isn't a subsection of the current
 338         section, back up the line count (use ``previous_line(-x)``), then
 339         ``raise EOFError``.  The current StateMachine will finish, then the
 340         calling StateMachine can re-examine the title.  This will work its way
 341         back up the calling chain until the correct section level isreached.
 342
 343         @@@ Alternative: Evaluate the title, store the title info & level, and
 344         back up the chain until that level is reached.  Store in memo? Or
 345         return in results?
 346
 347         :Exception: `EOFError` when a sibling or supersection encountered.
 348         """
 349         memo = self.memo
 350         title_styles = memo.title_styles
 351         mylevel = memo.section_level
 352         try:                            # check for existing title style
 353             level = title_styles.index(style) + 1
 354         except ValueError:              # new title style
 355             if len(title_styles) == memo.section_level: # new subsection
 356                 title_styles.append(style)
 357                 return 1
 358             else:                       # not at lowest level
 359                 self.parent += self.title_inconsistent(source, lineno)
 360                 return None
 361         if level <= mylevel:            # sibling or supersection
 362             memo.section_level = level   # bubble up to parent section
 363             if len(style) == 2:
 364                 memo.section_bubble_up_kludge = True
 365             # back up 2 lines for underline title, 3 for overline title
 366             self.state_machine.previous_line(len(style) + 1)
 367             raise EOFError              # let parent section re-evaluate
 368         if level == mylevel + 1:        # immediate subsection
 369             return 1
 370         else:                           # invalid subsection
 371             self.parent += self.title_inconsistent(source, lineno)
 372             return None
 373
 374     def title_inconsistent(self, sourcetext, lineno):
 375         error = self.reporter.severe(
 376             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 377             line=lineno)
 378         return error
 379
 380     def new_subsection(self, title, lineno, messages):
 381         """Append new subsection to document tree. On return, check level."""
 382         memo = self.memo
 383         mylevel = memo.section_level
 384         memo.section_level += 1
 385         section_node = nodes.section()
 386         self.parent += section_node
 387         textnodes, title_messages = self.inline_text(title, lineno)
 388         titlenode = nodes.title(title, '', *textnodes)
 389         name = normalize_name(titlenode.astext())
 390         section_node['names'].append(name)
 391         section_node += titlenode
 392         section_node += messages
 393         section_node += title_messages
 394         self.document.note_implicit_target(section_node, section_node)
 395         offset = self.state_machine.line_offset + 1
 396         absoffset = self.state_machine.abs_line_offset() + 1
 397         newabsoffset = self.nested_parse(
 398               self.state_machine.input_lines[offset:], input_offset=absoffset,
 399               node=section_node, match_titles=True)
 400         self.goto_line(newabsoffset)
 401         if memo.section_level <= mylevel: # can't handle next section?
 402             raise EOFError              # bubble up to supersection
 403         # reset section_level; next pass will detect it properly
 404         memo.section_level = mylevel
 405
 406     def paragraph(self, lines, lineno):
 407         """
 408         Return a list (paragraph & messages) & a boolean: literal_block next?
 409         """
 410         data = '\n'.join(lines).rstrip()
 411         if re.search(r'(?<!\\)(\\\\)*::$', data):
 412             if len(data) == 2:
 413                 return [], 1
 414             elif data[-3] in ' \n':
 415                 text = data[:-3].rstrip()
 416             else:
 417                 text = data[:-1]
 418             literalnext = 1
 419         else:
 420             text = data
 421             literalnext = 0
 422         textnodes, messages = self.inline_text(text, lineno)
 423         p = nodes.paragraph(data, '', *textnodes)
 424         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 425         return [p] + messages, literalnext
 426
 427     def inline_text(self, text, lineno):
 428         """
 429         Return 2 lists: nodes (text and inline elements), and system_messages.
 430         """
 431         return self.inliner.parse(text, lineno, self.memo, self.parent)
 432
 433     def unindent_warning(self, node_name):
 434         # the actual problem is one line below the current line
 435         lineno = self.state_machine.abs_line_number()+1
 436         return self.reporter.warning('%s ends without a blank line; '
 437                                      'unexpected unindent.' % node_name,
 438                                      line=lineno)
 439
 440
 441 def build_regexp(definition, compile=True):
 442     """
 443     Build, compile and return a regular expression based on `definition`.
 444
 445     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 446         where "parts" is a list of regular expressions and/or regular
 447         expression definitions to be joined into an or-group.
 448     """
 449     name, prefix, suffix, parts = definition
 450     part_strings = []
 451     for part in parts:
 452         if type(part) is tuple:
 453             part_strings.append(build_regexp(part, None))
 454         else:
 455             part_strings.append(part)
 456     or_group = '|'.join(part_strings)
 457     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 458     if compile:
 459         return re.compile(regexp, re.UNICODE)
 460     else:
 461         return regexp
 462
 463
 464 class Inliner:
 465
 466     """
 467     Parse inline markup; call the `parse()` method.
 468     """
 469
 470     def __init__(self):
 471         self.implicit_dispatch = [(self.patterns.uri, self.standalone_uri),]
 472         """List of (pattern, bound method) tuples, used by
 473         `self.implicit_inline`."""
 474
 475     def init_customizations(self, settings):
 476         """Setting-based customizations; run when parsing begins."""
 477         if settings.pep_references:
 478             self.implicit_dispatch.append((self.patterns.pep,
 479                                            self.pep_reference))
 480         if settings.rfc_references:
 481             self.implicit_dispatch.append((self.patterns.rfc,
 482                                            self.rfc_reference))
 483
 484     def parse(self, text, lineno, memo, parent):
 485         # Needs to be refactored for nested inline markup.
 486         # Add nested_parse() method?
 487         """
 488         Return 2 lists: nodes (text and inline elements), and system_messages.
 489
 490         Using `self.patterns.initial`, a pattern which matches start-strings
 491         (emphasis, strong, interpreted, phrase reference, literal,
 492         substitution reference, and inline target) and complete constructs
 493         (simple reference, footnote reference), search for a candidate.  When
 494         one is found, check for validity (e.g., not a quoted '*' character).
 495         If valid, search for the corresponding end string if applicable, and
 496         check it for validity.  If not found or invalid, generate a warning
 497         and ignore the start-string.  Implicit inline markup (e.g. standalone
 498         URIs) is found last.
 499         """
 500         self.reporter = memo.reporter
 501         self.document = memo.document
 502         self.language = memo.language
 503         self.parent = parent
 504         pattern_search = self.patterns.initial.search
 505         dispatch = self.dispatch
 506         remaining = escape2null(text)
 507         processed = []
 508         unprocessed = []
 509         messages = []
 510         while remaining:
 511             match = pattern_search(remaining)
 512             if match:
 513                 groups = match.groupdict()
 514                 method = dispatch[groups['start'] or groups['backquote']
 515                                   or groups['refend'] or groups['fnend']]
 516                 before, inlines, remaining, sysmessages = method(self, match,
 517                                                                  lineno)
 518                 unprocessed.append(before)
 519                 messages += sysmessages
 520                 if inlines:
 521                     processed += self.implicit_inline(''.join(unprocessed),
 522                                                       lineno)
 523                     processed += inlines
 524                     unprocessed = []
 525             else:
 526                 break
 527         remaining = ''.join(unprocessed) + remaining
 528         if remaining:
 529             processed += self.implicit_inline(remaining, lineno)
 530         return processed, messages
 531
 532     # Inline object recognition
 533     # -------------------------
 534     # lookahead and look-behind expressions for inline markup rules
 535     start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' %
 536                            (punctuation_chars.openers,
 537                             punctuation_chars.delimiters))
 538     end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' %
 539                          (punctuation_chars.closing_delimiters,
 540                           punctuation_chars.delimiters,
 541                           punctuation_chars.closers))
 542     # print start_string_prefix.encode('utf8')
 543     # TODO: support non-ASCII whitespace in the following 4 patterns?
 544     non_whitespace_before = r'(?<![ \n])'
 545     non_whitespace_escape_before = r'(?<![ \n\x00])'
 546     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[ \n\x00])'
 547     non_whitespace_after = r'(?![ \n])'
 548     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 549     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 550     # Valid URI characters (see RFC 2396 & RFC 2732);
 551     # final \x00 allows backslash escapes in URIs:
 552     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 553     # Delimiter indicating the end of a URI (not part of the URI):
 554     uri_end_delim = r"""[>]"""
 555     # Last URI character; same as uric but no punctuation:
 556     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 557     # End of a URI (either 'urilast' or 'uric followed by a
 558     # uri_end_delim'):
 559     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 560     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 561     email_pattern = r"""
 562           %(emailc)s+(?:\.%(emailc)s+)*   # name
 563           (?<!\x00)@                      # at
 564           %(emailc)s+(?:\.%(emailc)s*)*   # host
 565           %(uri_end)s                     # final URI char
 566           """
 567     parts = ('initial_inline', start_string_prefix, '',
 568              [('start', '', non_whitespace_after,  # simple start-strings
 569                [r'\*\*',                # strong
 570                 r'\*(?!\*)',            # emphasis but not strong
 571                 r'``',                  # literal
 572                 r'_`',                  # inline internal target
 573                 r'\|(?!\|)']            # substitution reference
 574                ),
 575               ('whole', '', end_string_suffix, # whole constructs
 576                [# reference name & end-string
 577                 r'(?P<refname>%s)(?P<refend>__?)' % simplename,
 578                 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 579                  [r'[0-9]+',               # manually numbered
 580                   r'\#(%s)?' % simplename, # auto-numbered (w/ label?)
 581                   r'\*',                   # auto-symbol
 582                   r'(?P<citationlabel>%s)' % simplename] # citation reference
 583                  )
 584                 ]
 585                ),
 586               ('backquote',             # interpreted text or phrase reference
 587                '(?P<role>(:%s:)?)' % simplename, # optional role
 588                non_whitespace_after,
 589                ['`(?!`)']               # but not literal
 590                )
 591               ]
 592              )
 593     patterns = Struct(
 594           initial=build_regexp(parts),
 595           emphasis=re.compile(non_whitespace_escape_before
 596                               + r'(\*)' + end_string_suffix, re.UNICODE),
 597           strong=re.compile(non_whitespace_escape_before
 598                             + r'(\*\*)' + end_string_suffix, re.UNICODE),
 599           interpreted_or_phrase_ref=re.compile(
 600               r"""
 601               %(non_unescaped_whitespace_escape_before)s
 602               (
 603                 `
 604                 (?P<suffix>
 605                   (?P<role>:%(simplename)s:)?
 606                   (?P<refend>__?)?
 607                 )
 608               )
 609               %(end_string_suffix)s
 610               """ % locals(), re.VERBOSE | re.UNICODE),
 611           embedded_uri=re.compile(
 612               r"""
 613               (
 614                 (?:[ \n]+|^)            # spaces or beginning of line/string
 615                 <                       # open bracket
 616                 %(non_whitespace_after)s
 617                 ([^<>\x00]+)            # anything but angle brackets & nulls
 618                 %(non_whitespace_before)s
 619                 >                       # close bracket w/o whitespace before
 620               )
 621               $                         # end of string
 622               """ % locals(), re.VERBOSE | re.UNICODE),
 623           literal=re.compile(non_whitespace_before + '(``)'
 624                              + end_string_suffix),
 625           target=re.compile(non_whitespace_escape_before
 626                             + r'(`)' + end_string_suffix),
 627           substitution_ref=re.compile(non_whitespace_escape_before
 628                                       + r'(\|_{0,2})'
 629                                       + end_string_suffix),
 630           email=re.compile(email_pattern % locals() + '$',
 631                            re.VERBOSE | re.UNICODE),
 632           uri=re.compile(
 633                 (r"""
 634                 %(start_string_prefix)s
 635                 (?P<whole>
 636                   (?P<absolute>           # absolute URI
 637                     (?P<scheme>             # scheme (http, ftp, mailto)
 638                       [a-zA-Z][a-zA-Z0-9.+-]*
 639                     )
 640                     :
 641                     (
 642                       (                       # either:
 643                         (//?)?                  # hierarchical URI
 644                         %(uric)s*               # URI characters
 645                         %(uri_end)s             # final URI char
 646                       )
 647                       (                       # optional query
 648                         \?%(uric)s*
 649                         %(uri_end)s
 650                       )?
 651                       (                       # optional fragment
 652                         \#%(uric)s*
 653                         %(uri_end)s
 654                       )?
 655                     )
 656                   )
 657                 |                       # *OR*
 658                   (?P<email>              # email address
 659                     """ + email_pattern + r"""
 660                   )
 661                 )
 662                 %(end_string_suffix)s
 663                 """) % locals(), re.VERBOSE | re.UNICODE),
 664           pep=re.compile(
 665                 r"""
 666                 %(start_string_prefix)s
 667                 (
 668                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 669                 |
 670                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 671                 )
 672                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE),
 673           rfc=re.compile(
 674                 r"""
 675                 %(start_string_prefix)s
 676                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 677                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE))
 678
 679     def quoted_start(self, match):
 680         """Test if inline markup start-string is 'quoted'.
 681
 682         'Quoted' in this context means the start-string is enclosed in a pair
 683         of matching opening/closing delimiters (not necessarily quotes)
 684         or at the end of the match.
 685         """
 686         string = match.string
 687         start = match.start()
 688         if start == 0:                  # start-string at beginning of text
 689             return False
 690         prestart = string[start - 1]
 691         try:
 692             poststart = string[match.end()]
 693         except IndexError:          # start-string at end of text
 694             return True  # not "quoted" but no markup start-string either
 695         return punctuation_chars.match_chars(prestart, poststart)
 696
 697     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 698                    restore_backslashes=False):
 699         string = match.string
 700         matchstart = match.start('start')
 701         matchend = match.end('start')
 702         if self.quoted_start(match):
 703             return (string[:matchend], [], string[matchend:], [], '')
 704         endmatch = end_pattern.search(string[matchend:])
 705         if endmatch and endmatch.start(1):  # 1 or more chars
 706             text = unescape(endmatch.string[:endmatch.start(1)],
 707                             restore_backslashes)
 708             textend = matchend + endmatch.end(1)
 709             rawsource = unescape(string[matchstart:textend], 1)
 710             return (string[:matchstart], [nodeclass(rawsource, text)],
 711                     string[textend:], [], endmatch.group(1))
 712         msg = self.reporter.warning(
 713               'Inline %s start-string without end-string.'
 714               % nodeclass.__name__, line=lineno)
 715         text = unescape(string[matchstart:matchend], 1)
 716         rawsource = unescape(string[matchstart:matchend], 1)
 717         prb = self.problematic(text, rawsource, msg)
 718         return string[:matchstart], [prb], string[matchend:], [msg], ''
 719
 720     def problematic(self, text, rawsource, message):
 721         msgid = self.document.set_id(message, self.parent)
 722         problematic = nodes.problematic(rawsource, text, refid=msgid)
 723         prbid = self.document.set_id(problematic)
 724         message.add_backref(prbid)
 725         return problematic
 726
 727     def emphasis(self, match, lineno):
 728         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 729               match, lineno, self.patterns.emphasis, nodes.emphasis)
 730         return before, inlines, remaining, sysmessages
 731
 732     def strong(self, match, lineno):
 733         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 734               match, lineno, self.patterns.strong, nodes.strong)
 735         return before, inlines, remaining, sysmessages
 736
 737     def interpreted_or_phrase_ref(self, match, lineno):
 738         end_pattern = self.patterns.interpreted_or_phrase_ref
 739         string = match.string
 740         matchstart = match.start('backquote')
 741         matchend = match.end('backquote')
 742         rolestart = match.start('role')
 743         role = match.group('role')
 744         position = ''
 745         if role:
 746             role = role[1:-1]
 747             position = 'prefix'
 748         elif self.quoted_start(match):
 749             return (string[:matchend], [], string[matchend:], [])
 750         endmatch = end_pattern.search(string[matchend:])
 751         if endmatch and endmatch.start(1):  # 1 or more chars
 752             textend = matchend + endmatch.end()
 753             if endmatch.group('role'):
 754                 if role:
 755                     msg = self.reporter.warning(
 756                         'Multiple roles in interpreted text (both '
 757                         'prefix and suffix present; only one allowed).',
 758                         line=lineno)
 759                     text = unescape(string[rolestart:textend], 1)
 760                     prb = self.problematic(text, text, msg)
 761                     return string[:rolestart], [prb], string[textend:], [msg]
 762                 role = endmatch.group('suffix')[1:-1]
 763                 position = 'suffix'
 764             escaped = endmatch.string[:endmatch.start(1)]
 765             rawsource = unescape(string[matchstart:textend], 1)
 766             if rawsource[-1:] == '_':
 767                 if role:
 768                     msg = self.reporter.warning(
 769                           'Mismatch: both interpreted text role %s and '
 770                           'reference suffix.' % position, line=lineno)
 771                     text = unescape(string[rolestart:textend], 1)
 772                     prb = self.problematic(text, text, msg)
 773                     return string[:rolestart], [prb], string[textend:], [msg]
 774                 return self.phrase_ref(string[:matchstart], string[textend:],
 775                                        rawsource, escaped, unescape(escaped))
 776             else:
 777                 rawsource = unescape(string[rolestart:textend], 1)
 778                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 779                                                       lineno)
 780                 return (string[:rolestart], nodelist,
 781                         string[textend:], messages)
 782         msg = self.reporter.warning(
 783               'Inline interpreted text or phrase reference start-string '
 784               'without end-string.', line=lineno)
 785         text = unescape(string[matchstart:matchend], 1)
 786         prb = self.problematic(text, text, msg)
 787         return string[:matchstart], [prb], string[matchend:], [msg]
 788
 789     def phrase_ref(self, before, after, rawsource, escaped, text):
 790         match = self.patterns.embedded_uri.search(escaped)
 791         if match:
 792             text = unescape(escaped[:match.start(0)])
 793             uri_text = match.group(2)
 794             uri = ''.join(uri_text.split())
 795             uri = self.adjust_uri(uri)
 796             if uri:
 797                 target = nodes.target(match.group(1), refuri=uri)
 798             else:
 799                 raise ApplicationError('problem with URI: %r' % uri_text)
 800             if not text:
 801                 text = uri
 802         else:
 803             target = None
 804         refname = normalize_name(text)
 805         reference = nodes.reference(rawsource, text,
 806                                     name=whitespace_normalize_name(text))
 807         node_list = [reference]
 808         if rawsource[-2:] == '__':
 809             if target:
 810                 reference['refuri'] = uri
 811             else:
 812                 reference['anonymous'] = 1
 813         else:
 814             if target:
 815                 reference['refuri'] = uri
 816                 target['names'].append(refname)
 817                 self.document.note_explicit_target(target, self.parent)
 818                 node_list.append(target)
 819             else:
 820                 reference['refname'] = refname
 821                 self.document.note_refname(reference)
 822         return before, node_list, after, []
 823
 824     def adjust_uri(self, uri):
 825         match = self.patterns.email.match(uri)
 826         if match:
 827             return 'mailto:' + uri
 828         else:
 829             return uri
 830
 831     def interpreted(self, rawsource, text, role, lineno):
 832         role_fn, messages = roles.role(role, self.language, lineno,
 833                                        self.reporter)
 834         if role_fn:
 835             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 836             return nodes, messages + messages2
 837         else:
 838             msg = self.reporter.error(
 839                 'Unknown interpreted text role "%s".' % role,
 840                 line=lineno)
 841             return ([self.problematic(rawsource, rawsource, msg)],
 842                     messages + [msg])
 843
 844     def literal(self, match, lineno):
 845         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 846               match, lineno, self.patterns.literal, nodes.literal,
 847               restore_backslashes=True)
 848         return before, inlines, remaining, sysmessages
 849
 850     def inline_internal_target(self, match, lineno):
 851         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 852               match, lineno, self.patterns.target, nodes.target)
 853         if inlines and isinstance(inlines[0], nodes.target):
 854             assert len(inlines) == 1
 855             target = inlines[0]
 856             name = normalize_name(target.astext())
 857             target['names'].append(name)
 858             self.document.note_explicit_target(target, self.parent)
 859         return before, inlines, remaining, sysmessages
 860
 861     def substitution_reference(self, match, lineno):
 862         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 863               match, lineno, self.patterns.substitution_ref,
 864               nodes.substitution_reference)
 865         if len(inlines) == 1:
 866             subref_node = inlines[0]
 867             if isinstance(subref_node, nodes.substitution_reference):
 868                 subref_text = subref_node.astext()
 869                 self.document.note_substitution_ref(subref_node, subref_text)
 870                 if endstring[-1:] == '_':
 871                     reference_node = nodes.reference(
 872                         '|%s%s' % (subref_text, endstring), '')
 873                     if endstring[-2:] == '__':
 874                         reference_node['anonymous'] = 1
 875                     else:
 876                         reference_node['refname'] = normalize_name(subref_text)
 877                         self.document.note_refname(reference_node)
 878                     reference_node += subref_node
 879                     inlines = [reference_node]
 880         return before, inlines, remaining, sysmessages
 881
 882     def footnote_reference(self, match, lineno):
 883         """
 884         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 885         elements.
 886         """
 887         label = match.group('footnotelabel')
 888         refname = normalize_name(label)
 889         string = match.string
 890         before = string[:match.start('whole')]
 891         remaining = string[match.end('whole'):]
 892         if match.group('citationlabel'):
 893             refnode = nodes.citation_reference('[%s]_' % label,
 894                                                refname=refname)
 895             refnode += nodes.Text(label)
 896             self.document.note_citation_ref(refnode)
 897         else:
 898             refnode = nodes.footnote_reference('[%s]_' % label)
 899             if refname[0] == '#':
 900                 refname = refname[1:]
 901                 refnode['auto'] = 1
 902                 self.document.note_autofootnote_ref(refnode)
 903             elif refname == '*':
 904                 refname = ''
 905                 refnode['auto'] = '*'
 906                 self.document.note_symbol_footnote_ref(
 907                       refnode)
 908             else:
 909                 refnode += nodes.Text(label)
 910             if refname:
 911                 refnode['refname'] = refname
 912                 self.document.note_footnote_ref(refnode)
 913             if utils.get_trim_footnote_ref_space(self.document.settings):
 914                 before = before.rstrip()
 915         return (before, [refnode], remaining, [])
 916
 917     def reference(self, match, lineno, anonymous=False):
 918         referencename = match.group('refname')
 919         refname = normalize_name(referencename)
 920         referencenode = nodes.reference(
 921             referencename + match.group('refend'), referencename,
 922             name=whitespace_normalize_name(referencename))
 923         if anonymous:
 924             referencenode['anonymous'] = 1
 925         else:
 926             referencenode['refname'] = refname
 927             self.document.note_refname(referencenode)
 928         string = match.string
 929         matchstart = match.start('whole')
 930         matchend = match.end('whole')
 931         return (string[:matchstart], [referencenode], string[matchend:], [])
 932
 933     def anonymous_reference(self, match, lineno):
 934         return self.reference(match, lineno, anonymous=1)
 935
 936     def standalone_uri(self, match, lineno):
 937         if (not match.group('scheme')
 938                 or match.group('scheme').lower() in urischemes.schemes):
 939             if match.group('email'):
 940                 addscheme = 'mailto:'
 941             else:
 942                 addscheme = ''
 943             text = match.group('whole')
 944             unescaped = unescape(text, 0)
 945             return [nodes.reference(unescape(text, 1), unescaped,
 946                                     refuri=addscheme + unescaped)]
 947         else:                   # not a valid scheme
 948             raise MarkupMismatch
 949
 950     def pep_reference(self, match, lineno):
 951         text = match.group(0)
 952         if text.startswith('pep-'):
 953             pepnum = int(match.group('pepnum1'))
 954         elif text.startswith('PEP'):
 955             pepnum = int(match.group('pepnum2'))
 956         else:
 957             raise MarkupMismatch
 958         ref = (self.document.settings.pep_base_url
 959                + self.document.settings.pep_file_url_template % pepnum)
 960         unescaped = unescape(text, 0)
 961         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 962
 963     rfc_url = 'rfc%d.html'
 964
 965     def rfc_reference(self, match, lineno):
 966         text = match.group(0)
 967         if text.startswith('RFC'):
 968             rfcnum = int(match.group('rfcnum'))
 969             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
 970         else:
 971             raise MarkupMismatch
 972         unescaped = unescape(text, 0)
 973         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 974
 975     def implicit_inline(self, text, lineno):
 976         """
 977         Check each of the patterns in `self.implicit_dispatch` for a match,
 978         and dispatch to the stored method for the pattern.  Recursively check
 979         the text before and after the match.  Return a list of `nodes.Text`
 980         and inline element nodes.
 981         """
 982         if not text:
 983             return []
 984         for pattern, method in self.implicit_dispatch:
 985             match = pattern.search(text)
 986             if match:
 987                 try:
 988                     # Must recurse on strings before *and* after the match;
 989                     # there may be multiple patterns.
 990                     return (self.implicit_inline(text[:match.start()], lineno)
 991                             + method(match, lineno) +
 992                             self.implicit_inline(text[match.end():], lineno))
 993                 except MarkupMismatch:
 994                     pass
 995         return [nodes.Text(unescape(text), rawsource=unescape(text, 1))]
 996
 997     dispatch = {'*': emphasis,
 998                 '**': strong,
 999                 '`': interpreted_or_phrase_ref,
1000                 '``': literal,
1001                 '_`': inline_internal_target,
1002                 ']_': footnote_reference,
1003                 '|': substitution_reference,
1004                 '_': reference,
1005                 '__': anonymous_reference}
1006
1007
1008 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1009     return ord(s) - _zero
1010
1011 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1012     return ord(s) - _zero
1013
1014 def _lowerroman_to_int(s):
1015     return roman.fromRoman(s.upper())
1016
1017
1018 class Body(RSTState):
1019
1020     """
1021     Generic classifier of the first line of a block.
1022     """
1023
1024     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1025     """Padding character for East Asian double-width text."""
1026
1027     enum = Struct()
1028     """Enumerated list parsing information."""
1029
1030     enum.formatinfo = {
1031           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1032           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1033           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1034     enum.formats = enum.formatinfo.keys()
1035     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1036                       'lowerroman', 'upperroman'] # ORDERED!
1037     enum.sequencepats = {'arabic': '[0-9]+',
1038                          'loweralpha': '[a-z]',
1039                          'upperalpha': '[A-Z]',
1040                          'lowerroman': '[ivxlcdm]+',
1041                          'upperroman': '[IVXLCDM]+',}
1042     enum.converters = {'arabic': int,
1043                        'loweralpha': _loweralpha_to_int,
1044                        'upperalpha': _upperalpha_to_int,
1045                        'lowerroman': _lowerroman_to_int,
1046                        'upperroman': roman.fromRoman}
1047
1048     enum.sequenceregexps = {}
1049     for sequence in enum.sequences:
1050         enum.sequenceregexps[sequence] = re.compile(
1051               enum.sequencepats[sequence] + '$', re.UNICODE)
1052
1053     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1054     """Matches the top (& bottom) of a full table)."""
1055
1056     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1057     """Matches the top of a simple table."""
1058
1059     simple_table_border_pat = re.compile('=+[ =]*$')
1060     """Matches the bottom & header bottom of a simple table."""
1061
1062     pats = {}
1063     """Fragments of patterns used by transitions."""
1064
1065     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1066     pats['alpha'] = '[a-zA-Z]'
1067     pats['alphanum'] = '[a-zA-Z0-9]'
1068     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1069     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1070                     '|%(upperroman)s|#)' % enum.sequencepats)
1071     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1072     # @@@ Loosen up the pattern?  Allow Unicode?
1073     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1074     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1075     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1076     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1077
1078     for format in enum.formats:
1079         pats[format] = '(?P<%s>%s%s%s)' % (
1080               format, re.escape(enum.formatinfo[format].prefix),
1081               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1082
1083     patterns = {
1084           'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
1085           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1086           'field_marker': r':(?![: ])([^:\\]|\\.)*(?<! ):( +|$)',
1087           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1088           'doctest': r'>>>( +|$)',
1089           'line_block': r'\|( +|$)',
1090           'grid_table_top': grid_table_top_pat,
1091           'simple_table_top': simple_table_top_pat,
1092           'explicit_markup': r'\.\.( +|$)',
1093           'anonymous': r'__( +|$)',
1094           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1095           'text': r''}
1096     initial_transitions = (
1097           'bullet',
1098           'enumerator',
1099           'field_marker',
1100           'option_marker',
1101           'doctest',
1102           'line_block',
1103           'grid_table_top',
1104           'simple_table_top',
1105           'explicit_markup',
1106           'anonymous',
1107           'line',
1108           'text')
1109
1110     def indent(self, match, context, next_state):
1111         """Block quote."""
1112         indented, indent, line_offset, blank_finish = \
1113               self.state_machine.get_indented()
1114         elements = self.block_quote(indented, line_offset)
1115         self.parent += elements
1116         if not blank_finish:
1117             self.parent += self.unindent_warning('Block quote')
1118         return context, next_state, []
1119
1120     def block_quote(self, indented, line_offset):
1121         elements = []
1122         while indented:
1123             (blockquote_lines,
1124              attribution_lines,
1125              attribution_offset,
1126              indented,
1127              new_line_offset) = self.split_attribution(indented, line_offset)
1128             blockquote = nodes.block_quote()
1129             self.nested_parse(blockquote_lines, line_offset, blockquote)
1130             elements.append(blockquote)
1131             if attribution_lines:
1132                 attribution, messages = self.parse_attribution(
1133                     attribution_lines, attribution_offset)
1134                 blockquote += attribution
1135                 elements += messages
1136             line_offset = new_line_offset
1137             while indented and not indented[0]:
1138                 indented = indented[1:]
1139                 line_offset += 1
1140         return elements
1141
1142     # U+2014 is an em-dash:
1143     attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
1144                                      re.UNICODE)
1145
1146     def split_attribution(self, indented, line_offset):
1147         """
1148         Check for a block quote attribution and split it off:
1149
1150         * First line after a blank line must begin with a dash ("--", "---",
1151           em-dash; matches `self.attribution_pattern`).
1152         * Every line after that must have consistent indentation.
1153         * Attributions must be preceded by block quote content.
1154
1155         Return a tuple of: (block quote content lines, content offset,
1156         attribution lines, attribution offset, remaining indented lines).
1157         """
1158         blank = None
1159         nonblank_seen = False
1160         for i in range(len(indented)):
1161             line = indented[i].rstrip()
1162             if line:
1163                 if nonblank_seen and blank == i - 1: # last line blank
1164                     match = self.attribution_pattern.match(line)
1165                     if match:
1166                         attribution_end, indent = self.check_attribution(
1167                             indented, i)
1168                         if attribution_end:
1169                             a_lines = indented[i:attribution_end]
1170                             a_lines.trim_left(match.end(), end=1)
1171                             a_lines.trim_left(indent, start=1)
1172                             return (indented[:i], a_lines,
1173                                     i, indented[attribution_end:],
1174                                     line_offset + attribution_end)
1175                 nonblank_seen = True
1176             else:
1177                 blank = i
1178         else:
1179             return (indented, None, None, None, None)
1180
1181     def check_attribution(self, indented, attribution_start):
1182         """
1183         Check attribution shape.
1184         Return the index past the end of the attribution, and the indent.
1185         """
1186         indent = None
1187         i = attribution_start + 1
1188         for i in range(attribution_start + 1, len(indented)):
1189             line = indented[i].rstrip()
1190             if not line:
1191                 break
1192             if indent is None:
1193                 indent = len(line) - len(line.lstrip())
1194             elif len(line) - len(line.lstrip()) != indent:
1195                 return None, None       # bad shape; not an attribution
1196         else:
1197             # return index of line after last attribution line:
1198             i += 1
1199         return i, (indent or 0)
1200
1201     def parse_attribution(self, indented, line_offset):
1202         text = '\n'.join(indented).rstrip()
1203         lineno = self.state_machine.abs_line_number() + line_offset
1204         textnodes, messages = self.inline_text(text, lineno)
1205         node = nodes.attribution(text, '', *textnodes)
1206         node.source, node.line = self.state_machine.get_source_and_line(lineno)
1207         return node, messages
1208
1209     def bullet(self, match, context, next_state):
1210         """Bullet list item."""
1211         bulletlist = nodes.bullet_list()
1212         self.parent += bulletlist
1213         bulletlist['bullet'] = match.string[0]
1214         i, blank_finish = self.list_item(match.end())
1215         bulletlist += i
1216         offset = self.state_machine.line_offset + 1   # next line
1217         new_line_offset, blank_finish = self.nested_list_parse(
1218               self.state_machine.input_lines[offset:],
1219               input_offset=self.state_machine.abs_line_offset() + 1,
1220               node=bulletlist, initial_state='BulletList',
1221               blank_finish=blank_finish)
1222         self.goto_line(new_line_offset)
1223         if not blank_finish:
1224             self.parent += self.unindent_warning('Bullet list')
1225         return [], next_state, []
1226
1227     def list_item(self, indent):
1228         if self.state_machine.line[indent:]:
1229             indented, line_offset, blank_finish = (
1230                 self.state_machine.get_known_indented(indent))
1231         else:
1232             indented, indent, line_offset, blank_finish = (
1233                 self.state_machine.get_first_known_indented(indent))
1234         listitem = nodes.list_item('\n'.join(indented))
1235         if indented:
1236             self.nested_parse(indented, input_offset=line_offset,
1237                               node=listitem)
1238         return listitem, blank_finish
1239
1240     def enumerator(self, match, context, next_state):
1241         """Enumerated List Item"""
1242         format, sequence, text, ordinal = self.parse_enumerator(match)
1243         if not self.is_enumerated_list_item(ordinal, sequence, format):
1244             raise statemachine.TransitionCorrection('text')
1245         enumlist = nodes.enumerated_list()
1246         self.parent += enumlist
1247         if sequence == '#':
1248             enumlist['enumtype'] = 'arabic'
1249         else:
1250             enumlist['enumtype'] = sequence
1251         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1252         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1253         if ordinal != 1:
1254             enumlist['start'] = ordinal
1255             msg = self.reporter.info(
1256                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1257                 % (text, ordinal))
1258             self.parent += msg
1259         listitem, blank_finish = self.list_item(match.end())
1260         enumlist += listitem
1261         offset = self.state_machine.line_offset + 1   # next line
1262         newline_offset, blank_finish = self.nested_list_parse(
1263               self.state_machine.input_lines[offset:],
1264               input_offset=self.state_machine.abs_line_offset() + 1,
1265               node=enumlist, initial_state='EnumeratedList',
1266               blank_finish=blank_finish,
1267               extra_settings={'lastordinal': ordinal,
1268                               'format': format,
1269                               'auto': sequence == '#'})
1270         self.goto_line(newline_offset)
1271         if not blank_finish:
1272             self.parent += self.unindent_warning('Enumerated list')
1273         return [], next_state, []
1274
1275     def parse_enumerator(self, match, expected_sequence=None):
1276         """
1277         Analyze an enumerator and return the results.
1278
1279         :Return:
1280             - the enumerator format ('period', 'parens', or 'rparen'),
1281             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1282             - the text of the enumerator, stripped of formatting, and
1283             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1284               ``None`` is returned for invalid enumerator text).
1285
1286         The enumerator format has already been determined by the regular
1287         expression match. If `expected_sequence` is given, that sequence is
1288         tried first. If not, we check for Roman numeral 1. This way,
1289         single-character Roman numerals (which are also alphabetical) can be
1290         matched. If no sequence has been matched, all sequences are checked in
1291         order.
1292         """
1293         groupdict = match.groupdict()
1294         sequence = ''
1295         for format in self.enum.formats:
1296             if groupdict[format]:       # was this the format matched?
1297                 break                   # yes; keep `format`
1298         else:                           # shouldn't happen
1299             raise ParserError('enumerator format not matched')
1300         text = groupdict[format][self.enum.formatinfo[format].start
1301                                  :self.enum.formatinfo[format].end]
1302         if text == '#':
1303             sequence = '#'
1304         elif expected_sequence:
1305             try:
1306                 if self.enum.sequenceregexps[expected_sequence].match(text):
1307                     sequence = expected_sequence
1308             except KeyError:            # shouldn't happen
1309                 raise ParserError('unknown enumerator sequence: %s'
1310                                   % sequence)
1311         elif text == 'i':
1312             sequence = 'lowerroman'
1313         elif text == 'I':
1314             sequence = 'upperroman'
1315         if not sequence:
1316             for sequence in self.enum.sequences:
1317                 if self.enum.sequenceregexps[sequence].match(text):
1318                     break
1319             else:                       # shouldn't happen
1320                 raise ParserError('enumerator sequence not matched')
1321         if sequence == '#':
1322             ordinal = 1
1323         else:
1324             try:
1325                 ordinal = self.enum.converters[sequence](text)
1326             except roman.InvalidRomanNumeralError:
1327                 ordinal = None
1328         return format, sequence, text, ordinal
1329
1330     def is_enumerated_list_item(self, ordinal, sequence, format):
1331         """
1332         Check validity based on the ordinal value and the second line.
1333
1334         Return true if the ordinal is valid and the second line is blank,
1335         indented, or starts with the next enumerator or an auto-enumerator.
1336         """
1337         if ordinal is None:
1338             return None
1339         try:
1340             next_line = self.state_machine.next_line()
1341         except EOFError:              # end of input lines
1342             self.state_machine.previous_line()
1343             return 1
1344         else:
1345             self.state_machine.previous_line()
1346         if not next_line[:1].strip():   # blank or indented
1347             return 1
1348         result = self.make_enumerator(ordinal + 1, sequence, format)
1349         if result:
1350             next_enumerator, auto_enumerator = result
1351             try:
1352                 if ( next_line.startswith(next_enumerator) or
1353                      next_line.startswith(auto_enumerator) ):
1354                     return 1
1355             except TypeError:
1356                 pass
1357         return None
1358
1359     def make_enumerator(self, ordinal, sequence, format):
1360         """
1361         Construct and return the next enumerated list item marker, and an
1362         auto-enumerator ("#" instead of the regular enumerator).
1363
1364         Return ``None`` for invalid (out of range) ordinals.
1365         """ #"
1366         if sequence == '#':
1367             enumerator = '#'
1368         elif sequence == 'arabic':
1369             enumerator = str(ordinal)
1370         else:
1371             if sequence.endswith('alpha'):
1372                 if ordinal > 26:
1373                     return None
1374                 enumerator = chr(ordinal + ord('a') - 1)
1375             elif sequence.endswith('roman'):
1376                 try:
1377                     enumerator = roman.toRoman(ordinal)
1378                 except roman.RomanError:
1379                     return None
1380             else:                       # shouldn't happen
1381                 raise ParserError('unknown enumerator sequence: "%s"'
1382                                   % sequence)
1383             if sequence.startswith('lower'):
1384                 enumerator = enumerator.lower()
1385             elif sequence.startswith('upper'):
1386                 enumerator = enumerator.upper()
1387             else:                       # shouldn't happen
1388                 raise ParserError('unknown enumerator sequence: "%s"'
1389                                   % sequence)
1390         formatinfo = self.enum.formatinfo[format]
1391         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1392                            + ' ')
1393         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1394         return next_enumerator, auto_enumerator
1395
1396     def field_marker(self, match, context, next_state):
1397         """Field list item."""
1398         field_list = nodes.field_list()
1399         self.parent += field_list
1400         field, blank_finish = self.field(match)
1401         field_list += field
1402         offset = self.state_machine.line_offset + 1   # next line
1403         newline_offset, blank_finish = self.nested_list_parse(
1404               self.state_machine.input_lines[offset:],
1405               input_offset=self.state_machine.abs_line_offset() + 1,
1406               node=field_list, initial_state='FieldList',
1407               blank_finish=blank_finish)
1408         self.goto_line(newline_offset)
1409         if not blank_finish:
1410             self.parent += self.unindent_warning('Field list')
1411         return [], next_state, []
1412
1413     def field(self, match):
1414         name = self.parse_field_marker(match)
1415         src, srcline = self.state_machine.get_source_and_line()
1416         lineno = self.state_machine.abs_line_number()
1417         indented, indent, line_offset, blank_finish = \
1418               self.state_machine.get_first_known_indented(match.end())
1419         field_node = nodes.field()
1420         field_node.source = src
1421         field_node.line = srcline
1422         name_nodes, name_messages = self.inline_text(name, lineno)
1423         field_node += nodes.field_name(name, '', *name_nodes)
1424         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1425         field_node += field_body
1426         if indented:
1427             self.parse_field_body(indented, line_offset, field_body)
1428         return field_node, blank_finish
1429
1430     def parse_field_marker(self, match):
1431         """Extract & return field name from a field marker match."""
1432         field = match.group()[1:]        # strip off leading ':'
1433         field = field[:field.rfind(':')] # strip off trailing ':' etc.
1434         return field
1435
1436     def parse_field_body(self, indented, offset, node):
1437         self.nested_parse(indented, input_offset=offset, node=node)
1438
1439     def option_marker(self, match, context, next_state):
1440         """Option list item."""
1441         optionlist = nodes.option_list()
1442         try:
1443             listitem, blank_finish = self.option_list_item(match)
1444         except MarkupError, error:
1445             # This shouldn't happen; pattern won't match.
1446             msg = self.reporter.error(u'Invalid option list marker: %s' %
1447                                       error)
1448             self.parent += msg
1449             indented, indent, line_offset, blank_finish = \
1450                   self.state_machine.get_first_known_indented(match.end())
1451             elements = self.block_quote(indented, line_offset)
1452             self.parent += elements
1453             if not blank_finish:
1454                 self.parent += self.unindent_warning('Option list')
1455             return [], next_state, []
1456         self.parent += optionlist
1457         optionlist += listitem
1458         offset = self.state_machine.line_offset + 1   # next line
1459         newline_offset, blank_finish = self.nested_list_parse(
1460               self.state_machine.input_lines[offset:],
1461               input_offset=self.state_machine.abs_line_offset() + 1,
1462               node=optionlist, initial_state='OptionList',
1463               blank_finish=blank_finish)
1464         self.goto_line(newline_offset)
1465         if not blank_finish:
1466             self.parent += self.unindent_warning('Option list')
1467         return [], next_state, []
1468
1469     def option_list_item(self, match):
1470         offset = self.state_machine.abs_line_offset()
1471         options = self.parse_option_marker(match)
1472         indented, indent, line_offset, blank_finish = \
1473               self.state_machine.get_first_known_indented(match.end())
1474         if not indented:                # not an option list item
1475             self.goto_line(offset)
1476             raise statemachine.TransitionCorrection('text')
1477         option_group = nodes.option_group('', *options)
1478         description = nodes.description('\n'.join(indented))
1479         option_list_item = nodes.option_list_item('', option_group,
1480                                                   description)
1481         if indented:
1482             self.nested_parse(indented, input_offset=line_offset,
1483                               node=description)
1484         return option_list_item, blank_finish
1485
1486     def parse_option_marker(self, match):
1487         """
1488         Return a list of `node.option` and `node.option_argument` objects,
1489         parsed from an option marker match.
1490
1491         :Exception: `MarkupError` for invalid option markers.
1492         """
1493         optlist = []
1494         optionstrings = match.group().rstrip().split(', ')
1495         for optionstring in optionstrings:
1496             tokens = optionstring.split()
1497             delimiter = ' '
1498             firstopt = tokens[0].split('=', 1)
1499             if len(firstopt) > 1:
1500                 # "--opt=value" form
1501                 tokens[:1] = firstopt
1502                 delimiter = '='
1503             elif (len(tokens[0]) > 2
1504                   and ((tokens[0].startswith('-')
1505                         and not tokens[0].startswith('--'))
1506                        or tokens[0].startswith('+'))):
1507                 # "-ovalue" form
1508                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1509                 delimiter = ''
1510             if len(tokens) > 1 and (tokens[1].startswith('<')
1511                                     and tokens[-1].endswith('>')):
1512                 # "-o <value1 value2>" form; join all values into one token
1513                 tokens[1:] = [' '.join(tokens[1:])]
1514             if 0 < len(tokens) <= 2:
1515                 option = nodes.option(optionstring)
1516                 option += nodes.option_string(tokens[0], tokens[0])
1517                 if len(tokens) > 1:
1518                     option += nodes.option_argument(tokens[1], tokens[1],
1519                                                     delimiter=delimiter)
1520                 optlist.append(option)
1521             else:
1522                 raise MarkupError(
1523                     'wrong number of option tokens (=%s), should be 1 or 2: '
1524                     '"%s"' % (len(tokens), optionstring))
1525         return optlist
1526
1527     def doctest(self, match, context, next_state):
1528         data = '\n'.join(self.state_machine.get_text_block())
1529         self.parent += nodes.doctest_block(data, data)
1530         return [], next_state, []
1531
1532     def line_block(self, match, context, next_state):
1533         """First line of a line block."""
1534         block = nodes.line_block()
1535         self.parent += block
1536         lineno = self.state_machine.abs_line_number()
1537         line, messages, blank_finish = self.line_block_line(match, lineno)
1538         block += line
1539         self.parent += messages
1540         if not blank_finish:
1541             offset = self.state_machine.line_offset + 1   # next line
1542             new_line_offset, blank_finish = self.nested_list_parse(
1543                   self.state_machine.input_lines[offset:],
1544                   input_offset=self.state_machine.abs_line_offset() + 1,
1545                   node=block, initial_state='LineBlock',
1546                   blank_finish=0)
1547             self.goto_line(new_line_offset)
1548         if not blank_finish:
1549             self.parent += self.reporter.warning(
1550                 'Line block ends without a blank line.',
1551                 line=lineno+1)
1552         if len(block):
1553             if block[0].indent is None:
1554                 block[0].indent = 0
1555             self.nest_line_block_lines(block)
1556         return [], next_state, []
1557
1558     def line_block_line(self, match, lineno):
1559         """Return one line element of a line_block."""
1560         indented, indent, line_offset, blank_finish = \
1561             self.state_machine.get_first_known_indented(match.end(),
1562                                                         until_blank=True)
1563         text = u'\n'.join(indented)
1564         text_nodes, messages = self.inline_text(text, lineno)
1565         line = nodes.line(text, '', *text_nodes)
1566         if match.string.rstrip() != '|': # not empty
1567             line.indent = len(match.group(1)) - 1
1568         return line, messages, blank_finish
1569
1570     def nest_line_block_lines(self, block):
1571         for index in range(1, len(block)):
1572             if block[index].indent is None:
1573                 block[index].indent = block[index - 1].indent
1574         self.nest_line_block_segment(block)
1575
1576     def nest_line_block_segment(self, block):
1577         indents = [item.indent for item in block]
1578         least = min(indents)
1579         new_items = []
1580         new_block = nodes.line_block()
1581         for item in block:
1582             if item.indent > least:
1583                 new_block.append(item)
1584             else:
1585                 if len(new_block):
1586                     self.nest_line_block_segment(new_block)
1587                     new_items.append(new_block)
1588                     new_block = nodes.line_block()
1589                 new_items.append(item)
1590         if len(new_block):
1591             self.nest_line_block_segment(new_block)
1592             new_items.append(new_block)
1593         block[:] = new_items
1594
1595     def grid_table_top(self, match, context, next_state):
1596         """Top border of a full table."""
1597         return self.table_top(match, context, next_state,
1598                               self.isolate_grid_table,
1599                               tableparser.GridTableParser)
1600
1601     def simple_table_top(self, match, context, next_state):
1602         """Top border of a simple table."""
1603         return self.table_top(match, context, next_state,
1604                               self.isolate_simple_table,
1605                               tableparser.SimpleTableParser)
1606
1607     def table_top(self, match, context, next_state,
1608                   isolate_function, parser_class):
1609         """Top border of a generic table."""
1610         nodelist, blank_finish = self.table(isolate_function, parser_class)
1611         self.parent += nodelist
1612         if not blank_finish:
1613             msg = self.reporter.warning(
1614                 'Blank line required after table.',
1615                 line=self.state_machine.abs_line_number()+1)
1616             self.parent += msg
1617         return [], next_state, []
1618
1619     def table(self, isolate_function, parser_class):
1620         """Parse a table."""
1621         block, messages, blank_finish = isolate_function()
1622         if block:
1623             try:
1624                 parser = parser_class()
1625                 tabledata = parser.parse(block)
1626                 tableline = (self.state_machine.abs_line_number() - len(block)
1627                              + 1)
1628                 table = self.build_table(tabledata, tableline)
1629                 nodelist = [table] + messages
1630             except tableparser.TableMarkupError, err:
1631                 nodelist = self.malformed_table(block, ' '.join(err.args),
1632                                                 offset=err.offset) + messages
1633         else:
1634             nodelist = messages
1635         return nodelist, blank_finish
1636
1637     def isolate_grid_table(self):
1638         messages = []
1639         blank_finish = 1
1640         try:
1641             block = self.state_machine.get_text_block(flush_left=True)
1642         except statemachine.UnexpectedIndentationError, err:
1643             block, src, srcline = err.args
1644             messages.append(self.reporter.error('Unexpected indentation.',
1645                                                 source=src, line=srcline))
1646             blank_finish = 0
1647         block.disconnect()
1648         # for East Asian chars:
1649         block.pad_double_width(self.double_width_pad_char)
1650         width = len(block[0].strip())
1651         for i in range(len(block)):
1652             block[i] = block[i].strip()
1653             if block[i][0] not in '+|': # check left edge
1654                 blank_finish = 0
1655                 self.state_machine.previous_line(len(block) - i)
1656                 del block[i:]
1657                 break
1658         if not self.grid_table_top_pat.match(block[-1]): # find bottom
1659             blank_finish = 0
1660             # from second-last to third line of table:
1661             for i in range(len(block) - 2, 1, -1):
1662                 if self.grid_table_top_pat.match(block[i]):
1663                     self.state_machine.previous_line(len(block) - i + 1)
1664                     del block[i+1:]
1665                     break
1666             else:
1667                 messages.extend(self.malformed_table(block))
1668                 return [], messages, blank_finish
1669         for i in range(len(block)):     # check right edge
1670             if len(block[i]) != width or block[i][-1] not in '+|':
1671                 messages.extend(self.malformed_table(block))
1672                 return [], messages, blank_finish
1673         return block, messages, blank_finish
1674
1675     def isolate_simple_table(self):
1676         start = self.state_machine.line_offset
1677         lines = self.state_machine.input_lines
1678         limit = len(lines) - 1
1679         toplen = len(lines[start].strip())
1680         pattern_match = self.simple_table_border_pat.match
1681         found = 0
1682         found_at = None
1683         i = start + 1
1684         while i <= limit:
1685             line = lines[i]
1686             match = pattern_match(line)
1687             if match:
1688                 if len(line.strip()) != toplen:
1689                     self.state_machine.next_line(i - start)
1690                     messages = self.malformed_table(
1691                         lines[start:i+1], 'Bottom/header table border does '
1692                         'not match top border.')
1693                     return [], messages, i == limit or not lines[i+1].strip()
1694                 found += 1
1695                 found_at = i
1696                 if found == 2 or i == limit or not lines[i+1].strip():
1697                     end = i
1698                     break
1699             i += 1
1700         else:                           # reached end of input_lines
1701             if found:
1702                 extra = ' or no blank line after table bottom'
1703                 self.state_machine.next_line(found_at - start)
1704                 block = lines[start:found_at+1]
1705             else:
1706                 extra = ''
1707                 self.state_machine.next_line(i - start - 1)
1708                 block = lines[start:]
1709             messages = self.malformed_table(
1710                 block, 'No bottom table border found%s.' % extra)
1711             return [], messages, not extra
1712         self.state_machine.next_line(end - start)
1713         block = lines[start:end+1]
1714         # for East Asian chars:
1715         block.pad_double_width(self.double_width_pad_char)
1716         return block, [], end == limit or not lines[end+1].strip()
1717
1718     def malformed_table(self, block, detail='', offset=0):
1719         block.replace(self.double_width_pad_char, '')
1720         data = '\n'.join(block)
1721         message = 'Malformed table.'
1722         startline = self.state_machine.abs_line_number() - len(block) + 1
1723         if detail:
1724             message += '\n' + detail
1725         error = self.reporter.error(message, nodes.literal_block(data, data),
1726                                     line=startline+offset)
1727         return [error]
1728
1729     def build_table(self, tabledata, tableline, stub_columns=0):
1730         colwidths, headrows, bodyrows = tabledata
1731         table = nodes.table()
1732         tgroup = nodes.tgroup(cols=len(colwidths))
1733         table += tgroup
1734         for colwidth in colwidths:
1735             colspec = nodes.colspec(colwidth=colwidth)
1736             if stub_columns:
1737                 colspec.attributes['stub'] = 1
1738                 stub_columns -= 1
1739             tgroup += colspec
1740         if headrows:
1741             thead = nodes.thead()
1742             tgroup += thead
1743             for row in headrows:
1744                 thead += self.build_table_row(row, tableline)
1745         tbody = nodes.tbody()
1746         tgroup += tbody
1747         for row in bodyrows:
1748             tbody += self.build_table_row(row, tableline)
1749         return table
1750
1751     def build_table_row(self, rowdata, tableline):
1752         row = nodes.row()
1753         for cell in rowdata:
1754             if cell is None:
1755                 continue
1756             morerows, morecols, offset, cellblock = cell
1757             attributes = {}
1758             if morerows:
1759                 attributes['morerows'] = morerows
1760             if morecols:
1761                 attributes['morecols'] = morecols
1762             entry = nodes.entry(**attributes)
1763             row += entry
1764             if ''.join(cellblock):
1765                 self.nested_parse(cellblock, input_offset=tableline+offset,
1766                                   node=entry)
1767         return row
1768
1769
1770     explicit = Struct()
1771     """Patterns and constants used for explicit markup recognition."""
1772
1773     explicit.patterns = Struct(
1774           target=re.compile(r"""
1775                             (
1776                               _               # anonymous target
1777                             |               # *OR*
1778                               (?!_)           # no underscore at the beginning
1779                               (?P<quote>`?)   # optional open quote
1780                               (?![ `])        # first char. not space or
1781                                               # backquote
1782                               (?P<name>       # reference name
1783                                 .+?
1784                               )
1785                               %(non_whitespace_escape_before)s
1786                               (?P=quote)      # close quote if open quote used
1787                             )
1788                             (?<!(?<!\x00):) # no unescaped colon at end
1789                             %(non_whitespace_escape_before)s
1790                             [ ]?            # optional space
1791                             :               # end of reference name
1792                             ([ ]+|$)        # followed by whitespace
1793                             """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1794           reference=re.compile(r"""
1795                                (
1796                                  (?P<simple>%(simplename)s)_
1797                                |                  # *OR*
1798                                  `                  # open backquote
1799                                  (?![ ])            # not space
1800                                  (?P<phrase>.+?)    # hyperlink phrase
1801                                  %(non_whitespace_escape_before)s
1802                                  `_                 # close backquote,
1803                                                     # reference mark
1804                                )
1805                                $                  # end of string
1806                                """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1807           substitution=re.compile(r"""
1808                                   (
1809                                     (?![ ])          # first char. not space
1810                                     (?P<name>.+?)    # substitution text
1811                                     %(non_whitespace_escape_before)s
1812                                     \|               # close delimiter
1813                                   )
1814                                   ([ ]+|$)           # followed by whitespace
1815                                   """ % vars(Inliner),
1816                                   re.VERBOSE | re.UNICODE),)
1817
1818     def footnote(self, match):
1819         src, srcline = self.state_machine.get_source_and_line()
1820         indented, indent, offset, blank_finish = \
1821               self.state_machine.get_first_known_indented(match.end())
1822         label = match.group(1)
1823         name = normalize_name(label)
1824         footnote = nodes.footnote('\n'.join(indented))
1825         footnote.source = src
1826         footnote.line = srcline
1827         if name[0] == '#':              # auto-numbered
1828             name = name[1:]             # autonumber label
1829             footnote['auto'] = 1
1830             if name:
1831                 footnote['names'].append(name)
1832             self.document.note_autofootnote(footnote)
1833         elif name == '*':               # auto-symbol
1834             name = ''
1835             footnote['auto'] = '*'
1836             self.document.note_symbol_footnote(footnote)
1837         else:                           # manually numbered
1838             footnote += nodes.label('', label)
1839             footnote['names'].append(name)
1840             self.document.note_footnote(footnote)
1841         if name:
1842             self.document.note_explicit_target(footnote, footnote)
1843         else:
1844             self.document.set_id(footnote, footnote)
1845         if indented:
1846             self.nested_parse(indented, input_offset=offset, node=footnote)
1847         return [footnote], blank_finish
1848
1849     def citation(self, match):
1850         src, srcline = self.state_machine.get_source_and_line()
1851         indented, indent, offset, blank_finish = \
1852               self.state_machine.get_first_known_indented(match.end())
1853         label = match.group(1)
1854         name = normalize_name(label)
1855         citation = nodes.citation('\n'.join(indented))
1856         citation.source = src
1857         citation.line = srcline
1858         citation += nodes.label('', label)
1859         citation['names'].append(name)
1860         self.document.note_citation(citation)
1861         self.document.note_explicit_target(citation, citation)
1862         if indented:
1863             self.nested_parse(indented, input_offset=offset, node=citation)
1864         return [citation], blank_finish
1865
1866     def hyperlink_target(self, match):
1867         pattern = self.explicit.patterns.target
1868         lineno = self.state_machine.abs_line_number()
1869         block, indent, offset, blank_finish = \
1870               self.state_machine.get_first_known_indented(
1871               match.end(), until_blank=True, strip_indent=False)
1872         blocktext = match.string[:match.end()] + '\n'.join(block)
1873         block = [escape2null(line) for line in block]
1874         escaped = block[0]
1875         blockindex = 0
1876         while True:
1877             targetmatch = pattern.match(escaped)
1878             if targetmatch:
1879                 break
1880             blockindex += 1
1881             try:
1882                 escaped += block[blockindex]
1883             except IndexError:
1884                 raise MarkupError('malformed hyperlink target.')
1885         del block[:blockindex]
1886         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1887         target = self.make_target(block, blocktext, lineno,
1888                                   targetmatch.group('name'))
1889         return [target], blank_finish
1890
1891     def make_target(self, block, block_text, lineno, target_name):
1892         target_type, data = self.parse_target(block, block_text, lineno)
1893         if target_type == 'refname':
1894             target = nodes.target(block_text, '', refname=normalize_name(data))
1895             target.indirect_reference_name = data
1896             self.add_target(target_name, '', target, lineno)
1897             self.document.note_indirect_target(target)
1898             return target
1899         elif target_type == 'refuri':
1900             target = nodes.target(block_text, '')
1901             self.add_target(target_name, data, target, lineno)
1902             return target
1903         else:
1904             return data
1905
1906     def parse_target(self, block, block_text, lineno):
1907         """
1908         Determine the type of reference of a target.
1909
1910         :Return: A 2-tuple, one of:
1911
1912             - 'refname' and the indirect reference name
1913             - 'refuri' and the URI
1914             - 'malformed' and a system_message node
1915         """
1916         if block and block[-1].strip()[-1:] == '_': # possible indirect target
1917             reference = ' '.join([line.strip() for line in block])
1918             refname = self.is_reference(reference)
1919             if refname:
1920                 return 'refname', refname
1921         reference = ''.join([''.join(line.split()) for line in block])
1922         return 'refuri', unescape(reference)
1923
1924     def is_reference(self, reference):
1925         match = self.explicit.patterns.reference.match(
1926             whitespace_normalize_name(reference))
1927         if not match:
1928             return None
1929         return unescape(match.group('simple') or match.group('phrase'))
1930
1931     def add_target(self, targetname, refuri, target, lineno):
1932         target.line = lineno
1933         if targetname:
1934             name = normalize_name(unescape(targetname))
1935             target['names'].append(name)
1936             if refuri:
1937                 uri = self.inliner.adjust_uri(refuri)
1938                 if uri:
1939                     target['refuri'] = uri
1940                 else:
1941                     raise ApplicationError('problem with URI: %r' % refuri)
1942             self.document.note_explicit_target(target, self.parent)
1943         else:                       # anonymous target
1944             if refuri:
1945                 target['refuri'] = refuri
1946             target['anonymous'] = 1
1947             self.document.note_anonymous_target(target)
1948
1949     def substitution_def(self, match):
1950         pattern = self.explicit.patterns.substitution
1951         src, srcline = self.state_machine.get_source_and_line()
1952         block, indent, offset, blank_finish = \
1953               self.state_machine.get_first_known_indented(match.end(),
1954                                                           strip_indent=False)
1955         blocktext = (match.string[:match.end()] + '\n'.join(block))
1956         block.disconnect()
1957         escaped = escape2null(block[0].rstrip())
1958         blockindex = 0
1959         while True:
1960             subdefmatch = pattern.match(escaped)
1961             if subdefmatch:
1962                 break
1963             blockindex += 1
1964             try:
1965                 escaped = escaped + ' ' + escape2null(block[blockindex].strip())
1966             except IndexError:
1967                 raise MarkupError('malformed substitution definition.')
1968         del block[:blockindex]          # strip out the substitution marker
1969         block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1]
1970         if not block[0]:
1971             del block[0]
1972             offset += 1
1973         while block and not block[-1].strip():
1974             block.pop()
1975         subname = subdefmatch.group('name')
1976         substitution_node = nodes.substitution_definition(blocktext)
1977         substitution_node.source = src
1978         substitution_node.line = srcline
1979         if not block:
1980             msg = self.reporter.warning(
1981                 'Substitution definition "%s" missing contents.' % subname,
1982                 nodes.literal_block(blocktext, blocktext),
1983                 source=src, line=srcline)
1984             return [msg], blank_finish
1985         block[0] = block[0].strip()
1986         substitution_node['names'].append(
1987             nodes.whitespace_normalize_name(subname))
1988         new_abs_offset, blank_finish = self.nested_list_parse(
1989               block, input_offset=offset, node=substitution_node,
1990               initial_state='SubstitutionDef', blank_finish=blank_finish)
1991         i = 0
1992         for node in substitution_node[:]:
1993             if not (isinstance(node, nodes.Inline) or
1994                     isinstance(node, nodes.Text)):
1995                 self.parent += substitution_node[i]
1996                 del substitution_node[i]
1997             else:
1998                 i += 1
1999         for node in substitution_node.traverse(nodes.Element):
2000             if self.disallowed_inside_substitution_definitions(node):
2001                 pformat = nodes.literal_block('', node.pformat().rstrip())
2002                 msg = self.reporter.error(
2003                     'Substitution definition contains illegal element:',
2004                     pformat, nodes.literal_block(blocktext, blocktext),
2005                     source=src, line=srcline)
2006                 return [msg], blank_finish
2007         if len(substitution_node) == 0:
2008             msg = self.reporter.warning(
2009                   'Substitution definition "%s" empty or invalid.' % subname,
2010                   nodes.literal_block(blocktext, blocktext),
2011                   source=src, line=srcline)
2012             return [msg], blank_finish
2013         self.document.note_substitution_def(
2014             substitution_node, subname, self.parent)
2015         return [substitution_node], blank_finish
2016
2017     def disallowed_inside_substitution_definitions(self, node):
2018         if (node['ids'] or
2019             isinstance(node, nodes.reference) and node.get('anonymous') or
2020             isinstance(node, nodes.footnote_reference) and node.get('auto')):
2021             return 1
2022         else:
2023             return 0
2024
2025     def directive(self, match, **option_presets):
2026         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2027         type_name = match.group(1)
2028         directive_class, messages = directives.directive(
2029             type_name, self.memo.language, self.document)
2030         self.parent += messages
2031         if directive_class:
2032             return self.run_directive(
2033                 directive_class, match, type_name, option_presets)
2034         else:
2035             return self.unknown_directive(type_name)
2036
2037     def run_directive(self, directive, match, type_name, option_presets):
2038         """
2039         Parse a directive then run its directive function.
2040
2041         Parameters:
2042
2043         - `directive`: The class implementing the directive.  Must be
2044           a subclass of `rst.Directive`.
2045
2046         - `match`: A regular expression match object which matched the first
2047           line of the directive.
2048
2049         - `type_name`: The directive name, as used in the source text.
2050
2051         - `option_presets`: A dictionary of preset options, defaults for the
2052           directive options.  Currently, only an "alt" option is passed by
2053           substitution definitions (value: the substitution name), which may
2054           be used by an embedded image directive.
2055
2056         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2057         """
2058         if isinstance(directive, (FunctionType, MethodType)):
2059             from docutils.parsers.rst import convert_directive_function
2060             directive = convert_directive_function(directive)
2061         lineno = self.state_machine.abs_line_number()
2062         initial_line_offset = self.state_machine.line_offset
2063         indented, indent, line_offset, blank_finish \
2064                   = self.state_machine.get_first_known_indented(match.end(),
2065                                                                 strip_top=0)
2066         block_text = '\n'.join(self.state_machine.input_lines[
2067             initial_line_offset : self.state_machine.line_offset + 1])
2068         try:
2069             arguments, options, content, content_offset = (
2070                 self.parse_directive_block(indented, line_offset,
2071                                            directive, option_presets))
2072         except MarkupError, detail:
2073             error = self.reporter.error(
2074                 'Error in "%s" directive:\n%s.' % (type_name,
2075                                                    ' '.join(detail.args)),
2076                 nodes.literal_block(block_text, block_text), line=lineno)
2077             return [error], blank_finish
2078         directive_instance = directive(
2079             type_name, arguments, options, content, lineno,
2080             content_offset, block_text, self, self.state_machine)
2081         try:
2082             result = directive_instance.run()
2083         except docutils.parsers.rst.DirectiveError, error:
2084             msg_node = self.reporter.system_message(error.level, error.msg,
2085                                                     line=lineno)
2086             msg_node += nodes.literal_block(block_text, block_text)
2087             result = [msg_node]
2088         assert isinstance(result, list), \
2089                'Directive "%s" must return a list of nodes.' % type_name
2090         for i in range(len(result)):
2091             assert isinstance(result[i], nodes.Node), \
2092                    ('Directive "%s" returned non-Node object (index %s): %r'
2093                     % (type_name, i, result[i]))
2094         return (result,
2095                 blank_finish or self.state_machine.is_next_line_blank())
2096
2097     def parse_directive_block(self, indented, line_offset, directive,
2098                               option_presets):
2099         option_spec = directive.option_spec
2100         has_content = directive.has_content
2101         if indented and not indented[0].strip():
2102             indented.trim_start()
2103             line_offset += 1
2104         while indented and not indented[-1].strip():
2105             indented.trim_end()
2106         if indented and (directive.required_arguments
2107                          or directive.optional_arguments
2108                          or option_spec):
2109             for i, line in enumerate(indented):
2110                 if not line.strip():
2111                     break
2112             else:
2113                 i += 1
2114             arg_block = indented[:i]
2115             content = indented[i+1:]
2116             content_offset = line_offset + i + 1
2117         else:
2118             content = indented
2119             content_offset = line_offset
2120             arg_block = []
2121         if option_spec:
2122             options, arg_block = self.parse_directive_options(
2123                 option_presets, option_spec, arg_block)
2124         else:
2125             options = {}
2126         if arg_block and not (directive.required_arguments
2127                               or directive.optional_arguments):
2128             content = arg_block + indented[i:]
2129             content_offset = line_offset
2130             arg_block = []
2131         while content and not content[0].strip():
2132             content.trim_start()
2133             content_offset += 1
2134         if directive.required_arguments or directive.optional_arguments:
2135             arguments = self.parse_directive_arguments(
2136                 directive, arg_block)
2137         else:
2138             arguments = []
2139         if content and not has_content:
2140             raise MarkupError('no content permitted')
2141         return (arguments, options, content, content_offset)
2142
2143     def parse_directive_options(self, option_presets, option_spec, arg_block):
2144         options = option_presets.copy()
2145         for i, line in enumerate(arg_block):
2146             if re.match(Body.patterns['field_marker'], line):
2147                 opt_block = arg_block[i:]
2148                 arg_block = arg_block[:i]
2149                 break
2150         else:
2151             opt_block = []
2152         if opt_block:
2153             success, data = self.parse_extension_options(option_spec,
2154                                                          opt_block)
2155             if success:                 # data is a dict of options
2156                 options.update(data)
2157             else:                       # data is an error string
2158                 raise MarkupError(data)
2159         return options, arg_block
2160
2161     def parse_directive_arguments(self, directive, arg_block):
2162         required = directive.required_arguments
2163         optional = directive.optional_arguments
2164         arg_text = '\n'.join(arg_block)
2165         arguments = arg_text.split()
2166         if len(arguments) < required:
2167             raise MarkupError('%s argument(s) required, %s supplied'
2168                               % (required, len(arguments)))
2169         elif len(arguments) > required + optional:
2170             if directive.final_argument_whitespace:
2171                 arguments = arg_text.split(None, required + optional - 1)
2172             else:
2173                 raise MarkupError(
2174                     'maximum %s argument(s) allowed, %s supplied'
2175                     % (required + optional, len(arguments)))
2176         return arguments
2177
2178     def parse_extension_options(self, option_spec, datalines):
2179         """
2180         Parse `datalines` for a field list containing extension options
2181         matching `option_spec`.
2182
2183         :Parameters:
2184             - `option_spec`: a mapping of option name to conversion
2185               function, which should raise an exception on bad input.
2186             - `datalines`: a list of input strings.
2187
2188         :Return:
2189             - Success value, 1 or 0.
2190             - An option dictionary on success, an error string on failure.
2191         """
2192         node = nodes.field_list()
2193         newline_offset, blank_finish = self.nested_list_parse(
2194               datalines, 0, node, initial_state='ExtensionOptions',
2195               blank_finish=True)
2196         if newline_offset != len(datalines): # incomplete parse of block
2197             return 0, 'invalid option block'
2198         try:
2199             options = utils.extract_extension_options(node, option_spec)
2200         except KeyError, detail:
2201             return 0, ('unknown option: "%s"' % detail.args[0])
2202         except (ValueError, TypeError), detail:
2203             return 0, ('invalid option value: %s' % ' '.join(detail.args))
2204         except utils.ExtensionOptionError, detail:
2205             return 0, ('invalid option data: %s' % ' '.join(detail.args))
2206         if blank_finish:
2207             return 1, options
2208         else:
2209             return 0, 'option data incompletely parsed'
2210
2211     def unknown_directive(self, type_name):
2212         lineno = self.state_machine.abs_line_number()
2213         indented, indent, offset, blank_finish = \
2214             self.state_machine.get_first_known_indented(0, strip_indent=False)
2215         text = '\n'.join(indented)
2216         error = self.reporter.error(
2217               'Unknown directive type "%s".' % type_name,
2218               nodes.literal_block(text, text), line=lineno)
2219         return [error], blank_finish
2220
2221     def comment(self, match):
2222         if not match.string[match.end():].strip() \
2223               and self.state_machine.is_next_line_blank(): # an empty comment?
2224             return [nodes.comment()], 1 # "A tiny but practical wart."
2225         indented, indent, offset, blank_finish = \
2226               self.state_machine.get_first_known_indented(match.end())
2227         while indented and not indented[-1].strip():
2228             indented.trim_end()
2229         text = '\n'.join(indented)
2230         return [nodes.comment(text, text)], blank_finish
2231
2232     explicit.constructs = [
2233           (footnote,
2234            re.compile(r"""
2235                       \.\.[ ]+          # explicit markup start
2236                       \[
2237                       (                 # footnote label:
2238                           [0-9]+          # manually numbered footnote
2239                         |               # *OR*
2240                           \#              # anonymous auto-numbered footnote
2241                         |               # *OR*
2242                           \#%s            # auto-number ed?) footnote label
2243                         |               # *OR*
2244                           \*              # auto-symbol footnote
2245                       )
2246                       \]
2247                       ([ ]+|$)          # whitespace or end of line
2248                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2249           (citation,
2250            re.compile(r"""
2251                       \.\.[ ]+          # explicit markup start
2252                       \[(%s)\]          # citation label
2253                       ([ ]+|$)          # whitespace or end of line
2254                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2255           (hyperlink_target,
2256            re.compile(r"""
2257                       \.\.[ ]+          # explicit markup start
2258                       _                 # target indicator
2259                       (?![ ]|$)         # first char. not space or EOL
2260                       """, re.VERBOSE | re.UNICODE)),
2261           (substitution_def,
2262            re.compile(r"""
2263                       \.\.[ ]+          # explicit markup start
2264                       \|                # substitution indicator
2265                       (?![ ]|$)         # first char. not space or EOL
2266                       """, re.VERBOSE | re.UNICODE)),
2267           (directive,
2268            re.compile(r"""
2269                       \.\.[ ]+          # explicit markup start
2270                       (%s)              # directive name
2271                       [ ]?              # optional space
2272                       ::                # directive delimiter
2273                       ([ ]+|$)          # whitespace or end of line
2274                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE))]
2275
2276     def explicit_markup(self, match, context, next_state):
2277         """Footnotes, hyperlink targets, directives, comments."""
2278         nodelist, blank_finish = self.explicit_construct(match)
2279         self.parent += nodelist
2280         self.explicit_list(blank_finish)
2281         return [], next_state, []
2282
2283     def explicit_construct(self, match):
2284         """Determine which explicit construct this is, parse & return it."""
2285         errors = []
2286         for method, pattern in self.explicit.constructs:
2287             expmatch = pattern.match(match.string)
2288             if expmatch:
2289                 try:
2290                     return method(self, expmatch)
2291                 except MarkupError, error:
2292                     lineno = self.state_machine.abs_line_number()
2293                     message = ' '.join(error.args)
2294                     errors.append(self.reporter.warning(message, line=lineno))
2295                     break
2296         nodelist, blank_finish = self.comment(match)
2297         return nodelist + errors, blank_finish
2298
2299     def explicit_list(self, blank_finish):
2300         """
2301         Create a nested state machine for a series of explicit markup
2302         constructs (including anonymous hyperlink targets).
2303         """
2304         offset = self.state_machine.line_offset + 1   # next line
2305         newline_offset, blank_finish = self.nested_list_parse(
2306               self.state_machine.input_lines[offset:],
2307               input_offset=self.state_machine.abs_line_offset() + 1,
2308               node=self.parent, initial_state='Explicit',
2309               blank_finish=blank_finish,
2310               match_titles=self.state_machine.match_titles)
2311         self.goto_line(newline_offset)
2312         if not blank_finish:
2313             self.parent += self.unindent_warning('Explicit markup')
2314
2315     def anonymous(self, match, context, next_state):
2316         """Anonymous hyperlink targets."""
2317         nodelist, blank_finish = self.anonymous_target(match)
2318         self.parent += nodelist
2319         self.explicit_list(blank_finish)
2320         return [], next_state, []
2321
2322     def anonymous_target(self, match):
2323         lineno = self.state_machine.abs_line_number()
2324         block, indent, offset, blank_finish \
2325             = self.state_machine.get_first_known_indented(match.end(),
2326                                                         until_blank=True)
2327         blocktext = match.string[:match.end()] + '\n'.join(block)
2328         block = [escape2null(line) for line in block]
2329         target = self.make_target(block, blocktext, lineno, '')
2330         return [target], blank_finish
2331
2332     def line(self, match, context, next_state):
2333         """Section title overline or transition marker."""
2334         if self.state_machine.match_titles:
2335             return [match.string], 'Line', []
2336         elif match.string.strip() == '::':
2337             raise statemachine.TransitionCorrection('text')
2338         elif len(match.string.strip()) < 4:
2339             msg = self.reporter.info(
2340                 'Unexpected possible title overline or transition.\n'
2341                 "Treating it as ordinary text because it's so short.",
2342                 line=self.state_machine.abs_line_number())
2343             self.parent += msg
2344             raise statemachine.TransitionCorrection('text')
2345         else:
2346             blocktext = self.state_machine.line
2347             msg = self.reporter.severe(
2348                   'Unexpected section title or transition.',
2349                   nodes.literal_block(blocktext, blocktext),
2350                   line=self.state_machine.abs_line_number())
2351             self.parent += msg
2352             return [], next_state, []
2353
2354     def text(self, match, context, next_state):
2355         """Titles, definition lists, paragraphs."""
2356         return [match.string], 'Text', []
2357
2358
2359 class RFC2822Body(Body):
2360
2361     """
2362     RFC2822 headers are only valid as the first constructs in documents.  As
2363     soon as anything else appears, the `Body` state should take over.
2364     """
2365
2366     patterns = Body.patterns.copy()     # can't modify the original
2367     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2368     initial_transitions = [(name, 'Body')
2369                            for name in Body.initial_transitions]
2370     initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2371
2372     def rfc2822(self, match, context, next_state):
2373         """RFC2822-style field list item."""
2374         fieldlist = nodes.field_list(classes=['rfc2822'])
2375         self.parent += fieldlist
2376         field, blank_finish = self.rfc2822_field(match)
2377         fieldlist += field
2378         offset = self.state_machine.line_offset + 1   # next line
2379         newline_offset, blank_finish = self.nested_list_parse(
2380               self.state_machine.input_lines[offset:],
2381               input_offset=self.state_machine.abs_line_offset() + 1,
2382               node=fieldlist, initial_state='RFC2822List',
2383               blank_finish=blank_finish)
2384         self.goto_line(newline_offset)
2385         if not blank_finish:
2386             self.parent += self.unindent_warning(
2387                   'RFC2822-style field list')
2388         return [], next_state, []
2389
2390     def rfc2822_field(self, match):
2391         name = match.string[:match.string.find(':')]
2392         indented, indent, line_offset, blank_finish = \
2393               self.state_machine.get_first_known_indented(match.end(),
2394                                                           until_blank=True)
2395         fieldnode = nodes.field()
2396         fieldnode += nodes.field_name(name, name)
2397         fieldbody = nodes.field_body('\n'.join(indented))
2398         fieldnode += fieldbody
2399         if indented:
2400             self.nested_parse(indented, input_offset=line_offset,
2401                               node=fieldbody)
2402         return fieldnode, blank_finish
2403
2404
2405 class SpecializedBody(Body):
2406
2407     """
2408     Superclass for second and subsequent compound element members.  Compound
2409     elements are lists and list-like constructs.
2410
2411     All transition methods are disabled (redefined as `invalid_input`).
2412     Override individual methods in subclasses to re-enable.
2413
2414     For example, once an initial bullet list item, say, is recognized, the
2415     `BulletList` subclass takes over, with a "bullet_list" node as its
2416     container.  Upon encountering the initial bullet list item, `Body.bullet`
2417     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2418     starts up a nested parsing session with `BulletList` as the initial state.
2419     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2420     as only bullet list items are encountered, they are parsed and inserted
2421     into the container.  The first construct which is *not* a bullet list item
2422     triggers the `invalid_input` method, which ends the nested parse and
2423     closes the container.  `BulletList` needs to recognize input that is
2424     invalid in the context of a bullet list, which means everything *other
2425     than* bullet list items, so it inherits the transition list created in
2426     `Body`.
2427     """
2428
2429     def invalid_input(self, match=None, context=None, next_state=None):
2430         """Not a compound element member. Abort this state machine."""
2431         self.state_machine.previous_line() # back up so parent SM can reassess
2432         raise EOFError
2433
2434     indent = invalid_input
2435     bullet = invalid_input
2436     enumerator = invalid_input
2437     field_marker = invalid_input
2438     option_marker = invalid_input
2439     doctest = invalid_input
2440     line_block = invalid_input
2441     grid_table_top = invalid_input
2442     simple_table_top = invalid_input
2443     explicit_markup = invalid_input
2444     anonymous = invalid_input
2445     line = invalid_input
2446     text = invalid_input
2447
2448
2449 class BulletList(SpecializedBody):
2450
2451     """Second and subsequent bullet_list list_items."""
2452
2453     def bullet(self, match, context, next_state):
2454         """Bullet list item."""
2455         if match.string[0] != self.parent['bullet']:
2456             # different bullet: new list
2457             self.invalid_input()
2458         listitem, blank_finish = self.list_item(match.end())
2459         self.parent += listitem
2460         self.blank_finish = blank_finish
2461         return [], next_state, []
2462
2463
2464 class DefinitionList(SpecializedBody):
2465
2466     """Second and subsequent definition_list_items."""
2467
2468     def text(self, match, context, next_state):
2469         """Definition lists."""
2470         return [match.string], 'Definition', []
2471
2472
2473 class EnumeratedList(SpecializedBody):
2474
2475     """Second and subsequent enumerated_list list_items."""
2476
2477     def enumerator(self, match, context, next_state):
2478         """Enumerated list item."""
2479         format, sequence, text, ordinal = self.parse_enumerator(
2480               match, self.parent['enumtype'])
2481         if ( format != self.format
2482              or (sequence != '#' and (sequence != self.parent['enumtype']
2483                                       or self.auto
2484                                       or ordinal != (self.lastordinal + 1)))
2485              or not self.is_enumerated_list_item(ordinal, sequence, format)):
2486             # different enumeration: new list
2487             self.invalid_input()
2488         if sequence == '#':
2489             self.auto = 1
2490         listitem, blank_finish = self.list_item(match.end())
2491         self.parent += listitem
2492         self.blank_finish = blank_finish
2493         self.lastordinal = ordinal
2494         return [], next_state, []
2495
2496
2497 class FieldList(SpecializedBody):
2498
2499     """Second and subsequent field_list fields."""
2500
2501     def field_marker(self, match, context, next_state):
2502         """Field list field."""
2503         field, blank_finish = self.field(match)
2504         self.parent += field
2505         self.blank_finish = blank_finish
2506         return [], next_state, []
2507
2508
2509 class OptionList(SpecializedBody):
2510
2511     """Second and subsequent option_list option_list_items."""
2512
2513     def option_marker(self, match, context, next_state):
2514         """Option list item."""
2515         try:
2516             option_list_item, blank_finish = self.option_list_item(match)
2517         except MarkupError:
2518             self.invalid_input()
2519         self.parent += option_list_item
2520         self.blank_finish = blank_finish
2521         return [], next_state, []
2522
2523
2524 class RFC2822List(SpecializedBody, RFC2822Body):
2525
2526     """Second and subsequent RFC2822-style field_list fields."""
2527
2528     patterns = RFC2822Body.patterns
2529     initial_transitions = RFC2822Body.initial_transitions
2530
2531     def rfc2822(self, match, context, next_state):
2532         """RFC2822-style field list item."""
2533         field, blank_finish = self.rfc2822_field(match)
2534         self.parent += field
2535         self.blank_finish = blank_finish
2536         return [], 'RFC2822List', []
2537
2538     blank = SpecializedBody.invalid_input
2539
2540
2541 class ExtensionOptions(FieldList):
2542
2543     """
2544     Parse field_list fields for extension options.
2545
2546     No nested parsing is done (including inline markup parsing).
2547     """
2548
2549     def parse_field_body(self, indented, offset, node):
2550         """Override `Body.parse_field_body` for simpler parsing."""
2551         lines = []
2552         for line in list(indented) + ['']:
2553             if line.strip():
2554                 lines.append(line)
2555             elif lines:
2556                 text = '\n'.join(lines)
2557                 node += nodes.paragraph(text, text)
2558                 lines = []
2559
2560
2561 class LineBlock(SpecializedBody):
2562
2563     """Second and subsequent lines of a line_block."""
2564
2565     blank = SpecializedBody.invalid_input
2566
2567     def line_block(self, match, context, next_state):
2568         """New line of line block."""
2569         lineno = self.state_machine.abs_line_number()
2570         line, messages, blank_finish = self.line_block_line(match, lineno)
2571         self.parent += line
2572         self.parent.parent += messages
2573         self.blank_finish = blank_finish
2574         return [], next_state, []
2575
2576
2577 class Explicit(SpecializedBody):
2578
2579     """Second and subsequent explicit markup construct."""
2580
2581     def explicit_markup(self, match, context, next_state):
2582         """Footnotes, hyperlink targets, directives, comments."""
2583         nodelist, blank_finish = self.explicit_construct(match)
2584         self.parent += nodelist
2585         self.blank_finish = blank_finish
2586         return [], next_state, []
2587
2588     def anonymous(self, match, context, next_state):
2589         """Anonymous hyperlink targets."""
2590         nodelist, blank_finish = self.anonymous_target(match)
2591         self.parent += nodelist
2592         self.blank_finish = blank_finish
2593         return [], next_state, []
2594
2595     blank = SpecializedBody.invalid_input
2596
2597
2598 class SubstitutionDef(Body):
2599
2600     """
2601     Parser for the contents of a substitution_definition element.
2602     """
2603
2604     patterns = {
2605           'embedded_directive': re.compile(r'(%s)::( +|$)'
2606                                            % Inliner.simplename, re.UNICODE),
2607           'text': r''}
2608     initial_transitions = ['embedded_directive', 'text']
2609
2610     def embedded_directive(self, match, context, next_state):
2611         nodelist, blank_finish = self.directive(match,
2612                                                 alt=self.parent['names'][0])
2613         self.parent += nodelist
2614         if not self.state_machine.at_eof():
2615             self.blank_finish = blank_finish
2616         raise EOFError
2617
2618     def text(self, match, context, next_state):
2619         if not self.state_machine.at_eof():
2620             self.blank_finish = self.state_machine.is_next_line_blank()
2621         raise EOFError
2622
2623
2624 class Text(RSTState):
2625
2626     """
2627     Classifier of second line of a text block.
2628
2629     Could be a paragraph, a definition list item, or a title.
2630     """
2631
2632     patterns = {'underline': Body.patterns['line'],
2633                 'text': r''}
2634     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2635
2636     def blank(self, match, context, next_state):
2637         """End of paragraph."""
2638         # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext
2639         paragraph, literalnext = self.paragraph(
2640               context, self.state_machine.abs_line_number() - 1)
2641         self.parent += paragraph
2642         if literalnext:
2643             self.parent += self.literal_block()
2644         return [], 'Body', []
2645
2646     def eof(self, context):
2647         if context:
2648             self.blank(None, context, None)
2649         return []
2650
2651     def indent(self, match, context, next_state):
2652         """Definition list item."""
2653         definitionlist = nodes.definition_list()
2654         definitionlistitem, blank_finish = self.definition_list_item(context)
2655         definitionlist += definitionlistitem
2656         self.parent += definitionlist
2657         offset = self.state_machine.line_offset + 1   # next line
2658         newline_offset, blank_finish = self.nested_list_parse(
2659               self.state_machine.input_lines[offset:],
2660               input_offset=self.state_machine.abs_line_offset() + 1,
2661               node=definitionlist, initial_state='DefinitionList',
2662               blank_finish=blank_finish, blank_finish_state='Definition')
2663         self.goto_line(newline_offset)
2664         if not blank_finish:
2665             self.parent += self.unindent_warning('Definition list')
2666         return [], 'Body', []
2667
2668     def underline(self, match, context, next_state):
2669         """Section title."""
2670         lineno = self.state_machine.abs_line_number()
2671         title = context[0].rstrip()
2672         underline = match.string.rstrip()
2673         source = title + '\n' + underline
2674         messages = []
2675         if column_width(title) > len(underline):
2676             if len(underline) < 4:
2677                 if self.state_machine.match_titles:
2678                     msg = self.reporter.info(
2679                         'Possible title underline, too short for the title.\n'
2680                         "Treating it as ordinary text because it's so short.",
2681                         line=lineno)
2682                     self.parent += msg
2683                 raise statemachine.TransitionCorrection('text')
2684             else:
2685                 blocktext = context[0] + '\n' + self.state_machine.line
2686                 msg = self.reporter.warning('Title underline too short.',
2687                     nodes.literal_block(blocktext, blocktext), line=lineno)
2688                 messages.append(msg)
2689         if not self.state_machine.match_titles:
2690             blocktext = context[0] + '\n' + self.state_machine.line
2691             # We need get_source_and_line() here to report correctly
2692             src, srcline = self.state_machine.get_source_and_line()
2693             # TODO: why is abs_line_number() == srcline+1
2694             # if the error is in a table (try with test_tables.py)?
2695             # print "get_source_and_line", srcline
2696             # print "abs_line_number", self.state_machine.abs_line_number()
2697             msg = self.reporter.severe('Unexpected section title.',
2698                 nodes.literal_block(blocktext, blocktext),
2699                 source=src, line=srcline)
2700             self.parent += messages
2701             self.parent += msg
2702             return [], next_state, []
2703         style = underline[0]
2704         context[:] = []
2705         self.section(title, source, style, lineno - 1, messages)
2706         return [], next_state, []
2707
2708     def text(self, match, context, next_state):
2709         """Paragraph."""
2710         startline = self.state_machine.abs_line_number() - 1
2711         msg = None
2712         try:
2713             block = self.state_machine.get_text_block(flush_left=True)
2714         except statemachine.UnexpectedIndentationError, err:
2715             block, src, srcline = err.args
2716             msg = self.reporter.error('Unexpected indentation.',
2717                                       source=src, line=srcline)
2718         lines = context + list(block)
2719         paragraph, literalnext = self.paragraph(lines, startline)
2720         self.parent += paragraph
2721         self.parent += msg
2722         if literalnext:
2723             try:
2724                 self.state_machine.next_line()
2725             except EOFError:
2726                 pass
2727             self.parent += self.literal_block()
2728         return [], next_state, []
2729
2730     def literal_block(self):
2731         """Return a list of nodes."""
2732         indented, indent, offset, blank_finish = \
2733               self.state_machine.get_indented()
2734         while indented and not indented[-1].strip():
2735             indented.trim_end()
2736         if not indented:
2737             return self.quoted_literal_block()
2738         data = '\n'.join(indented)
2739         literal_block = nodes.literal_block(data, data)
2740         literal_block.line = offset + 1
2741         nodelist = [literal_block]
2742         if not blank_finish:
2743             nodelist.append(self.unindent_warning('Literal block'))
2744         return nodelist
2745
2746     def quoted_literal_block(self):
2747         abs_line_offset = self.state_machine.abs_line_offset()
2748         offset = self.state_machine.line_offset
2749         parent_node = nodes.Element()
2750         new_abs_offset = self.nested_parse(
2751             self.state_machine.input_lines[offset:],
2752             input_offset=abs_line_offset, node=parent_node, match_titles=False,
2753             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2754                                   'initial_state': 'QuotedLiteralBlock'})
2755         self.goto_line(new_abs_offset)
2756         return parent_node.children
2757
2758     def definition_list_item(self, termline):
2759         indented, indent, line_offset, blank_finish = \
2760               self.state_machine.get_indented()
2761         itemnode = nodes.definition_list_item(
2762             '\n'.join(termline + list(indented)))
2763         lineno = self.state_machine.abs_line_number() - 1
2764         (itemnode.source,
2765          itemnode.line) = self.state_machine.get_source_and_line(lineno)
2766         termlist, messages = self.term(termline, lineno)
2767         itemnode += termlist
2768         definition = nodes.definition('', *messages)
2769         itemnode += definition
2770         if termline[0][-2:] == '::':
2771             definition += self.reporter.info(
2772                   'Blank line missing before literal block (after the "::")? '
2773                   'Interpreted as a definition list item.',
2774                   line=lineno+1)
2775         self.nested_parse(indented, input_offset=line_offset, node=definition)
2776         return itemnode, blank_finish
2777
2778     classifier_delimiter = re.compile(' +: +')
2779
2780     def term(self, lines, lineno):
2781         """Return a definition_list's term and optional classifiers."""
2782         assert len(lines) == 1
2783         text_nodes, messages = self.inline_text(lines[0], lineno)
2784         term_node = nodes.term()
2785         node_list = [term_node]
2786         for i in range(len(text_nodes)):
2787             node = text_nodes[i]
2788             if isinstance(node, nodes.Text):
2789                 parts = self.classifier_delimiter.split(node.rawsource)
2790                 if len(parts) == 1:
2791                     node_list[-1] += node
2792                 else:
2793
2794                     node_list[-1] += nodes.Text(parts[0].rstrip())
2795                     for part in parts[1:]:
2796                         classifier_node = nodes.classifier('', part)
2797                         node_list.append(classifier_node)
2798             else:
2799                 node_list[-1] += node
2800         return node_list, messages
2801
2802
2803 class SpecializedText(Text):
2804
2805     """
2806     Superclass for second and subsequent lines of Text-variants.
2807
2808     All transition methods are disabled. Override individual methods in
2809     subclasses to re-enable.
2810     """
2811
2812     def eof(self, context):
2813         """Incomplete construct."""
2814         return []
2815
2816     def invalid_input(self, match=None, context=None, next_state=None):
2817         """Not a compound element member. Abort this state machine."""
2818         raise EOFError
2819
2820     blank = invalid_input
2821     indent = invalid_input
2822     underline = invalid_input
2823     text = invalid_input
2824
2825
2826 class Definition(SpecializedText):
2827
2828     """Second line of potential definition_list_item."""
2829
2830     def eof(self, context):
2831         """Not a definition."""
2832         self.state_machine.previous_line(2) # so parent SM can reassess
2833         return []
2834
2835     def indent(self, match, context, next_state):
2836         """Definition list item."""
2837         itemnode, blank_finish = self.definition_list_item(context)
2838         self.parent += itemnode
2839         self.blank_finish = blank_finish
2840         return [], 'DefinitionList', []
2841
2842
2843 class Line(SpecializedText):
2844
2845     """
2846     Second line of over- & underlined section title or transition marker.
2847     """
2848
2849     eofcheck = 1                        # @@@ ???
2850     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2851
2852     def eof(self, context):
2853         """Transition marker at end of section or document."""
2854         marker = context[0].strip()
2855         if self.memo.section_bubble_up_kludge:
2856             self.memo.section_bubble_up_kludge = False
2857         elif len(marker) < 4:
2858             self.state_correction(context)
2859         if self.eofcheck:               # ignore EOFError with sections
2860             lineno = self.state_machine.abs_line_number() - 1
2861             transition = nodes.transition(rawsource=context[0])
2862             transition.line = lineno
2863             self.parent += transition
2864         self.eofcheck = 1
2865         return []
2866
2867     def blank(self, match, context, next_state):
2868         """Transition marker."""
2869         src, srcline = self.state_machine.get_source_and_line()
2870         marker = context[0].strip()
2871         if len(marker) < 4:
2872             self.state_correction(context)
2873         transition = nodes.transition(rawsource=marker)
2874         transition.source = src
2875         transition.line = srcline - 1
2876         self.parent += transition
2877         return [], 'Body', []
2878
2879     def text(self, match, context, next_state):
2880         """Potential over- & underlined title."""
2881         lineno = self.state_machine.abs_line_number() - 1
2882         overline = context[0]
2883         title = match.string
2884         underline = ''
2885         try:
2886             underline = self.state_machine.next_line()
2887         except EOFError:
2888             blocktext = overline + '\n' + title
2889             if len(overline.rstrip()) < 4:
2890                 self.short_overline(context, blocktext, lineno, 2)
2891             else:
2892                 msg = self.reporter.severe(
2893                     'Incomplete section title.',
2894                     nodes.literal_block(blocktext, blocktext),
2895                     line=lineno)
2896                 self.parent += msg
2897                 return [], 'Body', []
2898         source = '%s\n%s\n%s' % (overline, title, underline)
2899         overline = overline.rstrip()
2900         underline = underline.rstrip()
2901         if not self.transitions['underline'][0].match(underline):
2902             blocktext = overline + '\n' + title + '\n' + underline
2903             if len(overline.rstrip()) < 4:
2904                 self.short_overline(context, blocktext, lineno, 2)
2905             else:
2906                 msg = self.reporter.severe(
2907                     'Missing matching underline for section title overline.',
2908                     nodes.literal_block(source, source),
2909                     line=lineno)
2910                 self.parent += msg
2911                 return [], 'Body', []
2912         elif overline != underline:
2913             blocktext = overline + '\n' + title + '\n' + underline
2914             if len(overline.rstrip()) < 4:
2915                 self.short_overline(context, blocktext, lineno, 2)
2916             else:
2917                 msg = self.reporter.severe(
2918                       'Title overline & underline mismatch.',
2919                       nodes.literal_block(source, source),
2920                       line=lineno)
2921                 self.parent += msg
2922                 return [], 'Body', []
2923         title = title.rstrip()
2924         messages = []
2925         if column_width(title) > len(overline):
2926             blocktext = overline + '\n' + title + '\n' + underline
2927             if len(overline.rstrip()) < 4:
2928                 self.short_overline(context, blocktext, lineno, 2)
2929             else:
2930                 msg = self.reporter.warning(
2931                       'Title overline too short.',
2932                       nodes.literal_block(source, source),
2933                       line=lineno)
2934                 messages.append(msg)
2935         style = (overline[0], underline[0])
2936         self.eofcheck = 0               # @@@ not sure this is correct
2937         self.section(title.lstrip(), source, style, lineno + 1, messages)
2938         self.eofcheck = 1
2939         return [], 'Body', []
2940
2941     indent = text                       # indented title
2942
2943     def underline(self, match, context, next_state):
2944         overline = context[0]
2945         blocktext = overline + '\n' + self.state_machine.line
2946         lineno = self.state_machine.abs_line_number() - 1
2947         if len(overline.rstrip()) < 4:
2948             self.short_overline(context, blocktext, lineno, 1)
2949         msg = self.reporter.error(
2950               'Invalid section title or transition marker.',
2951               nodes.literal_block(blocktext, blocktext),
2952               line=lineno)
2953         self.parent += msg
2954         return [], 'Body', []
2955
2956     def short_overline(self, context, blocktext, lineno, lines=1):
2957         msg = self.reporter.info(
2958             'Possible incomplete section title.\nTreating the overline as '
2959             "ordinary text because it's so short.",
2960             line=lineno)
2961         self.parent += msg
2962         self.state_correction(context, lines)
2963
2964     def state_correction(self, context, lines=1):
2965         self.state_machine.previous_line(lines)
2966         context[:] = []
2967         raise statemachine.StateCorrection('Body', 'text')
2968
2969
2970 class QuotedLiteralBlock(RSTState):
2971
2972     """
2973     Nested parse handler for quoted (unindented) literal blocks.
2974
2975     Special-purpose.  Not for inclusion in `state_classes`.
2976     """
2977
2978     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
2979                 'text': r''}
2980     initial_transitions = ('initial_quoted', 'text')
2981
2982     def __init__(self, state_machine, debug=False):
2983         RSTState.__init__(self, state_machine, debug)
2984         self.messages = []
2985         self.initial_lineno = None
2986
2987     def blank(self, match, context, next_state):
2988         if context:
2989             raise EOFError
2990         else:
2991             return context, next_state, []
2992
2993     def eof(self, context):
2994         if context:
2995             src, srcline = self.state_machine.get_source_and_line(
2996                                                         self.initial_lineno)
2997             text = '\n'.join(context)
2998             literal_block = nodes.literal_block(text, text)
2999             literal_block.source = src
3000             literal_block.line = srcline
3001             self.parent += literal_block
3002         else:
3003             self.parent += self.reporter.warning(
3004                 'Literal block expected; none found.',
3005                 line=self.state_machine.abs_line_number())
3006                 # src not available, because statemachine.input_lines is empty
3007             self.state_machine.previous_line()
3008         self.parent += self.messages
3009         return []
3010
3011     def indent(self, match, context, next_state):
3012         assert context, ('QuotedLiteralBlock.indent: context should not '
3013                          'be empty!')
3014         self.messages.append(
3015             self.reporter.error('Unexpected indentation.',
3016                                 line=self.state_machine.abs_line_number()))
3017         self.state_machine.previous_line()
3018         raise EOFError
3019
3020     def initial_quoted(self, match, context, next_state):
3021         """Match arbitrary quote character on the first line only."""
3022         self.remove_transition('initial_quoted')
3023         quote = match.string[0]
3024         pattern = re.compile(re.escape(quote), re.UNICODE)
3025         # New transition matches consistent quotes only:
3026         self.add_transition('quoted',
3027                             (pattern, self.quoted, self.__class__.__name__))
3028         self.initial_lineno = self.state_machine.abs_line_number()
3029         return [match.string], next_state, []
3030
3031     def quoted(self, match, context, next_state):
3032         """Match consistent quotes on subsequent lines."""
3033         context.append(match.string)
3034         return context, next_state, []
3035
3036     def text(self, match, context, next_state):
3037         if context:
3038             self.messages.append(
3039                 self.reporter.error('Inconsistent literal block quoting.',
3040                                    line=self.state_machine.abs_line_number()))
3041             self.state_machine.previous_line()
3042         raise EOFError
3043
3044
3045 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3046                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3047                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3048 """Standard set of State classes used to start `RSTStateMachine`."""