docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import sys
 107 import re
 108 try:
 109     import roman
 110 except ImportError:
 111     import docutils.utils.roman as roman
 112 from types import FunctionType, MethodType
 113
 114 from docutils import nodes, statemachine, utils
 115 from docutils import ApplicationError, DataError
 116 from docutils.statemachine import StateMachineWS, StateWS
 117 from docutils.nodes import fully_normalize_name as normalize_name
 118 from docutils.nodes import whitespace_normalize_name
 119 import docutils.parsers.rst
 120 from docutils.parsers.rst import directives, languages, tableparser, roles
 121 from docutils.parsers.rst.languages import en as _fallback_language_module
 122 from docutils.utils import escape2null, unescape, column_width
 123 from docutils.utils import punctuation_chars, urischemes
 124
 125 class MarkupError(DataError): pass
 126 class UnknownInterpretedRoleError(DataError): pass
 127 class InterpretedRoleNotImplementedError(DataError): pass
 128 class ParserError(ApplicationError): pass
 129 class MarkupMismatch(Exception): pass
 130
 131
 132 class Struct:
 133
 134     """Stores data attributes for dotted-attribute access."""
 135
 136     def __init__(self, **keywordargs):
 137         self.__dict__.update(keywordargs)
 138
 139
 140 class RSTStateMachine(StateMachineWS):
 141
 142     """
 143     reStructuredText's master StateMachine.
 144
 145     The entry point to reStructuredText parsing is the `run()` method.
 146     """
 147
 148     def run(self, input_lines, document, input_offset=0, match_titles=True,
 149             inliner=None):
 150         """
 151         Parse `input_lines` and modify the `document` node in place.
 152
 153         Extend `StateMachineWS.run()`: set up parse-global data and
 154         run the StateMachine.
 155         """
 156         self.language = languages.get_language(
 157             document.settings.language_code)
 158         self.match_titles = match_titles
 159         if inliner is None:
 160             inliner = Inliner()
 161         inliner.init_customizations(document.settings)
 162         self.memo = Struct(document=document,
 163                            reporter=document.reporter,
 164                            language=self.language,
 165                            title_styles=[],
 166                            section_level=0,
 167                            section_bubble_up_kludge=False,
 168                            inliner=inliner)
 169         self.document = document
 170         self.attach_observer(document.note_source)
 171         self.reporter = self.memo.reporter
 172         self.node = document
 173         results = StateMachineWS.run(self, input_lines, input_offset,
 174                                      input_source=document['source'])
 175         assert results == [], 'RSTStateMachine.run() results should be empty!'
 176         self.node = self.memo = None    # remove unneeded references
 177
 178
 179 class NestedStateMachine(StateMachineWS):
 180
 181     """
 182     StateMachine run from within other StateMachine runs, to parse nested
 183     document structures.
 184     """
 185
 186     def run(self, input_lines, input_offset, memo, node, match_titles=True):
 187         """
 188         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 189
 190         Extend `StateMachineWS.run()`: set up document-wide data.
 191         """
 192         self.match_titles = match_titles
 193         self.memo = memo
 194         self.document = memo.document
 195         self.attach_observer(self.document.note_source)
 196         self.reporter = memo.reporter
 197         self.language = memo.language
 198         self.node = node
 199         results = StateMachineWS.run(self, input_lines, input_offset)
 200         assert results == [], ('NestedStateMachine.run() results should be '
 201                                'empty!')
 202         return results
 203
 204
 205 class RSTState(StateWS):
 206
 207     """
 208     reStructuredText State superclass.
 209
 210     Contains methods used by all State subclasses.
 211     """
 212
 213     nested_sm = NestedStateMachine
 214     nested_sm_cache = []
 215
 216     def __init__(self, state_machine, debug=False):
 217         self.nested_sm_kwargs = {'state_classes': state_classes,
 218                                  'initial_state': 'Body'}
 219         StateWS.__init__(self, state_machine, debug)
 220
 221     def runtime_init(self):
 222         StateWS.runtime_init(self)
 223         memo = self.state_machine.memo
 224         self.memo = memo
 225         self.reporter = memo.reporter
 226         self.inliner = memo.inliner
 227         self.document = memo.document
 228         self.parent = self.state_machine.node
 229         # enable the reporter to determine source and source-line
 230         if not hasattr(self.reporter, 'get_source_and_line'):
 231             self.reporter.get_source_and_line = self.state_machine.get_source_and_line
 232             # print "adding get_source_and_line to reporter", self.state_machine.input_offset
 233
 234
 235     def goto_line(self, abs_line_offset):
 236         """
 237         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 238         """
 239         try:
 240             self.state_machine.goto_line(abs_line_offset)
 241         except EOFError:
 242             pass
 243
 244     def no_match(self, context, transitions):
 245         """
 246         Override `StateWS.no_match` to generate a system message.
 247
 248         This code should never be run.
 249         """
 250         self.reporter.severe(
 251             'Internal error: no transition pattern match.  State: "%s"; '
 252             'transitions: %s; context: %s; current line: %r.'
 253             % (self.__class__.__name__, transitions, context,
 254                self.state_machine.line))
 255         return context, None, []
 256
 257     def bof(self, context):
 258         """Called at beginning of file."""
 259         return [], []
 260
 261     def nested_parse(self, block, input_offset, node, match_titles=False,
 262                      state_machine_class=None, state_machine_kwargs=None):
 263         """
 264         Create a new StateMachine rooted at `node` and run it over the input
 265         `block`.
 266         """
 267         use_default = 0
 268         if state_machine_class is None:
 269             state_machine_class = self.nested_sm
 270             use_default += 1
 271         if state_machine_kwargs is None:
 272             state_machine_kwargs = self.nested_sm_kwargs
 273             use_default += 1
 274         block_length = len(block)
 275
 276         state_machine = None
 277         if use_default == 2:
 278             try:
 279                 state_machine = self.nested_sm_cache.pop()
 280             except IndexError:
 281                 pass
 282         if not state_machine:
 283             state_machine = state_machine_class(debug=self.debug,
 284                                                 **state_machine_kwargs)
 285         state_machine.run(block, input_offset, memo=self.memo,
 286                           node=node, match_titles=match_titles)
 287         if use_default == 2:
 288             self.nested_sm_cache.append(state_machine)
 289         else:
 290             state_machine.unlink()
 291         new_offset = state_machine.abs_line_offset()
 292         # No `block.parent` implies disconnected -- lines aren't in sync:
 293         if block.parent and (len(block) - block_length) != 0:
 294             # Adjustment for block if modified in nested parse:
 295             self.state_machine.next_line(len(block) - block_length)
 296         return new_offset
 297
 298     def nested_list_parse(self, block, input_offset, node, initial_state,
 299                           blank_finish,
 300                           blank_finish_state=None,
 301                           extra_settings={},
 302                           match_titles=False,
 303                           state_machine_class=None,
 304                           state_machine_kwargs=None):
 305         """
 306         Create a new StateMachine rooted at `node` and run it over the input
 307         `block`. Also keep track of optional intermediate blank lines and the
 308         required final one.
 309         """
 310         if state_machine_class is None:
 311             state_machine_class = self.nested_sm
 312         if state_machine_kwargs is None:
 313             state_machine_kwargs = self.nested_sm_kwargs.copy()
 314         state_machine_kwargs['initial_state'] = initial_state
 315         state_machine = state_machine_class(debug=self.debug,
 316                                             **state_machine_kwargs)
 317         if blank_finish_state is None:
 318             blank_finish_state = initial_state
 319         state_machine.states[blank_finish_state].blank_finish = blank_finish
 320         for key, value in extra_settings.items():
 321             setattr(state_machine.states[initial_state], key, value)
 322         state_machine.run(block, input_offset, memo=self.memo,
 323                           node=node, match_titles=match_titles)
 324         blank_finish = state_machine.states[blank_finish_state].blank_finish
 325         state_machine.unlink()
 326         return state_machine.abs_line_offset(), blank_finish
 327
 328     def section(self, title, source, style, lineno, messages):
 329         """Check for a valid subsection and create one if it checks out."""
 330         if self.check_subsection(source, style, lineno):
 331             self.new_subsection(title, lineno, messages)
 332
 333     def check_subsection(self, source, style, lineno):
 334         """
 335         Check for a valid subsection header.  Return 1 (true) or None (false).
 336
 337         When a new section is reached that isn't a subsection of the current
 338         section, back up the line count (use ``previous_line(-x)``), then
 339         ``raise EOFError``.  The current StateMachine will finish, then the
 340         calling StateMachine can re-examine the title.  This will work its way
 341         back up the calling chain until the correct section level isreached.
 342
 343         @@@ Alternative: Evaluate the title, store the title info & level, and
 344         back up the chain until that level is reached.  Store in memo? Or
 345         return in results?
 346
 347         :Exception: `EOFError` when a sibling or supersection encountered.
 348         """
 349         memo = self.memo
 350         title_styles = memo.title_styles
 351         mylevel = memo.section_level
 352         try:                            # check for existing title style
 353             level = title_styles.index(style) + 1
 354         except ValueError:              # new title style
 355             if len(title_styles) == memo.section_level: # new subsection
 356                 title_styles.append(style)
 357                 return 1
 358             else:                       # not at lowest level
 359                 self.parent += self.title_inconsistent(source, lineno)
 360                 return None
 361         if level <= mylevel:            # sibling or supersection
 362             memo.section_level = level   # bubble up to parent section
 363             if len(style) == 2:
 364                 memo.section_bubble_up_kludge = True
 365             # back up 2 lines for underline title, 3 for overline title
 366             self.state_machine.previous_line(len(style) + 1)
 367             raise EOFError              # let parent section re-evaluate
 368         if level == mylevel + 1:        # immediate subsection
 369             return 1
 370         else:                           # invalid subsection
 371             self.parent += self.title_inconsistent(source, lineno)
 372             return None
 373
 374     def title_inconsistent(self, sourcetext, lineno):
 375         error = self.reporter.severe(
 376             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 377             line=lineno)
 378         return error
 379
 380     def new_subsection(self, title, lineno, messages):
 381         """Append new subsection to document tree. On return, check level."""
 382         memo = self.memo
 383         mylevel = memo.section_level
 384         memo.section_level += 1
 385         section_node = nodes.section()
 386         self.parent += section_node
 387         textnodes, title_messages = self.inline_text(title, lineno)
 388         titlenode = nodes.title(title, '', *textnodes)
 389         name = normalize_name(titlenode.astext())
 390         section_node['names'].append(name)
 391         section_node += titlenode
 392         section_node += messages
 393         section_node += title_messages
 394         self.document.note_implicit_target(section_node, section_node)
 395         offset = self.state_machine.line_offset + 1
 396         absoffset = self.state_machine.abs_line_offset() + 1
 397         newabsoffset = self.nested_parse(
 398               self.state_machine.input_lines[offset:], input_offset=absoffset,
 399               node=section_node, match_titles=True)
 400         self.goto_line(newabsoffset)
 401         if memo.section_level <= mylevel: # can't handle next section?
 402             raise EOFError              # bubble up to supersection
 403         # reset section_level; next pass will detect it properly
 404         memo.section_level = mylevel
 405
 406     def paragraph(self, lines, lineno):
 407         """
 408         Return a list (paragraph & messages) & a boolean: literal_block next?
 409         """
 410         data = '\n'.join(lines).rstrip()
 411         if re.search(r'(?<!\\)(\\\\)*::$', data):
 412             if len(data) == 2:
 413                 return [], 1
 414             elif data[-3] in ' \n':
 415                 text = data[:-3].rstrip()
 416             else:
 417                 text = data[:-1]
 418             literalnext = 1
 419         else:
 420             text = data
 421             literalnext = 0
 422         textnodes, messages = self.inline_text(text, lineno)
 423         p = nodes.paragraph(data, '', *textnodes)
 424         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 425         return [p] + messages, literalnext
 426
 427     def inline_text(self, text, lineno):
 428         """
 429         Return 2 lists: nodes (text and inline elements), and system_messages.
 430         """
 431         return self.inliner.parse(text, lineno, self.memo, self.parent)
 432
 433     def unindent_warning(self, node_name):
 434         # the actual problem is one line below the current line
 435         lineno = self.state_machine.abs_line_number()+1
 436         return self.reporter.warning('%s ends without a blank line; '
 437                                      'unexpected unindent.' % node_name,
 438                                      line=lineno)
 439
 440
 441 def build_regexp(definition, compile=True):
 442     """
 443     Build, compile and return a regular expression based on `definition`.
 444
 445     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 446         where "parts" is a list of regular expressions and/or regular
 447         expression definitions to be joined into an or-group.
 448     """
 449     name, prefix, suffix, parts = definition
 450     part_strings = []
 451     for part in parts:
 452         if type(part) is tuple:
 453             part_strings.append(build_regexp(part, None))
 454         else:
 455             part_strings.append(part)
 456     or_group = '|'.join(part_strings)
 457     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 458     if compile:
 459         return re.compile(regexp, re.UNICODE)
 460     else:
 461         return regexp
 462
 463
 464 class Inliner:
 465
 466     """
 467     Parse inline markup; call the `parse()` method.
 468     """
 469
 470     def __init__(self):
 471         self.implicit_dispatch = [(self.patterns.uri, self.standalone_uri),]
 472         """List of (pattern, bound method) tuples, used by
 473         `self.implicit_inline`."""
 474
 475     def init_customizations(self, settings):
 476         """Setting-based customizations; run when parsing begins."""
 477         if settings.pep_references:
 478             self.implicit_dispatch.append((self.patterns.pep,
 479                                            self.pep_reference))
 480         if settings.rfc_references:
 481             self.implicit_dispatch.append((self.patterns.rfc,
 482                                            self.rfc_reference))
 483
 484     def parse(self, text, lineno, memo, parent):
 485         # Needs to be refactored for nested inline markup.
 486         # Add nested_parse() method?
 487         """
 488         Return 2 lists: nodes (text and inline elements), and system_messages.
 489
 490         Using `self.patterns.initial`, a pattern which matches start-strings
 491         (emphasis, strong, interpreted, phrase reference, literal,
 492         substitution reference, and inline target) and complete constructs
 493         (simple reference, footnote reference), search for a candidate.  When
 494         one is found, check for validity (e.g., not a quoted '*' character).
 495         If valid, search for the corresponding end string if applicable, and
 496         check it for validity.  If not found or invalid, generate a warning
 497         and ignore the start-string.  Implicit inline markup (e.g. standalone
 498         URIs) is found last.
 499         """
 500         self.reporter = memo.reporter
 501         self.document = memo.document
 502         self.language = memo.language
 503         self.parent = parent
 504         pattern_search = self.patterns.initial.search
 505         dispatch = self.dispatch
 506         remaining = escape2null(text)
 507         processed = []
 508         unprocessed = []
 509         messages = []
 510         while remaining:
 511             match = pattern_search(remaining)
 512             if match:
 513                 groups = match.groupdict()
 514                 method = dispatch[groups['start'] or groups['backquote']
 515                                   or groups['refend'] or groups['fnend']]
 516                 before, inlines, remaining, sysmessages = method(self, match,
 517                                                                  lineno)
 518                 unprocessed.append(before)
 519                 messages += sysmessages
 520                 if inlines:
 521                     processed += self.implicit_inline(''.join(unprocessed),
 522                                                       lineno)
 523                     processed += inlines
 524                     unprocessed = []
 525             else:
 526                 break
 527         remaining = ''.join(unprocessed) + remaining
 528         if remaining:
 529             processed += self.implicit_inline(remaining, lineno)
 530         return processed, messages
 531
 532     # Inline object recognition
 533     # -------------------------
 534     # lookahead and look-behind expressions for inline markup rules
 535     start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' %
 536                            (punctuation_chars.openers,
 537                             punctuation_chars.delimiters))
 538     end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' %
 539                          (punctuation_chars.closing_delimiters,
 540                           punctuation_chars.delimiters,
 541                           punctuation_chars.closers))
 542     # print start_string_prefix.encode('utf8')
 543     # TODO: support non-ASCII whitespace in the following 4 patterns?
 544     non_whitespace_before = r'(?<![ \n])'
 545     non_whitespace_escape_before = r'(?<![ \n\x00])'
 546     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[ \n\x00])'
 547     non_whitespace_after = r'(?![ \n])'
 548     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 549     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 550     # Valid URI characters (see RFC 2396 & RFC 2732);
 551     # final \x00 allows backslash escapes in URIs:
 552     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 553     # Delimiter indicating the end of a URI (not part of the URI):
 554     uri_end_delim = r"""[>]"""
 555     # Last URI character; same as uric but no punctuation:
 556     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 557     # End of a URI (either 'urilast' or 'uric followed by a
 558     # uri_end_delim'):
 559     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 560     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 561     email_pattern = r"""
 562           %(emailc)s+(?:\.%(emailc)s+)*   # name
 563           (?<!\x00)@                      # at
 564           %(emailc)s+(?:\.%(emailc)s*)*   # host
 565           %(uri_end)s                     # final URI char
 566           """
 567     parts = ('initial_inline', start_string_prefix, '',
 568              [('start', '', non_whitespace_after,  # simple start-strings
 569                [r'\*\*',                # strong
 570                 r'\*(?!\*)',            # emphasis but not strong
 571                 r'``',                  # literal
 572                 r'_`',                  # inline internal target
 573                 r'\|(?!\|)']            # substitution reference
 574                ),
 575               ('whole', '', end_string_suffix, # whole constructs
 576                [# reference name & end-string
 577                 r'(?P<refname>%s)(?P<refend>__?)' % simplename,
 578                 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 579                  [r'[0-9]+',               # manually numbered
 580                   r'\#(%s)?' % simplename, # auto-numbered (w/ label?)
 581                   r'\*',                   # auto-symbol
 582                   r'(?P<citationlabel>%s)' % simplename] # citation reference
 583                  )
 584                 ]
 585                ),
 586               ('backquote',             # interpreted text or phrase reference
 587                '(?P<role>(:%s:)?)' % simplename, # optional role
 588                non_whitespace_after,
 589                ['`(?!`)']               # but not literal
 590                )
 591               ]
 592              )
 593     patterns = Struct(
 594           initial=build_regexp(parts),
 595           emphasis=re.compile(non_whitespace_escape_before
 596                               + r'(\*)' + end_string_suffix, re.UNICODE),
 597           strong=re.compile(non_whitespace_escape_before
 598                             + r'(\*\*)' + end_string_suffix, re.UNICODE),
 599           interpreted_or_phrase_ref=re.compile(
 600               r"""
 601               %(non_unescaped_whitespace_escape_before)s
 602               (
 603                 `
 604                 (?P<suffix>
 605                   (?P<role>:%(simplename)s:)?
 606                   (?P<refend>__?)?
 607                 )
 608               )
 609               %(end_string_suffix)s
 610               """ % locals(), re.VERBOSE | re.UNICODE),
 611           embedded_uri=re.compile(
 612               r"""
 613               (
 614                 (?:[ \n]+|^)            # spaces or beginning of line/string
 615                 <                       # open bracket
 616                 %(non_whitespace_after)s
 617                 ([^<>\x00]+)            # anything but angle brackets & nulls
 618                 %(non_whitespace_before)s
 619                 >                       # close bracket w/o whitespace before
 620               )
 621               $                         # end of string
 622               """ % locals(), re.VERBOSE | re.UNICODE),
 623           literal=re.compile(non_whitespace_before + '(``)'
 624                              + end_string_suffix),
 625           target=re.compile(non_whitespace_escape_before
 626                             + r'(`)' + end_string_suffix),
 627           substitution_ref=re.compile(non_whitespace_escape_before
 628                                       + r'(\|_{0,2})'
 629                                       + end_string_suffix),
 630           email=re.compile(email_pattern % locals() + '$',
 631                            re.VERBOSE | re.UNICODE),
 632           uri=re.compile(
 633                 (r"""
 634                 %(start_string_prefix)s
 635                 (?P<whole>
 636                   (?P<absolute>           # absolute URI
 637                     (?P<scheme>             # scheme (http, ftp, mailto)
 638                       [a-zA-Z][a-zA-Z0-9.+-]*
 639                     )
 640                     :
 641                     (
 642                       (                       # either:
 643                         (//?)?                  # hierarchical URI
 644                         %(uric)s*               # URI characters
 645                         %(uri_end)s             # final URI char
 646                       )
 647                       (                       # optional query
 648                         \?%(uric)s*
 649                         %(uri_end)s
 650                       )?
 651                       (                       # optional fragment
 652                         \#%(uric)s*
 653                         %(uri_end)s
 654                       )?
 655                     )
 656                   )
 657                 |                       # *OR*
 658                   (?P<email>              # email address
 659                     """ + email_pattern + r"""
 660                   )
 661                 )
 662                 %(end_string_suffix)s
 663                 """) % locals(), re.VERBOSE | re.UNICODE),
 664           pep=re.compile(
 665                 r"""
 666                 %(start_string_prefix)s
 667                 (
 668                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 669                 |
 670                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 671                 )
 672                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE),
 673           rfc=re.compile(
 674                 r"""
 675                 %(start_string_prefix)s
 676                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 677                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE))
 678
 679     def quoted_start(self, match):
 680         """Test if inline markup start-string is 'quoted'.
 681
 682         'Quoted' in this context means the start-string is enclosed in a pair
 683         of matching opening/closing delimiters (not necessarily quotes)
 684         or at the end of the match.
 685         """
 686         string = match.string
 687         start = match.start()
 688         if start == 0:                  # start-string at beginning of text
 689             return False
 690         prestart = string[start - 1]
 691         try:
 692             poststart = string[match.end()]
 693         except IndexError:          # start-string at end of text
 694             return True  # not "quoted" but no markup start-string either
 695         return punctuation_chars.match_chars(prestart, poststart)
 696
 697     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 698                    restore_backslashes=False):
 699         string = match.string
 700         matchstart = match.start('start')
 701         matchend = match.end('start')
 702         if self.quoted_start(match):
 703             return (string[:matchend], [], string[matchend:], [], '')
 704         endmatch = end_pattern.search(string[matchend:])
 705         if endmatch and endmatch.start(1):  # 1 or more chars
 706             text = unescape(endmatch.string[:endmatch.start(1)],
 707                             restore_backslashes)
 708             textend = matchend + endmatch.end(1)
 709             rawsource = unescape(string[matchstart:textend], 1)
 710             return (string[:matchstart], [nodeclass(rawsource, text)],
 711                     string[textend:], [], endmatch.group(1))
 712         msg = self.reporter.warning(
 713               'Inline %s start-string without end-string.'
 714               % nodeclass.__name__, line=lineno)
 715         text = unescape(string[matchstart:matchend], 1)
 716         rawsource = unescape(string[matchstart:matchend], 1)
 717         prb = self.problematic(text, rawsource, msg)
 718         return string[:matchstart], [prb], string[matchend:], [msg], ''
 719
 720     def problematic(self, text, rawsource, message):
 721         msgid = self.document.set_id(message, self.parent)
 722         problematic = nodes.problematic(rawsource, text, refid=msgid)
 723         prbid = self.document.set_id(problematic)
 724         message.add_backref(prbid)
 725         return problematic
 726
 727     def emphasis(self, match, lineno):
 728         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 729               match, lineno, self.patterns.emphasis, nodes.emphasis)
 730         return before, inlines, remaining, sysmessages
 731
 732     def strong(self, match, lineno):
 733         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 734               match, lineno, self.patterns.strong, nodes.strong)
 735         return before, inlines, remaining, sysmessages
 736
 737     def interpreted_or_phrase_ref(self, match, lineno):
 738         end_pattern = self.patterns.interpreted_or_phrase_ref
 739         string = match.string
 740         matchstart = match.start('backquote')
 741         matchend = match.end('backquote')
 742         rolestart = match.start('role')
 743         role = match.group('role')
 744         position = ''
 745         if role:
 746             role = role[1:-1]
 747             position = 'prefix'
 748         elif self.quoted_start(match):
 749             return (string[:matchend], [], string[matchend:], [])
 750         endmatch = end_pattern.search(string[matchend:])
 751         if endmatch and endmatch.start(1):  # 1 or more chars
 752             textend = matchend + endmatch.end()
 753             if endmatch.group('role'):
 754                 if role:
 755                     msg = self.reporter.warning(
 756                         'Multiple roles in interpreted text (both '
 757                         'prefix and suffix present; only one allowed).',
 758                         line=lineno)
 759                     text = unescape(string[rolestart:textend], 1)
 760                     prb = self.problematic(text, text, msg)
 761                     return string[:rolestart], [prb], string[textend:], [msg]
 762                 role = endmatch.group('suffix')[1:-1]
 763                 position = 'suffix'
 764             escaped = endmatch.string[:endmatch.start(1)]
 765             rawsource = unescape(string[matchstart:textend], 1)
 766             if rawsource[-1:] == '_':
 767                 if role:
 768                     msg = self.reporter.warning(
 769                           'Mismatch: both interpreted text role %s and '
 770                           'reference suffix.' % position, line=lineno)
 771                     text = unescape(string[rolestart:textend], 1)
 772                     prb = self.problematic(text, text, msg)
 773                     return string[:rolestart], [prb], string[textend:], [msg]
 774                 return self.phrase_ref(string[:matchstart], string[textend:],
 775                                        rawsource, escaped, unescape(escaped))
 776             else:
 777                 rawsource = unescape(string[rolestart:textend], 1)
 778                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 779                                                       lineno)
 780                 return (string[:rolestart], nodelist,
 781                         string[textend:], messages)
 782         msg = self.reporter.warning(
 783               'Inline interpreted text or phrase reference start-string '
 784               'without end-string.', line=lineno)
 785         text = unescape(string[matchstart:matchend], 1)
 786         prb = self.problematic(text, text, msg)
 787         return string[:matchstart], [prb], string[matchend:], [msg]
 788
 789     def phrase_ref(self, before, after, rawsource, escaped, text):
 790         match = self.patterns.embedded_uri.search(escaped)
 791         if match:
 792             text = unescape(escaped[:match.start(0)])
 793             uri_text = match.group(2)
 794             uri = ''.join(uri_text.split())
 795             uri = self.adjust_uri(uri)
 796             if uri:
 797                 target = nodes.target(match.group(1), refuri=uri)
 798                 target.referenced = 1
 799             else:
 800                 raise ApplicationError('problem with URI: %r' % uri_text)
 801             if not text:
 802                 text = uri
 803         else:
 804             target = None
 805         refname = normalize_name(text)
 806         reference = nodes.reference(rawsource, text,
 807                                     name=whitespace_normalize_name(text))
 808         node_list = [reference]
 809         if rawsource[-2:] == '__':
 810             if target:
 811                 reference['refuri'] = uri
 812             else:
 813                 reference['anonymous'] = 1
 814         else:
 815             if target:
 816                 reference['refuri'] = uri
 817                 target['names'].append(refname)
 818                 self.document.note_explicit_target(target, self.parent)
 819                 node_list.append(target)
 820             else:
 821                 reference['refname'] = refname
 822                 self.document.note_refname(reference)
 823         return before, node_list, after, []
 824
 825     def adjust_uri(self, uri):
 826         match = self.patterns.email.match(uri)
 827         if match:
 828             return 'mailto:' + uri
 829         else:
 830             return uri
 831
 832     def interpreted(self, rawsource, text, role, lineno):
 833         role_fn, messages = roles.role(role, self.language, lineno,
 834                                        self.reporter)
 835         if role_fn:
 836             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 837             return nodes, messages + messages2
 838         else:
 839             msg = self.reporter.error(
 840                 'Unknown interpreted text role "%s".' % role,
 841                 line=lineno)
 842             return ([self.problematic(rawsource, rawsource, msg)],
 843                     messages + [msg])
 844
 845     def literal(self, match, lineno):
 846         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 847               match, lineno, self.patterns.literal, nodes.literal,
 848               restore_backslashes=True)
 849         return before, inlines, remaining, sysmessages
 850
 851     def inline_internal_target(self, match, lineno):
 852         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 853               match, lineno, self.patterns.target, nodes.target)
 854         if inlines and isinstance(inlines[0], nodes.target):
 855             assert len(inlines) == 1
 856             target = inlines[0]
 857             name = normalize_name(target.astext())
 858             target['names'].append(name)
 859             self.document.note_explicit_target(target, self.parent)
 860         return before, inlines, remaining, sysmessages
 861
 862     def substitution_reference(self, match, lineno):
 863         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 864               match, lineno, self.patterns.substitution_ref,
 865               nodes.substitution_reference)
 866         if len(inlines) == 1:
 867             subref_node = inlines[0]
 868             if isinstance(subref_node, nodes.substitution_reference):
 869                 subref_text = subref_node.astext()
 870                 self.document.note_substitution_ref(subref_node, subref_text)
 871                 if endstring[-1:] == '_':
 872                     reference_node = nodes.reference(
 873                         '|%s%s' % (subref_text, endstring), '')
 874                     if endstring[-2:] == '__':
 875                         reference_node['anonymous'] = 1
 876                     else:
 877                         reference_node['refname'] = normalize_name(subref_text)
 878                         self.document.note_refname(reference_node)
 879                     reference_node += subref_node
 880                     inlines = [reference_node]
 881         return before, inlines, remaining, sysmessages
 882
 883     def footnote_reference(self, match, lineno):
 884         """
 885         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 886         elements.
 887         """
 888         label = match.group('footnotelabel')
 889         refname = normalize_name(label)
 890         string = match.string
 891         before = string[:match.start('whole')]
 892         remaining = string[match.end('whole'):]
 893         if match.group('citationlabel'):
 894             refnode = nodes.citation_reference('[%s]_' % label,
 895                                                refname=refname)
 896             refnode += nodes.Text(label)
 897             self.document.note_citation_ref(refnode)
 898         else:
 899             refnode = nodes.footnote_reference('[%s]_' % label)
 900             if refname[0] == '#':
 901                 refname = refname[1:]
 902                 refnode['auto'] = 1
 903                 self.document.note_autofootnote_ref(refnode)
 904             elif refname == '*':
 905                 refname = ''
 906                 refnode['auto'] = '*'
 907                 self.document.note_symbol_footnote_ref(
 908                       refnode)
 909             else:
 910                 refnode += nodes.Text(label)
 911             if refname:
 912                 refnode['refname'] = refname
 913                 self.document.note_footnote_ref(refnode)
 914             if utils.get_trim_footnote_ref_space(self.document.settings):
 915                 before = before.rstrip()
 916         return (before, [refnode], remaining, [])
 917
 918     def reference(self, match, lineno, anonymous=False):
 919         referencename = match.group('refname')
 920         refname = normalize_name(referencename)
 921         referencenode = nodes.reference(
 922             referencename + match.group('refend'), referencename,
 923             name=whitespace_normalize_name(referencename))
 924         if anonymous:
 925             referencenode['anonymous'] = 1
 926         else:
 927             referencenode['refname'] = refname
 928             self.document.note_refname(referencenode)
 929         string = match.string
 930         matchstart = match.start('whole')
 931         matchend = match.end('whole')
 932         return (string[:matchstart], [referencenode], string[matchend:], [])
 933
 934     def anonymous_reference(self, match, lineno):
 935         return self.reference(match, lineno, anonymous=1)
 936
 937     def standalone_uri(self, match, lineno):
 938         if (not match.group('scheme')
 939                 or match.group('scheme').lower() in urischemes.schemes):
 940             if match.group('email'):
 941                 addscheme = 'mailto:'
 942             else:
 943                 addscheme = ''
 944             text = match.group('whole')
 945             unescaped = unescape(text, 0)
 946             return [nodes.reference(unescape(text, 1), unescaped,
 947                                     refuri=addscheme + unescaped)]
 948         else:                   # not a valid scheme
 949             raise MarkupMismatch
 950
 951     def pep_reference(self, match, lineno):
 952         text = match.group(0)
 953         if text.startswith('pep-'):
 954             pepnum = int(match.group('pepnum1'))
 955         elif text.startswith('PEP'):
 956             pepnum = int(match.group('pepnum2'))
 957         else:
 958             raise MarkupMismatch
 959         ref = (self.document.settings.pep_base_url
 960                + self.document.settings.pep_file_url_template % pepnum)
 961         unescaped = unescape(text, 0)
 962         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 963
 964     rfc_url = 'rfc%d.html'
 965
 966     def rfc_reference(self, match, lineno):
 967         text = match.group(0)
 968         if text.startswith('RFC'):
 969             rfcnum = int(match.group('rfcnum'))
 970             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
 971         else:
 972             raise MarkupMismatch
 973         unescaped = unescape(text, 0)
 974         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 975
 976     def implicit_inline(self, text, lineno):
 977         """
 978         Check each of the patterns in `self.implicit_dispatch` for a match,
 979         and dispatch to the stored method for the pattern.  Recursively check
 980         the text before and after the match.  Return a list of `nodes.Text`
 981         and inline element nodes.
 982         """
 983         if not text:
 984             return []
 985         for pattern, method in self.implicit_dispatch:
 986             match = pattern.search(text)
 987             if match:
 988                 try:
 989                     # Must recurse on strings before *and* after the match;
 990                     # there may be multiple patterns.
 991                     return (self.implicit_inline(text[:match.start()], lineno)
 992                             + method(match, lineno) +
 993                             self.implicit_inline(text[match.end():], lineno))
 994                 except MarkupMismatch:
 995                     pass
 996         return [nodes.Text(unescape(text), rawsource=unescape(text, 1))]
 997
 998     dispatch = {'*': emphasis,
 999                 '**': strong,
1000                 '`': interpreted_or_phrase_ref,
1001                 '``': literal,
1002                 '_`': inline_internal_target,
1003                 ']_': footnote_reference,
1004                 '|': substitution_reference,
1005                 '_': reference,
1006                 '__': anonymous_reference}
1007
1008
1009 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1010     return ord(s) - _zero
1011
1012 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1013     return ord(s) - _zero
1014
1015 def _lowerroman_to_int(s):
1016     return roman.fromRoman(s.upper())
1017
1018
1019 class Body(RSTState):
1020
1021     """
1022     Generic classifier of the first line of a block.
1023     """
1024
1025     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1026     """Padding character for East Asian double-width text."""
1027
1028     enum = Struct()
1029     """Enumerated list parsing information."""
1030
1031     enum.formatinfo = {
1032           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1033           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1034           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1035     enum.formats = enum.formatinfo.keys()
1036     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1037                       'lowerroman', 'upperroman'] # ORDERED!
1038     enum.sequencepats = {'arabic': '[0-9]+',
1039                          'loweralpha': '[a-z]',
1040                          'upperalpha': '[A-Z]',
1041                          'lowerroman': '[ivxlcdm]+',
1042                          'upperroman': '[IVXLCDM]+',}
1043     enum.converters = {'arabic': int,
1044                        'loweralpha': _loweralpha_to_int,
1045                        'upperalpha': _upperalpha_to_int,
1046                        'lowerroman': _lowerroman_to_int,
1047                        'upperroman': roman.fromRoman}
1048
1049     enum.sequenceregexps = {}
1050     for sequence in enum.sequences:
1051         enum.sequenceregexps[sequence] = re.compile(
1052               enum.sequencepats[sequence] + '$', re.UNICODE)
1053
1054     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1055     """Matches the top (& bottom) of a full table)."""
1056
1057     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1058     """Matches the top of a simple table."""
1059
1060     simple_table_border_pat = re.compile('=+[ =]*$')
1061     """Matches the bottom & header bottom of a simple table."""
1062
1063     pats = {}
1064     """Fragments of patterns used by transitions."""
1065
1066     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1067     pats['alpha'] = '[a-zA-Z]'
1068     pats['alphanum'] = '[a-zA-Z0-9]'
1069     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1070     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1071                     '|%(upperroman)s|#)' % enum.sequencepats)
1072     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1073     # @@@ Loosen up the pattern?  Allow Unicode?
1074     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1075     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1076     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1077     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1078
1079     for format in enum.formats:
1080         pats[format] = '(?P<%s>%s%s%s)' % (
1081               format, re.escape(enum.formatinfo[format].prefix),
1082               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1083
1084     patterns = {
1085           'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
1086           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1087           'field_marker': r':(?![: ])([^:\\]|\\.)*(?<! ):( +|$)',
1088           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1089           'doctest': r'>>>( +|$)',
1090           'line_block': r'\|( +|$)',
1091           'grid_table_top': grid_table_top_pat,
1092           'simple_table_top': simple_table_top_pat,
1093           'explicit_markup': r'\.\.( +|$)',
1094           'anonymous': r'__( +|$)',
1095           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1096           'text': r''}
1097     initial_transitions = (
1098           'bullet',
1099           'enumerator',
1100           'field_marker',
1101           'option_marker',
1102           'doctest',
1103           'line_block',
1104           'grid_table_top',
1105           'simple_table_top',
1106           'explicit_markup',
1107           'anonymous',
1108           'line',
1109           'text')
1110
1111     def indent(self, match, context, next_state):
1112         """Block quote."""
1113         indented, indent, line_offset, blank_finish = \
1114               self.state_machine.get_indented()
1115         elements = self.block_quote(indented, line_offset)
1116         self.parent += elements
1117         if not blank_finish:
1118             self.parent += self.unindent_warning('Block quote')
1119         return context, next_state, []
1120
1121     def block_quote(self, indented, line_offset):
1122         elements = []
1123         while indented:
1124             (blockquote_lines,
1125              attribution_lines,
1126              attribution_offset,
1127              indented,
1128              new_line_offset) = self.split_attribution(indented, line_offset)
1129             blockquote = nodes.block_quote()
1130             self.nested_parse(blockquote_lines, line_offset, blockquote)
1131             elements.append(blockquote)
1132             if attribution_lines:
1133                 attribution, messages = self.parse_attribution(
1134                     attribution_lines, attribution_offset)
1135                 blockquote += attribution
1136                 elements += messages
1137             line_offset = new_line_offset
1138             while indented and not indented[0]:
1139                 indented = indented[1:]
1140                 line_offset += 1
1141         return elements
1142
1143     # U+2014 is an em-dash:
1144     attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
1145                                      re.UNICODE)
1146
1147     def split_attribution(self, indented, line_offset):
1148         """
1149         Check for a block quote attribution and split it off:
1150
1151         * First line after a blank line must begin with a dash ("--", "---",
1152           em-dash; matches `self.attribution_pattern`).
1153         * Every line after that must have consistent indentation.
1154         * Attributions must be preceded by block quote content.
1155
1156         Return a tuple of: (block quote content lines, content offset,
1157         attribution lines, attribution offset, remaining indented lines).
1158         """
1159         blank = None
1160         nonblank_seen = False
1161         for i in range(len(indented)):
1162             line = indented[i].rstrip()
1163             if line:
1164                 if nonblank_seen and blank == i - 1: # last line blank
1165                     match = self.attribution_pattern.match(line)
1166                     if match:
1167                         attribution_end, indent = self.check_attribution(
1168                             indented, i)
1169                         if attribution_end:
1170                             a_lines = indented[i:attribution_end]
1171                             a_lines.trim_left(match.end(), end=1)
1172                             a_lines.trim_left(indent, start=1)
1173                             return (indented[:i], a_lines,
1174                                     i, indented[attribution_end:],
1175                                     line_offset + attribution_end)
1176                 nonblank_seen = True
1177             else:
1178                 blank = i
1179         else:
1180             return (indented, None, None, None, None)
1181
1182     def check_attribution(self, indented, attribution_start):
1183         """
1184         Check attribution shape.
1185         Return the index past the end of the attribution, and the indent.
1186         """
1187         indent = None
1188         i = attribution_start + 1
1189         for i in range(attribution_start + 1, len(indented)):
1190             line = indented[i].rstrip()
1191             if not line:
1192                 break
1193             if indent is None:
1194                 indent = len(line) - len(line.lstrip())
1195             elif len(line) - len(line.lstrip()) != indent:
1196                 return None, None       # bad shape; not an attribution
1197         else:
1198             # return index of line after last attribution line:
1199             i += 1
1200         return i, (indent or 0)
1201
1202     def parse_attribution(self, indented, line_offset):
1203         text = '\n'.join(indented).rstrip()
1204         lineno = self.state_machine.abs_line_number() + line_offset
1205         textnodes, messages = self.inline_text(text, lineno)
1206         node = nodes.attribution(text, '', *textnodes)
1207         node.source, node.line = self.state_machine.get_source_and_line(lineno)
1208         return node, messages
1209
1210     def bullet(self, match, context, next_state):
1211         """Bullet list item."""
1212         bulletlist = nodes.bullet_list()
1213         self.parent += bulletlist
1214         bulletlist['bullet'] = match.string[0]
1215         i, blank_finish = self.list_item(match.end())
1216         bulletlist += i
1217         offset = self.state_machine.line_offset + 1   # next line
1218         new_line_offset, blank_finish = self.nested_list_parse(
1219               self.state_machine.input_lines[offset:],
1220               input_offset=self.state_machine.abs_line_offset() + 1,
1221               node=bulletlist, initial_state='BulletList',
1222               blank_finish=blank_finish)
1223         self.goto_line(new_line_offset)
1224         if not blank_finish:
1225             self.parent += self.unindent_warning('Bullet list')
1226         return [], next_state, []
1227
1228     def list_item(self, indent):
1229         if self.state_machine.line[indent:]:
1230             indented, line_offset, blank_finish = (
1231                 self.state_machine.get_known_indented(indent))
1232         else:
1233             indented, indent, line_offset, blank_finish = (
1234                 self.state_machine.get_first_known_indented(indent))
1235         listitem = nodes.list_item('\n'.join(indented))
1236         if indented:
1237             self.nested_parse(indented, input_offset=line_offset,
1238                               node=listitem)
1239         return listitem, blank_finish
1240
1241     def enumerator(self, match, context, next_state):
1242         """Enumerated List Item"""
1243         format, sequence, text, ordinal = self.parse_enumerator(match)
1244         if not self.is_enumerated_list_item(ordinal, sequence, format):
1245             raise statemachine.TransitionCorrection('text')
1246         enumlist = nodes.enumerated_list()
1247         self.parent += enumlist
1248         if sequence == '#':
1249             enumlist['enumtype'] = 'arabic'
1250         else:
1251             enumlist['enumtype'] = sequence
1252         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1253         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1254         if ordinal != 1:
1255             enumlist['start'] = ordinal
1256             msg = self.reporter.info(
1257                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1258                 % (text, ordinal))
1259             self.parent += msg
1260         listitem, blank_finish = self.list_item(match.end())
1261         enumlist += listitem
1262         offset = self.state_machine.line_offset + 1   # next line
1263         newline_offset, blank_finish = self.nested_list_parse(
1264               self.state_machine.input_lines[offset:],
1265               input_offset=self.state_machine.abs_line_offset() + 1,
1266               node=enumlist, initial_state='EnumeratedList',
1267               blank_finish=blank_finish,
1268               extra_settings={'lastordinal': ordinal,
1269                               'format': format,
1270                               'auto': sequence == '#'})
1271         self.goto_line(newline_offset)
1272         if not blank_finish:
1273             self.parent += self.unindent_warning('Enumerated list')
1274         return [], next_state, []
1275
1276     def parse_enumerator(self, match, expected_sequence=None):
1277         """
1278         Analyze an enumerator and return the results.
1279
1280         :Return:
1281             - the enumerator format ('period', 'parens', or 'rparen'),
1282             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1283             - the text of the enumerator, stripped of formatting, and
1284             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1285               ``None`` is returned for invalid enumerator text).
1286
1287         The enumerator format has already been determined by the regular
1288         expression match. If `expected_sequence` is given, that sequence is
1289         tried first. If not, we check for Roman numeral 1. This way,
1290         single-character Roman numerals (which are also alphabetical) can be
1291         matched. If no sequence has been matched, all sequences are checked in
1292         order.
1293         """
1294         groupdict = match.groupdict()
1295         sequence = ''
1296         for format in self.enum.formats:
1297             if groupdict[format]:       # was this the format matched?
1298                 break                   # yes; keep `format`
1299         else:                           # shouldn't happen
1300             raise ParserError('enumerator format not matched')
1301         text = groupdict[format][self.enum.formatinfo[format].start
1302                                  :self.enum.formatinfo[format].end]
1303         if text == '#':
1304             sequence = '#'
1305         elif expected_sequence:
1306             try:
1307                 if self.enum.sequenceregexps[expected_sequence].match(text):
1308                     sequence = expected_sequence
1309             except KeyError:            # shouldn't happen
1310                 raise ParserError('unknown enumerator sequence: %s'
1311                                   % sequence)
1312         elif text == 'i':
1313             sequence = 'lowerroman'
1314         elif text == 'I':
1315             sequence = 'upperroman'
1316         if not sequence:
1317             for sequence in self.enum.sequences:
1318                 if self.enum.sequenceregexps[sequence].match(text):
1319                     break
1320             else:                       # shouldn't happen
1321                 raise ParserError('enumerator sequence not matched')
1322         if sequence == '#':
1323             ordinal = 1
1324         else:
1325             try:
1326                 ordinal = self.enum.converters[sequence](text)
1327             except roman.InvalidRomanNumeralError:
1328                 ordinal = None
1329         return format, sequence, text, ordinal
1330
1331     def is_enumerated_list_item(self, ordinal, sequence, format):
1332         """
1333         Check validity based on the ordinal value and the second line.
1334
1335         Return true if the ordinal is valid and the second line is blank,
1336         indented, or starts with the next enumerator or an auto-enumerator.
1337         """
1338         if ordinal is None:
1339             return None
1340         try:
1341             next_line = self.state_machine.next_line()
1342         except EOFError:              # end of input lines
1343             self.state_machine.previous_line()
1344             return 1
1345         else:
1346             self.state_machine.previous_line()
1347         if not next_line[:1].strip():   # blank or indented
1348             return 1
1349         result = self.make_enumerator(ordinal + 1, sequence, format)
1350         if result:
1351             next_enumerator, auto_enumerator = result
1352             try:
1353                 if ( next_line.startswith(next_enumerator) or
1354                      next_line.startswith(auto_enumerator) ):
1355                     return 1
1356             except TypeError:
1357                 pass
1358         return None
1359
1360     def make_enumerator(self, ordinal, sequence, format):
1361         """
1362         Construct and return the next enumerated list item marker, and an
1363         auto-enumerator ("#" instead of the regular enumerator).
1364
1365         Return ``None`` for invalid (out of range) ordinals.
1366         """ #"
1367         if sequence == '#':
1368             enumerator = '#'
1369         elif sequence == 'arabic':
1370             enumerator = str(ordinal)
1371         else:
1372             if sequence.endswith('alpha'):
1373                 if ordinal > 26:
1374                     return None
1375                 enumerator = chr(ordinal + ord('a') - 1)
1376             elif sequence.endswith('roman'):
1377                 try:
1378                     enumerator = roman.toRoman(ordinal)
1379                 except roman.RomanError:
1380                     return None
1381             else:                       # shouldn't happen
1382                 raise ParserError('unknown enumerator sequence: "%s"'
1383                                   % sequence)
1384             if sequence.startswith('lower'):
1385                 enumerator = enumerator.lower()
1386             elif sequence.startswith('upper'):
1387                 enumerator = enumerator.upper()
1388             else:                       # shouldn't happen
1389                 raise ParserError('unknown enumerator sequence: "%s"'
1390                                   % sequence)
1391         formatinfo = self.enum.formatinfo[format]
1392         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1393                            + ' ')
1394         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1395         return next_enumerator, auto_enumerator
1396
1397     def field_marker(self, match, context, next_state):
1398         """Field list item."""
1399         field_list = nodes.field_list()
1400         self.parent += field_list
1401         field, blank_finish = self.field(match)
1402         field_list += field
1403         offset = self.state_machine.line_offset + 1   # next line
1404         newline_offset, blank_finish = self.nested_list_parse(
1405               self.state_machine.input_lines[offset:],
1406               input_offset=self.state_machine.abs_line_offset() + 1,
1407               node=field_list, initial_state='FieldList',
1408               blank_finish=blank_finish)
1409         self.goto_line(newline_offset)
1410         if not blank_finish:
1411             self.parent += self.unindent_warning('Field list')
1412         return [], next_state, []
1413
1414     def field(self, match):
1415         name = self.parse_field_marker(match)
1416         src, srcline = self.state_machine.get_source_and_line()
1417         lineno = self.state_machine.abs_line_number()
1418         indented, indent, line_offset, blank_finish = \
1419               self.state_machine.get_first_known_indented(match.end())
1420         field_node = nodes.field()
1421         field_node.source = src
1422         field_node.line = srcline
1423         name_nodes, name_messages = self.inline_text(name, lineno)
1424         field_node += nodes.field_name(name, '', *name_nodes)
1425         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1426         field_node += field_body
1427         if indented:
1428             self.parse_field_body(indented, line_offset, field_body)
1429         return field_node, blank_finish
1430
1431     def parse_field_marker(self, match):
1432         """Extract & return field name from a field marker match."""
1433         field = match.group()[1:]        # strip off leading ':'
1434         field = field[:field.rfind(':')] # strip off trailing ':' etc.
1435         return field
1436
1437     def parse_field_body(self, indented, offset, node):
1438         self.nested_parse(indented, input_offset=offset, node=node)
1439
1440     def option_marker(self, match, context, next_state):
1441         """Option list item."""
1442         optionlist = nodes.option_list()
1443         try:
1444             listitem, blank_finish = self.option_list_item(match)
1445         except MarkupError, error:
1446             # This shouldn't happen; pattern won't match.
1447             msg = self.reporter.error(u'Invalid option list marker: %s' %
1448                                       error)
1449             self.parent += msg
1450             indented, indent, line_offset, blank_finish = \
1451                   self.state_machine.get_first_known_indented(match.end())
1452             elements = self.block_quote(indented, line_offset)
1453             self.parent += elements
1454             if not blank_finish:
1455                 self.parent += self.unindent_warning('Option list')
1456             return [], next_state, []
1457         self.parent += optionlist
1458         optionlist += listitem
1459         offset = self.state_machine.line_offset + 1   # next line
1460         newline_offset, blank_finish = self.nested_list_parse(
1461               self.state_machine.input_lines[offset:],
1462               input_offset=self.state_machine.abs_line_offset() + 1,
1463               node=optionlist, initial_state='OptionList',
1464               blank_finish=blank_finish)
1465         self.goto_line(newline_offset)
1466         if not blank_finish:
1467             self.parent += self.unindent_warning('Option list')
1468         return [], next_state, []
1469
1470     def option_list_item(self, match):
1471         offset = self.state_machine.abs_line_offset()
1472         options = self.parse_option_marker(match)
1473         indented, indent, line_offset, blank_finish = \
1474               self.state_machine.get_first_known_indented(match.end())
1475         if not indented:                # not an option list item
1476             self.goto_line(offset)
1477             raise statemachine.TransitionCorrection('text')
1478         option_group = nodes.option_group('', *options)
1479         description = nodes.description('\n'.join(indented))
1480         option_list_item = nodes.option_list_item('', option_group,
1481                                                   description)
1482         if indented:
1483             self.nested_parse(indented, input_offset=line_offset,
1484                               node=description)
1485         return option_list_item, blank_finish
1486
1487     def parse_option_marker(self, match):
1488         """
1489         Return a list of `node.option` and `node.option_argument` objects,
1490         parsed from an option marker match.
1491
1492         :Exception: `MarkupError` for invalid option markers.
1493         """
1494         optlist = []
1495         optionstrings = match.group().rstrip().split(', ')
1496         for optionstring in optionstrings:
1497             tokens = optionstring.split()
1498             delimiter = ' '
1499             firstopt = tokens[0].split('=', 1)
1500             if len(firstopt) > 1:
1501                 # "--opt=value" form
1502                 tokens[:1] = firstopt
1503                 delimiter = '='
1504             elif (len(tokens[0]) > 2
1505                   and ((tokens[0].startswith('-')
1506                         and not tokens[0].startswith('--'))
1507                        or tokens[0].startswith('+'))):
1508                 # "-ovalue" form
1509                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1510                 delimiter = ''
1511             if len(tokens) > 1 and (tokens[1].startswith('<')
1512                                     and tokens[-1].endswith('>')):
1513                 # "-o <value1 value2>" form; join all values into one token
1514                 tokens[1:] = [' '.join(tokens[1:])]
1515             if 0 < len(tokens) <= 2:
1516                 option = nodes.option(optionstring)
1517                 option += nodes.option_string(tokens[0], tokens[0])
1518                 if len(tokens) > 1:
1519                     option += nodes.option_argument(tokens[1], tokens[1],
1520                                                     delimiter=delimiter)
1521                 optlist.append(option)
1522             else:
1523                 raise MarkupError(
1524                     'wrong number of option tokens (=%s), should be 1 or 2: '
1525                     '"%s"' % (len(tokens), optionstring))
1526         return optlist
1527
1528     def doctest(self, match, context, next_state):
1529         data = '\n'.join(self.state_machine.get_text_block())
1530         self.parent += nodes.doctest_block(data, data)
1531         return [], next_state, []
1532
1533     def line_block(self, match, context, next_state):
1534         """First line of a line block."""
1535         block = nodes.line_block()
1536         self.parent += block
1537         lineno = self.state_machine.abs_line_number()
1538         line, messages, blank_finish = self.line_block_line(match, lineno)
1539         block += line
1540         self.parent += messages
1541         if not blank_finish:
1542             offset = self.state_machine.line_offset + 1   # next line
1543             new_line_offset, blank_finish = self.nested_list_parse(
1544                   self.state_machine.input_lines[offset:],
1545                   input_offset=self.state_machine.abs_line_offset() + 1,
1546                   node=block, initial_state='LineBlock',
1547                   blank_finish=0)
1548             self.goto_line(new_line_offset)
1549         if not blank_finish:
1550             self.parent += self.reporter.warning(
1551                 'Line block ends without a blank line.',
1552                 line=lineno+1)
1553         if len(block):
1554             if block[0].indent is None:
1555                 block[0].indent = 0
1556             self.nest_line_block_lines(block)
1557         return [], next_state, []
1558
1559     def line_block_line(self, match, lineno):
1560         """Return one line element of a line_block."""
1561         indented, indent, line_offset, blank_finish = \
1562             self.state_machine.get_first_known_indented(match.end(),
1563                                                         until_blank=True)
1564         text = u'\n'.join(indented)
1565         text_nodes, messages = self.inline_text(text, lineno)
1566         line = nodes.line(text, '', *text_nodes)
1567         if match.string.rstrip() != '|': # not empty
1568             line.indent = len(match.group(1)) - 1
1569         return line, messages, blank_finish
1570
1571     def nest_line_block_lines(self, block):
1572         for index in range(1, len(block)):
1573             if block[index].indent is None:
1574                 block[index].indent = block[index - 1].indent
1575         self.nest_line_block_segment(block)
1576
1577     def nest_line_block_segment(self, block):
1578         indents = [item.indent for item in block]
1579         least = min(indents)
1580         new_items = []
1581         new_block = nodes.line_block()
1582         for item in block:
1583             if item.indent > least:
1584                 new_block.append(item)
1585             else:
1586                 if len(new_block):
1587                     self.nest_line_block_segment(new_block)
1588                     new_items.append(new_block)
1589                     new_block = nodes.line_block()
1590                 new_items.append(item)
1591         if len(new_block):
1592             self.nest_line_block_segment(new_block)
1593             new_items.append(new_block)
1594         block[:] = new_items
1595
1596     def grid_table_top(self, match, context, next_state):
1597         """Top border of a full table."""
1598         return self.table_top(match, context, next_state,
1599                               self.isolate_grid_table,
1600                               tableparser.GridTableParser)
1601
1602     def simple_table_top(self, match, context, next_state):
1603         """Top border of a simple table."""
1604         return self.table_top(match, context, next_state,
1605                               self.isolate_simple_table,
1606                               tableparser.SimpleTableParser)
1607
1608     def table_top(self, match, context, next_state,
1609                   isolate_function, parser_class):
1610         """Top border of a generic table."""
1611         nodelist, blank_finish = self.table(isolate_function, parser_class)
1612         self.parent += nodelist
1613         if not blank_finish:
1614             msg = self.reporter.warning(
1615                 'Blank line required after table.',
1616                 line=self.state_machine.abs_line_number()+1)
1617             self.parent += msg
1618         return [], next_state, []
1619
1620     def table(self, isolate_function, parser_class):
1621         """Parse a table."""
1622         block, messages, blank_finish = isolate_function()
1623         if block:
1624             try:
1625                 parser = parser_class()
1626                 tabledata = parser.parse(block)
1627                 tableline = (self.state_machine.abs_line_number() - len(block)
1628                              + 1)
1629                 table = self.build_table(tabledata, tableline)
1630                 nodelist = [table] + messages
1631             except tableparser.TableMarkupError, err:
1632                 nodelist = self.malformed_table(block, ' '.join(err.args),
1633                                                 offset=err.offset) + messages
1634         else:
1635             nodelist = messages
1636         return nodelist, blank_finish
1637
1638     def isolate_grid_table(self):
1639         messages = []
1640         blank_finish = 1
1641         try:
1642             block = self.state_machine.get_text_block(flush_left=True)
1643         except statemachine.UnexpectedIndentationError, err:
1644             block, src, srcline = err.args
1645             messages.append(self.reporter.error('Unexpected indentation.',
1646                                                 source=src, line=srcline))
1647             blank_finish = 0
1648         block.disconnect()
1649         # for East Asian chars:
1650         block.pad_double_width(self.double_width_pad_char)
1651         width = len(block[0].strip())
1652         for i in range(len(block)):
1653             block[i] = block[i].strip()
1654             if block[i][0] not in '+|': # check left edge
1655                 blank_finish = 0
1656                 self.state_machine.previous_line(len(block) - i)
1657                 del block[i:]
1658                 break
1659         if not self.grid_table_top_pat.match(block[-1]): # find bottom
1660             blank_finish = 0
1661             # from second-last to third line of table:
1662             for i in range(len(block) - 2, 1, -1):
1663                 if self.grid_table_top_pat.match(block[i]):
1664                     self.state_machine.previous_line(len(block) - i + 1)
1665                     del block[i+1:]
1666                     break
1667             else:
1668                 messages.extend(self.malformed_table(block))
1669                 return [], messages, blank_finish
1670         for i in range(len(block)):     # check right edge
1671             if len(block[i]) != width or block[i][-1] not in '+|':
1672                 messages.extend(self.malformed_table(block))
1673                 return [], messages, blank_finish
1674         return block, messages, blank_finish
1675
1676     def isolate_simple_table(self):
1677         start = self.state_machine.line_offset
1678         lines = self.state_machine.input_lines
1679         limit = len(lines) - 1
1680         toplen = len(lines[start].strip())
1681         pattern_match = self.simple_table_border_pat.match
1682         found = 0
1683         found_at = None
1684         i = start + 1
1685         while i <= limit:
1686             line = lines[i]
1687             match = pattern_match(line)
1688             if match:
1689                 if len(line.strip()) != toplen:
1690                     self.state_machine.next_line(i - start)
1691                     messages = self.malformed_table(
1692                         lines[start:i+1], 'Bottom/header table border does '
1693                         'not match top border.')
1694                     return [], messages, i == limit or not lines[i+1].strip()
1695                 found += 1
1696                 found_at = i
1697                 if found == 2 or i == limit or not lines[i+1].strip():
1698                     end = i
1699                     break
1700             i += 1
1701         else:                           # reached end of input_lines
1702             if found:
1703                 extra = ' or no blank line after table bottom'
1704                 self.state_machine.next_line(found_at - start)
1705                 block = lines[start:found_at+1]
1706             else:
1707                 extra = ''
1708                 self.state_machine.next_line(i - start - 1)
1709                 block = lines[start:]
1710             messages = self.malformed_table(
1711                 block, 'No bottom table border found%s.' % extra)
1712             return [], messages, not extra
1713         self.state_machine.next_line(end - start)
1714         block = lines[start:end+1]
1715         # for East Asian chars:
1716         block.pad_double_width(self.double_width_pad_char)
1717         return block, [], end == limit or not lines[end+1].strip()
1718
1719     def malformed_table(self, block, detail='', offset=0):
1720         block.replace(self.double_width_pad_char, '')
1721         data = '\n'.join(block)
1722         message = 'Malformed table.'
1723         startline = self.state_machine.abs_line_number() - len(block) + 1
1724         if detail:
1725             message += '\n' + detail
1726         error = self.reporter.error(message, nodes.literal_block(data, data),
1727                                     line=startline+offset)
1728         return [error]
1729
1730     def build_table(self, tabledata, tableline, stub_columns=0):
1731         colwidths, headrows, bodyrows = tabledata
1732         table = nodes.table()
1733         tgroup = nodes.tgroup(cols=len(colwidths))
1734         table += tgroup
1735         for colwidth in colwidths:
1736             colspec = nodes.colspec(colwidth=colwidth)
1737             if stub_columns:
1738                 colspec.attributes['stub'] = 1
1739                 stub_columns -= 1
1740             tgroup += colspec
1741         if headrows:
1742             thead = nodes.thead()
1743             tgroup += thead
1744             for row in headrows:
1745                 thead += self.build_table_row(row, tableline)
1746         tbody = nodes.tbody()
1747         tgroup += tbody
1748         for row in bodyrows:
1749             tbody += self.build_table_row(row, tableline)
1750         return table
1751
1752     def build_table_row(self, rowdata, tableline):
1753         row = nodes.row()
1754         for cell in rowdata:
1755             if cell is None:
1756                 continue
1757             morerows, morecols, offset, cellblock = cell
1758             attributes = {}
1759             if morerows:
1760                 attributes['morerows'] = morerows
1761             if morecols:
1762                 attributes['morecols'] = morecols
1763             entry = nodes.entry(**attributes)
1764             row += entry
1765             if ''.join(cellblock):
1766                 self.nested_parse(cellblock, input_offset=tableline+offset,
1767                                   node=entry)
1768         return row
1769
1770
1771     explicit = Struct()
1772     """Patterns and constants used for explicit markup recognition."""
1773
1774     explicit.patterns = Struct(
1775           target=re.compile(r"""
1776                             (
1777                               _               # anonymous target
1778                             |               # *OR*
1779                               (?!_)           # no underscore at the beginning
1780                               (?P<quote>`?)   # optional open quote
1781                               (?![ `])        # first char. not space or
1782                                               # backquote
1783                               (?P<name>       # reference name
1784                                 .+?
1785                               )
1786                               %(non_whitespace_escape_before)s
1787                               (?P=quote)      # close quote if open quote used
1788                             )
1789                             (?<!(?<!\x00):) # no unescaped colon at end
1790                             %(non_whitespace_escape_before)s
1791                             [ ]?            # optional space
1792                             :               # end of reference name
1793                             ([ ]+|$)        # followed by whitespace
1794                             """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1795           reference=re.compile(r"""
1796                                (
1797                                  (?P<simple>%(simplename)s)_
1798                                |                  # *OR*
1799                                  `                  # open backquote
1800                                  (?![ ])            # not space
1801                                  (?P<phrase>.+?)    # hyperlink phrase
1802                                  %(non_whitespace_escape_before)s
1803                                  `_                 # close backquote,
1804                                                     # reference mark
1805                                )
1806                                $                  # end of string
1807                                """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1808           substitution=re.compile(r"""
1809                                   (
1810                                     (?![ ])          # first char. not space
1811                                     (?P<name>.+?)    # substitution text
1812                                     %(non_whitespace_escape_before)s
1813                                     \|               # close delimiter
1814                                   )
1815                                   ([ ]+|$)           # followed by whitespace
1816                                   """ % vars(Inliner),
1817                                   re.VERBOSE | re.UNICODE),)
1818
1819     def footnote(self, match):
1820         src, srcline = self.state_machine.get_source_and_line()
1821         indented, indent, offset, blank_finish = \
1822               self.state_machine.get_first_known_indented(match.end())
1823         label = match.group(1)
1824         name = normalize_name(label)
1825         footnote = nodes.footnote('\n'.join(indented))
1826         footnote.source = src
1827         footnote.line = srcline
1828         if name[0] == '#':              # auto-numbered
1829             name = name[1:]             # autonumber label
1830             footnote['auto'] = 1
1831             if name:
1832                 footnote['names'].append(name)
1833             self.document.note_autofootnote(footnote)
1834         elif name == '*':               # auto-symbol
1835             name = ''
1836             footnote['auto'] = '*'
1837             self.document.note_symbol_footnote(footnote)
1838         else:                           # manually numbered
1839             footnote += nodes.label('', label)
1840             footnote['names'].append(name)
1841             self.document.note_footnote(footnote)
1842         if name:
1843             self.document.note_explicit_target(footnote, footnote)
1844         else:
1845             self.document.set_id(footnote, footnote)
1846         if indented:
1847             self.nested_parse(indented, input_offset=offset, node=footnote)
1848         return [footnote], blank_finish
1849
1850     def citation(self, match):
1851         src, srcline = self.state_machine.get_source_and_line()
1852         indented, indent, offset, blank_finish = \
1853               self.state_machine.get_first_known_indented(match.end())
1854         label = match.group(1)
1855         name = normalize_name(label)
1856         citation = nodes.citation('\n'.join(indented))
1857         citation.source = src
1858         citation.line = srcline
1859         citation += nodes.label('', label)
1860         citation['names'].append(name)
1861         self.document.note_citation(citation)
1862         self.document.note_explicit_target(citation, citation)
1863         if indented:
1864             self.nested_parse(indented, input_offset=offset, node=citation)
1865         return [citation], blank_finish
1866
1867     def hyperlink_target(self, match):
1868         pattern = self.explicit.patterns.target
1869         lineno = self.state_machine.abs_line_number()
1870         block, indent, offset, blank_finish = \
1871               self.state_machine.get_first_known_indented(
1872               match.end(), until_blank=True, strip_indent=False)
1873         blocktext = match.string[:match.end()] + '\n'.join(block)
1874         block = [escape2null(line) for line in block]
1875         escaped = block[0]
1876         blockindex = 0
1877         while True:
1878             targetmatch = pattern.match(escaped)
1879             if targetmatch:
1880                 break
1881             blockindex += 1
1882             try:
1883                 escaped += block[blockindex]
1884             except IndexError:
1885                 raise MarkupError('malformed hyperlink target.')
1886         del block[:blockindex]
1887         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1888         target = self.make_target(block, blocktext, lineno,
1889                                   targetmatch.group('name'))
1890         return [target], blank_finish
1891
1892     def make_target(self, block, block_text, lineno, target_name):
1893         target_type, data = self.parse_target(block, block_text, lineno)
1894         if target_type == 'refname':
1895             target = nodes.target(block_text, '', refname=normalize_name(data))
1896             target.indirect_reference_name = data
1897             self.add_target(target_name, '', target, lineno)
1898             self.document.note_indirect_target(target)
1899             return target
1900         elif target_type == 'refuri':
1901             target = nodes.target(block_text, '')
1902             self.add_target(target_name, data, target, lineno)
1903             return target
1904         else:
1905             return data
1906
1907     def parse_target(self, block, block_text, lineno):
1908         """
1909         Determine the type of reference of a target.
1910
1911         :Return: A 2-tuple, one of:
1912
1913             - 'refname' and the indirect reference name
1914             - 'refuri' and the URI
1915             - 'malformed' and a system_message node
1916         """
1917         if block and block[-1].strip()[-1:] == '_': # possible indirect target
1918             reference = ' '.join([line.strip() for line in block])
1919             refname = self.is_reference(reference)
1920             if refname:
1921                 return 'refname', refname
1922         reference = ''.join([''.join(line.split()) for line in block])
1923         return 'refuri', unescape(reference)
1924
1925     def is_reference(self, reference):
1926         match = self.explicit.patterns.reference.match(
1927             whitespace_normalize_name(reference))
1928         if not match:
1929             return None
1930         return unescape(match.group('simple') or match.group('phrase'))
1931
1932     def add_target(self, targetname, refuri, target, lineno):
1933         target.line = lineno
1934         if targetname:
1935             name = normalize_name(unescape(targetname))
1936             target['names'].append(name)
1937             if refuri:
1938                 uri = self.inliner.adjust_uri(refuri)
1939                 if uri:
1940                     target['refuri'] = uri
1941                 else:
1942                     raise ApplicationError('problem with URI: %r' % refuri)
1943             self.document.note_explicit_target(target, self.parent)
1944         else:                       # anonymous target
1945             if refuri:
1946                 target['refuri'] = refuri
1947             target['anonymous'] = 1
1948             self.document.note_anonymous_target(target)
1949
1950     def substitution_def(self, match):
1951         pattern = self.explicit.patterns.substitution
1952         src, srcline = self.state_machine.get_source_and_line()
1953         block, indent, offset, blank_finish = \
1954               self.state_machine.get_first_known_indented(match.end(),
1955                                                           strip_indent=False)
1956         blocktext = (match.string[:match.end()] + '\n'.join(block))
1957         block.disconnect()
1958         escaped = escape2null(block[0].rstrip())
1959         blockindex = 0
1960         while True:
1961             subdefmatch = pattern.match(escaped)
1962             if subdefmatch:
1963                 break
1964             blockindex += 1
1965             try:
1966                 escaped = escaped + ' ' + escape2null(block[blockindex].strip())
1967             except IndexError:
1968                 raise MarkupError('malformed substitution definition.')
1969         del block[:blockindex]          # strip out the substitution marker
1970         block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1]
1971         if not block[0]:
1972             del block[0]
1973             offset += 1
1974         while block and not block[-1].strip():
1975             block.pop()
1976         subname = subdefmatch.group('name')
1977         substitution_node = nodes.substitution_definition(blocktext)
1978         substitution_node.source = src
1979         substitution_node.line = srcline
1980         if not block:
1981             msg = self.reporter.warning(
1982                 'Substitution definition "%s" missing contents.' % subname,
1983                 nodes.literal_block(blocktext, blocktext),
1984                 source=src, line=srcline)
1985             return [msg], blank_finish
1986         block[0] = block[0].strip()
1987         substitution_node['names'].append(
1988             nodes.whitespace_normalize_name(subname))
1989         new_abs_offset, blank_finish = self.nested_list_parse(
1990               block, input_offset=offset, node=substitution_node,
1991               initial_state='SubstitutionDef', blank_finish=blank_finish)
1992         i = 0
1993         for node in substitution_node[:]:
1994             if not (isinstance(node, nodes.Inline) or
1995                     isinstance(node, nodes.Text)):
1996                 self.parent += substitution_node[i]
1997                 del substitution_node[i]
1998             else:
1999                 i += 1
2000         for node in substitution_node.traverse(nodes.Element):
2001             if self.disallowed_inside_substitution_definitions(node):
2002                 pformat = nodes.literal_block('', node.pformat().rstrip())
2003                 msg = self.reporter.error(
2004                     'Substitution definition contains illegal element:',
2005                     pformat, nodes.literal_block(blocktext, blocktext),
2006                     source=src, line=srcline)
2007                 return [msg], blank_finish
2008         if len(substitution_node) == 0:
2009             msg = self.reporter.warning(
2010                   'Substitution definition "%s" empty or invalid.' % subname,
2011                   nodes.literal_block(blocktext, blocktext),
2012                   source=src, line=srcline)
2013             return [msg], blank_finish
2014         self.document.note_substitution_def(
2015             substitution_node, subname, self.parent)
2016         return [substitution_node], blank_finish
2017
2018     def disallowed_inside_substitution_definitions(self, node):
2019         if (node['ids'] or
2020             isinstance(node, nodes.reference) and node.get('anonymous') or
2021             isinstance(node, nodes.footnote_reference) and node.get('auto')):
2022             return 1
2023         else:
2024             return 0
2025
2026     def directive(self, match, **option_presets):
2027         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2028         type_name = match.group(1)
2029         directive_class, messages = directives.directive(
2030             type_name, self.memo.language, self.document)
2031         self.parent += messages
2032         if directive_class:
2033             return self.run_directive(
2034                 directive_class, match, type_name, option_presets)
2035         else:
2036             return self.unknown_directive(type_name)
2037
2038     def run_directive(self, directive, match, type_name, option_presets):
2039         """
2040         Parse a directive then run its directive function.
2041
2042         Parameters:
2043
2044         - `directive`: The class implementing the directive.  Must be
2045           a subclass of `rst.Directive`.
2046
2047         - `match`: A regular expression match object which matched the first
2048           line of the directive.
2049
2050         - `type_name`: The directive name, as used in the source text.
2051
2052         - `option_presets`: A dictionary of preset options, defaults for the
2053           directive options.  Currently, only an "alt" option is passed by
2054           substitution definitions (value: the substitution name), which may
2055           be used by an embedded image directive.
2056
2057         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2058         """
2059         if isinstance(directive, (FunctionType, MethodType)):
2060             from docutils.parsers.rst import convert_directive_function
2061             directive = convert_directive_function(directive)
2062         lineno = self.state_machine.abs_line_number()
2063         initial_line_offset = self.state_machine.line_offset
2064         indented, indent, line_offset, blank_finish \
2065                   = self.state_machine.get_first_known_indented(match.end(),
2066                                                                 strip_top=0)
2067         block_text = '\n'.join(self.state_machine.input_lines[
2068             initial_line_offset : self.state_machine.line_offset + 1])
2069         try:
2070             arguments, options, content, content_offset = (
2071                 self.parse_directive_block(indented, line_offset,
2072                                            directive, option_presets))
2073         except MarkupError, detail:
2074             error = self.reporter.error(
2075                 'Error in "%s" directive:\n%s.' % (type_name,
2076                                                    ' '.join(detail.args)),
2077                 nodes.literal_block(block_text, block_text), line=lineno)
2078             return [error], blank_finish
2079         directive_instance = directive(
2080             type_name, arguments, options, content, lineno,
2081             content_offset, block_text, self, self.state_machine)
2082         try:
2083             result = directive_instance.run()
2084         except docutils.parsers.rst.DirectiveError, error:
2085             msg_node = self.reporter.system_message(error.level, error.msg,
2086                                                     line=lineno)
2087             msg_node += nodes.literal_block(block_text, block_text)
2088             result = [msg_node]
2089         assert isinstance(result, list), \
2090                'Directive "%s" must return a list of nodes.' % type_name
2091         for i in range(len(result)):
2092             assert isinstance(result[i], nodes.Node), \
2093                    ('Directive "%s" returned non-Node object (index %s): %r'
2094                     % (type_name, i, result[i]))
2095         return (result,
2096                 blank_finish or self.state_machine.is_next_line_blank())
2097
2098     def parse_directive_block(self, indented, line_offset, directive,
2099                               option_presets):
2100         option_spec = directive.option_spec
2101         has_content = directive.has_content
2102         if indented and not indented[0].strip():
2103             indented.trim_start()
2104             line_offset += 1
2105         while indented and not indented[-1].strip():
2106             indented.trim_end()
2107         if indented and (directive.required_arguments
2108                          or directive.optional_arguments
2109                          or option_spec):
2110             for i, line in enumerate(indented):
2111                 if not line.strip():
2112                     break
2113             else:
2114                 i += 1
2115             arg_block = indented[:i]
2116             content = indented[i+1:]
2117             content_offset = line_offset + i + 1
2118         else:
2119             content = indented
2120             content_offset = line_offset
2121             arg_block = []
2122         if option_spec:
2123             options, arg_block = self.parse_directive_options(
2124                 option_presets, option_spec, arg_block)
2125         else:
2126             options = {}
2127         if arg_block and not (directive.required_arguments
2128                               or directive.optional_arguments):
2129             content = arg_block + indented[i:]
2130             content_offset = line_offset
2131             arg_block = []
2132         while content and not content[0].strip():
2133             content.trim_start()
2134             content_offset += 1
2135         if directive.required_arguments or directive.optional_arguments:
2136             arguments = self.parse_directive_arguments(
2137                 directive, arg_block)
2138         else:
2139             arguments = []
2140         if content and not has_content:
2141             raise MarkupError('no content permitted')
2142         return (arguments, options, content, content_offset)
2143
2144     def parse_directive_options(self, option_presets, option_spec, arg_block):
2145         options = option_presets.copy()
2146         for i, line in enumerate(arg_block):
2147             if re.match(Body.patterns['field_marker'], line):
2148                 opt_block = arg_block[i:]
2149                 arg_block = arg_block[:i]
2150                 break
2151         else:
2152             opt_block = []
2153         if opt_block:
2154             success, data = self.parse_extension_options(option_spec,
2155                                                          opt_block)
2156             if success:                 # data is a dict of options
2157                 options.update(data)
2158             else:                       # data is an error string
2159                 raise MarkupError(data)
2160         return options, arg_block
2161
2162     def parse_directive_arguments(self, directive, arg_block):
2163         required = directive.required_arguments
2164         optional = directive.optional_arguments
2165         arg_text = '\n'.join(arg_block)
2166         arguments = arg_text.split()
2167         if len(arguments) < required:
2168             raise MarkupError('%s argument(s) required, %s supplied'
2169                               % (required, len(arguments)))
2170         elif len(arguments) > required + optional:
2171             if directive.final_argument_whitespace:
2172                 arguments = arg_text.split(None, required + optional - 1)
2173             else:
2174                 raise MarkupError(
2175                     'maximum %s argument(s) allowed, %s supplied'
2176                     % (required + optional, len(arguments)))
2177         return arguments
2178
2179     def parse_extension_options(self, option_spec, datalines):
2180         """
2181         Parse `datalines` for a field list containing extension options
2182         matching `option_spec`.
2183
2184         :Parameters:
2185             - `option_spec`: a mapping of option name to conversion
2186               function, which should raise an exception on bad input.
2187             - `datalines`: a list of input strings.
2188
2189         :Return:
2190             - Success value, 1 or 0.
2191             - An option dictionary on success, an error string on failure.
2192         """
2193         node = nodes.field_list()
2194         newline_offset, blank_finish = self.nested_list_parse(
2195               datalines, 0, node, initial_state='ExtensionOptions',
2196               blank_finish=True)
2197         if newline_offset != len(datalines): # incomplete parse of block
2198             return 0, 'invalid option block'
2199         try:
2200             options = utils.extract_extension_options(node, option_spec)
2201         except KeyError, detail:
2202             return 0, ('unknown option: "%s"' % detail.args[0])
2203         except (ValueError, TypeError), detail:
2204             return 0, ('invalid option value: %s' % ' '.join(detail.args))
2205         except utils.ExtensionOptionError, detail:
2206             return 0, ('invalid option data: %s' % ' '.join(detail.args))
2207         if blank_finish:
2208             return 1, options
2209         else:
2210             return 0, 'option data incompletely parsed'
2211
2212     def unknown_directive(self, type_name):
2213         lineno = self.state_machine.abs_line_number()
2214         indented, indent, offset, blank_finish = \
2215             self.state_machine.get_first_known_indented(0, strip_indent=False)
2216         text = '\n'.join(indented)
2217         error = self.reporter.error(
2218               'Unknown directive type "%s".' % type_name,
2219               nodes.literal_block(text, text), line=lineno)
2220         return [error], blank_finish
2221
2222     def comment(self, match):
2223         if not match.string[match.end():].strip() \
2224               and self.state_machine.is_next_line_blank(): # an empty comment?
2225             return [nodes.comment()], 1 # "A tiny but practical wart."
2226         indented, indent, offset, blank_finish = \
2227               self.state_machine.get_first_known_indented(match.end())
2228         while indented and not indented[-1].strip():
2229             indented.trim_end()
2230         text = '\n'.join(indented)
2231         return [nodes.comment(text, text)], blank_finish
2232
2233     explicit.constructs = [
2234           (footnote,
2235            re.compile(r"""
2236                       \.\.[ ]+          # explicit markup start
2237                       \[
2238                       (                 # footnote label:
2239                           [0-9]+          # manually numbered footnote
2240                         |               # *OR*
2241                           \#              # anonymous auto-numbered footnote
2242                         |               # *OR*
2243                           \#%s            # auto-number ed?) footnote label
2244                         |               # *OR*
2245                           \*              # auto-symbol footnote
2246                       )
2247                       \]
2248                       ([ ]+|$)          # whitespace or end of line
2249                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2250           (citation,
2251            re.compile(r"""
2252                       \.\.[ ]+          # explicit markup start
2253                       \[(%s)\]          # citation label
2254                       ([ ]+|$)          # whitespace or end of line
2255                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2256           (hyperlink_target,
2257            re.compile(r"""
2258                       \.\.[ ]+          # explicit markup start
2259                       _                 # target indicator
2260                       (?![ ]|$)         # first char. not space or EOL
2261                       """, re.VERBOSE | re.UNICODE)),
2262           (substitution_def,
2263            re.compile(r"""
2264                       \.\.[ ]+          # explicit markup start
2265                       \|                # substitution indicator
2266                       (?![ ]|$)         # first char. not space or EOL
2267                       """, re.VERBOSE | re.UNICODE)),
2268           (directive,
2269            re.compile(r"""
2270                       \.\.[ ]+          # explicit markup start
2271                       (%s)              # directive name
2272                       [ ]?              # optional space
2273                       ::                # directive delimiter
2274                       ([ ]+|$)          # whitespace or end of line
2275                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE))]
2276
2277     def explicit_markup(self, match, context, next_state):
2278         """Footnotes, hyperlink targets, directives, comments."""
2279         nodelist, blank_finish = self.explicit_construct(match)
2280         self.parent += nodelist
2281         self.explicit_list(blank_finish)
2282         return [], next_state, []
2283
2284     def explicit_construct(self, match):
2285         """Determine which explicit construct this is, parse & return it."""
2286         errors = []
2287         for method, pattern in self.explicit.constructs:
2288             expmatch = pattern.match(match.string)
2289             if expmatch:
2290                 try:
2291                     return method(self, expmatch)
2292                 except MarkupError, error:
2293                     lineno = self.state_machine.abs_line_number()
2294                     message = ' '.join(error.args)
2295                     errors.append(self.reporter.warning(message, line=lineno))
2296                     break
2297         nodelist, blank_finish = self.comment(match)
2298         return nodelist + errors, blank_finish
2299
2300     def explicit_list(self, blank_finish):
2301         """
2302         Create a nested state machine for a series of explicit markup
2303         constructs (including anonymous hyperlink targets).
2304         """
2305         offset = self.state_machine.line_offset + 1   # next line
2306         newline_offset, blank_finish = self.nested_list_parse(
2307               self.state_machine.input_lines[offset:],
2308               input_offset=self.state_machine.abs_line_offset() + 1,
2309               node=self.parent, initial_state='Explicit',
2310               blank_finish=blank_finish,
2311               match_titles=self.state_machine.match_titles)
2312         self.goto_line(newline_offset)
2313         if not blank_finish:
2314             self.parent += self.unindent_warning('Explicit markup')
2315
2316     def anonymous(self, match, context, next_state):
2317         """Anonymous hyperlink targets."""
2318         nodelist, blank_finish = self.anonymous_target(match)
2319         self.parent += nodelist
2320         self.explicit_list(blank_finish)
2321         return [], next_state, []
2322
2323     def anonymous_target(self, match):
2324         lineno = self.state_machine.abs_line_number()
2325         block, indent, offset, blank_finish \
2326             = self.state_machine.get_first_known_indented(match.end(),
2327                                                         until_blank=True)
2328         blocktext = match.string[:match.end()] + '\n'.join(block)
2329         block = [escape2null(line) for line in block]
2330         target = self.make_target(block, blocktext, lineno, '')
2331         return [target], blank_finish
2332
2333     def line(self, match, context, next_state):
2334         """Section title overline or transition marker."""
2335         if self.state_machine.match_titles:
2336             return [match.string], 'Line', []
2337         elif match.string.strip() == '::':
2338             raise statemachine.TransitionCorrection('text')
2339         elif len(match.string.strip()) < 4:
2340             msg = self.reporter.info(
2341                 'Unexpected possible title overline or transition.\n'
2342                 "Treating it as ordinary text because it's so short.",
2343                 line=self.state_machine.abs_line_number())
2344             self.parent += msg
2345             raise statemachine.TransitionCorrection('text')
2346         else:
2347             blocktext = self.state_machine.line
2348             msg = self.reporter.severe(
2349                   'Unexpected section title or transition.',
2350                   nodes.literal_block(blocktext, blocktext),
2351                   line=self.state_machine.abs_line_number())
2352             self.parent += msg
2353             return [], next_state, []
2354
2355     def text(self, match, context, next_state):
2356         """Titles, definition lists, paragraphs."""
2357         return [match.string], 'Text', []
2358
2359
2360 class RFC2822Body(Body):
2361
2362     """
2363     RFC2822 headers are only valid as the first constructs in documents.  As
2364     soon as anything else appears, the `Body` state should take over.
2365     """
2366
2367     patterns = Body.patterns.copy()     # can't modify the original
2368     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2369     initial_transitions = [(name, 'Body')
2370                            for name in Body.initial_transitions]
2371     initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2372
2373     def rfc2822(self, match, context, next_state):
2374         """RFC2822-style field list item."""
2375         fieldlist = nodes.field_list(classes=['rfc2822'])
2376         self.parent += fieldlist
2377         field, blank_finish = self.rfc2822_field(match)
2378         fieldlist += field
2379         offset = self.state_machine.line_offset + 1   # next line
2380         newline_offset, blank_finish = self.nested_list_parse(
2381               self.state_machine.input_lines[offset:],
2382               input_offset=self.state_machine.abs_line_offset() + 1,
2383               node=fieldlist, initial_state='RFC2822List',
2384               blank_finish=blank_finish)
2385         self.goto_line(newline_offset)
2386         if not blank_finish:
2387             self.parent += self.unindent_warning(
2388                   'RFC2822-style field list')
2389         return [], next_state, []
2390
2391     def rfc2822_field(self, match):
2392         name = match.string[:match.string.find(':')]
2393         indented, indent, line_offset, blank_finish = \
2394               self.state_machine.get_first_known_indented(match.end(),
2395                                                           until_blank=True)
2396         fieldnode = nodes.field()
2397         fieldnode += nodes.field_name(name, name)
2398         fieldbody = nodes.field_body('\n'.join(indented))
2399         fieldnode += fieldbody
2400         if indented:
2401             self.nested_parse(indented, input_offset=line_offset,
2402                               node=fieldbody)
2403         return fieldnode, blank_finish
2404
2405
2406 class SpecializedBody(Body):
2407
2408     """
2409     Superclass for second and subsequent compound element members.  Compound
2410     elements are lists and list-like constructs.
2411
2412     All transition methods are disabled (redefined as `invalid_input`).
2413     Override individual methods in subclasses to re-enable.
2414
2415     For example, once an initial bullet list item, say, is recognized, the
2416     `BulletList` subclass takes over, with a "bullet_list" node as its
2417     container.  Upon encountering the initial bullet list item, `Body.bullet`
2418     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2419     starts up a nested parsing session with `BulletList` as the initial state.
2420     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2421     as only bullet list items are encountered, they are parsed and inserted
2422     into the container.  The first construct which is *not* a bullet list item
2423     triggers the `invalid_input` method, which ends the nested parse and
2424     closes the container.  `BulletList` needs to recognize input that is
2425     invalid in the context of a bullet list, which means everything *other
2426     than* bullet list items, so it inherits the transition list created in
2427     `Body`.
2428     """
2429
2430     def invalid_input(self, match=None, context=None, next_state=None):
2431         """Not a compound element member. Abort this state machine."""
2432         self.state_machine.previous_line() # back up so parent SM can reassess
2433         raise EOFError
2434
2435     indent = invalid_input
2436     bullet = invalid_input
2437     enumerator = invalid_input
2438     field_marker = invalid_input
2439     option_marker = invalid_input
2440     doctest = invalid_input
2441     line_block = invalid_input
2442     grid_table_top = invalid_input
2443     simple_table_top = invalid_input
2444     explicit_markup = invalid_input
2445     anonymous = invalid_input
2446     line = invalid_input
2447     text = invalid_input
2448
2449
2450 class BulletList(SpecializedBody):
2451
2452     """Second and subsequent bullet_list list_items."""
2453
2454     def bullet(self, match, context, next_state):
2455         """Bullet list item."""
2456         if match.string[0] != self.parent['bullet']:
2457             # different bullet: new list
2458             self.invalid_input()
2459         listitem, blank_finish = self.list_item(match.end())
2460         self.parent += listitem
2461         self.blank_finish = blank_finish
2462         return [], next_state, []
2463
2464
2465 class DefinitionList(SpecializedBody):
2466
2467     """Second and subsequent definition_list_items."""
2468
2469     def text(self, match, context, next_state):
2470         """Definition lists."""
2471         return [match.string], 'Definition', []
2472
2473
2474 class EnumeratedList(SpecializedBody):
2475
2476     """Second and subsequent enumerated_list list_items."""
2477
2478     def enumerator(self, match, context, next_state):
2479         """Enumerated list item."""
2480         format, sequence, text, ordinal = self.parse_enumerator(
2481               match, self.parent['enumtype'])
2482         if ( format != self.format
2483              or (sequence != '#' and (sequence != self.parent['enumtype']
2484                                       or self.auto
2485                                       or ordinal != (self.lastordinal + 1)))
2486              or not self.is_enumerated_list_item(ordinal, sequence, format)):
2487             # different enumeration: new list
2488             self.invalid_input()
2489         if sequence == '#':
2490             self.auto = 1
2491         listitem, blank_finish = self.list_item(match.end())
2492         self.parent += listitem
2493         self.blank_finish = blank_finish
2494         self.lastordinal = ordinal
2495         return [], next_state, []
2496
2497
2498 class FieldList(SpecializedBody):
2499
2500     """Second and subsequent field_list fields."""
2501
2502     def field_marker(self, match, context, next_state):
2503         """Field list field."""
2504         field, blank_finish = self.field(match)
2505         self.parent += field
2506         self.blank_finish = blank_finish
2507         return [], next_state, []
2508
2509
2510 class OptionList(SpecializedBody):
2511
2512     """Second and subsequent option_list option_list_items."""
2513
2514     def option_marker(self, match, context, next_state):
2515         """Option list item."""
2516         try:
2517             option_list_item, blank_finish = self.option_list_item(match)
2518         except MarkupError:
2519             self.invalid_input()
2520         self.parent += option_list_item
2521         self.blank_finish = blank_finish
2522         return [], next_state, []
2523
2524
2525 class RFC2822List(SpecializedBody, RFC2822Body):
2526
2527     """Second and subsequent RFC2822-style field_list fields."""
2528
2529     patterns = RFC2822Body.patterns
2530     initial_transitions = RFC2822Body.initial_transitions
2531
2532     def rfc2822(self, match, context, next_state):
2533         """RFC2822-style field list item."""
2534         field, blank_finish = self.rfc2822_field(match)
2535         self.parent += field
2536         self.blank_finish = blank_finish
2537         return [], 'RFC2822List', []
2538
2539     blank = SpecializedBody.invalid_input
2540
2541
2542 class ExtensionOptions(FieldList):
2543
2544     """
2545     Parse field_list fields for extension options.
2546
2547     No nested parsing is done (including inline markup parsing).
2548     """
2549
2550     def parse_field_body(self, indented, offset, node):
2551         """Override `Body.parse_field_body` for simpler parsing."""
2552         lines = []
2553         for line in list(indented) + ['']:
2554             if line.strip():
2555                 lines.append(line)
2556             elif lines:
2557                 text = '\n'.join(lines)
2558                 node += nodes.paragraph(text, text)
2559                 lines = []
2560
2561
2562 class LineBlock(SpecializedBody):
2563
2564     """Second and subsequent lines of a line_block."""
2565
2566     blank = SpecializedBody.invalid_input
2567
2568     def line_block(self, match, context, next_state):
2569         """New line of line block."""
2570         lineno = self.state_machine.abs_line_number()
2571         line, messages, blank_finish = self.line_block_line(match, lineno)
2572         self.parent += line
2573         self.parent.parent += messages
2574         self.blank_finish = blank_finish
2575         return [], next_state, []
2576
2577
2578 class Explicit(SpecializedBody):
2579
2580     """Second and subsequent explicit markup construct."""
2581
2582     def explicit_markup(self, match, context, next_state):
2583         """Footnotes, hyperlink targets, directives, comments."""
2584         nodelist, blank_finish = self.explicit_construct(match)
2585         self.parent += nodelist
2586         self.blank_finish = blank_finish
2587         return [], next_state, []
2588
2589     def anonymous(self, match, context, next_state):
2590         """Anonymous hyperlink targets."""
2591         nodelist, blank_finish = self.anonymous_target(match)
2592         self.parent += nodelist
2593         self.blank_finish = blank_finish
2594         return [], next_state, []
2595
2596     blank = SpecializedBody.invalid_input
2597
2598
2599 class SubstitutionDef(Body):
2600
2601     """
2602     Parser for the contents of a substitution_definition element.
2603     """
2604
2605     patterns = {
2606           'embedded_directive': re.compile(r'(%s)::( +|$)'
2607                                            % Inliner.simplename, re.UNICODE),
2608           'text': r''}
2609     initial_transitions = ['embedded_directive', 'text']
2610
2611     def embedded_directive(self, match, context, next_state):
2612         nodelist, blank_finish = self.directive(match,
2613                                                 alt=self.parent['names'][0])
2614         self.parent += nodelist
2615         if not self.state_machine.at_eof():
2616             self.blank_finish = blank_finish
2617         raise EOFError
2618
2619     def text(self, match, context, next_state):
2620         if not self.state_machine.at_eof():
2621             self.blank_finish = self.state_machine.is_next_line_blank()
2622         raise EOFError
2623
2624
2625 class Text(RSTState):
2626
2627     """
2628     Classifier of second line of a text block.
2629
2630     Could be a paragraph, a definition list item, or a title.
2631     """
2632
2633     patterns = {'underline': Body.patterns['line'],
2634                 'text': r''}
2635     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2636
2637     def blank(self, match, context, next_state):
2638         """End of paragraph."""
2639         # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext
2640         paragraph, literalnext = self.paragraph(
2641               context, self.state_machine.abs_line_number() - 1)
2642         self.parent += paragraph
2643         if literalnext:
2644             self.parent += self.literal_block()
2645         return [], 'Body', []
2646
2647     def eof(self, context):
2648         if context:
2649             self.blank(None, context, None)
2650         return []
2651
2652     def indent(self, match, context, next_state):
2653         """Definition list item."""
2654         definitionlist = nodes.definition_list()
2655         definitionlistitem, blank_finish = self.definition_list_item(context)
2656         definitionlist += definitionlistitem
2657         self.parent += definitionlist
2658         offset = self.state_machine.line_offset + 1   # next line
2659         newline_offset, blank_finish = self.nested_list_parse(
2660               self.state_machine.input_lines[offset:],
2661               input_offset=self.state_machine.abs_line_offset() + 1,
2662               node=definitionlist, initial_state='DefinitionList',
2663               blank_finish=blank_finish, blank_finish_state='Definition')
2664         self.goto_line(newline_offset)
2665         if not blank_finish:
2666             self.parent += self.unindent_warning('Definition list')
2667         return [], 'Body', []
2668
2669     def underline(self, match, context, next_state):
2670         """Section title."""
2671         lineno = self.state_machine.abs_line_number()
2672         title = context[0].rstrip()
2673         underline = match.string.rstrip()
2674         source = title + '\n' + underline
2675         messages = []
2676         if column_width(title) > len(underline):
2677             if len(underline) < 4:
2678                 if self.state_machine.match_titles:
2679                     msg = self.reporter.info(
2680                         'Possible title underline, too short for the title.\n'
2681                         "Treating it as ordinary text because it's so short.",
2682                         line=lineno)
2683                     self.parent += msg
2684                 raise statemachine.TransitionCorrection('text')
2685             else:
2686                 blocktext = context[0] + '\n' + self.state_machine.line
2687                 msg = self.reporter.warning('Title underline too short.',
2688                     nodes.literal_block(blocktext, blocktext), line=lineno)
2689                 messages.append(msg)
2690         if not self.state_machine.match_titles:
2691             blocktext = context[0] + '\n' + self.state_machine.line
2692             # We need get_source_and_line() here to report correctly
2693             src, srcline = self.state_machine.get_source_and_line()
2694             # TODO: why is abs_line_number() == srcline+1
2695             # if the error is in a table (try with test_tables.py)?
2696             # print "get_source_and_line", srcline
2697             # print "abs_line_number", self.state_machine.abs_line_number()
2698             msg = self.reporter.severe('Unexpected section title.',
2699                 nodes.literal_block(blocktext, blocktext),
2700                 source=src, line=srcline)
2701             self.parent += messages
2702             self.parent += msg
2703             return [], next_state, []
2704         style = underline[0]
2705         context[:] = []
2706         self.section(title, source, style, lineno - 1, messages)
2707         return [], next_state, []
2708
2709     def text(self, match, context, next_state):
2710         """Paragraph."""
2711         startline = self.state_machine.abs_line_number() - 1
2712         msg = None
2713         try:
2714             block = self.state_machine.get_text_block(flush_left=True)
2715         except statemachine.UnexpectedIndentationError, err:
2716             block, src, srcline = err.args
2717             msg = self.reporter.error('Unexpected indentation.',
2718                                       source=src, line=srcline)
2719         lines = context + list(block)
2720         paragraph, literalnext = self.paragraph(lines, startline)
2721         self.parent += paragraph
2722         self.parent += msg
2723         if literalnext:
2724             try:
2725                 self.state_machine.next_line()
2726             except EOFError:
2727                 pass
2728             self.parent += self.literal_block()
2729         return [], next_state, []
2730
2731     def literal_block(self):
2732         """Return a list of nodes."""
2733         indented, indent, offset, blank_finish = \
2734               self.state_machine.get_indented()
2735         while indented and not indented[-1].strip():
2736             indented.trim_end()
2737         if not indented:
2738             return self.quoted_literal_block()
2739         data = '\n'.join(indented)
2740         literal_block = nodes.literal_block(data, data)
2741         literal_block.line = offset + 1
2742         nodelist = [literal_block]
2743         if not blank_finish:
2744             nodelist.append(self.unindent_warning('Literal block'))
2745         return nodelist
2746
2747     def quoted_literal_block(self):
2748         abs_line_offset = self.state_machine.abs_line_offset()
2749         offset = self.state_machine.line_offset
2750         parent_node = nodes.Element()
2751         new_abs_offset = self.nested_parse(
2752             self.state_machine.input_lines[offset:],
2753             input_offset=abs_line_offset, node=parent_node, match_titles=False,
2754             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2755                                   'initial_state': 'QuotedLiteralBlock'})
2756         self.goto_line(new_abs_offset)
2757         return parent_node.children
2758
2759     def definition_list_item(self, termline):
2760         indented, indent, line_offset, blank_finish = \
2761               self.state_machine.get_indented()
2762         itemnode = nodes.definition_list_item(
2763             '\n'.join(termline + list(indented)))
2764         lineno = self.state_machine.abs_line_number() - 1
2765         (itemnode.source,
2766          itemnode.line) = self.state_machine.get_source_and_line(lineno)
2767         termlist, messages = self.term(termline, lineno)
2768         itemnode += termlist
2769         definition = nodes.definition('', *messages)
2770         itemnode += definition
2771         if termline[0][-2:] == '::':
2772             definition += self.reporter.info(
2773                   'Blank line missing before literal block (after the "::")? '
2774                   'Interpreted as a definition list item.',
2775                   line=lineno+1)
2776         self.nested_parse(indented, input_offset=line_offset, node=definition)
2777         return itemnode, blank_finish
2778
2779     classifier_delimiter = re.compile(' +: +')
2780
2781     def term(self, lines, lineno):
2782         """Return a definition_list's term and optional classifiers."""
2783         assert len(lines) == 1
2784         text_nodes, messages = self.inline_text(lines[0], lineno)
2785         term_node = nodes.term()
2786         node_list = [term_node]
2787         for i in range(len(text_nodes)):
2788             node = text_nodes[i]
2789             if isinstance(node, nodes.Text):
2790                 parts = self.classifier_delimiter.split(node.rawsource)
2791                 if len(parts) == 1:
2792                     node_list[-1] += node
2793                 else:
2794
2795                     node_list[-1] += nodes.Text(parts[0].rstrip())
2796                     for part in parts[1:]:
2797                         classifier_node = nodes.classifier('', part)
2798                         node_list.append(classifier_node)
2799             else:
2800                 node_list[-1] += node
2801         return node_list, messages
2802
2803
2804 class SpecializedText(Text):
2805
2806     """
2807     Superclass for second and subsequent lines of Text-variants.
2808
2809     All transition methods are disabled. Override individual methods in
2810     subclasses to re-enable.
2811     """
2812
2813     def eof(self, context):
2814         """Incomplete construct."""
2815         return []
2816
2817     def invalid_input(self, match=None, context=None, next_state=None):
2818         """Not a compound element member. Abort this state machine."""
2819         raise EOFError
2820
2821     blank = invalid_input
2822     indent = invalid_input
2823     underline = invalid_input
2824     text = invalid_input
2825
2826
2827 class Definition(SpecializedText):
2828
2829     """Second line of potential definition_list_item."""
2830
2831     def eof(self, context):
2832         """Not a definition."""
2833         self.state_machine.previous_line(2) # so parent SM can reassess
2834         return []
2835
2836     def indent(self, match, context, next_state):
2837         """Definition list item."""
2838         itemnode, blank_finish = self.definition_list_item(context)
2839         self.parent += itemnode
2840         self.blank_finish = blank_finish
2841         return [], 'DefinitionList', []
2842
2843
2844 class Line(SpecializedText):
2845
2846     """
2847     Second line of over- & underlined section title or transition marker.
2848     """
2849
2850     eofcheck = 1                        # @@@ ???
2851     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2852
2853     def eof(self, context):
2854         """Transition marker at end of section or document."""
2855         marker = context[0].strip()
2856         if self.memo.section_bubble_up_kludge:
2857             self.memo.section_bubble_up_kludge = False
2858         elif len(marker) < 4:
2859             self.state_correction(context)
2860         if self.eofcheck:               # ignore EOFError with sections
2861             lineno = self.state_machine.abs_line_number() - 1
2862             transition = nodes.transition(rawsource=context[0])
2863             transition.line = lineno
2864             self.parent += transition
2865         self.eofcheck = 1
2866         return []
2867
2868     def blank(self, match, context, next_state):
2869         """Transition marker."""
2870         src, srcline = self.state_machine.get_source_and_line()
2871         marker = context[0].strip()
2872         if len(marker) < 4:
2873             self.state_correction(context)
2874         transition = nodes.transition(rawsource=marker)
2875         transition.source = src
2876         transition.line = srcline - 1
2877         self.parent += transition
2878         return [], 'Body', []
2879
2880     def text(self, match, context, next_state):
2881         """Potential over- & underlined title."""
2882         lineno = self.state_machine.abs_line_number() - 1
2883         overline = context[0]
2884         title = match.string
2885         underline = ''
2886         try:
2887             underline = self.state_machine.next_line()
2888         except EOFError:
2889             blocktext = overline + '\n' + title
2890             if len(overline.rstrip()) < 4:
2891                 self.short_overline(context, blocktext, lineno, 2)
2892             else:
2893                 msg = self.reporter.severe(
2894                     'Incomplete section title.',
2895                     nodes.literal_block(blocktext, blocktext),
2896                     line=lineno)
2897                 self.parent += msg
2898                 return [], 'Body', []
2899         source = '%s\n%s\n%s' % (overline, title, underline)
2900         overline = overline.rstrip()
2901         underline = underline.rstrip()
2902         if not self.transitions['underline'][0].match(underline):
2903             blocktext = overline + '\n' + title + '\n' + underline
2904             if len(overline.rstrip()) < 4:
2905                 self.short_overline(context, blocktext, lineno, 2)
2906             else:
2907                 msg = self.reporter.severe(
2908                     'Missing matching underline for section title overline.',
2909                     nodes.literal_block(source, source),
2910                     line=lineno)
2911                 self.parent += msg
2912                 return [], 'Body', []
2913         elif overline != underline:
2914             blocktext = overline + '\n' + title + '\n' + underline
2915             if len(overline.rstrip()) < 4:
2916                 self.short_overline(context, blocktext, lineno, 2)
2917             else:
2918                 msg = self.reporter.severe(
2919                       'Title overline & underline mismatch.',
2920                       nodes.literal_block(source, source),
2921                       line=lineno)
2922                 self.parent += msg
2923                 return [], 'Body', []
2924         title = title.rstrip()
2925         messages = []
2926         if column_width(title) > len(overline):
2927             blocktext = overline + '\n' + title + '\n' + underline
2928             if len(overline.rstrip()) < 4:
2929                 self.short_overline(context, blocktext, lineno, 2)
2930             else:
2931                 msg = self.reporter.warning(
2932                       'Title overline too short.',
2933                       nodes.literal_block(source, source),
2934                       line=lineno)
2935                 messages.append(msg)
2936         style = (overline[0], underline[0])
2937         self.eofcheck = 0               # @@@ not sure this is correct
2938         self.section(title.lstrip(), source, style, lineno + 1, messages)
2939         self.eofcheck = 1
2940         return [], 'Body', []
2941
2942     indent = text                       # indented title
2943
2944     def underline(self, match, context, next_state):
2945         overline = context[0]
2946         blocktext = overline + '\n' + self.state_machine.line
2947         lineno = self.state_machine.abs_line_number() - 1
2948         if len(overline.rstrip()) < 4:
2949             self.short_overline(context, blocktext, lineno, 1)
2950         msg = self.reporter.error(
2951               'Invalid section title or transition marker.',
2952               nodes.literal_block(blocktext, blocktext),
2953               line=lineno)
2954         self.parent += msg
2955         return [], 'Body', []
2956
2957     def short_overline(self, context, blocktext, lineno, lines=1):
2958         msg = self.reporter.info(
2959             'Possible incomplete section title.\nTreating the overline as '
2960             "ordinary text because it's so short.",
2961             line=lineno)
2962         self.parent += msg
2963         self.state_correction(context, lines)
2964
2965     def state_correction(self, context, lines=1):
2966         self.state_machine.previous_line(lines)
2967         context[:] = []
2968         raise statemachine.StateCorrection('Body', 'text')
2969
2970
2971 class QuotedLiteralBlock(RSTState):
2972
2973     """
2974     Nested parse handler for quoted (unindented) literal blocks.
2975
2976     Special-purpose.  Not for inclusion in `state_classes`.
2977     """
2978
2979     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
2980                 'text': r''}
2981     initial_transitions = ('initial_quoted', 'text')
2982
2983     def __init__(self, state_machine, debug=False):
2984         RSTState.__init__(self, state_machine, debug)
2985         self.messages = []
2986         self.initial_lineno = None
2987
2988     def blank(self, match, context, next_state):
2989         if context:
2990             raise EOFError
2991         else:
2992             return context, next_state, []
2993
2994     def eof(self, context):
2995         if context:
2996             src, srcline = self.state_machine.get_source_and_line(
2997                                                         self.initial_lineno)
2998             text = '\n'.join(context)
2999             literal_block = nodes.literal_block(text, text)
3000             literal_block.source = src
3001             literal_block.line = srcline
3002             self.parent += literal_block
3003         else:
3004             self.parent += self.reporter.warning(
3005                 'Literal block expected; none found.',
3006                 line=self.state_machine.abs_line_number())
3007                 # src not available, because statemachine.input_lines is empty
3008             self.state_machine.previous_line()
3009         self.parent += self.messages
3010         return []
3011
3012     def indent(self, match, context, next_state):
3013         assert context, ('QuotedLiteralBlock.indent: context should not '
3014                          'be empty!')
3015         self.messages.append(
3016             self.reporter.error('Unexpected indentation.',
3017                                 line=self.state_machine.abs_line_number()))
3018         self.state_machine.previous_line()
3019         raise EOFError
3020
3021     def initial_quoted(self, match, context, next_state):
3022         """Match arbitrary quote character on the first line only."""
3023         self.remove_transition('initial_quoted')
3024         quote = match.string[0]
3025         pattern = re.compile(re.escape(quote), re.UNICODE)
3026         # New transition matches consistent quotes only:
3027         self.add_transition('quoted',
3028                             (pattern, self.quoted, self.__class__.__name__))
3029         self.initial_lineno = self.state_machine.abs_line_number()
3030         return [match.string], next_state, []
3031
3032     def quoted(self, match, context, next_state):
3033         """Match consistent quotes on subsequent lines."""
3034         context.append(match.string)
3035         return context, next_state, []
3036
3037     def text(self, match, context, next_state):
3038         if context:
3039             self.messages.append(
3040                 self.reporter.error('Inconsistent literal block quoting.',
3041                                    line=self.state_machine.abs_line_number()))
3042             self.state_machine.previous_line()
3043         raise EOFError
3044
3045
3046 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3047                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3048                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3049 """Standard set of State classes used to start `RSTStateMachine`."""