docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import sys
 107 import re
 108 try:
 109     import roman
 110 except ImportError:
 111     import docutils.utils.roman as roman
 112 from types import FunctionType, MethodType
 113
 114 from docutils import nodes, statemachine, utils, urischemes
 115 from docutils import ApplicationError, DataError
 116 from docutils.statemachine import StateMachineWS, StateWS
 117 from docutils.nodes import fully_normalize_name as normalize_name
 118 from docutils.nodes import whitespace_normalize_name
 119 import docutils.parsers.rst
 120 from docutils.parsers.rst import directives, languages, tableparser, roles
 121 from docutils.parsers.rst.languages import en as _fallback_language_module
 122 from docutils.utils import escape2null, unescape, column_width
 123 from docutils.utils import punctuation_chars
 124
 125 class MarkupError(DataError): pass
 126 class UnknownInterpretedRoleError(DataError): pass
 127 class InterpretedRoleNotImplementedError(DataError): pass
 128 class ParserError(ApplicationError): pass
 129 class MarkupMismatch(Exception): pass
 130
 131
 132 class Struct:
 133
 134     """Stores data attributes for dotted-attribute access."""
 135
 136     def __init__(self, **keywordargs):
 137         self.__dict__.update(keywordargs)
 138
 139
 140 class RSTStateMachine(StateMachineWS):
 141
 142     """
 143     reStructuredText's master StateMachine.
 144
 145     The entry point to reStructuredText parsing is the `run()` method.
 146     """
 147
 148     def run(self, input_lines, document, input_offset=0, match_titles=1,
 149             inliner=None):
 150         """
 151         Parse `input_lines` and modify the `document` node in place.
 152
 153         Extend `StateMachineWS.run()`: set up parse-global data and
 154         run the StateMachine.
 155         """
 156         self.language = languages.get_language(
 157             document.settings.language_code)
 158         self.match_titles = match_titles
 159         if inliner is None:
 160             inliner = Inliner()
 161         inliner.init_customizations(document.settings)
 162         self.memo = Struct(document=document,
 163                            reporter=document.reporter,
 164                            language=self.language,
 165                            title_styles=[],
 166                            section_level=0,
 167                            section_bubble_up_kludge=0,
 168                            inliner=inliner)
 169         self.document = document
 170         self.attach_observer(document.note_source)
 171         self.reporter = self.memo.reporter
 172         self.node = document
 173         results = StateMachineWS.run(self, input_lines, input_offset,
 174                                      input_source=document['source'])
 175         assert results == [], 'RSTStateMachine.run() results should be empty!'
 176         self.node = self.memo = None    # remove unneeded references
 177
 178
 179 class NestedStateMachine(StateMachineWS):
 180
 181     """
 182     StateMachine run from within other StateMachine runs, to parse nested
 183     document structures.
 184     """
 185
 186     def run(self, input_lines, input_offset, memo, node, match_titles=1):
 187         """
 188         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 189
 190         Extend `StateMachineWS.run()`: set up document-wide data.
 191         """
 192         self.match_titles = match_titles
 193         self.memo = memo
 194         self.document = memo.document
 195         self.attach_observer(self.document.note_source)
 196         self.reporter = memo.reporter
 197         self.language = memo.language
 198         self.node = node
 199         results = StateMachineWS.run(self, input_lines, input_offset)
 200         assert results == [], ('NestedStateMachine.run() results should be '
 201                                'empty!')
 202         return results
 203
 204
 205 class RSTState(StateWS):
 206
 207     """
 208     reStructuredText State superclass.
 209
 210     Contains methods used by all State subclasses.
 211     """
 212
 213     nested_sm = NestedStateMachine
 214     nested_sm_cache = []
 215
 216     def __init__(self, state_machine, debug=0):
 217         self.nested_sm_kwargs = {'state_classes': state_classes,
 218                                  'initial_state': 'Body'}
 219         StateWS.__init__(self, state_machine, debug)
 220
 221     def runtime_init(self):
 222         StateWS.runtime_init(self)
 223         memo = self.state_machine.memo
 224         self.memo = memo
 225         self.reporter = memo.reporter
 226         self.inliner = memo.inliner
 227         self.document = memo.document
 228         self.parent = self.state_machine.node
 229         # enable the reporter to determine source and source-line
 230         if not hasattr(self.reporter, 'get_source_and_line'):
 231             self.reporter.get_source_and_line = self.state_machine.get_source_and_line
 232             # print "adding get_source_and_line to reporter", self.state_machine.input_offset
 233
 234
 235     def goto_line(self, abs_line_offset):
 236         """
 237         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 238         """
 239         try:
 240             self.state_machine.goto_line(abs_line_offset)
 241         except EOFError:
 242             pass
 243
 244     def no_match(self, context, transitions):
 245         """
 246         Override `StateWS.no_match` to generate a system message.
 247
 248         This code should never be run.
 249         """
 250         src, srcline = self.state_machine.get_source_and_line()
 251         self.reporter.severe(
 252             'Internal error: no transition pattern match.  State: "%s"; '
 253             'transitions: %s; context: %s; current line: %r.'
 254             % (self.__class__.__name__, transitions, context,
 255                self.state_machine.line),
 256             source=src, line=srcline)
 257         return context, None, []
 258
 259     def bof(self, context):
 260         """Called at beginning of file."""
 261         return [], []
 262
 263     def nested_parse(self, block, input_offset, node, match_titles=0,
 264                      state_machine_class=None, state_machine_kwargs=None):
 265         """
 266         Create a new StateMachine rooted at `node` and run it over the input
 267         `block`.
 268         """
 269         use_default = 0
 270         if state_machine_class is None:
 271             state_machine_class = self.nested_sm
 272             use_default += 1
 273         if state_machine_kwargs is None:
 274             state_machine_kwargs = self.nested_sm_kwargs
 275             use_default += 1
 276         block_length = len(block)
 277
 278         state_machine = None
 279         if use_default == 2:
 280             try:
 281                 state_machine = self.nested_sm_cache.pop()
 282             except IndexError:
 283                 pass
 284         if not state_machine:
 285             state_machine = state_machine_class(debug=self.debug,
 286                                                 **state_machine_kwargs)
 287         state_machine.run(block, input_offset, memo=self.memo,
 288                           node=node, match_titles=match_titles)
 289         if use_default == 2:
 290             self.nested_sm_cache.append(state_machine)
 291         else:
 292             state_machine.unlink()
 293         new_offset = state_machine.abs_line_offset()
 294         # No `block.parent` implies disconnected -- lines aren't in sync:
 295         if block.parent and (len(block) - block_length) != 0:
 296             # Adjustment for block if modified in nested parse:
 297             self.state_machine.next_line(len(block) - block_length)
 298         return new_offset
 299
 300     def nested_list_parse(self, block, input_offset, node, initial_state,
 301                           blank_finish,
 302                           blank_finish_state=None,
 303                           extra_settings={},
 304                           match_titles=0,
 305                           state_machine_class=None,
 306                           state_machine_kwargs=None):
 307         """
 308         Create a new StateMachine rooted at `node` and run it over the input
 309         `block`. Also keep track of optional intermediate blank lines and the
 310         required final one.
 311         """
 312         if state_machine_class is None:
 313             state_machine_class = self.nested_sm
 314         if state_machine_kwargs is None:
 315             state_machine_kwargs = self.nested_sm_kwargs.copy()
 316         state_machine_kwargs['initial_state'] = initial_state
 317         state_machine = state_machine_class(debug=self.debug,
 318                                             **state_machine_kwargs)
 319         if blank_finish_state is None:
 320             blank_finish_state = initial_state
 321         state_machine.states[blank_finish_state].blank_finish = blank_finish
 322         for key, value in extra_settings.items():
 323             setattr(state_machine.states[initial_state], key, value)
 324         state_machine.run(block, input_offset, memo=self.memo,
 325                           node=node, match_titles=match_titles)
 326         blank_finish = state_machine.states[blank_finish_state].blank_finish
 327         state_machine.unlink()
 328         return state_machine.abs_line_offset(), blank_finish
 329
 330     def section(self, title, source, style, lineno, messages):
 331         """Check for a valid subsection and create one if it checks out."""
 332         if self.check_subsection(source, style, lineno):
 333             self.new_subsection(title, lineno, messages)
 334
 335     def check_subsection(self, source, style, lineno):
 336         """
 337         Check for a valid subsection header.  Return 1 (true) or None (false).
 338
 339         When a new section is reached that isn't a subsection of the current
 340         section, back up the line count (use ``previous_line(-x)``), then
 341         ``raise EOFError``.  The current StateMachine will finish, then the
 342         calling StateMachine can re-examine the title.  This will work its way
 343         back up the calling chain until the correct section level isreached.
 344
 345         @@@ Alternative: Evaluate the title, store the title info & level, and
 346         back up the chain until that level is reached.  Store in memo? Or
 347         return in results?
 348
 349         :Exception: `EOFError` when a sibling or supersection encountered.
 350         """
 351         memo = self.memo
 352         title_styles = memo.title_styles
 353         mylevel = memo.section_level
 354         try:                            # check for existing title style
 355             level = title_styles.index(style) + 1
 356         except ValueError:              # new title style
 357             if len(title_styles) == memo.section_level: # new subsection
 358                 title_styles.append(style)
 359                 return 1
 360             else:                       # not at lowest level
 361                 self.parent += self.title_inconsistent(source, lineno)
 362                 return None
 363         if level <= mylevel:            # sibling or supersection
 364             memo.section_level = level   # bubble up to parent section
 365             if len(style) == 2:
 366                 memo.section_bubble_up_kludge = 1
 367             # back up 2 lines for underline title, 3 for overline title
 368             self.state_machine.previous_line(len(style) + 1)
 369             raise EOFError              # let parent section re-evaluate
 370         if level == mylevel + 1:        # immediate subsection
 371             return 1
 372         else:                           # invalid subsection
 373             self.parent += self.title_inconsistent(source, lineno)
 374             return None
 375
 376     def title_inconsistent(self, sourcetext, lineno):
 377         src, srcline = self.state_machine.get_source_and_line(lineno)
 378         error = self.reporter.severe(
 379             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 380             source=src, line=srcline)
 381         return error
 382
 383     def new_subsection(self, title, lineno, messages):
 384         """Append new subsection to document tree. On return, check level."""
 385         memo = self.memo
 386         mylevel = memo.section_level
 387         memo.section_level += 1
 388         section_node = nodes.section()
 389         self.parent += section_node
 390         textnodes, title_messages = self.inline_text(title, lineno)
 391         titlenode = nodes.title(title, '', *textnodes)
 392         name = normalize_name(titlenode.astext())
 393         section_node['names'].append(name)
 394         section_node += titlenode
 395         section_node += messages
 396         section_node += title_messages
 397         self.document.note_implicit_target(section_node, section_node)
 398         offset = self.state_machine.line_offset + 1
 399         absoffset = self.state_machine.abs_line_offset() + 1
 400         newabsoffset = self.nested_parse(
 401               self.state_machine.input_lines[offset:], input_offset=absoffset,
 402               node=section_node, match_titles=1)
 403         self.goto_line(newabsoffset)
 404         if memo.section_level <= mylevel: # can't handle next section?
 405             raise EOFError              # bubble up to supersection
 406         # reset section_level; next pass will detect it properly
 407         memo.section_level = mylevel
 408
 409     def paragraph(self, lines, lineno):
 410         """
 411         Return a list (paragraph & messages) & a boolean: literal_block next?
 412         """
 413         data = '\n'.join(lines).rstrip()
 414         if re.search(r'(?<!\\)(\\\\)*::$', data):
 415             if len(data) == 2:
 416                 return [], 1
 417             elif data[-3] in ' \n':
 418                 text = data[:-3].rstrip()
 419             else:
 420                 text = data[:-1]
 421             literalnext = 1
 422         else:
 423             text = data
 424             literalnext = 0
 425         textnodes, messages = self.inline_text(text, lineno)
 426         p = nodes.paragraph(data, '', *textnodes)
 427         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 428         return [p] + messages, literalnext
 429
 430     def inline_text(self, text, lineno):
 431         """
 432         Return 2 lists: nodes (text and inline elements), and system_messages.
 433         """
 434         return self.inliner.parse(text, lineno, self.memo, self.parent)
 435
 436     def unindent_warning(self, node_name):
 437         # the actual problem is one line below the current line
 438         src, srcline = self.state_machine.get_source_and_line()
 439         return self.reporter.warning('%s ends without a blank line; '
 440                                      'unexpected unindent.' % node_name,
 441                                      source=src, line=srcline+1)
 442
 443
 444 def build_regexp(definition, compile=1):
 445     """
 446     Build, compile and return a regular expression based on `definition`.
 447
 448     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 449         where "parts" is a list of regular expressions and/or regular
 450         expression definitions to be joined into an or-group.
 451     """
 452     name, prefix, suffix, parts = definition
 453     part_strings = []
 454     for part in parts:
 455         if type(part) is tuple:
 456             part_strings.append(build_regexp(part, None))
 457         else:
 458             part_strings.append(part)
 459     or_group = '|'.join(part_strings)
 460     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 461     if compile:
 462         return re.compile(regexp, re.UNICODE)
 463     else:
 464         return regexp
 465
 466
 467 class Inliner:
 468
 469     """
 470     Parse inline markup; call the `parse()` method.
 471     """
 472
 473     def __init__(self):
 474         self.implicit_dispatch = [(self.patterns.uri, self.standalone_uri),]
 475         """List of (pattern, bound method) tuples, used by
 476         `self.implicit_inline`."""
 477
 478     def init_customizations(self, settings):
 479         """Setting-based customizations; run when parsing begins."""
 480         if settings.pep_references:
 481             self.implicit_dispatch.append((self.patterns.pep,
 482                                            self.pep_reference))
 483         if settings.rfc_references:
 484             self.implicit_dispatch.append((self.patterns.rfc,
 485                                            self.rfc_reference))
 486
 487     def parse(self, text, lineno, memo, parent):
 488         # Needs to be refactored for nested inline markup.
 489         # Add nested_parse() method?
 490         """
 491         Return 2 lists: nodes (text and inline elements), and system_messages.
 492
 493         Using `self.patterns.initial`, a pattern which matches start-strings
 494         (emphasis, strong, interpreted, phrase reference, literal,
 495         substitution reference, and inline target) and complete constructs
 496         (simple reference, footnote reference), search for a candidate.  When
 497         one is found, check for validity (e.g., not a quoted '*' character).
 498         If valid, search for the corresponding end string if applicable, and
 499         check it for validity.  If not found or invalid, generate a warning
 500         and ignore the start-string.  Implicit inline markup (e.g. standalone
 501         URIs) is found last.
 502         """
 503         self.reporter = memo.reporter
 504         self.document = memo.document
 505         self.language = memo.language
 506         self.parent = parent
 507         pattern_search = self.patterns.initial.search
 508         dispatch = self.dispatch
 509         remaining = escape2null(text)
 510         processed = []
 511         unprocessed = []
 512         messages = []
 513         while remaining:
 514             match = pattern_search(remaining)
 515             if match:
 516                 groups = match.groupdict()
 517                 method = dispatch[groups['start'] or groups['backquote']
 518                                   or groups['refend'] or groups['fnend']]
 519                 before, inlines, remaining, sysmessages = method(self, match,
 520                                                                  lineno)
 521                 unprocessed.append(before)
 522                 messages += sysmessages
 523                 if inlines:
 524                     processed += self.implicit_inline(''.join(unprocessed),
 525                                                       lineno)
 526                     processed += inlines
 527                     unprocessed = []
 528             else:
 529                 break
 530         remaining = ''.join(unprocessed) + remaining
 531         if remaining:
 532             processed += self.implicit_inline(remaining, lineno)
 533         return processed, messages
 534
 535     # Inline object recognition
 536     # -------------------------
 537     # lookahead and look-behind expressions for inline markup rules
 538     start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' %
 539                            (punctuation_chars.openers,
 540                             punctuation_chars.delimiters))
 541     end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' %
 542                          (punctuation_chars.closing_delimiters,
 543                           punctuation_chars.delimiters,
 544                           punctuation_chars.closers))
 545     # print start_string_prefix.encode('utf8')
 546     # TODO: support non-ASCII whitespace in the following 4 patterns?
 547     non_whitespace_before = r'(?<![ \n])'
 548     non_whitespace_escape_before = r'(?<![ \n\x00])'
 549     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[ \n\x00])'
 550     non_whitespace_after = r'(?![ \n])'
 551     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 552     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 553     # Valid URI characters (see RFC 2396 & RFC 2732);
 554     # final \x00 allows backslash escapes in URIs:
 555     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 556     # Delimiter indicating the end of a URI (not part of the URI):
 557     uri_end_delim = r"""[>]"""
 558     # Last URI character; same as uric but no punctuation:
 559     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 560     # End of a URI (either 'urilast' or 'uric followed by a
 561     # uri_end_delim'):
 562     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 563     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 564     email_pattern = r"""
 565           %(emailc)s+(?:\.%(emailc)s+)*   # name
 566           (?<!\x00)@                      # at
 567           %(emailc)s+(?:\.%(emailc)s*)*   # host
 568           %(uri_end)s                     # final URI char
 569           """
 570     parts = ('initial_inline', start_string_prefix, '',
 571              [('start', '', non_whitespace_after,  # simple start-strings
 572                [r'\*\*',                # strong
 573                 r'\*(?!\*)',            # emphasis but not strong
 574                 r'``',                  # literal
 575                 r'_`',                  # inline internal target
 576                 r'\|(?!\|)']            # substitution reference
 577                ),
 578               ('whole', '', end_string_suffix, # whole constructs
 579                [# reference name & end-string
 580                 r'(?P<refname>%s)(?P<refend>__?)' % simplename,
 581                 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 582                  [r'[0-9]+',               # manually numbered
 583                   r'\#(%s)?' % simplename, # auto-numbered (w/ label?)
 584                   r'\*',                   # auto-symbol
 585                   r'(?P<citationlabel>%s)' % simplename] # citation reference
 586                  )
 587                 ]
 588                ),
 589               ('backquote',             # interpreted text or phrase reference
 590                '(?P<role>(:%s:)?)' % simplename, # optional role
 591                non_whitespace_after,
 592                ['`(?!`)']               # but not literal
 593                )
 594               ]
 595              )
 596     patterns = Struct(
 597           initial=build_regexp(parts),
 598           emphasis=re.compile(non_whitespace_escape_before
 599                               + r'(\*)' + end_string_suffix, re.UNICODE),
 600           strong=re.compile(non_whitespace_escape_before
 601                             + r'(\*\*)' + end_string_suffix, re.UNICODE),
 602           interpreted_or_phrase_ref=re.compile(
 603               r"""
 604               %(non_unescaped_whitespace_escape_before)s
 605               (
 606                 `
 607                 (?P<suffix>
 608                   (?P<role>:%(simplename)s:)?
 609                   (?P<refend>__?)?
 610                 )
 611               )
 612               %(end_string_suffix)s
 613               """ % locals(), re.VERBOSE | re.UNICODE),
 614           embedded_uri=re.compile(
 615               r"""
 616               (
 617                 (?:[ \n]+|^)            # spaces or beginning of line/string
 618                 <                       # open bracket
 619                 %(non_whitespace_after)s
 620                 ([^<>\x00]+)            # anything but angle brackets & nulls
 621                 %(non_whitespace_before)s
 622                 >                       # close bracket w/o whitespace before
 623               )
 624               $                         # end of string
 625               """ % locals(), re.VERBOSE | re.UNICODE),
 626           literal=re.compile(non_whitespace_before + '(``)'
 627                              + end_string_suffix),
 628           target=re.compile(non_whitespace_escape_before
 629                             + r'(`)' + end_string_suffix),
 630           substitution_ref=re.compile(non_whitespace_escape_before
 631                                       + r'(\|_{0,2})'
 632                                       + end_string_suffix),
 633           email=re.compile(email_pattern % locals() + '$',
 634                            re.VERBOSE | re.UNICODE),
 635           uri=re.compile(
 636                 (r"""
 637                 %(start_string_prefix)s
 638                 (?P<whole>
 639                   (?P<absolute>           # absolute URI
 640                     (?P<scheme>             # scheme (http, ftp, mailto)
 641                       [a-zA-Z][a-zA-Z0-9.+-]*
 642                     )
 643                     :
 644                     (
 645                       (                       # either:
 646                         (//?)?                  # hierarchical URI
 647                         %(uric)s*               # URI characters
 648                         %(uri_end)s             # final URI char
 649                       )
 650                       (                       # optional query
 651                         \?%(uric)s*
 652                         %(uri_end)s
 653                       )?
 654                       (                       # optional fragment
 655                         \#%(uric)s*
 656                         %(uri_end)s
 657                       )?
 658                     )
 659                   )
 660                 |                       # *OR*
 661                   (?P<email>              # email address
 662                     """ + email_pattern + r"""
 663                   )
 664                 )
 665                 %(end_string_suffix)s
 666                 """) % locals(), re.VERBOSE | re.UNICODE),
 667           pep=re.compile(
 668                 r"""
 669                 %(start_string_prefix)s
 670                 (
 671                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 672                 |
 673                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 674                 )
 675                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE),
 676           rfc=re.compile(
 677                 r"""
 678                 %(start_string_prefix)s
 679                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 680                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE))
 681
 682     def quoted_start(self, match):
 683         """Test if inline markup start-string is 'quoted'.
 684
 685         'Quoted' in this context means the start-string is enclosed in a pair
 686         of matching opening/closing delimiters (not necessarily quotes)
 687         or at the end of the match.
 688         """
 689         string = match.string
 690         start = match.start()
 691         if start == 0:                  # start-string at beginning of text
 692             return False
 693         prestart = string[start - 1]
 694         try:
 695             poststart = string[match.end()]
 696         except IndexError:          # start-string at end of text
 697             return True  # not "quoted" but no markup start-string either
 698         return punctuation_chars.match_chars(prestart, poststart)
 699
 700     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 701                    restore_backslashes=0):
 702         string = match.string
 703         matchstart = match.start('start')
 704         matchend = match.end('start')
 705         if self.quoted_start(match):
 706             return (string[:matchend], [], string[matchend:], [], '')
 707         endmatch = end_pattern.search(string[matchend:])
 708         if endmatch and endmatch.start(1):  # 1 or more chars
 709             text = unescape(endmatch.string[:endmatch.start(1)],
 710                             restore_backslashes)
 711             textend = matchend + endmatch.end(1)
 712             rawsource = unescape(string[matchstart:textend], 1)
 713             return (string[:matchstart], [nodeclass(rawsource, text)],
 714                     string[textend:], [], endmatch.group(1))
 715         msg = self.reporter.warning(
 716               'Inline %s start-string without end-string.'
 717               % nodeclass.__name__, line=lineno)
 718         text = unescape(string[matchstart:matchend], 1)
 719         rawsource = unescape(string[matchstart:matchend], 1)
 720         prb = self.problematic(text, rawsource, msg)
 721         return string[:matchstart], [prb], string[matchend:], [msg], ''
 722
 723     def problematic(self, text, rawsource, message):
 724         msgid = self.document.set_id(message, self.parent)
 725         problematic = nodes.problematic(rawsource, text, refid=msgid)
 726         prbid = self.document.set_id(problematic)
 727         message.add_backref(prbid)
 728         return problematic
 729
 730     def emphasis(self, match, lineno):
 731         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 732               match, lineno, self.patterns.emphasis, nodes.emphasis)
 733         return before, inlines, remaining, sysmessages
 734
 735     def strong(self, match, lineno):
 736         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 737               match, lineno, self.patterns.strong, nodes.strong)
 738         return before, inlines, remaining, sysmessages
 739
 740     def interpreted_or_phrase_ref(self, match, lineno):
 741         end_pattern = self.patterns.interpreted_or_phrase_ref
 742         string = match.string
 743         matchstart = match.start('backquote')
 744         matchend = match.end('backquote')
 745         rolestart = match.start('role')
 746         role = match.group('role')
 747         position = ''
 748         if role:
 749             role = role[1:-1]
 750             position = 'prefix'
 751         elif self.quoted_start(match):
 752             return (string[:matchend], [], string[matchend:], [])
 753         endmatch = end_pattern.search(string[matchend:])
 754         if endmatch and endmatch.start(1):  # 1 or more chars
 755             textend = matchend + endmatch.end()
 756             if endmatch.group('role'):
 757                 if role:
 758                     msg = self.reporter.warning(
 759                         'Multiple roles in interpreted text (both '
 760                         'prefix and suffix present; only one allowed).',
 761                         line=lineno)
 762                     text = unescape(string[rolestart:textend], 1)
 763                     prb = self.problematic(text, text, msg)
 764                     return string[:rolestart], [prb], string[textend:], [msg]
 765                 role = endmatch.group('suffix')[1:-1]
 766                 position = 'suffix'
 767             escaped = endmatch.string[:endmatch.start(1)]
 768             rawsource = unescape(string[matchstart:textend], 1)
 769             if rawsource[-1:] == '_':
 770                 if role:
 771                     msg = self.reporter.warning(
 772                           'Mismatch: both interpreted text role %s and '
 773                           'reference suffix.' % position, line=lineno)
 774                     text = unescape(string[rolestart:textend], 1)
 775                     prb = self.problematic(text, text, msg)
 776                     return string[:rolestart], [prb], string[textend:], [msg]
 777                 return self.phrase_ref(string[:matchstart], string[textend:],
 778                                        rawsource, escaped, unescape(escaped))
 779             else:
 780                 rawsource = unescape(string[rolestart:textend], 1)
 781                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 782                                                       lineno)
 783                 return (string[:rolestart], nodelist,
 784                         string[textend:], messages)
 785         msg = self.reporter.warning(
 786               'Inline interpreted text or phrase reference start-string '
 787               'without end-string.', line=lineno)
 788         text = unescape(string[matchstart:matchend], 1)
 789         prb = self.problematic(text, text, msg)
 790         return string[:matchstart], [prb], string[matchend:], [msg]
 791
 792     def phrase_ref(self, before, after, rawsource, escaped, text):
 793         match = self.patterns.embedded_uri.search(escaped)
 794         if match:
 795             text = unescape(escaped[:match.start(0)])
 796             uri_text = match.group(2)
 797             uri = ''.join(uri_text.split())
 798             uri = self.adjust_uri(uri)
 799             if uri:
 800                 target = nodes.target(match.group(1), refuri=uri)
 801             else:
 802                 raise ApplicationError('problem with URI: %r' % uri_text)
 803             if not text:
 804                 text = uri
 805         else:
 806             target = None
 807         refname = normalize_name(text)
 808         reference = nodes.reference(rawsource, text,
 809                                     name=whitespace_normalize_name(text))
 810         node_list = [reference]
 811         if rawsource[-2:] == '__':
 812             if target:
 813                 reference['refuri'] = uri
 814             else:
 815                 reference['anonymous'] = 1
 816         else:
 817             if target:
 818                 reference['refuri'] = uri
 819                 target['names'].append(refname)
 820                 self.document.note_explicit_target(target, self.parent)
 821                 node_list.append(target)
 822             else:
 823                 reference['refname'] = refname
 824                 self.document.note_refname(reference)
 825         return before, node_list, after, []
 826
 827     def adjust_uri(self, uri):
 828         match = self.patterns.email.match(uri)
 829         if match:
 830             return 'mailto:' + uri
 831         else:
 832             return uri
 833
 834     def interpreted(self, rawsource, text, role, lineno):
 835         role_fn, messages = roles.role(role, self.language, lineno,
 836                                        self.reporter)
 837         if role_fn:
 838             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 839             return nodes, messages + messages2
 840         else:
 841             msg = self.reporter.error(
 842                 'Unknown interpreted text role "%s".' % role,
 843                 line=lineno)
 844             return ([self.problematic(rawsource, rawsource, msg)],
 845                     messages + [msg])
 846
 847     def literal(self, match, lineno):
 848         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 849               match, lineno, self.patterns.literal, nodes.literal,
 850               restore_backslashes=1)
 851         return before, inlines, remaining, sysmessages
 852
 853     def inline_internal_target(self, match, lineno):
 854         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 855               match, lineno, self.patterns.target, nodes.target)
 856         if inlines and isinstance(inlines[0], nodes.target):
 857             assert len(inlines) == 1
 858             target = inlines[0]
 859             name = normalize_name(target.astext())
 860             target['names'].append(name)
 861             self.document.note_explicit_target(target, self.parent)
 862         return before, inlines, remaining, sysmessages
 863
 864     def substitution_reference(self, match, lineno):
 865         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 866               match, lineno, self.patterns.substitution_ref,
 867               nodes.substitution_reference)
 868         if len(inlines) == 1:
 869             subref_node = inlines[0]
 870             if isinstance(subref_node, nodes.substitution_reference):
 871                 subref_text = subref_node.astext()
 872                 self.document.note_substitution_ref(subref_node, subref_text)
 873                 if endstring[-1:] == '_':
 874                     reference_node = nodes.reference(
 875                         '|%s%s' % (subref_text, endstring), '')
 876                     if endstring[-2:] == '__':
 877                         reference_node['anonymous'] = 1
 878                     else:
 879                         reference_node['refname'] = normalize_name(subref_text)
 880                         self.document.note_refname(reference_node)
 881                     reference_node += subref_node
 882                     inlines = [reference_node]
 883         return before, inlines, remaining, sysmessages
 884
 885     def footnote_reference(self, match, lineno):
 886         """
 887         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 888         elements.
 889         """
 890         label = match.group('footnotelabel')
 891         refname = normalize_name(label)
 892         string = match.string
 893         before = string[:match.start('whole')]
 894         remaining = string[match.end('whole'):]
 895         if match.group('citationlabel'):
 896             refnode = nodes.citation_reference('[%s]_' % label,
 897                                                refname=refname)
 898             refnode += nodes.Text(label)
 899             self.document.note_citation_ref(refnode)
 900         else:
 901             refnode = nodes.footnote_reference('[%s]_' % label)
 902             if refname[0] == '#':
 903                 refname = refname[1:]
 904                 refnode['auto'] = 1
 905                 self.document.note_autofootnote_ref(refnode)
 906             elif refname == '*':
 907                 refname = ''
 908                 refnode['auto'] = '*'
 909                 self.document.note_symbol_footnote_ref(
 910                       refnode)
 911             else:
 912                 refnode += nodes.Text(label)
 913             if refname:
 914                 refnode['refname'] = refname
 915                 self.document.note_footnote_ref(refnode)
 916             if utils.get_trim_footnote_ref_space(self.document.settings):
 917                 before = before.rstrip()
 918         return (before, [refnode], remaining, [])
 919
 920     def reference(self, match, lineno, anonymous=None):
 921         referencename = match.group('refname')
 922         refname = normalize_name(referencename)
 923         referencenode = nodes.reference(
 924             referencename + match.group('refend'), referencename,
 925             name=whitespace_normalize_name(referencename))
 926         if anonymous:
 927             referencenode['anonymous'] = 1
 928         else:
 929             referencenode['refname'] = refname
 930             self.document.note_refname(referencenode)
 931         string = match.string
 932         matchstart = match.start('whole')
 933         matchend = match.end('whole')
 934         return (string[:matchstart], [referencenode], string[matchend:], [])
 935
 936     def anonymous_reference(self, match, lineno):
 937         return self.reference(match, lineno, anonymous=1)
 938
 939     def standalone_uri(self, match, lineno):
 940         if (not match.group('scheme')
 941                 or match.group('scheme').lower() in urischemes.schemes):
 942             if match.group('email'):
 943                 addscheme = 'mailto:'
 944             else:
 945                 addscheme = ''
 946             text = match.group('whole')
 947             unescaped = unescape(text, 0)
 948             return [nodes.reference(unescape(text, 1), unescaped,
 949                                     refuri=addscheme + unescaped)]
 950         else:                   # not a valid scheme
 951             raise MarkupMismatch
 952
 953     def pep_reference(self, match, lineno):
 954         text = match.group(0)
 955         if text.startswith('pep-'):
 956             pepnum = int(match.group('pepnum1'))
 957         elif text.startswith('PEP'):
 958             pepnum = int(match.group('pepnum2'))
 959         else:
 960             raise MarkupMismatch
 961         ref = (self.document.settings.pep_base_url
 962                + self.document.settings.pep_file_url_template % pepnum)
 963         unescaped = unescape(text, 0)
 964         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 965
 966     rfc_url = 'rfc%d.html'
 967
 968     def rfc_reference(self, match, lineno):
 969         text = match.group(0)
 970         if text.startswith('RFC'):
 971             rfcnum = int(match.group('rfcnum'))
 972             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
 973         else:
 974             raise MarkupMismatch
 975         unescaped = unescape(text, 0)
 976         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 977
 978     def implicit_inline(self, text, lineno):
 979         """
 980         Check each of the patterns in `self.implicit_dispatch` for a match,
 981         and dispatch to the stored method for the pattern.  Recursively check
 982         the text before and after the match.  Return a list of `nodes.Text`
 983         and inline element nodes.
 984         """
 985         if not text:
 986             return []
 987         for pattern, method in self.implicit_dispatch:
 988             match = pattern.search(text)
 989             if match:
 990                 try:
 991                     # Must recurse on strings before *and* after the match;
 992                     # there may be multiple patterns.
 993                     return (self.implicit_inline(text[:match.start()], lineno)
 994                             + method(match, lineno) +
 995                             self.implicit_inline(text[match.end():], lineno))
 996                 except MarkupMismatch:
 997                     pass
 998         return [nodes.Text(unescape(text), rawsource=unescape(text, 1))]
 999
1000     dispatch = {'*': emphasis,
1001                 '**': strong,
1002                 '`': interpreted_or_phrase_ref,
1003                 '``': literal,
1004                 '_`': inline_internal_target,
1005                 ']_': footnote_reference,
1006                 '|': substitution_reference,
1007                 '_': reference,
1008                 '__': anonymous_reference}
1009
1010
1011 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1012     return ord(s) - _zero
1013
1014 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1015     return ord(s) - _zero
1016
1017 def _lowerroman_to_int(s):
1018     return roman.fromRoman(s.upper())
1019
1020
1021 class Body(RSTState):
1022
1023     """
1024     Generic classifier of the first line of a block.
1025     """
1026
1027     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1028     """Padding character for East Asian double-width text."""
1029
1030     enum = Struct()
1031     """Enumerated list parsing information."""
1032
1033     enum.formatinfo = {
1034           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1035           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1036           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1037     enum.formats = enum.formatinfo.keys()
1038     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1039                       'lowerroman', 'upperroman'] # ORDERED!
1040     enum.sequencepats = {'arabic': '[0-9]+',
1041                          'loweralpha': '[a-z]',
1042                          'upperalpha': '[A-Z]',
1043                          'lowerroman': '[ivxlcdm]+',
1044                          'upperroman': '[IVXLCDM]+',}
1045     enum.converters = {'arabic': int,
1046                        'loweralpha': _loweralpha_to_int,
1047                        'upperalpha': _upperalpha_to_int,
1048                        'lowerroman': _lowerroman_to_int,
1049                        'upperroman': roman.fromRoman}
1050
1051     enum.sequenceregexps = {}
1052     for sequence in enum.sequences:
1053         enum.sequenceregexps[sequence] = re.compile(
1054               enum.sequencepats[sequence] + '$', re.UNICODE)
1055
1056     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1057     """Matches the top (& bottom) of a full table)."""
1058
1059     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1060     """Matches the top of a simple table."""
1061
1062     simple_table_border_pat = re.compile('=+[ =]*$')
1063     """Matches the bottom & header bottom of a simple table."""
1064
1065     pats = {}
1066     """Fragments of patterns used by transitions."""
1067
1068     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1069     pats['alpha'] = '[a-zA-Z]'
1070     pats['alphanum'] = '[a-zA-Z0-9]'
1071     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1072     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1073                     '|%(upperroman)s|#)' % enum.sequencepats)
1074     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1075     # @@@ Loosen up the pattern?  Allow Unicode?
1076     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1077     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1078     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1079     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1080
1081     for format in enum.formats:
1082         pats[format] = '(?P<%s>%s%s%s)' % (
1083               format, re.escape(enum.formatinfo[format].prefix),
1084               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1085
1086     patterns = {
1087           'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
1088           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1089           'field_marker': r':(?![: ])([^:\\]|\\.)*(?<! ):( +|$)',
1090           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1091           'doctest': r'>>>( +|$)',
1092           'line_block': r'\|( +|$)',
1093           'grid_table_top': grid_table_top_pat,
1094           'simple_table_top': simple_table_top_pat,
1095           'explicit_markup': r'\.\.( +|$)',
1096           'anonymous': r'__( +|$)',
1097           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1098           'text': r''}
1099     initial_transitions = (
1100           'bullet',
1101           'enumerator',
1102           'field_marker',
1103           'option_marker',
1104           'doctest',
1105           'line_block',
1106           'grid_table_top',
1107           'simple_table_top',
1108           'explicit_markup',
1109           'anonymous',
1110           'line',
1111           'text')
1112
1113     def indent(self, match, context, next_state):
1114         """Block quote."""
1115         indented, indent, line_offset, blank_finish = \
1116               self.state_machine.get_indented()
1117         elements = self.block_quote(indented, line_offset)
1118         self.parent += elements
1119         if not blank_finish:
1120             self.parent += self.unindent_warning('Block quote')
1121         return context, next_state, []
1122
1123     def block_quote(self, indented, line_offset):
1124         elements = []
1125         while indented:
1126             (blockquote_lines,
1127              attribution_lines,
1128              attribution_offset,
1129              indented,
1130              new_line_offset) = self.split_attribution(indented, line_offset)
1131             blockquote = nodes.block_quote()
1132             self.nested_parse(blockquote_lines, line_offset, blockquote)
1133             elements.append(blockquote)
1134             if attribution_lines:
1135                 attribution, messages = self.parse_attribution(
1136                     attribution_lines, attribution_offset)
1137                 blockquote += attribution
1138                 elements += messages
1139             line_offset = new_line_offset
1140             while indented and not indented[0]:
1141                 indented = indented[1:]
1142                 line_offset += 1
1143         return elements
1144
1145     # U+2014 is an em-dash:
1146     attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
1147                                      re.UNICODE)
1148
1149     def split_attribution(self, indented, line_offset):
1150         """
1151         Check for a block quote attribution and split it off:
1152
1153         * First line after a blank line must begin with a dash ("--", "---",
1154           em-dash; matches `self.attribution_pattern`).
1155         * Every line after that must have consistent indentation.
1156         * Attributions must be preceded by block quote content.
1157
1158         Return a tuple of: (block quote content lines, content offset,
1159         attribution lines, attribution offset, remaining indented lines).
1160         """
1161         blank = None
1162         nonblank_seen = False
1163         for i in range(len(indented)):
1164             line = indented[i].rstrip()
1165             if line:
1166                 if nonblank_seen and blank == i - 1: # last line blank
1167                     match = self.attribution_pattern.match(line)
1168                     if match:
1169                         attribution_end, indent = self.check_attribution(
1170                             indented, i)
1171                         if attribution_end:
1172                             a_lines = indented[i:attribution_end]
1173                             a_lines.trim_left(match.end(), end=1)
1174                             a_lines.trim_left(indent, start=1)
1175                             return (indented[:i], a_lines,
1176                                     i, indented[attribution_end:],
1177                                     line_offset + attribution_end)
1178                 nonblank_seen = True
1179             else:
1180                 blank = i
1181         else:
1182             return (indented, None, None, None, None)
1183
1184     def check_attribution(self, indented, attribution_start):
1185         """
1186         Check attribution shape.
1187         Return the index past the end of the attribution, and the indent.
1188         """
1189         indent = None
1190         i = attribution_start + 1
1191         for i in range(attribution_start + 1, len(indented)):
1192             line = indented[i].rstrip()
1193             if not line:
1194                 break
1195             if indent is None:
1196                 indent = len(line) - len(line.lstrip())
1197             elif len(line) - len(line.lstrip()) != indent:
1198                 return None, None       # bad shape; not an attribution
1199         else:
1200             # return index of line after last attribution line:
1201             i += 1
1202         return i, (indent or 0)
1203
1204     def parse_attribution(self, indented, line_offset):
1205         text = '\n'.join(indented).rstrip()
1206         lineno = self.state_machine.abs_line_number() + line_offset
1207         textnodes, messages = self.inline_text(text, lineno)
1208         node = nodes.attribution(text, '', *textnodes)
1209         node.line = lineno
1210         # report with source and source-line results in
1211         # ``IndexError: list index out of range``
1212         # node.source, node.line = self.state_machine.get_source_and_line(lineno)
1213         return node, messages
1214
1215     def bullet(self, match, context, next_state):
1216         """Bullet list item."""
1217         bulletlist = nodes.bullet_list()
1218         self.parent += bulletlist
1219         bulletlist['bullet'] = match.string[0]
1220         i, blank_finish = self.list_item(match.end())
1221         bulletlist += i
1222         offset = self.state_machine.line_offset + 1   # next line
1223         new_line_offset, blank_finish = self.nested_list_parse(
1224               self.state_machine.input_lines[offset:],
1225               input_offset=self.state_machine.abs_line_offset() + 1,
1226               node=bulletlist, initial_state='BulletList',
1227               blank_finish=blank_finish)
1228         self.goto_line(new_line_offset)
1229         if not blank_finish:
1230             self.parent += self.unindent_warning('Bullet list')
1231         return [], next_state, []
1232
1233     def list_item(self, indent):
1234         if self.state_machine.line[indent:]:
1235             indented, line_offset, blank_finish = (
1236                 self.state_machine.get_known_indented(indent))
1237         else:
1238             indented, indent, line_offset, blank_finish = (
1239                 self.state_machine.get_first_known_indented(indent))
1240         listitem = nodes.list_item('\n'.join(indented))
1241         if indented:
1242             self.nested_parse(indented, input_offset=line_offset,
1243                               node=listitem)
1244         return listitem, blank_finish
1245
1246     def enumerator(self, match, context, next_state):
1247         """Enumerated List Item"""
1248         format, sequence, text, ordinal = self.parse_enumerator(match)
1249         if not self.is_enumerated_list_item(ordinal, sequence, format):
1250             raise statemachine.TransitionCorrection('text')
1251         enumlist = nodes.enumerated_list()
1252         self.parent += enumlist
1253         if sequence == '#':
1254             enumlist['enumtype'] = 'arabic'
1255         else:
1256             enumlist['enumtype'] = sequence
1257         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1258         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1259         if ordinal != 1:
1260             enumlist['start'] = ordinal
1261             src, srcline = self.state_machine.get_source_and_line()
1262             msg = self.reporter.info(
1263                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1264                 % (text, ordinal), source=src, line=srcline)
1265             self.parent += msg
1266         listitem, blank_finish = self.list_item(match.end())
1267         enumlist += listitem
1268         offset = self.state_machine.line_offset + 1   # next line
1269         newline_offset, blank_finish = self.nested_list_parse(
1270               self.state_machine.input_lines[offset:],
1271               input_offset=self.state_machine.abs_line_offset() + 1,
1272               node=enumlist, initial_state='EnumeratedList',
1273               blank_finish=blank_finish,
1274               extra_settings={'lastordinal': ordinal,
1275                               'format': format,
1276                               'auto': sequence == '#'})
1277         self.goto_line(newline_offset)
1278         if not blank_finish:
1279             self.parent += self.unindent_warning('Enumerated list')
1280         return [], next_state, []
1281
1282     def parse_enumerator(self, match, expected_sequence=None):
1283         """
1284         Analyze an enumerator and return the results.
1285
1286         :Return:
1287             - the enumerator format ('period', 'parens', or 'rparen'),
1288             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1289             - the text of the enumerator, stripped of formatting, and
1290             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1291               ``None`` is returned for invalid enumerator text).
1292
1293         The enumerator format has already been determined by the regular
1294         expression match. If `expected_sequence` is given, that sequence is
1295         tried first. If not, we check for Roman numeral 1. This way,
1296         single-character Roman numerals (which are also alphabetical) can be
1297         matched. If no sequence has been matched, all sequences are checked in
1298         order.
1299         """
1300         groupdict = match.groupdict()
1301         sequence = ''
1302         for format in self.enum.formats:
1303             if groupdict[format]:       # was this the format matched?
1304                 break                   # yes; keep `format`
1305         else:                           # shouldn't happen
1306             raise ParserError('enumerator format not matched')
1307         text = groupdict[format][self.enum.formatinfo[format].start
1308                                  :self.enum.formatinfo[format].end]
1309         if text == '#':
1310             sequence = '#'
1311         elif expected_sequence:
1312             try:
1313                 if self.enum.sequenceregexps[expected_sequence].match(text):
1314                     sequence = expected_sequence
1315             except KeyError:            # shouldn't happen
1316                 raise ParserError('unknown enumerator sequence: %s'
1317                                   % sequence)
1318         elif text == 'i':
1319             sequence = 'lowerroman'
1320         elif text == 'I':
1321             sequence = 'upperroman'
1322         if not sequence:
1323             for sequence in self.enum.sequences:
1324                 if self.enum.sequenceregexps[sequence].match(text):
1325                     break
1326             else:                       # shouldn't happen
1327                 raise ParserError('enumerator sequence not matched')
1328         if sequence == '#':
1329             ordinal = 1
1330         else:
1331             try:
1332                 ordinal = self.enum.converters[sequence](text)
1333             except roman.InvalidRomanNumeralError:
1334                 ordinal = None
1335         return format, sequence, text, ordinal
1336
1337     def is_enumerated_list_item(self, ordinal, sequence, format):
1338         """
1339         Check validity based on the ordinal value and the second line.
1340
1341         Return true if the ordinal is valid and the second line is blank,
1342         indented, or starts with the next enumerator or an auto-enumerator.
1343         """
1344         if ordinal is None:
1345             return None
1346         try:
1347             next_line = self.state_machine.next_line()
1348         except EOFError:              # end of input lines
1349             self.state_machine.previous_line()
1350             return 1
1351         else:
1352             self.state_machine.previous_line()
1353         if not next_line[:1].strip():   # blank or indented
1354             return 1
1355         result = self.make_enumerator(ordinal + 1, sequence, format)
1356         if result:
1357             next_enumerator, auto_enumerator = result
1358             try:
1359                 if ( next_line.startswith(next_enumerator) or
1360                      next_line.startswith(auto_enumerator) ):
1361                     return 1
1362             except TypeError:
1363                 pass
1364         return None
1365
1366     def make_enumerator(self, ordinal, sequence, format):
1367         """
1368         Construct and return the next enumerated list item marker, and an
1369         auto-enumerator ("#" instead of the regular enumerator).
1370
1371         Return ``None`` for invalid (out of range) ordinals.
1372         """ #"
1373         if sequence == '#':
1374             enumerator = '#'
1375         elif sequence == 'arabic':
1376             enumerator = str(ordinal)
1377         else:
1378             if sequence.endswith('alpha'):
1379                 if ordinal > 26:
1380                     return None
1381                 enumerator = chr(ordinal + ord('a') - 1)
1382             elif sequence.endswith('roman'):
1383                 try:
1384                     enumerator = roman.toRoman(ordinal)
1385                 except roman.RomanError:
1386                     return None
1387             else:                       # shouldn't happen
1388                 raise ParserError('unknown enumerator sequence: "%s"'
1389                                   % sequence)
1390             if sequence.startswith('lower'):
1391                 enumerator = enumerator.lower()
1392             elif sequence.startswith('upper'):
1393                 enumerator = enumerator.upper()
1394             else:                       # shouldn't happen
1395                 raise ParserError('unknown enumerator sequence: "%s"'
1396                                   % sequence)
1397         formatinfo = self.enum.formatinfo[format]
1398         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1399                            + ' ')
1400         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1401         return next_enumerator, auto_enumerator
1402
1403     def field_marker(self, match, context, next_state):
1404         """Field list item."""
1405         field_list = nodes.field_list()
1406         self.parent += field_list
1407         field, blank_finish = self.field(match)
1408         field_list += field
1409         offset = self.state_machine.line_offset + 1   # next line
1410         newline_offset, blank_finish = self.nested_list_parse(
1411               self.state_machine.input_lines[offset:],
1412               input_offset=self.state_machine.abs_line_offset() + 1,
1413               node=field_list, initial_state='FieldList',
1414               blank_finish=blank_finish)
1415         self.goto_line(newline_offset)
1416         if not blank_finish:
1417             self.parent += self.unindent_warning('Field list')
1418         return [], next_state, []
1419
1420     def field(self, match):
1421         name = self.parse_field_marker(match)
1422         src, srcline = self.state_machine.get_source_and_line()
1423         lineno = self.state_machine.abs_line_number()
1424         indented, indent, line_offset, blank_finish = \
1425               self.state_machine.get_first_known_indented(match.end())
1426         field_node = nodes.field()
1427         field_node.source = src
1428         field_node.line = srcline
1429         name_nodes, name_messages = self.inline_text(name, lineno)
1430         field_node += nodes.field_name(name, '', *name_nodes)
1431         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1432         field_node += field_body
1433         if indented:
1434             self.parse_field_body(indented, line_offset, field_body)
1435         return field_node, blank_finish
1436
1437     def parse_field_marker(self, match):
1438         """Extract & return field name from a field marker match."""
1439         field = match.group()[1:]        # strip off leading ':'
1440         field = field[:field.rfind(':')] # strip off trailing ':' etc.
1441         return field
1442
1443     def parse_field_body(self, indented, offset, node):
1444         self.nested_parse(indented, input_offset=offset, node=node)
1445
1446     def option_marker(self, match, context, next_state):
1447         """Option list item."""
1448         optionlist = nodes.option_list()
1449         try:
1450             listitem, blank_finish = self.option_list_item(match)
1451         except MarkupError, error:
1452             # This shouldn't happen; pattern won't match.
1453             src, srcline = self.state_machine.get_source_and_line()
1454             msg = self.reporter.error(u'Invalid option list marker: %s' %
1455                 error, source=src, line=srcline)
1456             self.parent += msg
1457             indented, indent, line_offset, blank_finish = \
1458                   self.state_machine.get_first_known_indented(match.end())
1459             elements = self.block_quote(indented, line_offset)
1460             self.parent += elements
1461             if not blank_finish:
1462                 self.parent += self.unindent_warning('Option list')
1463             return [], next_state, []
1464         self.parent += optionlist
1465         optionlist += listitem
1466         offset = self.state_machine.line_offset + 1   # next line
1467         newline_offset, blank_finish = self.nested_list_parse(
1468               self.state_machine.input_lines[offset:],
1469               input_offset=self.state_machine.abs_line_offset() + 1,
1470               node=optionlist, initial_state='OptionList',
1471               blank_finish=blank_finish)
1472         self.goto_line(newline_offset)
1473         if not blank_finish:
1474             self.parent += self.unindent_warning('Option list')
1475         return [], next_state, []
1476
1477     def option_list_item(self, match):
1478         offset = self.state_machine.abs_line_offset()
1479         options = self.parse_option_marker(match)
1480         indented, indent, line_offset, blank_finish = \
1481               self.state_machine.get_first_known_indented(match.end())
1482         if not indented:                # not an option list item
1483             self.goto_line(offset)
1484             raise statemachine.TransitionCorrection('text')
1485         option_group = nodes.option_group('', *options)
1486         description = nodes.description('\n'.join(indented))
1487         option_list_item = nodes.option_list_item('', option_group,
1488                                                   description)
1489         if indented:
1490             self.nested_parse(indented, input_offset=line_offset,
1491                               node=description)
1492         return option_list_item, blank_finish
1493
1494     def parse_option_marker(self, match):
1495         """
1496         Return a list of `node.option` and `node.option_argument` objects,
1497         parsed from an option marker match.
1498
1499         :Exception: `MarkupError` for invalid option markers.
1500         """
1501         optlist = []
1502         optionstrings = match.group().rstrip().split(', ')
1503         for optionstring in optionstrings:
1504             tokens = optionstring.split()
1505             delimiter = ' '
1506             firstopt = tokens[0].split('=', 1)
1507             if len(firstopt) > 1:
1508                 # "--opt=value" form
1509                 tokens[:1] = firstopt
1510                 delimiter = '='
1511             elif (len(tokens[0]) > 2
1512                   and ((tokens[0].startswith('-')
1513                         and not tokens[0].startswith('--'))
1514                        or tokens[0].startswith('+'))):
1515                 # "-ovalue" form
1516                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1517                 delimiter = ''
1518             if len(tokens) > 1 and (tokens[1].startswith('<')
1519                                     and tokens[-1].endswith('>')):
1520                 # "-o <value1 value2>" form; join all values into one token
1521                 tokens[1:] = [' '.join(tokens[1:])]
1522             if 0 < len(tokens) <= 2:
1523                 option = nodes.option(optionstring)
1524                 option += nodes.option_string(tokens[0], tokens[0])
1525                 if len(tokens) > 1:
1526                     option += nodes.option_argument(tokens[1], tokens[1],
1527                                                     delimiter=delimiter)
1528                 optlist.append(option)
1529             else:
1530                 raise MarkupError(
1531                     'wrong number of option tokens (=%s), should be 1 or 2: '
1532                     '"%s"' % (len(tokens), optionstring))
1533         return optlist
1534
1535     def doctest(self, match, context, next_state):
1536         data = '\n'.join(self.state_machine.get_text_block())
1537         self.parent += nodes.doctest_block(data, data)
1538         return [], next_state, []
1539
1540     def line_block(self, match, context, next_state):
1541         """First line of a line block."""
1542         block = nodes.line_block()
1543         self.parent += block
1544         lineno = self.state_machine.abs_line_number()
1545         line, messages, blank_finish = self.line_block_line(match, lineno)
1546         block += line
1547         self.parent += messages
1548         if not blank_finish:
1549             offset = self.state_machine.line_offset + 1   # next line
1550             new_line_offset, blank_finish = self.nested_list_parse(
1551                   self.state_machine.input_lines[offset:],
1552                   input_offset=self.state_machine.abs_line_offset() + 1,
1553                   node=block, initial_state='LineBlock',
1554                   blank_finish=0)
1555             self.goto_line(new_line_offset)
1556         if not blank_finish:
1557             src, srcline = self.state_machine.get_source_and_line()
1558             self.parent += self.reporter.warning(
1559                 'Line block ends without a blank line.',
1560                 source=src, line=srcline+1)
1561         if len(block):
1562             if block[0].indent is None:
1563                 block[0].indent = 0
1564             self.nest_line_block_lines(block)
1565         return [], next_state, []
1566
1567     def line_block_line(self, match, lineno):
1568         """Return one line element of a line_block."""
1569         indented, indent, line_offset, blank_finish = \
1570               self.state_machine.get_first_known_indented(match.end(),
1571                                                           until_blank=1)
1572         text = u'\n'.join(indented)
1573         text_nodes, messages = self.inline_text(text, lineno)
1574         line = nodes.line(text, '', *text_nodes)
1575         if match.string.rstrip() != '|': # not empty
1576             line.indent = len(match.group(1)) - 1
1577         return line, messages, blank_finish
1578
1579     def nest_line_block_lines(self, block):
1580         for index in range(1, len(block)):
1581             if block[index].indent is None:
1582                 block[index].indent = block[index - 1].indent
1583         self.nest_line_block_segment(block)
1584
1585     def nest_line_block_segment(self, block):
1586         indents = [item.indent for item in block]
1587         least = min(indents)
1588         new_items = []
1589         new_block = nodes.line_block()
1590         for item in block:
1591             if item.indent > least:
1592                 new_block.append(item)
1593             else:
1594                 if len(new_block):
1595                     self.nest_line_block_segment(new_block)
1596                     new_items.append(new_block)
1597                     new_block = nodes.line_block()
1598                 new_items.append(item)
1599         if len(new_block):
1600             self.nest_line_block_segment(new_block)
1601             new_items.append(new_block)
1602         block[:] = new_items
1603
1604     def grid_table_top(self, match, context, next_state):
1605         """Top border of a full table."""
1606         return self.table_top(match, context, next_state,
1607                               self.isolate_grid_table,
1608                               tableparser.GridTableParser)
1609
1610     def simple_table_top(self, match, context, next_state):
1611         """Top border of a simple table."""
1612         return self.table_top(match, context, next_state,
1613                               self.isolate_simple_table,
1614                               tableparser.SimpleTableParser)
1615
1616     def table_top(self, match, context, next_state,
1617                   isolate_function, parser_class):
1618         """Top border of a generic table."""
1619         nodelist, blank_finish = self.table(isolate_function, parser_class)
1620         self.parent += nodelist
1621         if not blank_finish:
1622             src, srcline = self.state_machine.get_source_and_line()
1623             msg = self.reporter.warning(
1624                 'Blank line required after table.',
1625                 source=src, line=srcline+1)
1626             self.parent += msg
1627         return [], next_state, []
1628
1629     def table(self, isolate_function, parser_class):
1630         """Parse a table."""
1631         block, messages, blank_finish = isolate_function()
1632         if block:
1633             try:
1634                 parser = parser_class()
1635                 tabledata = parser.parse(block)
1636                 tableline = (self.state_machine.abs_line_number() - len(block)
1637                              + 1)
1638                 table = self.build_table(tabledata, tableline)
1639                 nodelist = [table] + messages
1640             except tableparser.TableMarkupError, detail:
1641                 nodelist = self.malformed_table(
1642                     block, ' '.join(detail.args)) + messages
1643         else:
1644             nodelist = messages
1645         return nodelist, blank_finish
1646
1647     def isolate_grid_table(self):
1648         messages = []
1649         blank_finish = 1
1650         try:
1651             block = self.state_machine.get_text_block(flush_left=1)
1652         except statemachine.UnexpectedIndentationError, instance:
1653             block, src, srcline = instance.args
1654             messages.append(self.reporter.error('Unexpected indentation.',
1655                                                 source=src, line=srcline))
1656             blank_finish = 0
1657         block.disconnect()
1658         # for East Asian chars:
1659         block.pad_double_width(self.double_width_pad_char)
1660         width = len(block[0].strip())
1661         for i in range(len(block)):
1662             block[i] = block[i].strip()
1663             if block[i][0] not in '+|': # check left edge
1664                 blank_finish = 0
1665                 self.state_machine.previous_line(len(block) - i)
1666                 del block[i:]
1667                 break
1668         if not self.grid_table_top_pat.match(block[-1]): # find bottom
1669             blank_finish = 0
1670             # from second-last to third line of table:
1671             for i in range(len(block) - 2, 1, -1):
1672                 if self.grid_table_top_pat.match(block[i]):
1673                     self.state_machine.previous_line(len(block) - i + 1)
1674                     del block[i+1:]
1675                     break
1676             else:
1677                 messages.extend(self.malformed_table(block))
1678                 return [], messages, blank_finish
1679         for i in range(len(block)):     # check right edge
1680             if len(block[i]) != width or block[i][-1] not in '+|':
1681                 messages.extend(self.malformed_table(block))
1682                 return [], messages, blank_finish
1683         return block, messages, blank_finish
1684
1685     def isolate_simple_table(self):
1686         start = self.state_machine.line_offset
1687         lines = self.state_machine.input_lines
1688         limit = len(lines) - 1
1689         toplen = len(lines[start].strip())
1690         pattern_match = self.simple_table_border_pat.match
1691         found = 0
1692         found_at = None
1693         i = start + 1
1694         while i <= limit:
1695             line = lines[i]
1696             match = pattern_match(line)
1697             if match:
1698                 if len(line.strip()) != toplen:
1699                     self.state_machine.next_line(i - start)
1700                     messages = self.malformed_table(
1701                         lines[start:i+1], 'Bottom/header table border does '
1702                         'not match top border.')
1703                     return [], messages, i == limit or not lines[i+1].strip()
1704                 found += 1
1705                 found_at = i
1706                 if found == 2 or i == limit or not lines[i+1].strip():
1707                     end = i
1708                     break
1709             i += 1
1710         else:                           # reached end of input_lines
1711             if found:
1712                 extra = ' or no blank line after table bottom'
1713                 self.state_machine.next_line(found_at - start)
1714                 block = lines[start:found_at+1]
1715             else:
1716                 extra = ''
1717                 self.state_machine.next_line(i - start - 1)
1718                 block = lines[start:]
1719             messages = self.malformed_table(
1720                 block, 'No bottom table border found%s.' % extra)
1721             return [], messages, not extra
1722         self.state_machine.next_line(end - start)
1723         block = lines[start:end+1]
1724         # for East Asian chars:
1725         block.pad_double_width(self.double_width_pad_char)
1726         return block, [], end == limit or not lines[end+1].strip()
1727
1728     def malformed_table(self, block, detail=''):
1729         block.replace(self.double_width_pad_char, '')
1730         data = '\n'.join(block)
1731         message = 'Malformed table.'
1732         startline = self.state_machine.abs_line_number() - len(block) + 1
1733         src, srcline = self.state_machine.get_source_and_line(startline)
1734         if detail:
1735             message += '\n' + detail
1736         error = self.reporter.error(message, nodes.literal_block(data, data),
1737                                     source=src, line=srcline)
1738         return [error]
1739
1740     def build_table(self, tabledata, tableline, stub_columns=0):
1741         colwidths, headrows, bodyrows = tabledata
1742         table = nodes.table()
1743         tgroup = nodes.tgroup(cols=len(colwidths))
1744         table += tgroup
1745         for colwidth in colwidths:
1746             colspec = nodes.colspec(colwidth=colwidth)
1747             if stub_columns:
1748                 colspec.attributes['stub'] = 1
1749                 stub_columns -= 1
1750             tgroup += colspec
1751         if headrows:
1752             thead = nodes.thead()
1753             tgroup += thead
1754             for row in headrows:
1755                 thead += self.build_table_row(row, tableline)
1756         tbody = nodes.tbody()
1757         tgroup += tbody
1758         for row in bodyrows:
1759             tbody += self.build_table_row(row, tableline)
1760         return table
1761
1762     def build_table_row(self, rowdata, tableline):
1763         row = nodes.row()
1764         for cell in rowdata:
1765             if cell is None:
1766                 continue
1767             morerows, morecols, offset, cellblock = cell
1768             attributes = {}
1769             if morerows:
1770                 attributes['morerows'] = morerows
1771             if morecols:
1772                 attributes['morecols'] = morecols
1773             entry = nodes.entry(**attributes)
1774             row += entry
1775             if ''.join(cellblock):
1776                 self.nested_parse(cellblock, input_offset=tableline+offset,
1777                                   node=entry)
1778         return row
1779
1780
1781     explicit = Struct()
1782     """Patterns and constants used for explicit markup recognition."""
1783
1784     explicit.patterns = Struct(
1785           target=re.compile(r"""
1786                             (
1787                               _               # anonymous target
1788                             |               # *OR*
1789                               (?!_)           # no underscore at the beginning
1790                               (?P<quote>`?)   # optional open quote
1791                               (?![ `])        # first char. not space or
1792                                               # backquote
1793                               (?P<name>       # reference name
1794                                 .+?
1795                               )
1796                               %(non_whitespace_escape_before)s
1797                               (?P=quote)      # close quote if open quote used
1798                             )
1799                             (?<!(?<!\x00):) # no unescaped colon at end
1800                             %(non_whitespace_escape_before)s
1801                             [ ]?            # optional space
1802                             :               # end of reference name
1803                             ([ ]+|$)        # followed by whitespace
1804                             """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1805           reference=re.compile(r"""
1806                                (
1807                                  (?P<simple>%(simplename)s)_
1808                                |                  # *OR*
1809                                  `                  # open backquote
1810                                  (?![ ])            # not space
1811                                  (?P<phrase>.+?)    # hyperlink phrase
1812                                  %(non_whitespace_escape_before)s
1813                                  `_                 # close backquote,
1814                                                     # reference mark
1815                                )
1816                                $                  # end of string
1817                                """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1818           substitution=re.compile(r"""
1819                                   (
1820                                     (?![ ])          # first char. not space
1821                                     (?P<name>.+?)    # substitution text
1822                                     %(non_whitespace_escape_before)s
1823                                     \|               # close delimiter
1824                                   )
1825                                   ([ ]+|$)           # followed by whitespace
1826                                   """ % vars(Inliner),
1827                                   re.VERBOSE | re.UNICODE),)
1828
1829     def footnote(self, match):
1830         src, srcline = self.state_machine.get_source_and_line()
1831         indented, indent, offset, blank_finish = \
1832               self.state_machine.get_first_known_indented(match.end())
1833         label = match.group(1)
1834         name = normalize_name(label)
1835         footnote = nodes.footnote('\n'.join(indented))
1836         footnote.source = src
1837         footnote.line = srcline
1838         if name[0] == '#':              # auto-numbered
1839             name = name[1:]             # autonumber label
1840             footnote['auto'] = 1
1841             if name:
1842                 footnote['names'].append(name)
1843             self.document.note_autofootnote(footnote)
1844         elif name == '*':               # auto-symbol
1845             name = ''
1846             footnote['auto'] = '*'
1847             self.document.note_symbol_footnote(footnote)
1848         else:                           # manually numbered
1849             footnote += nodes.label('', label)
1850             footnote['names'].append(name)
1851             self.document.note_footnote(footnote)
1852         if name:
1853             self.document.note_explicit_target(footnote, footnote)
1854         else:
1855             self.document.set_id(footnote, footnote)
1856         if indented:
1857             self.nested_parse(indented, input_offset=offset, node=footnote)
1858         return [footnote], blank_finish
1859
1860     def citation(self, match):
1861         src, srcline = self.state_machine.get_source_and_line()
1862         indented, indent, offset, blank_finish = \
1863               self.state_machine.get_first_known_indented(match.end())
1864         label = match.group(1)
1865         name = normalize_name(label)
1866         citation = nodes.citation('\n'.join(indented))
1867         citation.source = src
1868         citation.line = srcline
1869         citation += nodes.label('', label)
1870         citation['names'].append(name)
1871         self.document.note_citation(citation)
1872         self.document.note_explicit_target(citation, citation)
1873         if indented:
1874             self.nested_parse(indented, input_offset=offset, node=citation)
1875         return [citation], blank_finish
1876
1877     def hyperlink_target(self, match):
1878         pattern = self.explicit.patterns.target
1879         lineno = self.state_machine.abs_line_number()
1880         src, srcline = self.state_machine.get_source_and_line()
1881         block, indent, offset, blank_finish = \
1882               self.state_machine.get_first_known_indented(
1883               match.end(), until_blank=1, strip_indent=0)
1884         blocktext = match.string[:match.end()] + '\n'.join(block)
1885         block = [escape2null(line) for line in block]
1886         escaped = block[0]
1887         blockindex = 0
1888         while 1:
1889             targetmatch = pattern.match(escaped)
1890             if targetmatch:
1891                 break
1892             blockindex += 1
1893             try:
1894                 escaped += block[blockindex]
1895             except IndexError:
1896                 raise MarkupError('malformed hyperlink target.')
1897         del block[:blockindex]
1898         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1899         target = self.make_target(block, blocktext, lineno,
1900                                   targetmatch.group('name'))
1901         return [target], blank_finish
1902
1903     def make_target(self, block, block_text, lineno, target_name):
1904         target_type, data = self.parse_target(block, block_text, lineno)
1905         if target_type == 'refname':
1906             target = nodes.target(block_text, '', refname=normalize_name(data))
1907             target.indirect_reference_name = data
1908             self.add_target(target_name, '', target, lineno)
1909             self.document.note_indirect_target(target)
1910             return target
1911         elif target_type == 'refuri':
1912             target = nodes.target(block_text, '')
1913             self.add_target(target_name, data, target, lineno)
1914             return target
1915         else:
1916             return data
1917
1918     def parse_target(self, block, block_text, lineno):
1919         """
1920         Determine the type of reference of a target.
1921
1922         :Return: A 2-tuple, one of:
1923
1924             - 'refname' and the indirect reference name
1925             - 'refuri' and the URI
1926             - 'malformed' and a system_message node
1927         """
1928         if block and block[-1].strip()[-1:] == '_': # possible indirect target
1929             reference = ' '.join([line.strip() for line in block])
1930             refname = self.is_reference(reference)
1931             if refname:
1932                 return 'refname', refname
1933         reference = ''.join([''.join(line.split()) for line in block])
1934         return 'refuri', unescape(reference)
1935
1936     def is_reference(self, reference):
1937         match = self.explicit.patterns.reference.match(
1938             whitespace_normalize_name(reference))
1939         if not match:
1940             return None
1941         return unescape(match.group('simple') or match.group('phrase'))
1942
1943     def add_target(self, targetname, refuri, target, lineno):
1944         target.line = lineno
1945         if targetname:
1946             name = normalize_name(unescape(targetname))
1947             target['names'].append(name)
1948             if refuri:
1949                 uri = self.inliner.adjust_uri(refuri)
1950                 if uri:
1951                     target['refuri'] = uri
1952                 else:
1953                     raise ApplicationError('problem with URI: %r' % refuri)
1954             self.document.note_explicit_target(target, self.parent)
1955         else:                       # anonymous target
1956             if refuri:
1957                 target['refuri'] = refuri
1958             target['anonymous'] = 1
1959             self.document.note_anonymous_target(target)
1960
1961     def substitution_def(self, match):
1962         pattern = self.explicit.patterns.substitution
1963         src, srcline = self.state_machine.get_source_and_line()
1964         block, indent, offset, blank_finish = \
1965               self.state_machine.get_first_known_indented(match.end(),
1966                                                           strip_indent=0)
1967         blocktext = (match.string[:match.end()] + '\n'.join(block))
1968         block.disconnect()
1969         escaped = escape2null(block[0].rstrip())
1970         blockindex = 0
1971         while 1:
1972             subdefmatch = pattern.match(escaped)
1973             if subdefmatch:
1974                 break
1975             blockindex += 1
1976             try:
1977                 escaped = escaped + ' ' + escape2null(block[blockindex].strip())
1978             except IndexError:
1979                 raise MarkupError('malformed substitution definition.')
1980         del block[:blockindex]          # strip out the substitution marker
1981         block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1]
1982         if not block[0]:
1983             del block[0]
1984             offset += 1
1985         while block and not block[-1].strip():
1986             block.pop()
1987         subname = subdefmatch.group('name')
1988         substitution_node = nodes.substitution_definition(blocktext)
1989         substitution_node.source = src
1990         substitution_node.line = srcline
1991         if not block:
1992             msg = self.reporter.warning(
1993                 'Substitution definition "%s" missing contents.' % subname,
1994                 nodes.literal_block(blocktext, blocktext),
1995                 source=src, line=srcline)
1996             return [msg], blank_finish
1997         block[0] = block[0].strip()
1998         substitution_node['names'].append(
1999             nodes.whitespace_normalize_name(subname))
2000         new_abs_offset, blank_finish = self.nested_list_parse(
2001               block, input_offset=offset, node=substitution_node,
2002               initial_state='SubstitutionDef', blank_finish=blank_finish)
2003         i = 0
2004         for node in substitution_node[:]:
2005             if not (isinstance(node, nodes.Inline) or
2006                     isinstance(node, nodes.Text)):
2007                 self.parent += substitution_node[i]
2008                 del substitution_node[i]
2009             else:
2010                 i += 1
2011         for node in substitution_node.traverse(nodes.Element):
2012             if self.disallowed_inside_substitution_definitions(node):
2013                 pformat = nodes.literal_block('', node.pformat().rstrip())
2014                 msg = self.reporter.error(
2015                     'Substitution definition contains illegal element:',
2016                     pformat, nodes.literal_block(blocktext, blocktext),
2017                     source=src, line=srcline)
2018                 return [msg], blank_finish
2019         if len(substitution_node) == 0:
2020             msg = self.reporter.warning(
2021                   'Substitution definition "%s" empty or invalid.' % subname,
2022                   nodes.literal_block(blocktext, blocktext),
2023                   source=src, line=srcline)
2024             return [msg], blank_finish
2025         self.document.note_substitution_def(
2026             substitution_node, subname, self.parent)
2027         return [substitution_node], blank_finish
2028
2029     def disallowed_inside_substitution_definitions(self, node):
2030         if (node['ids'] or
2031             isinstance(node, nodes.reference) and node.get('anonymous') or
2032             isinstance(node, nodes.footnote_reference) and node.get('auto')):
2033             return 1
2034         else:
2035             return 0
2036
2037     def directive(self, match, **option_presets):
2038         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2039         type_name = match.group(1)
2040         directive_class, messages = directives.directive(
2041             type_name, self.memo.language, self.document)
2042         self.parent += messages
2043         if directive_class:
2044             return self.run_directive(
2045                 directive_class, match, type_name, option_presets)
2046         else:
2047             return self.unknown_directive(type_name)
2048
2049     def run_directive(self, directive, match, type_name, option_presets):
2050         """
2051         Parse a directive then run its directive function.
2052
2053         Parameters:
2054
2055         - `directive`: The class implementing the directive.  Must be
2056           a subclass of `rst.Directive`.
2057
2058         - `match`: A regular expression match object which matched the first
2059           line of the directive.
2060
2061         - `type_name`: The directive name, as used in the source text.
2062
2063         - `option_presets`: A dictionary of preset options, defaults for the
2064           directive options.  Currently, only an "alt" option is passed by
2065           substitution definitions (value: the substitution name), which may
2066           be used by an embedded image directive.
2067
2068         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2069         """
2070         if isinstance(directive, (FunctionType, MethodType)):
2071             from docutils.parsers.rst import convert_directive_function
2072             directive = convert_directive_function(directive)
2073         lineno = self.state_machine.abs_line_number()
2074         src, srcline = self.state_machine.get_source_and_line()
2075         initial_line_offset = self.state_machine.line_offset
2076         indented, indent, line_offset, blank_finish \
2077                   = self.state_machine.get_first_known_indented(match.end(),
2078                                                                 strip_top=0)
2079         block_text = '\n'.join(self.state_machine.input_lines[
2080             initial_line_offset : self.state_machine.line_offset + 1])
2081         try:
2082             arguments, options, content, content_offset = (
2083                 self.parse_directive_block(indented, line_offset,
2084                                            directive, option_presets))
2085         except MarkupError, detail:
2086             error = self.reporter.error(
2087                 'Error in "%s" directive:\n%s.' % (type_name,
2088                                                    ' '.join(detail.args)),
2089                 nodes.literal_block(block_text, block_text),
2090                 source=src, line=srcline)
2091             return [error], blank_finish
2092         directive_instance = directive(
2093             type_name, arguments, options, content, lineno,
2094             content_offset, block_text, self, self.state_machine)
2095         try:
2096             result = directive_instance.run()
2097         except docutils.parsers.rst.DirectiveError, error:
2098             msg_node = self.reporter.system_message(error.level, error.msg,
2099                 source=src, line=srcline)
2100             msg_node += nodes.literal_block(block_text, block_text)
2101             result = [msg_node]
2102         assert isinstance(result, list), \
2103                'Directive "%s" must return a list of nodes.' % type_name
2104         for i in range(len(result)):
2105             assert isinstance(result[i], nodes.Node), \
2106                    ('Directive "%s" returned non-Node object (index %s): %r'
2107                     % (type_name, i, result[i]))
2108         return (result,
2109                 blank_finish or self.state_machine.is_next_line_blank())
2110
2111     def parse_directive_block(self, indented, line_offset, directive,
2112                               option_presets):
2113         option_spec = directive.option_spec
2114         has_content = directive.has_content
2115         if indented and not indented[0].strip():
2116             indented.trim_start()
2117             line_offset += 1
2118         while indented and not indented[-1].strip():
2119             indented.trim_end()
2120         if indented and (directive.required_arguments
2121                          or directive.optional_arguments
2122                          or option_spec):
2123             for i, line in enumerate(indented):
2124                 if not line.strip():
2125                     break
2126             else:
2127                 i += 1
2128             arg_block = indented[:i]
2129             content = indented[i+1:]
2130             content_offset = line_offset + i + 1
2131         else:
2132             content = indented
2133             content_offset = line_offset
2134             arg_block = []
2135         if option_spec:
2136             options, arg_block = self.parse_directive_options(
2137                 option_presets, option_spec, arg_block)
2138         else:
2139             options = {}
2140         if arg_block and not (directive.required_arguments
2141                               or directive.optional_arguments):
2142             content = arg_block + indented[i:]
2143             content_offset = line_offset
2144             arg_block = []
2145         while content and not content[0].strip():
2146             content.trim_start()
2147             content_offset += 1
2148         if directive.required_arguments or directive.optional_arguments:
2149             arguments = self.parse_directive_arguments(
2150                 directive, arg_block)
2151         else:
2152             arguments = []
2153         if content and not has_content:
2154             raise MarkupError('no content permitted')
2155         return (arguments, options, content, content_offset)
2156
2157     def parse_directive_options(self, option_presets, option_spec, arg_block):
2158         options = option_presets.copy()
2159         for i in range(len(arg_block)):
2160             if arg_block[i][:1] == ':':
2161                 opt_block = arg_block[i:]
2162                 arg_block = arg_block[:i]
2163                 break
2164         else:
2165             opt_block = []
2166         if opt_block:
2167             success, data = self.parse_extension_options(option_spec,
2168                                                          opt_block)
2169             if success:                 # data is a dict of options
2170                 options.update(data)
2171             else:                       # data is an error string
2172                 raise MarkupError(data)
2173         return options, arg_block
2174
2175     def parse_directive_arguments(self, directive, arg_block):
2176         required = directive.required_arguments
2177         optional = directive.optional_arguments
2178         arg_text = '\n'.join(arg_block)
2179         arguments = arg_text.split()
2180         if len(arguments) < required:
2181             raise MarkupError('%s argument(s) required, %s supplied'
2182                               % (required, len(arguments)))
2183         elif len(arguments) > required + optional:
2184             if directive.final_argument_whitespace:
2185                 arguments = arg_text.split(None, required + optional - 1)
2186             else:
2187                 raise MarkupError(
2188                     'maximum %s argument(s) allowed, %s supplied'
2189                     % (required + optional, len(arguments)))
2190         return arguments
2191
2192     def parse_extension_options(self, option_spec, datalines):
2193         """
2194         Parse `datalines` for a field list containing extension options
2195         matching `option_spec`.
2196
2197         :Parameters:
2198             - `option_spec`: a mapping of option name to conversion
2199               function, which should raise an exception on bad input.
2200             - `datalines`: a list of input strings.
2201
2202         :Return:
2203             - Success value, 1 or 0.
2204             - An option dictionary on success, an error string on failure.
2205         """
2206         node = nodes.field_list()
2207         newline_offset, blank_finish = self.nested_list_parse(
2208               datalines, 0, node, initial_state='ExtensionOptions',
2209               blank_finish=1)
2210         if newline_offset != len(datalines): # incomplete parse of block
2211             return 0, 'invalid option block'
2212         try:
2213             options = utils.extract_extension_options(node, option_spec)
2214         except KeyError, detail:
2215             return 0, ('unknown option: "%s"' % detail.args[0])
2216         except (ValueError, TypeError), detail:
2217             return 0, ('invalid option value: %s' % ' '.join(detail.args))
2218         except utils.ExtensionOptionError, detail:
2219             return 0, ('invalid option data: %s' % ' '.join(detail.args))
2220         if blank_finish:
2221             return 1, options
2222         else:
2223             return 0, 'option data incompletely parsed'
2224
2225     def unknown_directive(self, type_name):
2226         src, srcline = self.state_machine.get_source_and_line()
2227         indented, indent, offset, blank_finish = \
2228               self.state_machine.get_first_known_indented(0, strip_indent=0)
2229         text = '\n'.join(indented)
2230         error = self.reporter.error(
2231               'Unknown directive type "%s".' % type_name,
2232               nodes.literal_block(text, text), source=src, line=srcline)
2233         return [error], blank_finish
2234
2235     def comment(self, match):
2236         if not match.string[match.end():].strip() \
2237               and self.state_machine.is_next_line_blank(): # an empty comment?
2238             return [nodes.comment()], 1 # "A tiny but practical wart."
2239         indented, indent, offset, blank_finish = \
2240               self.state_machine.get_first_known_indented(match.end())
2241         while indented and not indented[-1].strip():
2242             indented.trim_end()
2243         text = '\n'.join(indented)
2244         return [nodes.comment(text, text)], blank_finish
2245
2246     explicit.constructs = [
2247           (footnote,
2248            re.compile(r"""
2249                       \.\.[ ]+          # explicit markup start
2250                       \[
2251                       (                 # footnote label:
2252                           [0-9]+          # manually numbered footnote
2253                         |               # *OR*
2254                           \#              # anonymous auto-numbered footnote
2255                         |               # *OR*
2256                           \#%s            # auto-number ed?) footnote label
2257                         |               # *OR*
2258                           \*              # auto-symbol footnote
2259                       )
2260                       \]
2261                       ([ ]+|$)          # whitespace or end of line
2262                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2263           (citation,
2264            re.compile(r"""
2265                       \.\.[ ]+          # explicit markup start
2266                       \[(%s)\]          # citation label
2267                       ([ ]+|$)          # whitespace or end of line
2268                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2269           (hyperlink_target,
2270            re.compile(r"""
2271                       \.\.[ ]+          # explicit markup start
2272                       _                 # target indicator
2273                       (?![ ]|$)         # first char. not space or EOL
2274                       """, re.VERBOSE | re.UNICODE)),
2275           (substitution_def,
2276            re.compile(r"""
2277                       \.\.[ ]+          # explicit markup start
2278                       \|                # substitution indicator
2279                       (?![ ]|$)         # first char. not space or EOL
2280                       """, re.VERBOSE | re.UNICODE)),
2281           (directive,
2282            re.compile(r"""
2283                       \.\.[ ]+          # explicit markup start
2284                       (%s)              # directive name
2285                       [ ]?              # optional space
2286                       ::                # directive delimiter
2287                       ([ ]+|$)          # whitespace or end of line
2288                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE))]
2289
2290     def explicit_markup(self, match, context, next_state):
2291         """Footnotes, hyperlink targets, directives, comments."""
2292         nodelist, blank_finish = self.explicit_construct(match)
2293         self.parent += nodelist
2294         self.explicit_list(blank_finish)
2295         return [], next_state, []
2296
2297     def explicit_construct(self, match):
2298         """Determine which explicit construct this is, parse & return it."""
2299         errors = []
2300         for method, pattern in self.explicit.constructs:
2301             expmatch = pattern.match(match.string)
2302             if expmatch:
2303                 try:
2304                     return method(self, expmatch)
2305                 except MarkupError, error: # never reached?
2306                     message = ' '.join(error.args)
2307                     src, srcline = self.state_machine.get_source_and_line()
2308                     errors.append(self.reporter.warning(
2309                                       message, source=src, line=srcline))
2310                     break
2311         nodelist, blank_finish = self.comment(match)
2312         return nodelist + errors, blank_finish
2313
2314     def explicit_list(self, blank_finish):
2315         """
2316         Create a nested state machine for a series of explicit markup
2317         constructs (including anonymous hyperlink targets).
2318         """
2319         offset = self.state_machine.line_offset + 1   # next line
2320         newline_offset, blank_finish = self.nested_list_parse(
2321               self.state_machine.input_lines[offset:],
2322               input_offset=self.state_machine.abs_line_offset() + 1,
2323               node=self.parent, initial_state='Explicit',
2324               blank_finish=blank_finish,
2325               match_titles=self.state_machine.match_titles)
2326         self.goto_line(newline_offset)
2327         if not blank_finish:
2328             self.parent += self.unindent_warning('Explicit markup')
2329
2330     def anonymous(self, match, context, next_state):
2331         """Anonymous hyperlink targets."""
2332         nodelist, blank_finish = self.anonymous_target(match)
2333         self.parent += nodelist
2334         self.explicit_list(blank_finish)
2335         return [], next_state, []
2336
2337     def anonymous_target(self, match):
2338         lineno = self.state_machine.abs_line_number()
2339         block, indent, offset, blank_finish \
2340               = self.state_machine.get_first_known_indented(match.end(),
2341                                                             until_blank=1)
2342         blocktext = match.string[:match.end()] + '\n'.join(block)
2343         block = [escape2null(line) for line in block]
2344         target = self.make_target(block, blocktext, lineno, '')
2345         return [target], blank_finish
2346
2347     def line(self, match, context, next_state):
2348         """Section title overline or transition marker."""
2349         if self.state_machine.match_titles:
2350             return [match.string], 'Line', []
2351         elif match.string.strip() == '::':
2352             raise statemachine.TransitionCorrection('text')
2353         elif len(match.string.strip()) < 4:
2354             msg = self.reporter.info(
2355                 'Unexpected possible title overline or transition.\n'
2356                 "Treating it as ordinary text because it's so short.",
2357                 line=self.state_machine.abs_line_number())
2358             self.parent += msg
2359             raise statemachine.TransitionCorrection('text')
2360         else:
2361             blocktext = self.state_machine.line
2362             msg = self.reporter.severe(
2363                   'Unexpected section title or transition.',
2364                   nodes.literal_block(blocktext, blocktext),
2365                   line=self.state_machine.abs_line_number())
2366             self.parent += msg
2367             return [], next_state, []
2368
2369     def text(self, match, context, next_state):
2370         """Titles, definition lists, paragraphs."""
2371         return [match.string], 'Text', []
2372
2373
2374 class RFC2822Body(Body):
2375
2376     """
2377     RFC2822 headers are only valid as the first constructs in documents.  As
2378     soon as anything else appears, the `Body` state should take over.
2379     """
2380
2381     patterns = Body.patterns.copy()     # can't modify the original
2382     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2383     initial_transitions = [(name, 'Body')
2384                            for name in Body.initial_transitions]
2385     initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2386
2387     def rfc2822(self, match, context, next_state):
2388         """RFC2822-style field list item."""
2389         fieldlist = nodes.field_list(classes=['rfc2822'])
2390         self.parent += fieldlist
2391         field, blank_finish = self.rfc2822_field(match)
2392         fieldlist += field
2393         offset = self.state_machine.line_offset + 1   # next line
2394         newline_offset, blank_finish = self.nested_list_parse(
2395               self.state_machine.input_lines[offset:],
2396               input_offset=self.state_machine.abs_line_offset() + 1,
2397               node=fieldlist, initial_state='RFC2822List',
2398               blank_finish=blank_finish)
2399         self.goto_line(newline_offset)
2400         if not blank_finish:
2401             self.parent += self.unindent_warning(
2402                   'RFC2822-style field list')
2403         return [], next_state, []
2404
2405     def rfc2822_field(self, match):
2406         name = match.string[:match.string.find(':')]
2407         indented, indent, line_offset, blank_finish = \
2408               self.state_machine.get_first_known_indented(match.end(),
2409                                                           until_blank=1)
2410         fieldnode = nodes.field()
2411         fieldnode += nodes.field_name(name, name)
2412         fieldbody = nodes.field_body('\n'.join(indented))
2413         fieldnode += fieldbody
2414         if indented:
2415             self.nested_parse(indented, input_offset=line_offset,
2416                               node=fieldbody)
2417         return fieldnode, blank_finish
2418
2419
2420 class SpecializedBody(Body):
2421
2422     """
2423     Superclass for second and subsequent compound element members.  Compound
2424     elements are lists and list-like constructs.
2425
2426     All transition methods are disabled (redefined as `invalid_input`).
2427     Override individual methods in subclasses to re-enable.
2428
2429     For example, once an initial bullet list item, say, is recognized, the
2430     `BulletList` subclass takes over, with a "bullet_list" node as its
2431     container.  Upon encountering the initial bullet list item, `Body.bullet`
2432     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2433     starts up a nested parsing session with `BulletList` as the initial state.
2434     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2435     as only bullet list items are encountered, they are parsed and inserted
2436     into the container.  The first construct which is *not* a bullet list item
2437     triggers the `invalid_input` method, which ends the nested parse and
2438     closes the container.  `BulletList` needs to recognize input that is
2439     invalid in the context of a bullet list, which means everything *other
2440     than* bullet list items, so it inherits the transition list created in
2441     `Body`.
2442     """
2443
2444     def invalid_input(self, match=None, context=None, next_state=None):
2445         """Not a compound element member. Abort this state machine."""
2446         self.state_machine.previous_line() # back up so parent SM can reassess
2447         raise EOFError
2448
2449     indent = invalid_input
2450     bullet = invalid_input
2451     enumerator = invalid_input
2452     field_marker = invalid_input
2453     option_marker = invalid_input
2454     doctest = invalid_input
2455     line_block = invalid_input
2456     grid_table_top = invalid_input
2457     simple_table_top = invalid_input
2458     explicit_markup = invalid_input
2459     anonymous = invalid_input
2460     line = invalid_input
2461     text = invalid_input
2462
2463
2464 class BulletList(SpecializedBody):
2465
2466     """Second and subsequent bullet_list list_items."""
2467
2468     def bullet(self, match, context, next_state):
2469         """Bullet list item."""
2470         if match.string[0] != self.parent['bullet']:
2471             # different bullet: new list
2472             self.invalid_input()
2473         listitem, blank_finish = self.list_item(match.end())
2474         self.parent += listitem
2475         self.blank_finish = blank_finish
2476         return [], next_state, []
2477
2478
2479 class DefinitionList(SpecializedBody):
2480
2481     """Second and subsequent definition_list_items."""
2482
2483     def text(self, match, context, next_state):
2484         """Definition lists."""
2485         return [match.string], 'Definition', []
2486
2487
2488 class EnumeratedList(SpecializedBody):
2489
2490     """Second and subsequent enumerated_list list_items."""
2491
2492     def enumerator(self, match, context, next_state):
2493         """Enumerated list item."""
2494         format, sequence, text, ordinal = self.parse_enumerator(
2495               match, self.parent['enumtype'])
2496         if ( format != self.format
2497              or (sequence != '#' and (sequence != self.parent['enumtype']
2498                                       or self.auto
2499                                       or ordinal != (self.lastordinal + 1)))
2500              or not self.is_enumerated_list_item(ordinal, sequence, format)):
2501             # different enumeration: new list
2502             self.invalid_input()
2503         if sequence == '#':
2504             self.auto = 1
2505         listitem, blank_finish = self.list_item(match.end())
2506         self.parent += listitem
2507         self.blank_finish = blank_finish
2508         self.lastordinal = ordinal
2509         return [], next_state, []
2510
2511
2512 class FieldList(SpecializedBody):
2513
2514     """Second and subsequent field_list fields."""
2515
2516     def field_marker(self, match, context, next_state):
2517         """Field list field."""
2518         field, blank_finish = self.field(match)
2519         self.parent += field
2520         self.blank_finish = blank_finish
2521         return [], next_state, []
2522
2523
2524 class OptionList(SpecializedBody):
2525
2526     """Second and subsequent option_list option_list_items."""
2527
2528     def option_marker(self, match, context, next_state):
2529         """Option list item."""
2530         try:
2531             option_list_item, blank_finish = self.option_list_item(match)
2532         except MarkupError:
2533             self.invalid_input()
2534         self.parent += option_list_item
2535         self.blank_finish = blank_finish
2536         return [], next_state, []
2537
2538
2539 class RFC2822List(SpecializedBody, RFC2822Body):
2540
2541     """Second and subsequent RFC2822-style field_list fields."""
2542
2543     patterns = RFC2822Body.patterns
2544     initial_transitions = RFC2822Body.initial_transitions
2545
2546     def rfc2822(self, match, context, next_state):
2547         """RFC2822-style field list item."""
2548         field, blank_finish = self.rfc2822_field(match)
2549         self.parent += field
2550         self.blank_finish = blank_finish
2551         return [], 'RFC2822List', []
2552
2553     blank = SpecializedBody.invalid_input
2554
2555
2556 class ExtensionOptions(FieldList):
2557
2558     """
2559     Parse field_list fields for extension options.
2560
2561     No nested parsing is done (including inline markup parsing).
2562     """
2563
2564     def parse_field_body(self, indented, offset, node):
2565         """Override `Body.parse_field_body` for simpler parsing."""
2566         lines = []
2567         for line in list(indented) + ['']:
2568             if line.strip():
2569                 lines.append(line)
2570             elif lines:
2571                 text = '\n'.join(lines)
2572                 node += nodes.paragraph(text, text)
2573                 lines = []
2574
2575
2576 class LineBlock(SpecializedBody):
2577
2578     """Second and subsequent lines of a line_block."""
2579
2580     blank = SpecializedBody.invalid_input
2581
2582     def line_block(self, match, context, next_state):
2583         """New line of line block."""
2584         lineno = self.state_machine.abs_line_number()
2585         line, messages, blank_finish = self.line_block_line(match, lineno)
2586         self.parent += line
2587         self.parent.parent += messages
2588         self.blank_finish = blank_finish
2589         return [], next_state, []
2590
2591
2592 class Explicit(SpecializedBody):
2593
2594     """Second and subsequent explicit markup construct."""
2595
2596     def explicit_markup(self, match, context, next_state):
2597         """Footnotes, hyperlink targets, directives, comments."""
2598         nodelist, blank_finish = self.explicit_construct(match)
2599         self.parent += nodelist
2600         self.blank_finish = blank_finish
2601         return [], next_state, []
2602
2603     def anonymous(self, match, context, next_state):
2604         """Anonymous hyperlink targets."""
2605         nodelist, blank_finish = self.anonymous_target(match)
2606         self.parent += nodelist
2607         self.blank_finish = blank_finish
2608         return [], next_state, []
2609
2610     blank = SpecializedBody.invalid_input
2611
2612
2613 class SubstitutionDef(Body):
2614
2615     """
2616     Parser for the contents of a substitution_definition element.
2617     """
2618
2619     patterns = {
2620           'embedded_directive': re.compile(r'(%s)::( +|$)'
2621                                            % Inliner.simplename, re.UNICODE),
2622           'text': r''}
2623     initial_transitions = ['embedded_directive', 'text']
2624
2625     def embedded_directive(self, match, context, next_state):
2626         nodelist, blank_finish = self.directive(match,
2627                                                 alt=self.parent['names'][0])
2628         self.parent += nodelist
2629         if not self.state_machine.at_eof():
2630             self.blank_finish = blank_finish
2631         raise EOFError
2632
2633     def text(self, match, context, next_state):
2634         if not self.state_machine.at_eof():
2635             self.blank_finish = self.state_machine.is_next_line_blank()
2636         raise EOFError
2637
2638
2639 class Text(RSTState):
2640
2641     """
2642     Classifier of second line of a text block.
2643
2644     Could be a paragraph, a definition list item, or a title.
2645     """
2646
2647     patterns = {'underline': Body.patterns['line'],
2648                 'text': r''}
2649     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2650
2651     def blank(self, match, context, next_state):
2652         """End of paragraph."""
2653         # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext
2654         paragraph, literalnext = self.paragraph(
2655               context, self.state_machine.abs_line_number() - 1)
2656         self.parent += paragraph
2657         if literalnext:
2658             self.parent += self.literal_block()
2659         return [], 'Body', []
2660
2661     def eof(self, context):
2662         if context:
2663             self.blank(None, context, None)
2664         return []
2665
2666     def indent(self, match, context, next_state):
2667         """Definition list item."""
2668         definitionlist = nodes.definition_list()
2669         definitionlistitem, blank_finish = self.definition_list_item(context)
2670         definitionlist += definitionlistitem
2671         self.parent += definitionlist
2672         offset = self.state_machine.line_offset + 1   # next line
2673         newline_offset, blank_finish = self.nested_list_parse(
2674               self.state_machine.input_lines[offset:],
2675               input_offset=self.state_machine.abs_line_offset() + 1,
2676               node=definitionlist, initial_state='DefinitionList',
2677               blank_finish=blank_finish, blank_finish_state='Definition')
2678         self.goto_line(newline_offset)
2679         if not blank_finish:
2680             self.parent += self.unindent_warning('Definition list')
2681         return [], 'Body', []
2682
2683     def underline(self, match, context, next_state):
2684         """Section title."""
2685         lineno = self.state_machine.abs_line_number()
2686         src, srcline = self.state_machine.get_source_and_line()
2687         title = context[0].rstrip()
2688         underline = match.string.rstrip()
2689         source = title + '\n' + underline
2690         messages = []
2691         if column_width(title) > len(underline):
2692             if len(underline) < 4:
2693                 if self.state_machine.match_titles:
2694                     msg = self.reporter.info(
2695                         'Possible title underline, too short for the title.\n'
2696                         "Treating it as ordinary text because it's so short.",
2697                         source=src, line=srcline)
2698                     self.parent += msg
2699                 raise statemachine.TransitionCorrection('text')
2700             else:
2701                 blocktext = context[0] + '\n' + self.state_machine.line
2702                 msg = self.reporter.warning(
2703                     'Title underline too short.',
2704                     nodes.literal_block(blocktext, blocktext),
2705                     source=src, line=srcline)
2706                 messages.append(msg)
2707         if not self.state_machine.match_titles:
2708             blocktext = context[0] + '\n' + self.state_machine.line
2709             msg = self.reporter.severe(
2710                 'Unexpected section title.',
2711                 nodes.literal_block(blocktext, blocktext),
2712                 source=src, line=srcline)
2713             self.parent += messages
2714             self.parent += msg
2715             return [], next_state, []
2716         style = underline[0]
2717         context[:] = []
2718         self.section(title, source, style, lineno - 1, messages)
2719         return [], next_state, []
2720
2721     def text(self, match, context, next_state):
2722         """Paragraph."""
2723         startline = self.state_machine.abs_line_number() - 1
2724         msg = None
2725         try:
2726             block = self.state_machine.get_text_block(flush_left=1)
2727         except statemachine.UnexpectedIndentationError, instance:
2728             block, src, srcline = instance.args
2729             msg = self.reporter.error('Unexpected indentation.',
2730                                       source=src, line=srcline)
2731         lines = context + list(block)
2732         paragraph, literalnext = self.paragraph(lines, startline)
2733         self.parent += paragraph
2734         self.parent += msg
2735         if literalnext:
2736             try:
2737                 self.state_machine.next_line()
2738             except EOFError:
2739                 pass
2740             self.parent += self.literal_block()
2741         return [], next_state, []
2742
2743     def literal_block(self):
2744         """Return a list of nodes."""
2745         indented, indent, offset, blank_finish = \
2746               self.state_machine.get_indented()
2747         while indented and not indented[-1].strip():
2748             indented.trim_end()
2749         if not indented:
2750             return self.quoted_literal_block()
2751         data = '\n'.join(indented)
2752         literal_block = nodes.literal_block(data, data)
2753         literal_block.line = offset + 1
2754         nodelist = [literal_block]
2755         if not blank_finish:
2756             nodelist.append(self.unindent_warning('Literal block'))
2757         return nodelist
2758
2759     def quoted_literal_block(self):
2760         abs_line_offset = self.state_machine.abs_line_offset()
2761         offset = self.state_machine.line_offset
2762         parent_node = nodes.Element()
2763         new_abs_offset = self.nested_parse(
2764             self.state_machine.input_lines[offset:],
2765             input_offset=abs_line_offset, node=parent_node, match_titles=0,
2766             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2767                                   'initial_state': 'QuotedLiteralBlock'})
2768         self.goto_line(new_abs_offset)
2769         return parent_node.children
2770
2771     def definition_list_item(self, termline):
2772         indented, indent, line_offset, blank_finish = \
2773               self.state_machine.get_indented()
2774         definitionlistitem = nodes.definition_list_item(
2775             '\n'.join(termline + list(indented)))
2776         lineno = self.state_machine.abs_line_number() - 1
2777         src, srcline = self.state_machine.get_source_and_line()
2778         definitionlistitem.source = src
2779         definitionlistitem.line = srcline - 1
2780         termlist, messages = self.term(termline, lineno)
2781         definitionlistitem += termlist
2782         definition = nodes.definition('', *messages)
2783         definitionlistitem += definition
2784         if termline[0][-2:] == '::':
2785             definition += self.reporter.info(
2786                   'Blank line missing before literal block (after the "::")? '
2787                   'Interpreted as a definition list item.',
2788                   source=src, line=srcline)
2789         self.nested_parse(indented, input_offset=line_offset, node=definition)
2790         return definitionlistitem, blank_finish
2791
2792     classifier_delimiter = re.compile(' +: +')
2793
2794     def term(self, lines, lineno):
2795         """Return a definition_list's term and optional classifiers."""
2796         assert len(lines) == 1
2797         text_nodes, messages = self.inline_text(lines[0], lineno)
2798         term_node = nodes.term()
2799         node_list = [term_node]
2800         for i in range(len(text_nodes)):
2801             node = text_nodes[i]
2802             if isinstance(node, nodes.Text):
2803                 parts = self.classifier_delimiter.split(node.rawsource)
2804                 if len(parts) == 1:
2805                     node_list[-1] += node
2806                 else:
2807
2808                     node_list[-1] += nodes.Text(parts[0].rstrip())
2809                     for part in parts[1:]:
2810                         classifier_node = nodes.classifier('', part)
2811                         node_list.append(classifier_node)
2812             else:
2813                 node_list[-1] += node
2814         return node_list, messages
2815
2816
2817 class SpecializedText(Text):
2818
2819     """
2820     Superclass for second and subsequent lines of Text-variants.
2821
2822     All transition methods are disabled. Override individual methods in
2823     subclasses to re-enable.
2824     """
2825
2826     def eof(self, context):
2827         """Incomplete construct."""
2828         return []
2829
2830     def invalid_input(self, match=None, context=None, next_state=None):
2831         """Not a compound element member. Abort this state machine."""
2832         raise EOFError
2833
2834     blank = invalid_input
2835     indent = invalid_input
2836     underline = invalid_input
2837     text = invalid_input
2838
2839
2840 class Definition(SpecializedText):
2841
2842     """Second line of potential definition_list_item."""
2843
2844     def eof(self, context):
2845         """Not a definition."""
2846         self.state_machine.previous_line(2) # so parent SM can reassess
2847         return []
2848
2849     def indent(self, match, context, next_state):
2850         """Definition list item."""
2851         definitionlistitem, blank_finish = self.definition_list_item(context)
2852         self.parent += definitionlistitem
2853         self.blank_finish = blank_finish
2854         return [], 'DefinitionList', []
2855
2856
2857 class Line(SpecializedText):
2858
2859     """
2860     Second line of over- & underlined section title or transition marker.
2861     """
2862
2863     eofcheck = 1                        # @@@ ???
2864     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2865
2866     def eof(self, context):
2867         """Transition marker at end of section or document."""
2868         marker = context[0].strip()
2869         if self.memo.section_bubble_up_kludge:
2870             self.memo.section_bubble_up_kludge = 0
2871         elif len(marker) < 4:
2872             self.state_correction(context)
2873         if self.eofcheck:               # ignore EOFError with sections
2874             lineno = self.state_machine.abs_line_number() - 1
2875             transition = nodes.transition(rawsource=context[0])
2876             transition.line = lineno
2877             self.parent += transition
2878         self.eofcheck = 1
2879         return []
2880
2881     def blank(self, match, context, next_state):
2882         """Transition marker."""
2883         src, srcline = self.state_machine.get_source_and_line()
2884         marker = context[0].strip()
2885         if len(marker) < 4:
2886             self.state_correction(context)
2887         transition = nodes.transition(rawsource=marker)
2888         transition.source = src
2889         transition.line = srcline - 1
2890         self.parent += transition
2891         return [], 'Body', []
2892
2893     def text(self, match, context, next_state):
2894         """Potential over- & underlined title."""
2895         lineno = self.state_machine.abs_line_number() - 1
2896         src, srcline = self.state_machine.get_source_and_line()
2897         overline = context[0]
2898         title = match.string
2899         underline = ''
2900         try:
2901             underline = self.state_machine.next_line()
2902         except EOFError:
2903             blocktext = overline + '\n' + title
2904             if len(overline.rstrip()) < 4:
2905                 self.short_overline(context, blocktext, lineno, 2)
2906             else:
2907                 msg = self.reporter.severe(
2908                     'Incomplete section title.',
2909                     nodes.literal_block(blocktext, blocktext),
2910                     source=src, line=srcline-1)
2911                 self.parent += msg
2912                 return [], 'Body', []
2913         source = '%s\n%s\n%s' % (overline, title, underline)
2914         overline = overline.rstrip()
2915         underline = underline.rstrip()
2916         if not self.transitions['underline'][0].match(underline):
2917             blocktext = overline + '\n' + title + '\n' + underline
2918             if len(overline.rstrip()) < 4:
2919                 self.short_overline(context, blocktext, lineno, 2)
2920             else:
2921                 msg = self.reporter.severe(
2922                     'Missing matching underline for section title overline.',
2923                     nodes.literal_block(source, source),
2924                     source=src, line=srcline-1)
2925                 self.parent += msg
2926                 return [], 'Body', []
2927         elif overline != underline:
2928             blocktext = overline + '\n' + title + '\n' + underline
2929             if len(overline.rstrip()) < 4:
2930                 self.short_overline(context, blocktext, lineno, 2)
2931             else:
2932                 msg = self.reporter.severe(
2933                       'Title overline & underline mismatch.',
2934                       nodes.literal_block(source, source),
2935                       source=src, line=srcline-1)
2936                 self.parent += msg
2937                 return [], 'Body', []
2938         title = title.rstrip()
2939         messages = []
2940         if column_width(title) > len(overline):
2941             blocktext = overline + '\n' + title + '\n' + underline
2942             if len(overline.rstrip()) < 4:
2943                 self.short_overline(context, blocktext, lineno, 2)
2944             else:
2945                 msg = self.reporter.warning(
2946                       'Title overline too short.',
2947                       nodes.literal_block(source, source),
2948                       source=src, line=srcline-1)
2949                 messages.append(msg)
2950         style = (overline[0], underline[0])
2951         self.eofcheck = 0               # @@@ not sure this is correct
2952         self.section(title.lstrip(), source, style, lineno + 1, messages)
2953         self.eofcheck = 1
2954         return [], 'Body', []
2955
2956     indent = text                       # indented title
2957
2958     def underline(self, match, context, next_state):
2959         overline = context[0]
2960         blocktext = overline + '\n' + self.state_machine.line
2961         lineno = self.state_machine.abs_line_number() - 1
2962         src, srcline = self.state_machine.get_source_and_line()
2963         if len(overline.rstrip()) < 4:
2964             self.short_overline(context, blocktext, lineno, 1)
2965         msg = self.reporter.error(
2966               'Invalid section title or transition marker.',
2967               nodes.literal_block(blocktext, blocktext),
2968               source=src, line=srcline-1)
2969         self.parent += msg
2970         return [], 'Body', []
2971
2972     def short_overline(self, context, blocktext, lineno, lines=1):
2973         src, srcline = self.state_machine.get_source_and_line(lineno)
2974         msg = self.reporter.info(
2975             'Possible incomplete section title.\nTreating the overline as '
2976             "ordinary text because it's so short.",
2977                     source=src, line=srcline)
2978         self.parent += msg
2979         self.state_correction(context, lines)
2980
2981     def state_correction(self, context, lines=1):
2982         self.state_machine.previous_line(lines)
2983         context[:] = []
2984         raise statemachine.StateCorrection('Body', 'text')
2985
2986
2987 class QuotedLiteralBlock(RSTState):
2988
2989     """
2990     Nested parse handler for quoted (unindented) literal blocks.
2991
2992     Special-purpose.  Not for inclusion in `state_classes`.
2993     """
2994
2995     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
2996                 'text': r''}
2997     initial_transitions = ('initial_quoted', 'text')
2998
2999     def __init__(self, state_machine, debug=0):
3000         RSTState.__init__(self, state_machine, debug)
3001         self.messages = []
3002         self.initial_lineno = None
3003
3004     def blank(self, match, context, next_state):
3005         if context:
3006             raise EOFError
3007         else:
3008             return context, next_state, []
3009
3010     def eof(self, context):
3011         if context:
3012             src, srcline = self.state_machine.get_source_and_line(
3013                                                         self.initial_lineno)
3014             text = '\n'.join(context)
3015             literal_block = nodes.literal_block(text, text)
3016             literal_block.source = src
3017             literal_block.line = srcline
3018             self.parent += literal_block
3019         else:
3020             self.parent += self.reporter.warning(
3021                 'Literal block expected; none found.',
3022                 line=self.state_machine.abs_line_number())
3023                 # src not available, because statemachine.input_lines is empty
3024             self.state_machine.previous_line()
3025         self.parent += self.messages
3026         return []
3027
3028     def indent(self, match, context, next_state):
3029         assert context, ('QuotedLiteralBlock.indent: context should not '
3030                          'be empty!')
3031         self.messages.append(
3032             self.reporter.error('Unexpected indentation.',
3033                                 line=self.state_machine.abs_line_number()))
3034         self.state_machine.previous_line()
3035         raise EOFError
3036
3037     def initial_quoted(self, match, context, next_state):
3038         """Match arbitrary quote character on the first line only."""
3039         self.remove_transition('initial_quoted')
3040         quote = match.string[0]
3041         pattern = re.compile(re.escape(quote), re.UNICODE)
3042         # New transition matches consistent quotes only:
3043         self.add_transition('quoted',
3044                             (pattern, self.quoted, self.__class__.__name__))
3045         self.initial_lineno = self.state_machine.abs_line_number()
3046         return [match.string], next_state, []
3047
3048     def quoted(self, match, context, next_state):
3049         """Match consistent quotes on subsequent lines."""
3050         context.append(match.string)
3051         return context, next_state, []
3052
3053     def text(self, match, context, next_state):
3054         if context:
3055             src, srcline = self.state_machine.get_source_and_line()
3056             self.messages.append(
3057                 self.reporter.error('Inconsistent literal block quoting.',
3058                                     source=src, line=srcline))
3059             self.state_machine.previous_line()
3060         raise EOFError
3061
3062
3063 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3064                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3065                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3066 """Standard set of State classes used to start `RSTStateMachine`."""