docutils/docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import re
 107 from types import FunctionType, MethodType
 108
 109 from docutils import nodes, statemachine, utils
 110 from docutils import ApplicationError, DataError
 111 from docutils.statemachine import StateMachineWS, StateWS
 112 from docutils.nodes import fully_normalize_name as normalize_name
 113 from docutils.nodes import unescape, whitespace_normalize_name
 114 import docutils.parsers.rst
 115 from docutils.parsers.rst import directives, languages, tableparser, roles
 116 from docutils.utils import escape2null, column_width
 117 from docutils.utils import punctuation_chars, roman, urischemes
 118 from docutils.utils import split_escaped_whitespace
 119
 120
 121 class MarkupError(DataError): pass
 122 class UnknownInterpretedRoleError(DataError): pass
 123 class InterpretedRoleNotImplementedError(DataError): pass
 124 class ParserError(ApplicationError): pass
 125 class MarkupMismatch(Exception): pass
 126
 127
 128 class Struct:
 129
 130     """Stores data attributes for dotted-attribute access."""
 131
 132     def __init__(self, **keywordargs):
 133         self.__dict__.update(keywordargs)
 134
 135
 136 class RSTStateMachine(StateMachineWS):
 137
 138     """
 139     reStructuredText's master StateMachine.
 140
 141     The entry point to reStructuredText parsing is the `run()` method.
 142     """
 143
 144     def run(self, input_lines, document, input_offset=0, match_titles=True,
 145             inliner=None):
 146         """
 147         Parse `input_lines` and modify the `document` node in place.
 148
 149         Extend `StateMachineWS.run()`: set up parse-global data and
 150         run the StateMachine.
 151         """
 152         self.language = languages.get_language(
 153             document.settings.language_code, document.reporter)
 154         self.match_titles = match_titles
 155         if inliner is None:
 156             inliner = Inliner()
 157         inliner.init_customizations(document.settings)
 158         self.memo = Struct(document=document,
 159                            reporter=document.reporter,
 160                            language=self.language,
 161                            title_styles=[],
 162                            section_level=0,
 163                            section_bubble_up_kludge=False,
 164                            inliner=inliner)
 165         self.document = document
 166         self.attach_observer(document.note_source)
 167         self.reporter = self.memo.reporter
 168         self.node = document
 169         results = StateMachineWS.run(self, input_lines, input_offset,
 170                                      input_source=document['source'])
 171         assert results == [], 'RSTStateMachine.run() results should be empty!'
 172         self.node = self.memo = None    # remove unneeded references
 173
 174
 175 class NestedStateMachine(StateMachineWS):
 176
 177     """
 178     StateMachine run from within other StateMachine runs, to parse nested
 179     document structures.
 180     """
 181
 182     def run(self, input_lines, input_offset, memo, node, match_titles=True):
 183         """
 184         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 185
 186         Extend `StateMachineWS.run()`: set up document-wide data.
 187         """
 188         self.match_titles = match_titles
 189         self.memo = memo
 190         self.document = memo.document
 191         self.attach_observer(self.document.note_source)
 192         self.reporter = memo.reporter
 193         self.language = memo.language
 194         self.node = node
 195         results = StateMachineWS.run(self, input_lines, input_offset)
 196         assert results == [], ('NestedStateMachine.run() results should be '
 197                                'empty!')
 198         return results
 199
 200
 201 class RSTState(StateWS):
 202
 203     """
 204     reStructuredText State superclass.
 205
 206     Contains methods used by all State subclasses.
 207     """
 208
 209     nested_sm = NestedStateMachine
 210     nested_sm_cache = []
 211
 212     def __init__(self, state_machine, debug=False):
 213         self.nested_sm_kwargs = {'state_classes': state_classes,
 214                                  'initial_state': 'Body'}
 215         StateWS.__init__(self, state_machine, debug)
 216
 217     def runtime_init(self):
 218         StateWS.runtime_init(self)
 219         memo = self.state_machine.memo
 220         self.memo = memo
 221         self.reporter = memo.reporter
 222         self.inliner = memo.inliner
 223         self.document = memo.document
 224         self.parent = self.state_machine.node
 225         # enable the reporter to determine source and source-line
 226         if not hasattr(self.reporter, 'get_source_and_line'):
 227             self.reporter.get_source_and_line = self.state_machine.get_source_and_line  # noqa:E501
 228
 229     def goto_line(self, abs_line_offset):
 230         """
 231         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 232         """
 233         try:
 234             self.state_machine.goto_line(abs_line_offset)
 235         except EOFError:
 236             pass
 237
 238     def no_match(self, context, transitions):
 239         """
 240         Override `StateWS.no_match` to generate a system message.
 241
 242         This code should never be run.
 243         """
 244         self.reporter.severe(
 245             'Internal error: no transition pattern match.  State: "%s"; '
 246             'transitions: %s; context: %s; current line: %r.'
 247             % (self.__class__.__name__, transitions, context,
 248                self.state_machine.line))
 249         return context, None, []
 250
 251     def bof(self, context):
 252         """Called at beginning of file."""
 253         return [], []
 254
 255     def nested_parse(self, block, input_offset, node, match_titles=False,
 256                      state_machine_class=None, state_machine_kwargs=None):
 257         """
 258         Create a new StateMachine rooted at `node` and run it over the input
 259         `block`.
 260         """
 261         use_default = 0
 262         if state_machine_class is None:
 263             state_machine_class = self.nested_sm
 264             use_default += 1
 265         if state_machine_kwargs is None:
 266             state_machine_kwargs = self.nested_sm_kwargs
 267             use_default += 1
 268         block_length = len(block)
 269
 270         state_machine = None
 271         if use_default == 2:
 272             try:
 273                 state_machine = self.nested_sm_cache.pop()
 274             except IndexError:
 275                 pass
 276         if not state_machine:
 277             state_machine = state_machine_class(debug=self.debug,
 278                                                 **state_machine_kwargs)
 279         state_machine.run(block, input_offset, memo=self.memo,
 280                           node=node, match_titles=match_titles)
 281         if use_default == 2:
 282             self.nested_sm_cache.append(state_machine)
 283         else:
 284             state_machine.unlink()
 285         new_offset = state_machine.abs_line_offset()
 286         # No `block.parent` implies disconnected -- lines aren't in sync:
 287         if block.parent and (len(block) - block_length) != 0:
 288             # Adjustment for block if modified in nested parse:
 289             self.state_machine.next_line(len(block) - block_length)
 290         return new_offset
 291
 292     def nested_list_parse(self, block, input_offset, node, initial_state,
 293                           blank_finish,
 294                           blank_finish_state=None,
 295                           extra_settings={},
 296                           match_titles=False,
 297                           state_machine_class=None,
 298                           state_machine_kwargs=None):
 299         """
 300         Create a new StateMachine rooted at `node` and run it over the input
 301         `block`. Also keep track of optional intermediate blank lines and the
 302         required final one.
 303         """
 304         if state_machine_class is None:
 305             state_machine_class = self.nested_sm
 306         if state_machine_kwargs is None:
 307             state_machine_kwargs = self.nested_sm_kwargs.copy()
 308         state_machine_kwargs['initial_state'] = initial_state
 309         state_machine = state_machine_class(debug=self.debug,
 310                                             **state_machine_kwargs)
 311         if blank_finish_state is None:
 312             blank_finish_state = initial_state
 313         state_machine.states[blank_finish_state].blank_finish = blank_finish
 314         for key, value in extra_settings.items():
 315             setattr(state_machine.states[initial_state], key, value)
 316         state_machine.run(block, input_offset, memo=self.memo,
 317                           node=node, match_titles=match_titles)
 318         blank_finish = state_machine.states[blank_finish_state].blank_finish
 319         state_machine.unlink()
 320         return state_machine.abs_line_offset(), blank_finish
 321
 322     def section(self, title, source, style, lineno, messages):
 323         """Check for a valid subsection and create one if it checks out."""
 324         if self.check_subsection(source, style, lineno):
 325             self.new_subsection(title, lineno, messages)
 326
 327     def check_subsection(self, source, style, lineno):
 328         """
 329         Check for a valid subsection header.  Return True or False.
 330
 331         When a new section is reached that isn't a subsection of the current
 332         section, back up the line count (use ``previous_line(-x)``), then
 333         ``raise EOFError``.  The current StateMachine will finish, then the
 334         calling StateMachine can re-examine the title.  This will work its way
 335         back up the calling chain until the correct section level isreached.
 336
 337         @@@ Alternative: Evaluate the title, store the title info & level, and
 338         back up the chain until that level is reached.  Store in memo? Or
 339         return in results?
 340
 341         :Exception: `EOFError` when a sibling or supersection encountered.
 342         """
 343         memo = self.memo
 344         title_styles = memo.title_styles
 345         mylevel = memo.section_level
 346         try:                            # check for existing title style
 347             level = title_styles.index(style) + 1
 348         except ValueError:              # new title style
 349             if len(title_styles) == memo.section_level:  # new subsection
 350                 title_styles.append(style)
 351                 return True
 352             else:                       # not at lowest level
 353                 self.parent += self.title_inconsistent(source, lineno)
 354                 return False
 355         if level <= mylevel:            # sibling or supersection
 356             memo.section_level = level   # bubble up to parent section
 357             if len(style) == 2:
 358                 memo.section_bubble_up_kludge = True
 359             # back up 2 lines for underline title, 3 for overline title
 360             self.state_machine.previous_line(len(style) + 1)
 361             raise EOFError              # let parent section re-evaluate
 362         if level == mylevel + 1:        # immediate subsection
 363             return True
 364         else:                           # invalid subsection
 365             self.parent += self.title_inconsistent(source, lineno)
 366             return False
 367
 368     def title_inconsistent(self, sourcetext, lineno):
 369         error = self.reporter.severe(
 370             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 371             line=lineno)
 372         return error
 373
 374     def new_subsection(self, title, lineno, messages):
 375         """Append new subsection to document tree. On return, check level."""
 376         memo = self.memo
 377         mylevel = memo.section_level
 378         memo.section_level += 1
 379         section_node = nodes.section()
 380         self.parent += section_node
 381         textnodes, title_messages = self.inline_text(title, lineno)
 382         titlenode = nodes.title(title, '', *textnodes)
 383         name = normalize_name(titlenode.astext())
 384         section_node['names'].append(name)
 385         section_node += titlenode
 386         section_node += messages
 387         section_node += title_messages
 388         self.document.note_implicit_target(section_node, section_node)
 389         offset = self.state_machine.line_offset + 1
 390         absoffset = self.state_machine.abs_line_offset() + 1
 391         newabsoffset = self.nested_parse(
 392               self.state_machine.input_lines[offset:], input_offset=absoffset,
 393               node=section_node, match_titles=True)
 394         self.goto_line(newabsoffset)
 395         if memo.section_level <= mylevel:  # can't handle next section?
 396             raise EOFError                 # bubble up to supersection
 397         # reset section_level; next pass will detect it properly
 398         memo.section_level = mylevel
 399
 400     def paragraph(self, lines, lineno):
 401         """
 402         Return a list (paragraph & messages) & a boolean: literal_block next?
 403         """
 404         data = '\n'.join(lines).rstrip()
 405         if re.search(r'(?<!\\)(\\\\)*::$', data):
 406             if len(data) == 2:
 407                 return [], 1
 408             elif data[-3] in ' \n':
 409                 text = data[:-3].rstrip()
 410             else:
 411                 text = data[:-1]
 412             literalnext = 1
 413         else:
 414             text = data
 415             literalnext = 0
 416         textnodes, messages = self.inline_text(text, lineno)
 417         p = nodes.paragraph(data, '', *textnodes)
 418         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 419         return [p] + messages, literalnext
 420
 421     def inline_text(self, text, lineno):
 422         """
 423         Return 2 lists: nodes (text and inline elements), and system_messages.
 424         """
 425         nodes, messages = self.inliner.parse(text, lineno,
 426                                              self.memo, self.parent)
 427         return nodes, messages
 428
 429     def unindent_warning(self, node_name):
 430         # the actual problem is one line below the current line
 431         lineno = self.state_machine.abs_line_number() + 1
 432         return self.reporter.warning('%s ends without a blank line; '
 433                                      'unexpected unindent.' % node_name,
 434                                      line=lineno)
 435
 436
 437 def build_regexp(definition, compile=True):
 438     """
 439     Build, compile and return a regular expression based on `definition`.
 440
 441     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 442         where "parts" is a list of regular expressions and/or regular
 443         expression definitions to be joined into an or-group.
 444     """
 445     name, prefix, suffix, parts = definition
 446     part_strings = []
 447     for part in parts:
 448         if isinstance(part, tuple):
 449             part_strings.append(build_regexp(part, None))
 450         else:
 451             part_strings.append(part)
 452     or_group = '|'.join(part_strings)
 453     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 454     if compile:
 455         return re.compile(regexp)
 456     else:
 457         return regexp
 458
 459
 460 class Inliner:
 461
 462     """
 463     Parse inline markup; call the `parse()` method.
 464     """
 465
 466     def __init__(self):
 467         self.implicit_dispatch = []
 468         """List of (pattern, bound method) tuples, used by
 469         `self.implicit_inline`."""
 470
 471     def init_customizations(self, settings):
 472         # lookahead and look-behind expressions for inline markup rules
 473         if getattr(settings, 'character_level_inline_markup', False):
 474             start_string_prefix = '(^|(?<!\x00))'
 475             end_string_suffix = ''
 476         else:
 477             start_string_prefix = ('(^|(?<=\\s|[%s%s]))' %
 478                                    (punctuation_chars.openers,
 479                                     punctuation_chars.delimiters))
 480             end_string_suffix = ('($|(?=\\s|[\x00%s%s%s]))' %
 481                                  (punctuation_chars.closing_delimiters,
 482                                   punctuation_chars.delimiters,
 483                                   punctuation_chars.closers))
 484         args = locals().copy()
 485         args.update(vars(self.__class__))
 486
 487         parts = ('initial_inline', start_string_prefix, '',
 488            [
 489             ('start', '', self.non_whitespace_after,  # simple start-strings
 490              [r'\*\*',                # strong
 491               r'\*(?!\*)',            # emphasis but not strong
 492               r'``',                  # literal
 493               r'_`',                  # inline internal target
 494               r'\|(?!\|)']            # substitution reference
 495              ),
 496             ('whole', '', end_string_suffix,  # whole constructs
 497              [  # reference name & end-string
 498               r'(?P<refname>%s)(?P<refend>__?)' % self.simplename,
 499               ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 500                [r'[0-9]+',                     # manually numbered
 501                 r'\#(%s)?' % self.simplename,  # auto-numbered (w/ label?)
 502                 r'\*',                         # auto-symbol
 503                 r'(?P<citationlabel>%s)' % self.simplename,  # citation ref
 504                 ]
 505                )
 506               ]
 507              ),
 508             ('backquote',             # interpreted text or phrase reference
 509              '(?P<role>(:%s:)?)' % self.simplename,  # optional role
 510              self.non_whitespace_after,
 511              ['`(?!`)']               # but not literal
 512              )
 513             ]
 514         )
 515         self.start_string_prefix = start_string_prefix
 516         self.end_string_suffix = end_string_suffix
 517         self.parts = parts
 518
 519         self.patterns = Struct(
 520           initial=build_regexp(parts),
 521           emphasis=re.compile(self.non_whitespace_escape_before
 522                               + r'(\*)' + end_string_suffix),
 523           strong=re.compile(self.non_whitespace_escape_before
 524                             + r'(\*\*)' + end_string_suffix),
 525           interpreted_or_phrase_ref=re.compile(
 526               r"""
 527               %(non_unescaped_whitespace_escape_before)s
 528               (
 529                 `
 530                 (?P<suffix>
 531                   (?P<role>:%(simplename)s:)?
 532                   (?P<refend>__?)?
 533                 )
 534               )
 535               %(end_string_suffix)s
 536               """ % args, re.VERBOSE),
 537           embedded_link=re.compile(
 538               r"""
 539               (
 540                 (?:[ \n]+|^)            # spaces or beginning of line/string
 541                 <                       # open bracket
 542                 %(non_whitespace_after)s
 543                 (([^<>]|\x00[<>])+)     # anything but unescaped angle brackets
 544                 %(non_whitespace_escape_before)s
 545                 >                       # close bracket
 546               )
 547               $                         # end of string
 548               """ % args, re.VERBOSE),
 549           literal=re.compile(self.non_whitespace_before + '(``)'
 550                              + end_string_suffix),
 551           target=re.compile(self.non_whitespace_escape_before
 552                             + r'(`)' + end_string_suffix),
 553           substitution_ref=re.compile(self.non_whitespace_escape_before
 554                                       + r'(\|_{0,2})'
 555                                       + end_string_suffix),
 556           email=re.compile(self.email_pattern % args + '$',
 557                            re.VERBOSE),
 558           uri=re.compile(
 559                 (r"""
 560                 %(start_string_prefix)s
 561                 (?P<whole>
 562                   (?P<absolute>           # absolute URI
 563                     (?P<scheme>             # scheme (http, ftp, mailto)
 564                       [a-zA-Z][a-zA-Z0-9.+-]*
 565                     )
 566                     :
 567                     (
 568                       (                       # either:
 569                         (//?)?                  # hierarchical URI
 570                         %(uric)s*               # URI characters
 571                         %(uri_end)s             # final URI char
 572                       )
 573                       (                       # optional query
 574                         \?%(uric)s*
 575                         %(uri_end)s
 576                       )?
 577                       (                       # optional fragment
 578                         \#%(uric)s*
 579                         %(uri_end)s
 580                       )?
 581                     )
 582                   )
 583                 |                       # *OR*
 584                   (?P<email>              # email address
 585                     """ + self.email_pattern + r"""
 586                   )
 587                 )
 588                 %(end_string_suffix)s
 589                 """) % args, re.VERBOSE),
 590           pep=re.compile(
 591                 r"""
 592                 %(start_string_prefix)s
 593                 (
 594                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 595                 |
 596                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 597                 )
 598                 %(end_string_suffix)s""" % args, re.VERBOSE),
 599           rfc=re.compile(
 600                 r"""
 601                 %(start_string_prefix)s
 602                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 603                 %(end_string_suffix)s""" % args, re.VERBOSE))
 604
 605         self.implicit_dispatch.append((self.patterns.uri,
 606                                        self.standalone_uri))
 607         if settings.pep_references:
 608             self.implicit_dispatch.append((self.patterns.pep,
 609                                            self.pep_reference))
 610         if settings.rfc_references:
 611             self.implicit_dispatch.append((self.patterns.rfc,
 612                                            self.rfc_reference))
 613
 614     def parse(self, text, lineno, memo, parent):
 615         # Needs to be refactored for nested inline markup.
 616         # Add nested_parse() method?
 617         """
 618         Return 2 lists: nodes (text and inline elements), and system_messages.
 619
 620         Using `self.patterns.initial`, a pattern which matches start-strings
 621         (emphasis, strong, interpreted, phrase reference, literal,
 622         substitution reference, and inline target) and complete constructs
 623         (simple reference, footnote reference), search for a candidate.  When
 624         one is found, check for validity (e.g., not a quoted '*' character).
 625         If valid, search for the corresponding end string if applicable, and
 626         check it for validity.  If not found or invalid, generate a warning
 627         and ignore the start-string.  Implicit inline markup (e.g. standalone
 628         URIs) is found last.
 629
 630         :text: source string
 631         :lineno: absolute line number (cf. statemachine.get_source_and_line())
 632         """
 633         self.reporter = memo.reporter
 634         self.document = memo.document
 635         self.language = memo.language
 636         self.parent = parent
 637         pattern_search = self.patterns.initial.search
 638         dispatch = self.dispatch
 639         remaining = escape2null(text)
 640         processed = []
 641         unprocessed = []
 642         messages = []
 643         while remaining:
 644             match = pattern_search(remaining)
 645             if match:
 646                 groups = match.groupdict()
 647                 method = dispatch[groups['start'] or groups['backquote']
 648                                   or groups['refend'] or groups['fnend']]
 649                 before, inlines, remaining, sysmessages = method(self, match,
 650                                                                  lineno)
 651                 unprocessed.append(before)
 652                 messages += sysmessages
 653                 if inlines:
 654                     processed += self.implicit_inline(''.join(unprocessed),
 655                                                       lineno)
 656                     processed += inlines
 657                     unprocessed = []
 658             else:
 659                 break
 660         remaining = ''.join(unprocessed) + remaining
 661         if remaining:
 662             processed += self.implicit_inline(remaining, lineno)
 663         return processed, messages
 664
 665     # Inline object recognition
 666     # -------------------------
 667     # See also init_customizations().
 668     non_whitespace_before = r'(?<!\s)'
 669     non_whitespace_escape_before = r'(?<![\s\x00])'
 670     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[\s\x00])'
 671     non_whitespace_after = r'(?!\s)'
 672     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 673     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 674     # Valid URI characters (see RFC 2396 & RFC 2732);
 675     # final \x00 allows backslash escapes in URIs:
 676     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 677     # Delimiter indicating the end of a URI (not part of the URI):
 678     uri_end_delim = r"""[>]"""
 679     # Last URI character; same as uric but no punctuation:
 680     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 681     # End of a URI (either 'urilast' or 'uric followed by a
 682     # uri_end_delim'):
 683     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 684     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 685     email_pattern = r"""
 686           %(emailc)s+(?:\.%(emailc)s+)*   # name
 687           (?<!\x00)@                      # at
 688           %(emailc)s+(?:\.%(emailc)s*)*   # host
 689           %(uri_end)s                     # final URI char
 690           """
 691
 692     def quoted_start(self, match):
 693         """Test if inline markup start-string is 'quoted'.
 694
 695         'Quoted' in this context means the start-string is enclosed in a pair
 696         of matching opening/closing delimiters (not necessarily quotes)
 697         or at the end of the match.
 698         """
 699         string = match.string
 700         start = match.start()
 701         if start == 0:                  # start-string at beginning of text
 702             return False
 703         prestart = string[start - 1]
 704         try:
 705             poststart = string[match.end()]
 706         except IndexError:          # start-string at end of text
 707             return True  # not "quoted" but no markup start-string either
 708         return punctuation_chars.match_chars(prestart, poststart)
 709
 710     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 711                    restore_backslashes=False):
 712         string = match.string
 713         matchstart = match.start('start')
 714         matchend = match.end('start')
 715         if self.quoted_start(match):
 716             return string[:matchend], [], string[matchend:], [], ''
 717         endmatch = end_pattern.search(string[matchend:])
 718         if endmatch and endmatch.start(1):  # 1 or more chars
 719             text = endmatch.string[:endmatch.start(1)]
 720             if restore_backslashes:
 721                 text = unescape(text, True)
 722             textend = matchend + endmatch.end(1)
 723             rawsource = unescape(string[matchstart:textend], True)
 724             node = nodeclass(rawsource, text)
 725             return (string[:matchstart], [node],
 726                     string[textend:], [], endmatch.group(1))
 727         msg = self.reporter.warning(
 728               'Inline %s start-string without end-string.'
 729               % nodeclass.__name__, line=lineno)
 730         text = unescape(string[matchstart:matchend], True)
 731         prb = self.problematic(text, text, msg)
 732         return string[:matchstart], [prb], string[matchend:], [msg], ''
 733
 734     def problematic(self, text, rawsource, message):
 735         msgid = self.document.set_id(message, self.parent)
 736         problematic = nodes.problematic(rawsource, text, refid=msgid)
 737         prbid = self.document.set_id(problematic)
 738         message.add_backref(prbid)
 739         return problematic
 740
 741     def emphasis(self, match, lineno):
 742         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 743               match, lineno, self.patterns.emphasis, nodes.emphasis)
 744         return before, inlines, remaining, sysmessages
 745
 746     def strong(self, match, lineno):
 747         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 748               match, lineno, self.patterns.strong, nodes.strong)
 749         return before, inlines, remaining, sysmessages
 750
 751     def interpreted_or_phrase_ref(self, match, lineno):
 752         end_pattern = self.patterns.interpreted_or_phrase_ref
 753         string = match.string
 754         matchstart = match.start('backquote')
 755         matchend = match.end('backquote')
 756         rolestart = match.start('role')
 757         role = match.group('role')
 758         position = ''
 759         if role:
 760             role = role[1:-1]
 761             position = 'prefix'
 762         elif self.quoted_start(match):
 763             return string[:matchend], [], string[matchend:], []
 764         endmatch = end_pattern.search(string[matchend:])
 765         if endmatch and endmatch.start(1):  # 1 or more chars
 766             textend = matchend + endmatch.end()
 767             if endmatch.group('role'):
 768                 if role:
 769                     msg = self.reporter.warning(
 770                         'Multiple roles in interpreted text (both '
 771                         'prefix and suffix present; only one allowed).',
 772                         line=lineno)
 773                     text = unescape(string[rolestart:textend], True)
 774                     prb = self.problematic(text, text, msg)
 775                     return string[:rolestart], [prb], string[textend:], [msg]
 776                 role = endmatch.group('suffix')[1:-1]
 777                 position = 'suffix'
 778             escaped = endmatch.string[:endmatch.start(1)]
 779             rawsource = unescape(string[matchstart:textend], True)
 780             if rawsource[-1:] == '_':
 781                 if role:
 782                     msg = self.reporter.warning(
 783                           'Mismatch: both interpreted text role %s and '
 784                           'reference suffix.' % position, line=lineno)
 785                     text = unescape(string[rolestart:textend], True)
 786                     prb = self.problematic(text, text, msg)
 787                     return string[:rolestart], [prb], string[textend:], [msg]
 788                 return self.phrase_ref(string[:matchstart], string[textend:],
 789                                        rawsource, escaped)
 790             else:
 791                 rawsource = unescape(string[rolestart:textend], True)
 792                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 793                                                       lineno)
 794                 return (string[:rolestart], nodelist,
 795                         string[textend:], messages)
 796         msg = self.reporter.warning(
 797               'Inline interpreted text or phrase reference start-string '
 798               'without end-string.', line=lineno)
 799         text = unescape(string[matchstart:matchend], True)
 800         prb = self.problematic(text, text, msg)
 801         return string[:matchstart], [prb], string[matchend:], [msg]
 802
 803     def phrase_ref(self, before, after, rawsource, escaped, text=None):
 804         # `text` is ignored (since 0.16)
 805         match = self.patterns.embedded_link.search(escaped)
 806         if match:  # embedded <URI> or <alias_>
 807             text = escaped[:match.start(0)]
 808             unescaped = unescape(text)
 809             rawtext = unescape(text, True)
 810             aliastext = match.group(2)
 811             rawaliastext = unescape(aliastext, True)
 812             underscore_escaped = rawaliastext.endswith(r'\_')
 813             if (aliastext.endswith('_')
 814                 and not (underscore_escaped
 815                          or self.patterns.uri.match(aliastext))):
 816                 aliastype = 'name'
 817                 alias = normalize_name(unescape(aliastext[:-1]))
 818                 target = nodes.target(match.group(1), refname=alias)
 819                 target.indirect_reference_name = whitespace_normalize_name(
 820                                                     unescape(aliastext[:-1]))
 821             else:
 822                 aliastype = 'uri'
 823                 # remove unescaped whitespace
 824                 alias_parts = split_escaped_whitespace(match.group(2))
 825                 alias = ' '.join(''.join(part.split())
 826                                  for part in alias_parts)
 827                 alias = self.adjust_uri(unescape(alias))
 828                 if alias.endswith(r'\_'):
 829                     alias = alias[:-2] + '_'
 830                 target = nodes.target(match.group(1), refuri=alias)
 831                 target.referenced = 1
 832             if not aliastext:
 833                 raise ApplicationError('problem with embedded link: %r'
 834                                        % aliastext)
 835             if not text:
 836                 text = alias
 837                 unescaped = unescape(text)
 838                 rawtext = rawaliastext
 839         else:
 840             text = escaped
 841             unescaped = unescape(text)
 842             target = None
 843             rawtext = unescape(escaped, True)
 844
 845         refname = normalize_name(unescaped)
 846         reference = nodes.reference(rawsource, text,
 847                                     name=whitespace_normalize_name(unescaped))
 848         reference[0].rawsource = rawtext
 849
 850         node_list = [reference]
 851
 852         if rawsource[-2:] == '__':
 853             if target and (aliastype == 'name'):
 854                 reference['refname'] = alias
 855                 self.document.note_refname(reference)
 856                 # self.document.note_indirect_target(target) # required?
 857             elif target and (aliastype == 'uri'):
 858                 reference['refuri'] = alias
 859             else:
 860                 reference['anonymous'] = 1
 861         else:
 862             if target:
 863                 target['names'].append(refname)
 864                 if aliastype == 'name':
 865                     reference['refname'] = alias
 866                     self.document.note_indirect_target(target)
 867                     self.document.note_refname(reference)
 868                 else:
 869                     reference['refuri'] = alias
 870                     self.document.note_explicit_target(target, self.parent)
 871                 # target.note_referenced_by(name=refname)
 872                 node_list.append(target)
 873             else:
 874                 reference['refname'] = refname
 875                 self.document.note_refname(reference)
 876         return before, node_list, after, []
 877
 878     def adjust_uri(self, uri):
 879         match = self.patterns.email.match(uri)
 880         if match:
 881             return 'mailto:' + uri
 882         else:
 883             return uri
 884
 885     def interpreted(self, rawsource, text, role, lineno):
 886         role_fn, messages = roles.role(role, self.language, lineno,
 887                                        self.reporter)
 888         if role_fn:
 889             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 890             return nodes, messages + messages2
 891         else:
 892             msg = self.reporter.error(
 893                 'Unknown interpreted text role "%s".' % role,
 894                 line=lineno)
 895             return ([self.problematic(rawsource, rawsource, msg)],
 896                     messages + [msg])
 897
 898     def literal(self, match, lineno):
 899         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 900               match, lineno, self.patterns.literal, nodes.literal,
 901               restore_backslashes=True)
 902         return before, inlines, remaining, sysmessages
 903
 904     def inline_internal_target(self, match, lineno):
 905         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 906               match, lineno, self.patterns.target, nodes.target)
 907         if inlines and isinstance(inlines[0], nodes.target):
 908             assert len(inlines) == 1
 909             target = inlines[0]
 910             name = normalize_name(target.astext())
 911             target['names'].append(name)
 912             self.document.note_explicit_target(target, self.parent)
 913         return before, inlines, remaining, sysmessages
 914
 915     def substitution_reference(self, match, lineno):
 916         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 917               match, lineno, self.patterns.substitution_ref,
 918               nodes.substitution_reference)
 919         if len(inlines) == 1:
 920             subref_node = inlines[0]
 921             if isinstance(subref_node, nodes.substitution_reference):
 922                 subref_text = subref_node.astext()
 923                 self.document.note_substitution_ref(subref_node, subref_text)
 924                 if endstring[-1:] == '_':
 925                     reference_node = nodes.reference(
 926                         '|%s%s' % (subref_text, endstring), '')
 927                     if endstring[-2:] == '__':
 928                         reference_node['anonymous'] = 1
 929                     else:
 930                         reference_node['refname'] = normalize_name(subref_text)
 931                         self.document.note_refname(reference_node)
 932                     reference_node += subref_node
 933                     inlines = [reference_node]
 934         return before, inlines, remaining, sysmessages
 935
 936     def footnote_reference(self, match, lineno):
 937         """
 938         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 939         elements.
 940         """
 941         label = match.group('footnotelabel')
 942         refname = normalize_name(label)
 943         string = match.string
 944         before = string[:match.start('whole')]
 945         remaining = string[match.end('whole'):]
 946         if match.group('citationlabel'):
 947             refnode = nodes.citation_reference('[%s]_' % label,
 948                                                refname=refname)
 949             refnode += nodes.Text(label)
 950             self.document.note_citation_ref(refnode)
 951         else:
 952             refnode = nodes.footnote_reference('[%s]_' % label)
 953             if refname[0] == '#':
 954                 refname = refname[1:]
 955                 refnode['auto'] = 1
 956                 self.document.note_autofootnote_ref(refnode)
 957             elif refname == '*':
 958                 refname = ''
 959                 refnode['auto'] = '*'
 960                 self.document.note_symbol_footnote_ref(
 961                       refnode)
 962             else:
 963                 refnode += nodes.Text(label)
 964             if refname:
 965                 refnode['refname'] = refname
 966                 self.document.note_footnote_ref(refnode)
 967             if utils.get_trim_footnote_ref_space(self.document.settings):
 968                 before = before.rstrip()
 969         return before, [refnode], remaining, []
 970
 971     def reference(self, match, lineno, anonymous=False):
 972         referencename = match.group('refname')
 973         refname = normalize_name(referencename)
 974         referencenode = nodes.reference(
 975             referencename + match.group('refend'), referencename,
 976             name=whitespace_normalize_name(referencename))
 977         referencenode[0].rawsource = referencename
 978         if anonymous:
 979             referencenode['anonymous'] = 1
 980         else:
 981             referencenode['refname'] = refname
 982             self.document.note_refname(referencenode)
 983         string = match.string
 984         matchstart = match.start('whole')
 985         matchend = match.end('whole')
 986         return string[:matchstart], [referencenode], string[matchend:], []
 987
 988     def anonymous_reference(self, match, lineno):
 989         return self.reference(match, lineno, anonymous=True)
 990
 991     def standalone_uri(self, match, lineno):
 992         if (not match.group('scheme')
 993                 or match.group('scheme').lower() in urischemes.schemes):
 994             if match.group('email'):
 995                 addscheme = 'mailto:'
 996             else:
 997                 addscheme = ''
 998             text = match.group('whole')
 999             refuri = addscheme + unescape(text)
1000             reference = nodes.reference(unescape(text, True), text,
1001                                         refuri=refuri)
1002             return [reference]
1003         else:                   # not a valid scheme
1004             raise MarkupMismatch
1005
1006     def pep_reference(self, match, lineno):
1007         text = match.group(0)
1008         if text.startswith('pep-'):
1009             pepnum = int(unescape(match.group('pepnum1')))
1010         elif text.startswith('PEP'):
1011             pepnum = int(unescape(match.group('pepnum2')))
1012         else:
1013             raise MarkupMismatch
1014         ref = (self.document.settings.pep_base_url
1015                + self.document.settings.pep_file_url_template % pepnum)
1016         return [nodes.reference(unescape(text, True), text, refuri=ref)]
1017
1018     rfc_url = 'rfc%d.html'
1019
1020     def rfc_reference(self, match, lineno):
1021         text = match.group(0)
1022         if text.startswith('RFC'):
1023             rfcnum = int(unescape(match.group('rfcnum')))
1024             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
1025         else:
1026             raise MarkupMismatch
1027         return [nodes.reference(unescape(text, True), text, refuri=ref)]
1028
1029     def implicit_inline(self, text, lineno):
1030         """
1031         Check each of the patterns in `self.implicit_dispatch` for a match,
1032         and dispatch to the stored method for the pattern.  Recursively check
1033         the text before and after the match.  Return a list of `nodes.Text`
1034         and inline element nodes.
1035         """
1036         if not text:
1037             return []
1038         for pattern, method in self.implicit_dispatch:
1039             match = pattern.search(text)
1040             if match:
1041                 try:
1042                     # Must recurse on strings before *and* after the match;
1043                     # there may be multiple patterns.
1044                     return (self.implicit_inline(text[:match.start()], lineno)
1045                             + method(match, lineno)
1046                             + self.implicit_inline(text[match.end():], lineno))
1047                 except MarkupMismatch:
1048                     pass
1049         return [nodes.Text(text)]
1050
1051     dispatch = {'*': emphasis,
1052                 '**': strong,
1053                 '`': interpreted_or_phrase_ref,
1054                 '``': literal,
1055                 '_`': inline_internal_target,
1056                 ']_': footnote_reference,
1057                 '|': substitution_reference,
1058                 '_': reference,
1059                 '__': anonymous_reference}
1060
1061
1062 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1063     return ord(s) - _zero
1064
1065
1066 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1067     return ord(s) - _zero
1068
1069
1070 def _lowerroman_to_int(s):
1071     return roman.fromRoman(s.upper())
1072
1073
1074 class Body(RSTState):
1075
1076     """
1077     Generic classifier of the first line of a block.
1078     """
1079
1080     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1081     """Padding character for East Asian double-width text."""
1082
1083     enum = Struct()
1084     """Enumerated list parsing information."""
1085
1086     enum.formatinfo = {
1087           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1088           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1089           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1090     enum.formats = enum.formatinfo.keys()
1091     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1092                       'lowerroman', 'upperroman']  # ORDERED!
1093     enum.sequencepats = {'arabic': '[0-9]+',
1094                          'loweralpha': '[a-z]',
1095                          'upperalpha': '[A-Z]',
1096                          'lowerroman': '[ivxlcdm]+',
1097                          'upperroman': '[IVXLCDM]+'}
1098     enum.converters = {'arabic': int,
1099                        'loweralpha': _loweralpha_to_int,
1100                        'upperalpha': _upperalpha_to_int,
1101                        'lowerroman': _lowerroman_to_int,
1102                        'upperroman': roman.fromRoman}
1103
1104     enum.sequenceregexps = {}
1105     for sequence in enum.sequences:
1106         enum.sequenceregexps[sequence] = re.compile(
1107               enum.sequencepats[sequence] + '$')
1108
1109     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1110     """Matches the top (& bottom) of a full table)."""
1111
1112     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1113     """Matches the top of a simple table."""
1114
1115     simple_table_border_pat = re.compile('=+[ =]*$')
1116     """Matches the bottom & header bottom of a simple table."""
1117
1118     pats = {}
1119     """Fragments of patterns used by transitions."""
1120
1121     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1122     pats['alpha'] = '[a-zA-Z]'
1123     pats['alphanum'] = '[a-zA-Z0-9]'
1124     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1125     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1126                     '|%(upperroman)s|#)' % enum.sequencepats)
1127     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1128     # @@@ Loosen up the pattern?  Allow Unicode?
1129     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1130     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1131     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1132     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1133
1134     for format in enum.formats:
1135         pats[format] = '(?P<%s>%s%s%s)' % (
1136               format, re.escape(enum.formatinfo[format].prefix),
1137               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1138
1139     patterns = {
1140           'bullet': '[-+*\u2022\u2023\u2043]( +|$)',
1141           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1142           'field_marker': r':(?![: ])([^:\\]|\\.|:(?!([ `]|$)))*(?<! ):( +|$)',
1143           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1144           'doctest': r'>>>( +|$)',
1145           'line_block': r'\|( +|$)',
1146           'grid_table_top': grid_table_top_pat,
1147           'simple_table_top': simple_table_top_pat,
1148           'explicit_markup': r'\.\.( +|$)',
1149           'anonymous': r'__( +|$)',
1150           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1151           'text': r''}
1152     initial_transitions = (
1153           'bullet',
1154           'enumerator',
1155           'field_marker',
1156           'option_marker',
1157           'doctest',
1158           'line_block',
1159           'grid_table_top',
1160           'simple_table_top',
1161           'explicit_markup',
1162           'anonymous',
1163           'line',
1164           'text')
1165
1166     def indent(self, match, context, next_state):
1167         """Block quote."""
1168         (indented, indent, line_offset, blank_finish
1169          ) = self.state_machine.get_indented()
1170         elements = self.block_quote(indented, line_offset)
1171         self.parent += elements
1172         if not blank_finish:
1173             self.parent += self.unindent_warning('Block quote')
1174         return context, next_state, []
1175
1176     def block_quote(self, indented, line_offset):
1177         elements = []
1178         while indented:
1179             blockquote = nodes.block_quote(rawsource='\n'.join(indented))
1180             (blockquote.source, blockquote.line
1181              ) = self.state_machine.get_source_and_line(line_offset+1)
1182             (blockquote_lines,
1183              attribution_lines,
1184              attribution_offset,
1185              indented,
1186              new_line_offset) = self.split_attribution(indented, line_offset)
1187             self.nested_parse(blockquote_lines, line_offset, blockquote)
1188             elements.append(blockquote)
1189             if attribution_lines:
1190                 attribution, messages = self.parse_attribution(
1191                     attribution_lines, line_offset+attribution_offset)
1192                 blockquote += attribution
1193                 elements += messages
1194             line_offset = new_line_offset
1195             while indented and not indented[0]:
1196                 indented = indented[1:]
1197                 line_offset += 1
1198         return elements
1199
1200     # U+2014 is an em-dash:
1201     attribution_pattern = re.compile('(---?(?!-)|\u2014) *(?=[^ \\n])')
1202
1203     def split_attribution(self, indented, line_offset):
1204         """
1205         Check for a block quote attribution and split it off:
1206
1207         * First line after a blank line must begin with a dash ("--", "---",
1208           em-dash; matches `self.attribution_pattern`).
1209         * Every line after that must have consistent indentation.
1210         * Attributions must be preceded by block quote content.
1211
1212         Return a tuple of: (block quote content lines, attribution lines,
1213         attribution offset, remaining indented lines, remaining lines offset).
1214         """
1215         blank = None
1216         nonblank_seen = False
1217         for i in range(len(indented)):
1218             line = indented[i].rstrip()
1219             if line:
1220                 if nonblank_seen and blank == i - 1:  # last line blank
1221                     match = self.attribution_pattern.match(line)
1222                     if match:
1223                         attribution_end, indent = self.check_attribution(
1224                             indented, i)
1225                         if attribution_end:
1226                             a_lines = indented[i:attribution_end]
1227                             a_lines.trim_left(match.end(), end=1)
1228                             a_lines.trim_left(indent, start=1)
1229                             return (indented[:i], a_lines,
1230                                     i, indented[attribution_end:],
1231                                     line_offset + attribution_end)
1232                 nonblank_seen = True
1233             else:
1234                 blank = i
1235         else:
1236             return indented, None, None, None, None
1237
1238     def check_attribution(self, indented, attribution_start):
1239         """
1240         Check attribution shape.
1241         Return the index past the end of the attribution, and the indent.
1242         """
1243         indent = None
1244         i = attribution_start + 1
1245         for i in range(attribution_start + 1, len(indented)):
1246             line = indented[i].rstrip()
1247             if not line:
1248                 break
1249             if indent is None:
1250                 indent = len(line) - len(line.lstrip())
1251             elif len(line) - len(line.lstrip()) != indent:
1252                 return None, None       # bad shape; not an attribution
1253         else:
1254             # return index of line after last attribution line:
1255             i += 1
1256         return i, (indent or 0)
1257
1258     def parse_attribution(self, indented, line_offset):
1259         text = '\n'.join(indented).rstrip()
1260         lineno = 1 + line_offset  # line_offset is zero-based
1261         textnodes, messages = self.inline_text(text, lineno)
1262         node = nodes.attribution(text, '', *textnodes)
1263         node.source, node.line = self.state_machine.get_source_and_line(lineno)
1264         return node, messages
1265
1266     def bullet(self, match, context, next_state):
1267         """Bullet list item."""
1268         ul = nodes.bullet_list()
1269         ul.source, ul.line = self.state_machine.get_source_and_line()
1270         self.parent += ul
1271         ul['bullet'] = match.string[0]
1272         i, blank_finish = self.list_item(match.end())
1273         ul += i
1274         offset = self.state_machine.line_offset + 1   # next line
1275         new_line_offset, blank_finish = self.nested_list_parse(
1276               self.state_machine.input_lines[offset:],
1277               input_offset=self.state_machine.abs_line_offset() + 1,
1278               node=ul, initial_state='BulletList',
1279               blank_finish=blank_finish)
1280         self.goto_line(new_line_offset)
1281         if not blank_finish:
1282             self.parent += self.unindent_warning('Bullet list')
1283         return [], next_state, []
1284
1285     def list_item(self, indent):
1286         src, srcline = self.state_machine.get_source_and_line()
1287         if self.state_machine.line[indent:]:
1288             indented, line_offset, blank_finish = (
1289                 self.state_machine.get_known_indented(indent))
1290         else:
1291             indented, indent, line_offset, blank_finish = (
1292                 self.state_machine.get_first_known_indented(indent))
1293         listitem = nodes.list_item('\n'.join(indented))
1294         listitem.source, listitem.line = src, srcline
1295         if indented:
1296             self.nested_parse(indented, input_offset=line_offset,
1297                               node=listitem)
1298         return listitem, blank_finish
1299
1300     def enumerator(self, match, context, next_state):
1301         """Enumerated List Item"""
1302         format, sequence, text, ordinal = self.parse_enumerator(match)
1303         if not self.is_enumerated_list_item(ordinal, sequence, format):
1304             raise statemachine.TransitionCorrection('text')
1305         enumlist = nodes.enumerated_list()
1306         self.parent += enumlist
1307         if sequence == '#':
1308             enumlist['enumtype'] = 'arabic'
1309         else:
1310             enumlist['enumtype'] = sequence
1311         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1312         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1313         if ordinal != 1:
1314             enumlist['start'] = ordinal
1315             msg = self.reporter.info(
1316                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1317                 % (text, ordinal))
1318             self.parent += msg
1319         listitem, blank_finish = self.list_item(match.end())
1320         enumlist += listitem
1321         offset = self.state_machine.line_offset + 1   # next line
1322         newline_offset, blank_finish = self.nested_list_parse(
1323               self.state_machine.input_lines[offset:],
1324               input_offset=self.state_machine.abs_line_offset() + 1,
1325               node=enumlist, initial_state='EnumeratedList',
1326               blank_finish=blank_finish,
1327               extra_settings={'lastordinal': ordinal,
1328                               'format': format,
1329                               'auto': sequence == '#'})
1330         self.goto_line(newline_offset)
1331         if not blank_finish:
1332             self.parent += self.unindent_warning('Enumerated list')
1333         return [], next_state, []
1334
1335     def parse_enumerator(self, match, expected_sequence=None):
1336         """
1337         Analyze an enumerator and return the results.
1338
1339         :Return:
1340             - the enumerator format ('period', 'parens', or 'rparen'),
1341             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1342             - the text of the enumerator, stripped of formatting, and
1343             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1344               ``None`` is returned for invalid enumerator text).
1345
1346         The enumerator format has already been determined by the regular
1347         expression match. If `expected_sequence` is given, that sequence is
1348         tried first. If not, we check for Roman numeral 1. This way,
1349         single-character Roman numerals (which are also alphabetical) can be
1350         matched. If no sequence has been matched, all sequences are checked in
1351         order.
1352         """
1353         groupdict = match.groupdict()
1354         sequence = ''
1355         for format in self.enum.formats:
1356             if groupdict[format]:       # was this the format matched?
1357                 break                   # yes; keep `format`
1358         else:                           # shouldn't happen
1359             raise ParserError('enumerator format not matched')
1360         text = groupdict[format][self.enum.formatinfo[format].start     # noqa: E203,E501
1361                                  : self.enum.formatinfo[format].end]
1362         if text == '#':
1363             sequence = '#'
1364         elif expected_sequence:
1365             try:
1366                 if self.enum.sequenceregexps[expected_sequence].match(text):
1367                     sequence = expected_sequence
1368             except KeyError:            # shouldn't happen
1369                 raise ParserError('unknown enumerator sequence: %s'
1370                                   % sequence)
1371         elif text == 'i':
1372             sequence = 'lowerroman'
1373         elif text == 'I':
1374             sequence = 'upperroman'
1375         if not sequence:
1376             for sequence in self.enum.sequences:
1377                 if self.enum.sequenceregexps[sequence].match(text):
1378                     break
1379             else:                       # shouldn't happen
1380                 raise ParserError('enumerator sequence not matched')
1381         if sequence == '#':
1382             ordinal = 1
1383         else:
1384             try:
1385                 ordinal = self.enum.converters[sequence](text)
1386             except roman.InvalidRomanNumeralError:
1387                 ordinal = None
1388         return format, sequence, text, ordinal
1389
1390     def is_enumerated_list_item(self, ordinal, sequence, format):
1391         """
1392         Check validity based on the ordinal value and the second line.
1393
1394         Return true if the ordinal is valid and the second line is blank,
1395         indented, or starts with the next enumerator or an auto-enumerator.
1396         """
1397         if ordinal is None:
1398             return None
1399         try:
1400             next_line = self.state_machine.next_line()
1401         except EOFError:              # end of input lines
1402             self.state_machine.previous_line()
1403             return 1
1404         else:
1405             self.state_machine.previous_line()
1406         if not next_line[:1].strip():   # blank or indented
1407             return 1
1408         result = self.make_enumerator(ordinal + 1, sequence, format)
1409         if result:
1410             next_enumerator, auto_enumerator = result
1411             try:
1412                 if (next_line.startswith(next_enumerator)
1413                     or next_line.startswith(auto_enumerator)):
1414                     return 1
1415             except TypeError:
1416                 pass
1417         return None
1418
1419     def make_enumerator(self, ordinal, sequence, format):
1420         """
1421         Construct and return the next enumerated list item marker, and an
1422         auto-enumerator ("#" instead of the regular enumerator).
1423
1424         Return ``None`` for invalid (out of range) ordinals.
1425         """
1426         if sequence == '#':
1427             enumerator = '#'
1428         elif sequence == 'arabic':
1429             enumerator = str(ordinal)
1430         else:
1431             if sequence.endswith('alpha'):
1432                 if ordinal > 26:
1433                     return None
1434                 enumerator = chr(ordinal + ord('a') - 1)
1435             elif sequence.endswith('roman'):
1436                 try:
1437                     enumerator = roman.toRoman(ordinal)
1438                 except roman.RomanError:
1439                     return None
1440             else:                       # shouldn't happen
1441                 raise ParserError('unknown enumerator sequence: "%s"'
1442                                   % sequence)
1443             if sequence.startswith('lower'):
1444                 enumerator = enumerator.lower()
1445             elif sequence.startswith('upper'):
1446                 enumerator = enumerator.upper()
1447             else:                       # shouldn't happen
1448                 raise ParserError('unknown enumerator sequence: "%s"'
1449                                   % sequence)
1450         formatinfo = self.enum.formatinfo[format]
1451         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1452                            + ' ')
1453         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1454         return next_enumerator, auto_enumerator
1455
1456     def field_marker(self, match, context, next_state):
1457         """Field list item."""
1458         field_list = nodes.field_list()
1459         self.parent += field_list
1460         field, blank_finish = self.field(match)
1461         field_list += field
1462         offset = self.state_machine.line_offset + 1   # next line
1463         newline_offset, blank_finish = self.nested_list_parse(
1464               self.state_machine.input_lines[offset:],
1465               input_offset=self.state_machine.abs_line_offset() + 1,
1466               node=field_list, initial_state='FieldList',
1467               blank_finish=blank_finish)
1468         self.goto_line(newline_offset)
1469         if not blank_finish:
1470             self.parent += self.unindent_warning('Field list')
1471         return [], next_state, []
1472
1473     def field(self, match):
1474         name = self.parse_field_marker(match)
1475         src, srcline = self.state_machine.get_source_and_line()
1476         lineno = self.state_machine.abs_line_number()
1477         (indented, indent, line_offset, blank_finish
1478          ) = self.state_machine.get_first_known_indented(match.end())
1479         field_node = nodes.field()
1480         field_node.source = src
1481         field_node.line = srcline
1482         name_nodes, name_messages = self.inline_text(name, lineno)
1483         field_node += nodes.field_name(name, '', *name_nodes)
1484         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1485         field_node += field_body
1486         if indented:
1487             self.parse_field_body(indented, line_offset, field_body)
1488         return field_node, blank_finish
1489
1490     def parse_field_marker(self, match):
1491         """Extract & return field name from a field marker match."""
1492         field = match.group()[1:]         # strip off leading ':'
1493         field = field[:field.rfind(':')]  # strip off trailing ':' etc.
1494         return field
1495
1496     def parse_field_body(self, indented, offset, node):
1497         self.nested_parse(indented, input_offset=offset, node=node)
1498
1499     def option_marker(self, match, context, next_state):
1500         """Option list item."""
1501         optionlist = nodes.option_list()
1502         (optionlist.source, optionlist.line
1503          ) = self.state_machine.get_source_and_line()
1504         try:
1505             listitem, blank_finish = self.option_list_item(match)
1506         except MarkupError as error:
1507             # This shouldn't happen; pattern won't match.
1508             msg = self.reporter.error('Invalid option list marker: %s'
1509                                       % error)
1510             self.parent += msg
1511             (indented, indent, line_offset, blank_finish
1512              ) = self.state_machine.get_first_known_indented(match.end())
1513             elements = self.block_quote(indented, line_offset)
1514             self.parent += elements
1515             if not blank_finish:
1516                 self.parent += self.unindent_warning('Option list')
1517             return [], next_state, []
1518         self.parent += optionlist
1519         optionlist += listitem
1520         offset = self.state_machine.line_offset + 1   # next line
1521         newline_offset, blank_finish = self.nested_list_parse(
1522               self.state_machine.input_lines[offset:],
1523               input_offset=self.state_machine.abs_line_offset() + 1,
1524               node=optionlist, initial_state='OptionList',
1525               blank_finish=blank_finish)
1526         self.goto_line(newline_offset)
1527         if not blank_finish:
1528             self.parent += self.unindent_warning('Option list')
1529         return [], next_state, []
1530
1531     def option_list_item(self, match):
1532         offset = self.state_machine.abs_line_offset()
1533         options = self.parse_option_marker(match)
1534         (indented, indent, line_offset, blank_finish
1535          ) = self.state_machine.get_first_known_indented(match.end())
1536         if not indented:                # not an option list item
1537             self.goto_line(offset)
1538             raise statemachine.TransitionCorrection('text')
1539         option_group = nodes.option_group('', *options)
1540         description = nodes.description('\n'.join(indented))
1541         option_list_item = nodes.option_list_item('', option_group,
1542                                                   description)
1543         if indented:
1544             self.nested_parse(indented, input_offset=line_offset,
1545                               node=description)
1546         return option_list_item, blank_finish
1547
1548     def parse_option_marker(self, match):
1549         """
1550         Return a list of `node.option` and `node.option_argument` objects,
1551         parsed from an option marker match.
1552
1553         :Exception: `MarkupError` for invalid option markers.
1554         """
1555         optlist = []
1556         # split at ", ", except inside < > (complex arguments)
1557         optionstrings = re.split(r', (?![^<]*>)', match.group().rstrip())
1558         for optionstring in optionstrings:
1559             tokens = optionstring.split()
1560             delimiter = ' '
1561             firstopt = tokens[0].split('=', 1)
1562             if len(firstopt) > 1:
1563                 # "--opt=value" form
1564                 tokens[:1] = firstopt
1565                 delimiter = '='
1566             elif (len(tokens[0]) > 2
1567                   and ((tokens[0].startswith('-')
1568                         and not tokens[0].startswith('--'))
1569                        or tokens[0].startswith('+'))):
1570                 # "-ovalue" form
1571                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1572                 delimiter = ''
1573             if len(tokens) > 1 and (tokens[1].startswith('<')
1574                                     and tokens[-1].endswith('>')):
1575                 # "-o <value1 value2>" form; join all values into one token
1576                 tokens[1:] = [' '.join(tokens[1:])]
1577             if 0 < len(tokens) <= 2:
1578                 option = nodes.option(optionstring)
1579                 option += nodes.option_string(tokens[0], tokens[0])
1580                 if len(tokens) > 1:
1581                     option += nodes.option_argument(tokens[1], tokens[1],
1582                                                     delimiter=delimiter)
1583                 optlist.append(option)
1584             else:
1585                 raise MarkupError(
1586                     'wrong number of option tokens (=%s), should be 1 or 2: '
1587                     '"%s"' % (len(tokens), optionstring))
1588         return optlist
1589
1590     def doctest(self, match, context, next_state):
1591         data = '\n'.join(self.state_machine.get_text_block())
1592         # TODO: prepend class value ['pycon'] (Python Console)
1593         # parse with `directives.body.CodeBlock` (returns literal-block
1594         # with class "code" and syntax highlight markup).
1595         self.parent += nodes.doctest_block(data, data)
1596         return [], next_state, []
1597
1598     def line_block(self, match, context, next_state):
1599         """First line of a line block."""
1600         block = nodes.line_block()
1601         self.parent += block
1602         lineno = self.state_machine.abs_line_number()
1603         line, messages, blank_finish = self.line_block_line(match, lineno)
1604         block += line
1605         self.parent += messages
1606         if not blank_finish:
1607             offset = self.state_machine.line_offset + 1   # next line
1608             new_line_offset, blank_finish = self.nested_list_parse(
1609                   self.state_machine.input_lines[offset:],
1610                   input_offset=self.state_machine.abs_line_offset() + 1,
1611                   node=block, initial_state='LineBlock',
1612                   blank_finish=0)
1613             self.goto_line(new_line_offset)
1614         if not blank_finish:
1615             self.parent += self.reporter.warning(
1616                 'Line block ends without a blank line.',
1617                 line=lineno+1)
1618         if len(block):
1619             if block[0].indent is None:
1620                 block[0].indent = 0
1621             self.nest_line_block_lines(block)
1622         return [], next_state, []
1623
1624     def line_block_line(self, match, lineno):
1625         """Return one line element of a line_block."""
1626         (indented, indent, line_offset, blank_finish
1627          ) = self.state_machine.get_first_known_indented(match.end(),
1628                                                          until_blank=True)
1629         text = '\n'.join(indented)
1630         text_nodes, messages = self.inline_text(text, lineno)
1631         line = nodes.line(text, '', *text_nodes)
1632         if match.string.rstrip() != '|':  # not empty
1633             line.indent = len(match.group(1)) - 1
1634         return line, messages, blank_finish
1635
1636     def nest_line_block_lines(self, block):
1637         for index in range(1, len(block)):
1638             if getattr(block[index], 'indent', None) is None:
1639                 block[index].indent = block[index - 1].indent
1640         self.nest_line_block_segment(block)
1641
1642     def nest_line_block_segment(self, block):
1643         indents = [item.indent for item in block]
1644         least = min(indents)
1645         new_items = []
1646         new_block = nodes.line_block()
1647         for item in block:
1648             if item.indent > least:
1649                 new_block.append(item)
1650             else:
1651                 if len(new_block):
1652                     self.nest_line_block_segment(new_block)
1653                     new_items.append(new_block)
1654                     new_block = nodes.line_block()
1655                 new_items.append(item)
1656         if len(new_block):
1657             self.nest_line_block_segment(new_block)
1658             new_items.append(new_block)
1659         block[:] = new_items
1660
1661     def grid_table_top(self, match, context, next_state):
1662         """Top border of a full table."""
1663         return self.table_top(match, context, next_state,
1664                               self.isolate_grid_table,
1665                               tableparser.GridTableParser)
1666
1667     def simple_table_top(self, match, context, next_state):
1668         """Top border of a simple table."""
1669         return self.table_top(match, context, next_state,
1670                               self.isolate_simple_table,
1671                               tableparser.SimpleTableParser)
1672
1673     def table_top(self, match, context, next_state,
1674                   isolate_function, parser_class):
1675         """Top border of a generic table."""
1676         nodelist, blank_finish = self.table(isolate_function, parser_class)
1677         self.parent += nodelist
1678         if not blank_finish:
1679             msg = self.reporter.warning(
1680                 'Blank line required after table.',
1681                 line=self.state_machine.abs_line_number()+1)
1682             self.parent += msg
1683         return [], next_state, []
1684
1685     def table(self, isolate_function, parser_class):
1686         """Parse a table."""
1687         block, messages, blank_finish = isolate_function()
1688         if block:
1689             try:
1690                 parser = parser_class()
1691                 tabledata = parser.parse(block)
1692                 tableline = (self.state_machine.abs_line_number() - len(block)
1693                              + 1)
1694                 table = self.build_table(tabledata, tableline)
1695                 nodelist = [table] + messages
1696             except tableparser.TableMarkupError as err:
1697                 nodelist = self.malformed_table(block, ' '.join(err.args),
1698                                                 offset=err.offset) + messages
1699         else:
1700             nodelist = messages
1701         return nodelist, blank_finish
1702
1703     def isolate_grid_table(self):
1704         messages = []
1705         blank_finish = 1
1706         try:
1707             block = self.state_machine.get_text_block(flush_left=True)
1708         except statemachine.UnexpectedIndentationError as err:
1709             block, src, srcline = err.args
1710             messages.append(self.reporter.error('Unexpected indentation.',
1711                                                 source=src, line=srcline))
1712             blank_finish = 0
1713         block.disconnect()
1714         # for East Asian chars:
1715         block.pad_double_width(self.double_width_pad_char)
1716         width = len(block[0].strip())
1717         for i in range(len(block)):
1718             block[i] = block[i].strip()
1719             if block[i][0] not in '+|':  # check left edge
1720                 blank_finish = 0
1721                 self.state_machine.previous_line(len(block) - i)
1722                 del block[i:]
1723                 break
1724         if not self.grid_table_top_pat.match(block[-1]):  # find bottom
1725             blank_finish = 0
1726             # from second-last to third line of table:
1727             for i in range(len(block) - 2, 1, -1):
1728                 if self.grid_table_top_pat.match(block[i]):
1729                     self.state_machine.previous_line(len(block) - i + 1)
1730                     del block[i+1:]
1731                     break
1732             else:
1733                 messages.extend(self.malformed_table(block))
1734                 return [], messages, blank_finish
1735         for i in range(len(block)):     # check right edge
1736             if len(block[i]) != width or block[i][-1] not in '+|':
1737                 messages.extend(self.malformed_table(block))
1738                 return [], messages, blank_finish
1739         return block, messages, blank_finish
1740
1741     def isolate_simple_table(self):
1742         start = self.state_machine.line_offset
1743         lines = self.state_machine.input_lines
1744         limit = len(lines) - 1
1745         toplen = len(lines[start].strip())
1746         pattern_match = self.simple_table_border_pat.match
1747         found = 0
1748         found_at = None
1749         i = start + 1
1750         while i <= limit:
1751             line = lines[i]
1752             match = pattern_match(line)
1753             if match:
1754                 if len(line.strip()) != toplen:
1755                     self.state_machine.next_line(i - start)
1756                     messages = self.malformed_table(
1757                         lines[start:i+1], 'Bottom/header table border does '
1758                         'not match top border.')
1759                     return [], messages, i == limit or not lines[i+1].strip()
1760                 found += 1
1761                 found_at = i
1762                 if found == 2 or i == limit or not lines[i+1].strip():
1763                     end = i
1764                     break
1765             i += 1
1766         else:                           # reached end of input_lines
1767             if found:
1768                 extra = ' or no blank line after table bottom'
1769                 self.state_machine.next_line(found_at - start)
1770                 block = lines[start:found_at+1]
1771             else:
1772                 extra = ''
1773                 self.state_machine.next_line(i - start - 1)
1774                 block = lines[start:]
1775             messages = self.malformed_table(
1776                 block, 'No bottom table border found%s.' % extra)
1777             return [], messages, not extra
1778         self.state_machine.next_line(end - start)
1779         block = lines[start:end+1]
1780         # for East Asian chars:
1781         block.pad_double_width(self.double_width_pad_char)
1782         return block, [], end == limit or not lines[end+1].strip()
1783
1784     def malformed_table(self, block, detail='', offset=0):
1785         block.replace(self.double_width_pad_char, '')
1786         data = '\n'.join(block)
1787         message = 'Malformed table.'
1788         startline = self.state_machine.abs_line_number() - len(block) + 1
1789         if detail:
1790             message += '\n' + detail
1791         error = self.reporter.error(message, nodes.literal_block(data, data),
1792                                     line=startline+offset)
1793         return [error]
1794
1795     def build_table(self, tabledata, tableline, stub_columns=0, widths=None):
1796         colwidths, headrows, bodyrows = tabledata
1797         table = nodes.table()
1798         if widths == 'auto':
1799             table['classes'] += ['colwidths-auto']
1800         elif widths:  # "grid" or list of integers
1801             table['classes'] += ['colwidths-given']
1802         tgroup = nodes.tgroup(cols=len(colwidths))
1803         table += tgroup
1804         for colwidth in colwidths:
1805             colspec = nodes.colspec(colwidth=colwidth)
1806             if stub_columns:
1807                 colspec.attributes['stub'] = 1
1808                 stub_columns -= 1
1809             tgroup += colspec
1810         if headrows:
1811             thead = nodes.thead()
1812             tgroup += thead
1813             for row in headrows:
1814                 thead += self.build_table_row(row, tableline)
1815         tbody = nodes.tbody()
1816         tgroup += tbody
1817         for row in bodyrows:
1818             tbody += self.build_table_row(row, tableline)
1819         return table
1820
1821     def build_table_row(self, rowdata, tableline):
1822         row = nodes.row()
1823         for cell in rowdata:
1824             if cell is None:
1825                 continue
1826             morerows, morecols, offset, cellblock = cell
1827             attributes = {}
1828             if morerows:
1829                 attributes['morerows'] = morerows
1830             if morecols:
1831                 attributes['morecols'] = morecols
1832             entry = nodes.entry(**attributes)
1833             row += entry
1834             if ''.join(cellblock):
1835                 self.nested_parse(cellblock, input_offset=tableline+offset,
1836                                   node=entry)
1837         return row
1838
1839     explicit = Struct()
1840     """Patterns and constants used for explicit markup recognition."""
1841
1842     explicit.patterns = Struct(
1843           target=re.compile(r"""
1844                             (
1845                               _               # anonymous target
1846                             |               # *OR*
1847                               (?!_)           # no underscore at the beginning
1848                               (?P<quote>`?)   # optional open quote
1849                               (?![ `])        # first char. not space or
1850                                               # backquote
1851                               (?P<name>       # reference name
1852                                 .+?
1853                               )
1854                               %(non_whitespace_escape_before)s
1855                               (?P=quote)      # close quote if open quote used
1856                             )
1857                             (?<!(?<!\x00):) # no unescaped colon at end
1858                             %(non_whitespace_escape_before)s
1859                             [ ]?            # optional space
1860                             :               # end of reference name
1861                             ([ ]+|$)        # followed by whitespace
1862                             """ % vars(Inliner), re.VERBOSE),
1863           reference=re.compile(r"""
1864                                (
1865                                  (?P<simple>%(simplename)s)_
1866                                |                  # *OR*
1867                                  `                  # open backquote
1868                                  (?![ ])            # not space
1869                                  (?P<phrase>.+?)    # hyperlink phrase
1870                                  %(non_whitespace_escape_before)s
1871                                  `_                 # close backquote,
1872                                                     # reference mark
1873                                )
1874                                $                  # end of string
1875                                """ % vars(Inliner), re.VERBOSE),
1876           substitution=re.compile(r"""
1877                                   (
1878                                     (?![ ])          # first char. not space
1879                                     (?P<name>.+?)    # substitution text
1880                                     %(non_whitespace_escape_before)s
1881                                     \|               # close delimiter
1882                                   )
1883                                   ([ ]+|$)           # followed by whitespace
1884                                   """ % vars(Inliner),
1885                                   re.VERBOSE),)
1886
1887     def footnote(self, match):
1888         src, srcline = self.state_machine.get_source_and_line()
1889         (indented, indent, offset, blank_finish
1890          ) = self.state_machine.get_first_known_indented(match.end())
1891         label = match.group(1)
1892         name = normalize_name(label)
1893         footnote = nodes.footnote('\n'.join(indented))
1894         footnote.source = src
1895         footnote.line = srcline
1896         if name[0] == '#':              # auto-numbered
1897             name = name[1:]             # autonumber label
1898             footnote['auto'] = 1
1899             if name:
1900                 footnote['names'].append(name)
1901             self.document.note_autofootnote(footnote)
1902         elif name == '*':               # auto-symbol
1903             name = ''
1904             footnote['auto'] = '*'
1905             self.document.note_symbol_footnote(footnote)
1906         else:                           # manually numbered
1907             footnote += nodes.label('', label)
1908             footnote['names'].append(name)
1909             self.document.note_footnote(footnote)
1910         if name:
1911             self.document.note_explicit_target(footnote, footnote)
1912         else:
1913             self.document.set_id(footnote, footnote)
1914         if indented:
1915             self.nested_parse(indented, input_offset=offset, node=footnote)
1916         return [footnote], blank_finish
1917
1918     def citation(self, match):
1919         src, srcline = self.state_machine.get_source_and_line()
1920         (indented, indent, offset, blank_finish
1921          ) = self.state_machine.get_first_known_indented(match.end())
1922         label = match.group(1)
1923         name = normalize_name(label)
1924         citation = nodes.citation('\n'.join(indented))
1925         citation.source = src
1926         citation.line = srcline
1927         citation += nodes.label('', label)
1928         citation['names'].append(name)
1929         self.document.note_citation(citation)
1930         self.document.note_explicit_target(citation, citation)
1931         if indented:
1932             self.nested_parse(indented, input_offset=offset, node=citation)
1933         return [citation], blank_finish
1934
1935     def hyperlink_target(self, match):
1936         pattern = self.explicit.patterns.target
1937         lineno = self.state_machine.abs_line_number()
1938         (block, indent, offset, blank_finish
1939          ) = self.state_machine.get_first_known_indented(
1940                  match.end(), until_blank=True, strip_indent=False)
1941         blocktext = match.string[:match.end()] + '\n'.join(block)
1942         block = [escape2null(line) for line in block]
1943         escaped = block[0]
1944         blockindex = 0
1945         while True:
1946             targetmatch = pattern.match(escaped)
1947             if targetmatch:
1948                 break
1949             blockindex += 1
1950             try:
1951                 escaped += block[blockindex]
1952             except IndexError:
1953                 raise MarkupError('malformed hyperlink target.')
1954         del block[:blockindex]
1955         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1956         target = self.make_target(block, blocktext, lineno,
1957                                   targetmatch.group('name'))
1958         return [target], blank_finish
1959
1960     def make_target(self, block, block_text, lineno, target_name):
1961         target_type, data = self.parse_target(block, block_text, lineno)
1962         if target_type == 'refname':
1963             target = nodes.target(block_text, '', refname=normalize_name(data))
1964             target.indirect_reference_name = data
1965             self.add_target(target_name, '', target, lineno)
1966             self.document.note_indirect_target(target)
1967             return target
1968         elif target_type == 'refuri':
1969             target = nodes.target(block_text, '')
1970             self.add_target(target_name, data, target, lineno)
1971             return target
1972         else:
1973             return data
1974
1975     def parse_target(self, block, block_text, lineno):
1976         """
1977         Determine the type of reference of a target.
1978
1979         :Return: A 2-tuple, one of:
1980
1981             - 'refname' and the indirect reference name
1982             - 'refuri' and the URI
1983             - 'malformed' and a system_message node
1984         """
1985         if block and block[-1].strip()[-1:] == '_':  # possible indirect target
1986             reference = ' '.join(line.strip() for line in block)
1987             refname = self.is_reference(reference)
1988             if refname:
1989                 return 'refname', refname
1990         ref_parts = split_escaped_whitespace(' '.join(block))
1991         reference = ' '.join(''.join(unescape(part).split())
1992                              for part in ref_parts)
1993         return 'refuri', reference
1994
1995     def is_reference(self, reference):
1996         match = self.explicit.patterns.reference.match(
1997             whitespace_normalize_name(reference))
1998         if not match:
1999             return None
2000         return unescape(match.group('simple') or match.group('phrase'))
2001
2002     def add_target(self, targetname, refuri, target, lineno):
2003         target.line = lineno
2004         if targetname:
2005             name = normalize_name(unescape(targetname))
2006             target['names'].append(name)
2007             if refuri:
2008                 uri = self.inliner.adjust_uri(refuri)
2009                 if uri:
2010                     target['refuri'] = uri
2011                 else:
2012                     raise ApplicationError('problem with URI: %r' % refuri)
2013             self.document.note_explicit_target(target, self.parent)
2014         else:                       # anonymous target
2015             if refuri:
2016                 target['refuri'] = refuri
2017             target['anonymous'] = 1
2018             self.document.note_anonymous_target(target)
2019
2020     def substitution_def(self, match):
2021         pattern = self.explicit.patterns.substitution
2022         src, srcline = self.state_machine.get_source_and_line()
2023         (block, indent, offset, blank_finish
2024          ) = self.state_machine.get_first_known_indented(match.end(),
2025                                                          strip_indent=False)
2026         blocktext = (match.string[:match.end()] + '\n'.join(block))
2027         block.disconnect()
2028         escaped = escape2null(block[0].rstrip())
2029         blockindex = 0
2030         while True:
2031             subdefmatch = pattern.match(escaped)
2032             if subdefmatch:
2033                 break
2034             blockindex += 1
2035             try:
2036                 escaped = escaped + ' ' + escape2null(
2037                                               block[blockindex].strip())
2038             except IndexError:
2039                 raise MarkupError('malformed substitution definition.')
2040         del block[:blockindex]          # strip out the substitution marker
2041         start = subdefmatch.end()-len(escaped)-1
2042         block[0] = (block[0].strip() + ' ')[start:-1]
2043         if not block[0]:
2044             del block[0]
2045             offset += 1
2046         while block and not block[-1].strip():
2047             block.pop()
2048         subname = subdefmatch.group('name')
2049         substitution_node = nodes.substitution_definition(blocktext)
2050         substitution_node.source = src
2051         substitution_node.line = srcline
2052         if not block:
2053             msg = self.reporter.warning(
2054                 'Substitution definition "%s" missing contents.' % subname,
2055                 nodes.literal_block(blocktext, blocktext),
2056                 source=src, line=srcline)
2057             return [msg], blank_finish
2058         block[0] = block[0].strip()
2059         substitution_node['names'].append(
2060             nodes.whitespace_normalize_name(subname))
2061         new_abs_offset, blank_finish = self.nested_list_parse(
2062               block, input_offset=offset, node=substitution_node,
2063               initial_state='SubstitutionDef', blank_finish=blank_finish)
2064         i = 0
2065         for node in substitution_node[:]:
2066             if not (isinstance(node, nodes.Inline)
2067                     or isinstance(node, nodes.Text)):
2068                 self.parent += substitution_node[i]
2069                 del substitution_node[i]
2070             else:
2071                 i += 1
2072         for node in substitution_node.findall(nodes.Element):
2073             if self.disallowed_inside_substitution_definitions(node):
2074                 pformat = nodes.literal_block('', node.pformat().rstrip())
2075                 msg = self.reporter.error(
2076                     'Substitution definition contains illegal element <%s>:'
2077                     % node.tagname,
2078                     pformat, nodes.literal_block(blocktext, blocktext),
2079                     source=src, line=srcline)
2080                 return [msg], blank_finish
2081         if len(substitution_node) == 0:
2082             msg = self.reporter.warning(
2083                   'Substitution definition "%s" empty or invalid.' % subname,
2084                   nodes.literal_block(blocktext, blocktext),
2085                   source=src, line=srcline)
2086             return [msg], blank_finish
2087         self.document.note_substitution_def(
2088             substitution_node, subname, self.parent)
2089         return [substitution_node], blank_finish
2090
2091     def disallowed_inside_substitution_definitions(self, node):
2092         if (node['ids']
2093             or isinstance(node, nodes.reference) and node.get('anonymous')
2094             or isinstance(node, nodes.footnote_reference) and node.get('auto')):  # noqa: E501
2095             return True
2096         else:
2097             return False
2098
2099     def directive(self, match, **option_presets):
2100         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2101         type_name = match.group(1)
2102         directive_class, messages = directives.directive(
2103             type_name, self.memo.language, self.document)
2104         self.parent += messages
2105         if directive_class:
2106             return self.run_directive(
2107                 directive_class, match, type_name, option_presets)
2108         else:
2109             return self.unknown_directive(type_name)
2110
2111     def run_directive(self, directive, match, type_name, option_presets):
2112         """
2113         Parse a directive then run its directive function.
2114
2115         Parameters:
2116
2117         - `directive`: The class implementing the directive.  Must be
2118           a subclass of `rst.Directive`.
2119
2120         - `match`: A regular expression match object which matched the first
2121           line of the directive.
2122
2123         - `type_name`: The directive name, as used in the source text.
2124
2125         - `option_presets`: A dictionary of preset options, defaults for the
2126           directive options.  Currently, only an "alt" option is passed by
2127           substitution definitions (value: the substitution name), which may
2128           be used by an embedded image directive.
2129
2130         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2131         """
2132         if isinstance(directive, (FunctionType, MethodType)):
2133             from docutils.parsers.rst import convert_directive_function
2134             directive = convert_directive_function(directive)
2135         lineno = self.state_machine.abs_line_number()
2136         initial_line_offset = self.state_machine.line_offset
2137         (indented, indent, line_offset, blank_finish
2138          ) = self.state_machine.get_first_known_indented(match.end(),
2139                                                          strip_top=0)
2140         block_text = '\n'.join(self.state_machine.input_lines[
2141             initial_line_offset : self.state_machine.line_offset + 1])  # noqa: E203,E501
2142         try:
2143             arguments, options, content, content_offset = (
2144                 self.parse_directive_block(indented, line_offset,
2145                                            directive, option_presets))
2146         except MarkupError as detail:
2147             error = self.reporter.error(
2148                 'Error in "%s" directive:\n%s.' % (type_name,
2149                                                    ' '.join(detail.args)),
2150                 nodes.literal_block(block_text, block_text), line=lineno)
2151             return [error], blank_finish
2152         directive_instance = directive(
2153             type_name, arguments, options, content, lineno,
2154             content_offset, block_text, self, self.state_machine)
2155         try:
2156             result = directive_instance.run()
2157         except docutils.parsers.rst.DirectiveError as error:
2158             msg_node = self.reporter.system_message(error.level, error.msg,
2159                                                     line=lineno)
2160             msg_node += nodes.literal_block(block_text, block_text)
2161             result = [msg_node]
2162         assert isinstance(result, list), \
2163                'Directive "%s" must return a list of nodes.' % type_name
2164         for i in range(len(result)):
2165             assert isinstance(result[i], nodes.Node), \
2166                    ('Directive "%s" returned non-Node object (index %s): %r'
2167                     % (type_name, i, result[i]))
2168         return (result,
2169                 blank_finish or self.state_machine.is_next_line_blank())
2170
2171     def parse_directive_block(self, indented, line_offset, directive,
2172                               option_presets):
2173         option_spec = directive.option_spec
2174         has_content = directive.has_content
2175         if indented and not indented[0].strip():
2176             indented.trim_start()
2177             line_offset += 1
2178         while indented and not indented[-1].strip():
2179             indented.trim_end()
2180         if indented and (directive.required_arguments
2181                          or directive.optional_arguments
2182                          or option_spec):
2183             for i, line in enumerate(indented):
2184                 if not line.strip():
2185                     break
2186             else:
2187                 i += 1
2188             arg_block = indented[:i]
2189             content = indented[i+1:]
2190             content_offset = line_offset + i + 1
2191         else:
2192             content = indented
2193             content_offset = line_offset
2194             arg_block = []
2195         if option_spec:
2196             options, arg_block = self.parse_directive_options(
2197                 option_presets, option_spec, arg_block)
2198         else:
2199             options = {}
2200         if arg_block and not (directive.required_arguments
2201                               or directive.optional_arguments):
2202             content = arg_block + indented[i:]
2203             content_offset = line_offset
2204             arg_block = []
2205         while content and not content[0].strip():
2206             content.trim_start()
2207             content_offset += 1
2208         if directive.required_arguments or directive.optional_arguments:
2209             arguments = self.parse_directive_arguments(
2210                 directive, arg_block)
2211         else:
2212             arguments = []
2213         if content and not has_content:
2214             raise MarkupError('no content permitted')
2215         return arguments, options, content, content_offset
2216
2217     def parse_directive_options(self, option_presets, option_spec, arg_block):
2218         options = option_presets.copy()
2219         for i, line in enumerate(arg_block):
2220             if re.match(Body.patterns['field_marker'], line):
2221                 opt_block = arg_block[i:]
2222                 arg_block = arg_block[:i]
2223                 break
2224         else:
2225             opt_block = []
2226         if opt_block:
2227             success, data = self.parse_extension_options(option_spec,
2228                                                          opt_block)
2229             if success:                 # data is a dict of options
2230                 options.update(data)
2231             else:                       # data is an error string
2232                 raise MarkupError(data)
2233         return options, arg_block
2234
2235     def parse_directive_arguments(self, directive, arg_block):
2236         required = directive.required_arguments
2237         optional = directive.optional_arguments
2238         arg_text = '\n'.join(arg_block)
2239         arguments = arg_text.split()
2240         if len(arguments) < required:
2241             raise MarkupError('%s argument(s) required, %s supplied'
2242                               % (required, len(arguments)))
2243         elif len(arguments) > required + optional:
2244             if directive.final_argument_whitespace:
2245                 arguments = arg_text.split(None, required + optional - 1)
2246             else:
2247                 raise MarkupError(
2248                     'maximum %s argument(s) allowed, %s supplied'
2249                     % (required + optional, len(arguments)))
2250         return arguments
2251
2252     def parse_extension_options(self, option_spec, datalines):
2253         """
2254         Parse `datalines` for a field list containing extension options
2255         matching `option_spec`.
2256
2257         :Parameters:
2258             - `option_spec`: a mapping of option name to conversion
2259               function, which should raise an exception on bad input.
2260             - `datalines`: a list of input strings.
2261
2262         :Return:
2263             - Success value, 1 or 0.
2264             - An option dictionary on success, an error string on failure.
2265         """
2266         node = nodes.field_list()
2267         newline_offset, blank_finish = self.nested_list_parse(
2268               datalines, 0, node, initial_state='ExtensionOptions',
2269               blank_finish=True)
2270         if newline_offset != len(datalines):  # incomplete parse of block
2271             return 0, 'invalid option block'
2272         try:
2273             options = utils.extract_extension_options(node, option_spec)
2274         except KeyError as detail:
2275             return 0, 'unknown option: "%s"' % detail.args[0]
2276         except (ValueError, TypeError) as detail:
2277             return 0, 'invalid option value: %s' % ' '.join(detail.args)
2278         except utils.ExtensionOptionError as detail:
2279             return 0, 'invalid option data: %s' % ' '.join(detail.args)
2280         if blank_finish:
2281             return 1, options
2282         else:
2283             return 0, 'option data incompletely parsed'
2284
2285     def unknown_directive(self, type_name):
2286         lineno = self.state_machine.abs_line_number()
2287         (indented, indent, offset, blank_finish
2288          ) = self.state_machine.get_first_known_indented(0, strip_indent=False)
2289         text = '\n'.join(indented)
2290         error = self.reporter.error('Unknown directive type "%s".' % type_name,
2291                                     nodes.literal_block(text, text),
2292                                     line=lineno)
2293         return [error], blank_finish
2294
2295     def comment(self, match):
2296         if self.state_machine.is_next_line_blank():
2297             first_comment_line = match.string[match.end():]
2298             if not first_comment_line.strip():  # empty comment
2299                 return [nodes.comment()], True  # "A tiny but practical wart."
2300             if first_comment_line.startswith('end of inclusion from "'):
2301                 # cf. parsers.rst.directives.misc.Include
2302                 self.document.include_log.pop()
2303                 return [], True
2304         (indented, indent, offset, blank_finish
2305          ) = self.state_machine.get_first_known_indented(match.end())
2306         while indented and not indented[-1].strip():
2307             indented.trim_end()
2308         text = '\n'.join(indented)
2309         return [nodes.comment(text, text)], blank_finish
2310
2311     explicit.constructs = [
2312           (footnote,
2313            re.compile(r"""
2314                       \.\.[ ]+          # explicit markup start
2315                       \[
2316                       (                 # footnote label:
2317                           [0-9]+          # manually numbered footnote
2318                         |               # *OR*
2319                           \#              # anonymous auto-numbered footnote
2320                         |               # *OR*
2321                           \#%s            # auto-number ed?) footnote label
2322                         |               # *OR*
2323                           \*              # auto-symbol footnote
2324                       )
2325                       \]
2326                       ([ ]+|$)          # whitespace or end of line
2327                       """ % Inliner.simplename, re.VERBOSE)),
2328           (citation,
2329            re.compile(r"""
2330                       \.\.[ ]+          # explicit markup start
2331                       \[(%s)\]          # citation label
2332                       ([ ]+|$)          # whitespace or end of line
2333                       """ % Inliner.simplename, re.VERBOSE)),
2334           (hyperlink_target,
2335            re.compile(r"""
2336                       \.\.[ ]+          # explicit markup start
2337                       _                 # target indicator
2338                       (?![ ]|$)         # first char. not space or EOL
2339                       """, re.VERBOSE)),
2340           (substitution_def,
2341            re.compile(r"""
2342                       \.\.[ ]+          # explicit markup start
2343                       \|                # substitution indicator
2344                       (?![ ]|$)         # first char. not space or EOL
2345                       """, re.VERBOSE)),
2346           (directive,
2347            re.compile(r"""
2348                       \.\.[ ]+          # explicit markup start
2349                       (%s)              # directive name
2350                       [ ]?              # optional space
2351                       ::                # directive delimiter
2352                       ([ ]+|$)          # whitespace or end of line
2353                       """ % Inliner.simplename, re.VERBOSE))]
2354
2355     def explicit_markup(self, match, context, next_state):
2356         """Footnotes, hyperlink targets, directives, comments."""
2357         nodelist, blank_finish = self.explicit_construct(match)
2358         self.parent += nodelist
2359         self.explicit_list(blank_finish)
2360         return [], next_state, []
2361
2362     def explicit_construct(self, match):
2363         """Determine which explicit construct this is, parse & return it."""
2364         errors = []
2365         for method, pattern in self.explicit.constructs:
2366             expmatch = pattern.match(match.string)
2367             if expmatch:
2368                 try:
2369                     return method(self, expmatch)
2370                 except MarkupError as error:
2371                     lineno = self.state_machine.abs_line_number()
2372                     message = ' '.join(error.args)
2373                     errors.append(self.reporter.warning(message, line=lineno))
2374                     break
2375         nodelist, blank_finish = self.comment(match)
2376         return nodelist + errors, blank_finish
2377
2378     def explicit_list(self, blank_finish):
2379         """
2380         Create a nested state machine for a series of explicit markup
2381         constructs (including anonymous hyperlink targets).
2382         """
2383         offset = self.state_machine.line_offset + 1   # next line
2384         newline_offset, blank_finish = self.nested_list_parse(
2385               self.state_machine.input_lines[offset:],
2386               input_offset=self.state_machine.abs_line_offset() + 1,
2387               node=self.parent, initial_state='Explicit',
2388               blank_finish=blank_finish,
2389               match_titles=self.state_machine.match_titles)
2390         self.goto_line(newline_offset)
2391         if not blank_finish:
2392             self.parent += self.unindent_warning('Explicit markup')
2393
2394     def anonymous(self, match, context, next_state):
2395         """Anonymous hyperlink targets."""
2396         nodelist, blank_finish = self.anonymous_target(match)
2397         self.parent += nodelist
2398         self.explicit_list(blank_finish)
2399         return [], next_state, []
2400
2401     def anonymous_target(self, match):
2402         lineno = self.state_machine.abs_line_number()
2403         (block, indent, offset, blank_finish
2404          ) = self.state_machine.get_first_known_indented(match.end(),
2405                                                          until_blank=True)
2406         blocktext = match.string[:match.end()] + '\n'.join(block)
2407         block = [escape2null(line) for line in block]
2408         target = self.make_target(block, blocktext, lineno, '')
2409         return [target], blank_finish
2410
2411     def line(self, match, context, next_state):
2412         """Section title overline or transition marker."""
2413         if self.state_machine.match_titles:
2414             return [match.string], 'Line', []
2415         elif match.string.strip() == '::':
2416             raise statemachine.TransitionCorrection('text')
2417         elif len(match.string.strip()) < 4:
2418             msg = self.reporter.info(
2419                 'Unexpected possible title overline or transition.\n'
2420                 "Treating it as ordinary text because it's so short.",
2421                 line=self.state_machine.abs_line_number())
2422             self.parent += msg
2423             raise statemachine.TransitionCorrection('text')
2424         else:
2425             blocktext = self.state_machine.line
2426             msg = self.reporter.severe(
2427                   'Unexpected section title or transition.',
2428                   nodes.literal_block(blocktext, blocktext),
2429                   line=self.state_machine.abs_line_number())
2430             self.parent += msg
2431             return [], next_state, []
2432
2433     def text(self, match, context, next_state):
2434         """Titles, definition lists, paragraphs."""
2435         return [match.string], 'Text', []
2436
2437
2438 class RFC2822Body(Body):
2439
2440     """
2441     RFC2822 headers are only valid as the first constructs in documents.  As
2442     soon as anything else appears, the `Body` state should take over.
2443     """
2444
2445     patterns = Body.patterns.copy()     # can't modify the original
2446     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2447     initial_transitions = [(name, 'Body')
2448                            for name in Body.initial_transitions]
2449     initial_transitions.insert(-1, ('rfc2822', 'Body'))  # just before 'text'
2450
2451     def rfc2822(self, match, context, next_state):
2452         """RFC2822-style field list item."""
2453         fieldlist = nodes.field_list(classes=['rfc2822'])
2454         self.parent += fieldlist
2455         field, blank_finish = self.rfc2822_field(match)
2456         fieldlist += field
2457         offset = self.state_machine.line_offset + 1  # next line
2458         newline_offset, blank_finish = self.nested_list_parse(
2459               self.state_machine.input_lines[offset:],
2460               input_offset=self.state_machine.abs_line_offset() + 1,
2461               node=fieldlist, initial_state='RFC2822List',
2462               blank_finish=blank_finish)
2463         self.goto_line(newline_offset)
2464         if not blank_finish:
2465             self.parent += self.unindent_warning(
2466                   'RFC2822-style field list')
2467         return [], next_state, []
2468
2469     def rfc2822_field(self, match):
2470         name = match.string[:match.string.find(':')]
2471         (indented, indent, line_offset, blank_finish
2472          ) = self.state_machine.get_first_known_indented(match.end(),
2473                                                          until_blank=True)
2474         fieldnode = nodes.field()
2475         fieldnode += nodes.field_name(name, name)
2476         fieldbody = nodes.field_body('\n'.join(indented))
2477         fieldnode += fieldbody
2478         if indented:
2479             self.nested_parse(indented, input_offset=line_offset,
2480                               node=fieldbody)
2481         return fieldnode, blank_finish
2482
2483
2484 class SpecializedBody(Body):
2485
2486     """
2487     Superclass for second and subsequent compound element members.  Compound
2488     elements are lists and list-like constructs.
2489
2490     All transition methods are disabled (redefined as `invalid_input`).
2491     Override individual methods in subclasses to re-enable.
2492
2493     For example, once an initial bullet list item, say, is recognized, the
2494     `BulletList` subclass takes over, with a "bullet_list" node as its
2495     container.  Upon encountering the initial bullet list item, `Body.bullet`
2496     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2497     starts up a nested parsing session with `BulletList` as the initial state.
2498     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2499     as only bullet list items are encountered, they are parsed and inserted
2500     into the container.  The first construct which is *not* a bullet list item
2501     triggers the `invalid_input` method, which ends the nested parse and
2502     closes the container.  `BulletList` needs to recognize input that is
2503     invalid in the context of a bullet list, which means everything *other
2504     than* bullet list items, so it inherits the transition list created in
2505     `Body`.
2506     """
2507
2508     def invalid_input(self, match=None, context=None, next_state=None):
2509         """Not a compound element member. Abort this state machine."""
2510         self.state_machine.previous_line()  # back up so parent SM can reassess
2511         raise EOFError
2512
2513     indent = invalid_input
2514     bullet = invalid_input
2515     enumerator = invalid_input
2516     field_marker = invalid_input
2517     option_marker = invalid_input
2518     doctest = invalid_input
2519     line_block = invalid_input
2520     grid_table_top = invalid_input
2521     simple_table_top = invalid_input
2522     explicit_markup = invalid_input
2523     anonymous = invalid_input
2524     line = invalid_input
2525     text = invalid_input
2526
2527
2528 class BulletList(SpecializedBody):
2529
2530     """Second and subsequent bullet_list list_items."""
2531
2532     def bullet(self, match, context, next_state):
2533         """Bullet list item."""
2534         if match.string[0] != self.parent['bullet']:
2535             # different bullet: new list
2536             self.invalid_input()
2537         listitem, blank_finish = self.list_item(match.end())
2538         self.parent += listitem
2539         self.blank_finish = blank_finish
2540         return [], next_state, []
2541
2542
2543 class DefinitionList(SpecializedBody):
2544
2545     """Second and subsequent definition_list_items."""
2546
2547     def text(self, match, context, next_state):
2548         """Definition lists."""
2549         return [match.string], 'Definition', []
2550
2551
2552 class EnumeratedList(SpecializedBody):
2553
2554     """Second and subsequent enumerated_list list_items."""
2555
2556     def enumerator(self, match, context, next_state):
2557         """Enumerated list item."""
2558         format, sequence, text, ordinal = self.parse_enumerator(
2559               match, self.parent['enumtype'])
2560         if (format != self.format
2561             or (sequence != '#' and (sequence != self.parent['enumtype']
2562                                      or self.auto
2563                                      or ordinal != (self.lastordinal + 1)))
2564             or not self.is_enumerated_list_item(ordinal, sequence, format)):
2565             # different enumeration: new list
2566             self.invalid_input()
2567         if sequence == '#':
2568             self.auto = 1
2569         listitem, blank_finish = self.list_item(match.end())
2570         self.parent += listitem
2571         self.blank_finish = blank_finish
2572         self.lastordinal = ordinal
2573         return [], next_state, []
2574
2575
2576 class FieldList(SpecializedBody):
2577
2578     """Second and subsequent field_list fields."""
2579
2580     def field_marker(self, match, context, next_state):
2581         """Field list field."""
2582         field, blank_finish = self.field(match)
2583         self.parent += field
2584         self.blank_finish = blank_finish
2585         return [], next_state, []
2586
2587
2588 class OptionList(SpecializedBody):
2589
2590     """Second and subsequent option_list option_list_items."""
2591
2592     def option_marker(self, match, context, next_state):
2593         """Option list item."""
2594         try:
2595             option_list_item, blank_finish = self.option_list_item(match)
2596         except MarkupError:
2597             self.invalid_input()
2598         self.parent += option_list_item
2599         self.blank_finish = blank_finish
2600         return [], next_state, []
2601
2602
2603 class RFC2822List(SpecializedBody, RFC2822Body):
2604
2605     """Second and subsequent RFC2822-style field_list fields."""
2606
2607     patterns = RFC2822Body.patterns
2608     initial_transitions = RFC2822Body.initial_transitions
2609
2610     def rfc2822(self, match, context, next_state):
2611         """RFC2822-style field list item."""
2612         field, blank_finish = self.rfc2822_field(match)
2613         self.parent += field
2614         self.blank_finish = blank_finish
2615         return [], 'RFC2822List', []
2616
2617     blank = SpecializedBody.invalid_input
2618
2619
2620 class ExtensionOptions(FieldList):
2621
2622     """
2623     Parse field_list fields for extension options.
2624
2625     No nested parsing is done (including inline markup parsing).
2626     """
2627
2628     def parse_field_body(self, indented, offset, node):
2629         """Override `Body.parse_field_body` for simpler parsing."""
2630         lines = []
2631         for line in list(indented) + ['']:
2632             if line.strip():
2633                 lines.append(line)
2634             elif lines:
2635                 text = '\n'.join(lines)
2636                 node += nodes.paragraph(text, text)
2637                 lines = []
2638
2639
2640 class LineBlock(SpecializedBody):
2641
2642     """Second and subsequent lines of a line_block."""
2643
2644     blank = SpecializedBody.invalid_input
2645
2646     def line_block(self, match, context, next_state):
2647         """New line of line block."""
2648         lineno = self.state_machine.abs_line_number()
2649         line, messages, blank_finish = self.line_block_line(match, lineno)
2650         self.parent += line
2651         self.parent.parent += messages
2652         self.blank_finish = blank_finish
2653         return [], next_state, []
2654
2655
2656 class Explicit(SpecializedBody):
2657
2658     """Second and subsequent explicit markup construct."""
2659
2660     def explicit_markup(self, match, context, next_state):
2661         """Footnotes, hyperlink targets, directives, comments."""
2662         nodelist, blank_finish = self.explicit_construct(match)
2663         self.parent += nodelist
2664         self.blank_finish = blank_finish
2665         return [], next_state, []
2666
2667     def anonymous(self, match, context, next_state):
2668         """Anonymous hyperlink targets."""
2669         nodelist, blank_finish = self.anonymous_target(match)
2670         self.parent += nodelist
2671         self.blank_finish = blank_finish
2672         return [], next_state, []
2673
2674     blank = SpecializedBody.invalid_input
2675
2676
2677 class SubstitutionDef(Body):
2678
2679     """
2680     Parser for the contents of a substitution_definition element.
2681     """
2682
2683     patterns = {
2684           'embedded_directive': re.compile(r'(%s)::( +|$)'
2685                                            % Inliner.simplename),
2686           'text': r''}
2687     initial_transitions = ['embedded_directive', 'text']
2688
2689     def embedded_directive(self, match, context, next_state):
2690         nodelist, blank_finish = self.directive(match,
2691                                                 alt=self.parent['names'][0])
2692         self.parent += nodelist
2693         if not self.state_machine.at_eof():
2694             self.blank_finish = blank_finish
2695         raise EOFError
2696
2697     def text(self, match, context, next_state):
2698         if not self.state_machine.at_eof():
2699             self.blank_finish = self.state_machine.is_next_line_blank()
2700         raise EOFError
2701
2702
2703 class Text(RSTState):
2704
2705     """
2706     Classifier of second line of a text block.
2707
2708     Could be a paragraph, a definition list item, or a title.
2709     """
2710
2711     patterns = {'underline': Body.patterns['line'],
2712                 'text': r''}
2713     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2714
2715     def blank(self, match, context, next_state):
2716         """End of paragraph."""
2717         # NOTE: self.paragraph returns [node, system_message(s)], literalnext
2718         paragraph, literalnext = self.paragraph(
2719               context, self.state_machine.abs_line_number() - 1)
2720         self.parent += paragraph
2721         if literalnext:
2722             self.parent += self.literal_block()
2723         return [], 'Body', []
2724
2725     def eof(self, context):
2726         if context:
2727             self.blank(None, context, None)
2728         return []
2729
2730     def indent(self, match, context, next_state):
2731         """Definition list item."""
2732         dl = nodes.definition_list()
2733         # the definition list starts on the line before the indent:
2734         lineno = self.state_machine.abs_line_number() - 1
2735         dl.source, dl.line = self.state_machine.get_source_and_line(lineno)
2736         dl_item, blank_finish = self.definition_list_item(context)
2737         dl += dl_item
2738         self.parent += dl
2739         offset = self.state_machine.line_offset + 1   # next line
2740         newline_offset, blank_finish = self.nested_list_parse(
2741               self.state_machine.input_lines[offset:],
2742               input_offset=self.state_machine.abs_line_offset() + 1,
2743               node=dl, initial_state='DefinitionList',
2744               blank_finish=blank_finish, blank_finish_state='Definition')
2745         self.goto_line(newline_offset)
2746         if not blank_finish:
2747             self.parent += self.unindent_warning('Definition list')
2748         return [], 'Body', []
2749
2750     def underline(self, match, context, next_state):
2751         """Section title."""
2752         lineno = self.state_machine.abs_line_number()
2753         title = context[0].rstrip()
2754         underline = match.string.rstrip()
2755         source = title + '\n' + underline
2756         messages = []
2757         if column_width(title) > len(underline):
2758             if len(underline) < 4:
2759                 if self.state_machine.match_titles:
2760                     msg = self.reporter.info(
2761                         'Possible title underline, too short for the title.\n'
2762                         "Treating it as ordinary text because it's so short.",
2763                         line=lineno)
2764                     self.parent += msg
2765                 raise statemachine.TransitionCorrection('text')
2766             else:
2767                 blocktext = context[0] + '\n' + self.state_machine.line
2768                 msg = self.reporter.warning(
2769                     'Title underline too short.',
2770                     nodes.literal_block(blocktext, blocktext),
2771                     line=lineno)
2772                 messages.append(msg)
2773         if not self.state_machine.match_titles:
2774             blocktext = context[0] + '\n' + self.state_machine.line
2775             # We need get_source_and_line() here to report correctly
2776             src, srcline = self.state_machine.get_source_and_line()
2777             # TODO: why is abs_line_number() == srcline+1
2778             # if the error is in a table (try with test_tables.py)?
2779             # print("get_source_and_line", srcline)
2780             # print("abs_line_number", self.state_machine.abs_line_number())
2781             msg = self.reporter.severe(
2782                 'Unexpected section title.',
2783                 nodes.literal_block(blocktext, blocktext),
2784                 source=src, line=srcline)
2785             self.parent += messages
2786             self.parent += msg
2787             return [], next_state, []
2788         style = underline[0]
2789         context[:] = []
2790         self.section(title, source, style, lineno - 1, messages)
2791         return [], next_state, []
2792
2793     def text(self, match, context, next_state):
2794         """Paragraph."""
2795         startline = self.state_machine.abs_line_number() - 1
2796         msg = None
2797         try:
2798             block = self.state_machine.get_text_block(flush_left=True)
2799         except statemachine.UnexpectedIndentationError as err:
2800             block, src, srcline = err.args
2801             msg = self.reporter.error('Unexpected indentation.',
2802                                       source=src, line=srcline)
2803         lines = context + list(block)
2804         paragraph, literalnext = self.paragraph(lines, startline)
2805         self.parent += paragraph
2806         self.parent += msg
2807         if literalnext:
2808             try:
2809                 self.state_machine.next_line()
2810             except EOFError:
2811                 pass
2812             self.parent += self.literal_block()
2813         return [], next_state, []
2814
2815     def literal_block(self):
2816         """Return a list of nodes."""
2817         (indented, indent, offset, blank_finish
2818          ) = self.state_machine.get_indented()
2819         while indented and not indented[-1].strip():
2820             indented.trim_end()
2821         if not indented:
2822             return self.quoted_literal_block()
2823         data = '\n'.join(indented)
2824         literal_block = nodes.literal_block(data, data)
2825         (literal_block.source,
2826          literal_block.line) = self.state_machine.get_source_and_line(offset+1)
2827         nodelist = [literal_block]
2828         if not blank_finish:
2829             nodelist.append(self.unindent_warning('Literal block'))
2830         return nodelist
2831
2832     def quoted_literal_block(self):
2833         abs_line_offset = self.state_machine.abs_line_offset()
2834         offset = self.state_machine.line_offset
2835         parent_node = nodes.Element()
2836         new_abs_offset = self.nested_parse(
2837             self.state_machine.input_lines[offset:],
2838             input_offset=abs_line_offset, node=parent_node, match_titles=False,
2839             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2840                                   'initial_state': 'QuotedLiteralBlock'})
2841         self.goto_line(new_abs_offset)
2842         return parent_node.children
2843
2844     def definition_list_item(self, termline):
2845         # the parser is already on the second (indented) line:
2846         dd_lineno = self.state_machine.abs_line_number()
2847         dt_lineno = dd_lineno - 1
2848         (indented, indent, line_offset, blank_finish
2849          ) = self.state_machine.get_indented()
2850         dl_item = nodes.definition_list_item(
2851                       '\n'.join(termline + list(indented)))
2852         (dl_item.source,
2853          dl_item.line) = self.state_machine.get_source_and_line(dt_lineno)
2854         dt_nodes, messages = self.term(termline, dt_lineno)
2855         dl_item += dt_nodes
2856         dd = nodes.definition('', *messages)
2857         dd.source, dd.line = self.state_machine.get_source_and_line(dd_lineno)
2858         dl_item += dd
2859         if termline[0][-2:] == '::':
2860             dd += self.reporter.info(
2861                   'Blank line missing before literal block (after the "::")? '
2862                   'Interpreted as a definition list item.',
2863                   line=dd_lineno)
2864         # TODO: drop a definition if it is an empty comment to allow
2865         #       definition list items with several terms?
2866         #       https://sourceforge.net/p/docutils/feature-requests/60/
2867         self.nested_parse(indented, input_offset=line_offset, node=dd)
2868         return dl_item, blank_finish
2869
2870     classifier_delimiter = re.compile(' +: +')
2871
2872     def term(self, lines, lineno):
2873         """Return a definition_list's term and optional classifiers."""
2874         assert len(lines) == 1
2875         text_nodes, messages = self.inline_text(lines[0], lineno)
2876         dt = nodes.term(lines[0])
2877         dt.source, dt.line = self.state_machine.get_source_and_line(lineno)
2878         node_list = [dt]
2879         for i in range(len(text_nodes)):
2880             node = text_nodes[i]
2881             if isinstance(node, nodes.Text):
2882                 parts = self.classifier_delimiter.split(node)
2883                 if len(parts) == 1:
2884                     node_list[-1] += node
2885                 else:
2886                     text = parts[0].rstrip()
2887                     textnode = nodes.Text(text)
2888                     node_list[-1] += textnode
2889                     for part in parts[1:]:
2890                         node_list.append(
2891                             nodes.classifier(unescape(part, True), part))
2892             else:
2893                 node_list[-1] += node
2894         return node_list, messages
2895
2896
2897 class SpecializedText(Text):
2898
2899     """
2900     Superclass for second and subsequent lines of Text-variants.
2901
2902     All transition methods are disabled. Override individual methods in
2903     subclasses to re-enable.
2904     """
2905
2906     def eof(self, context):
2907         """Incomplete construct."""
2908         return []
2909
2910     def invalid_input(self, match=None, context=None, next_state=None):
2911         """Not a compound element member. Abort this state machine."""
2912         raise EOFError
2913
2914     blank = invalid_input
2915     indent = invalid_input
2916     underline = invalid_input
2917     text = invalid_input
2918
2919
2920 class Definition(SpecializedText):
2921
2922     """Second line of potential definition_list_item."""
2923
2924     def eof(self, context):
2925         """Not a definition."""
2926         self.state_machine.previous_line(2)  # so parent SM can reassess
2927         return []
2928
2929     def indent(self, match, context, next_state):
2930         """Definition list item."""
2931         dl_item, blank_finish = self.definition_list_item(context)
2932         self.parent += dl_item
2933         self.blank_finish = blank_finish
2934         return [], 'DefinitionList', []
2935
2936
2937 class Line(SpecializedText):
2938
2939     """
2940     Second line of over- & underlined section title or transition marker.
2941     """
2942
2943     eofcheck = 1                        # @@@ ???
2944     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2945
2946     def eof(self, context):
2947         """Transition marker at end of section or document."""
2948         marker = context[0].strip()
2949         if self.memo.section_bubble_up_kludge:
2950             self.memo.section_bubble_up_kludge = False
2951         elif len(marker) < 4:
2952             self.state_correction(context)
2953         if self.eofcheck:               # ignore EOFError with sections
2954             src, srcline = self.state_machine.get_source_and_line()
2955             # lineno = self.state_machine.abs_line_number() - 1
2956             transition = nodes.transition(rawsource=context[0])
2957             transition.source = src
2958             transition.line = srcline - 1
2959             # transition.line = lineno
2960             self.parent += transition
2961         self.eofcheck = 1
2962         return []
2963
2964     def blank(self, match, context, next_state):
2965         """Transition marker."""
2966         src, srcline = self.state_machine.get_source_and_line()
2967         marker = context[0].strip()
2968         if len(marker) < 4:
2969             self.state_correction(context)
2970         transition = nodes.transition(rawsource=marker)
2971         transition.source = src
2972         transition.line = srcline - 1
2973         self.parent += transition
2974         return [], 'Body', []
2975
2976     def text(self, match, context, next_state):
2977         """Potential over- & underlined title."""
2978         lineno = self.state_machine.abs_line_number() - 1
2979         overline = context[0]
2980         title = match.string
2981         underline = ''
2982         try:
2983             underline = self.state_machine.next_line()
2984         except EOFError:
2985             blocktext = overline + '\n' + title
2986             if len(overline.rstrip()) < 4:
2987                 self.short_overline(context, blocktext, lineno, 2)
2988             else:
2989                 msg = self.reporter.severe(
2990                     'Incomplete section title.',
2991                     nodes.literal_block(blocktext, blocktext),
2992                     line=lineno)
2993                 self.parent += msg
2994                 return [], 'Body', []
2995         source = '%s\n%s\n%s' % (overline, title, underline)
2996         overline = overline.rstrip()
2997         underline = underline.rstrip()
2998         if not self.transitions['underline'][0].match(underline):
2999             blocktext = overline + '\n' + title + '\n' + underline
3000             if len(overline.rstrip()) < 4:
3001                 self.short_overline(context, blocktext, lineno, 2)
3002             else:
3003                 msg = self.reporter.severe(
3004                     'Missing matching underline for section title overline.',
3005                     nodes.literal_block(source, source),
3006                     line=lineno)
3007                 self.parent += msg
3008                 return [], 'Body', []
3009         elif overline != underline:
3010             blocktext = overline + '\n' + title + '\n' + underline
3011             if len(overline.rstrip()) < 4:
3012                 self.short_overline(context, blocktext, lineno, 2)
3013             else:
3014                 msg = self.reporter.severe(
3015                       'Title overline & underline mismatch.',
3016                       nodes.literal_block(source, source),
3017                       line=lineno)
3018                 self.parent += msg
3019                 return [], 'Body', []
3020         title = title.rstrip()
3021         messages = []
3022         if column_width(title) > len(overline):
3023             blocktext = overline + '\n' + title + '\n' + underline
3024             if len(overline.rstrip()) < 4:
3025                 self.short_overline(context, blocktext, lineno, 2)
3026             else:
3027                 msg = self.reporter.warning(
3028                       'Title overline too short.',
3029                       nodes.literal_block(source, source),
3030                       line=lineno)
3031                 messages.append(msg)
3032         style = (overline[0], underline[0])
3033         self.eofcheck = 0               # @@@ not sure this is correct
3034         self.section(title.lstrip(), source, style, lineno + 1, messages)
3035         self.eofcheck = 1
3036         return [], 'Body', []
3037
3038     indent = text                       # indented title
3039
3040     def underline(self, match, context, next_state):
3041         overline = context[0]
3042         blocktext = overline + '\n' + self.state_machine.line
3043         lineno = self.state_machine.abs_line_number() - 1
3044         if len(overline.rstrip()) < 4:
3045             self.short_overline(context, blocktext, lineno, 1)
3046         msg = self.reporter.error(
3047               'Invalid section title or transition marker.',
3048               nodes.literal_block(blocktext, blocktext),
3049               line=lineno)
3050         self.parent += msg
3051         return [], 'Body', []
3052
3053     def short_overline(self, context, blocktext, lineno, lines=1):
3054         msg = self.reporter.info(
3055             'Possible incomplete section title.\nTreating the overline as '
3056             "ordinary text because it's so short.",
3057             line=lineno)
3058         self.parent += msg
3059         self.state_correction(context, lines)
3060
3061     def state_correction(self, context, lines=1):
3062         self.state_machine.previous_line(lines)
3063         context[:] = []
3064         raise statemachine.StateCorrection('Body', 'text')
3065
3066
3067 class QuotedLiteralBlock(RSTState):
3068
3069     """
3070     Nested parse handler for quoted (unindented) literal blocks.
3071
3072     Special-purpose.  Not for inclusion in `state_classes`.
3073     """
3074
3075     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3076                 'text': r''}
3077     initial_transitions = ('initial_quoted', 'text')
3078
3079     def __init__(self, state_machine, debug=False):
3080         RSTState.__init__(self, state_machine, debug)
3081         self.messages = []
3082         self.initial_lineno = None
3083
3084     def blank(self, match, context, next_state):
3085         if context:
3086             raise EOFError
3087         else:
3088             return context, next_state, []
3089
3090     def eof(self, context):
3091         if context:
3092             src, srcline = self.state_machine.get_source_and_line(
3093                                                         self.initial_lineno)
3094             text = '\n'.join(context)
3095             literal_block = nodes.literal_block(text, text)
3096             literal_block.source = src
3097             literal_block.line = srcline
3098             self.parent += literal_block
3099         else:
3100             self.parent += self.reporter.warning(
3101                 'Literal block expected; none found.',
3102                 line=self.state_machine.abs_line_number()
3103                 )  # src not available, statemachine.input_lines is empty
3104             self.state_machine.previous_line()
3105         self.parent += self.messages
3106         return []
3107
3108     def indent(self, match, context, next_state):
3109         assert context, ('QuotedLiteralBlock.indent: context should not '
3110                          'be empty!')
3111         self.messages.append(
3112             self.reporter.error('Unexpected indentation.',
3113                                 line=self.state_machine.abs_line_number()))
3114         self.state_machine.previous_line()
3115         raise EOFError
3116
3117     def initial_quoted(self, match, context, next_state):
3118         """Match arbitrary quote character on the first line only."""
3119         self.remove_transition('initial_quoted')
3120         quote = match.string[0]
3121         pattern = re.compile(re.escape(quote))
3122         # New transition matches consistent quotes only:
3123         self.add_transition('quoted',
3124                             (pattern, self.quoted, self.__class__.__name__))
3125         self.initial_lineno = self.state_machine.abs_line_number()
3126         return [match.string], next_state, []
3127
3128     def quoted(self, match, context, next_state):
3129         """Match consistent quotes on subsequent lines."""
3130         context.append(match.string)
3131         return context, next_state, []
3132
3133     def text(self, match, context, next_state):
3134         if context:
3135             self.messages.append(
3136                 self.reporter.error('Inconsistent literal block quoting.',
3137                                     line=self.state_machine.abs_line_number()))
3138             self.state_machine.previous_line()
3139         raise EOFError
3140
3141
3142 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3143                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3144                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3145 """Standard set of State classes used to start `RSTStateMachine`."""