docutils/docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import sys
 107 import re
 108 from types import FunctionType, MethodType
 109
 110 from docutils import nodes, statemachine, utils
 111 from docutils import ApplicationError, DataError
 112 from docutils.statemachine import StateMachineWS, StateWS
 113 from docutils.nodes import fully_normalize_name as normalize_name
 114 from docutils.nodes import whitespace_normalize_name
 115 import docutils.parsers.rst
 116 from docutils.parsers.rst import directives, languages, tableparser, roles
 117 from docutils.parsers.rst.languages import en as _fallback_language_module
 118 from docutils.utils import escape2null, unescape, column_width
 119 from docutils.utils import punctuation_chars, roman, urischemes
 120 from docutils.utils import split_escaped_whitespace
 121
 122 class MarkupError(DataError): pass
 123 class UnknownInterpretedRoleError(DataError): pass
 124 class InterpretedRoleNotImplementedError(DataError): pass
 125 class ParserError(ApplicationError): pass
 126 class MarkupMismatch(Exception): pass
 127
 128
 129 class Struct:
 130
 131     """Stores data attributes for dotted-attribute access."""
 132
 133     def __init__(self, **keywordargs):
 134         self.__dict__.update(keywordargs)
 135
 136
 137 class RSTStateMachine(StateMachineWS):
 138
 139     """
 140     reStructuredText's master StateMachine.
 141
 142     The entry point to reStructuredText parsing is the `run()` method.
 143     """
 144
 145     def run(self, input_lines, document, input_offset=0, match_titles=True,
 146             inliner=None):
 147         """
 148         Parse `input_lines` and modify the `document` node in place.
 149
 150         Extend `StateMachineWS.run()`: set up parse-global data and
 151         run the StateMachine.
 152         """
 153         self.language = languages.get_language(
 154             document.settings.language_code)
 155         self.match_titles = match_titles
 156         if inliner is None:
 157             inliner = Inliner()
 158         inliner.init_customizations(document.settings)
 159         self.memo = Struct(document=document,
 160                            reporter=document.reporter,
 161                            language=self.language,
 162                            title_styles=[],
 163                            section_level=0,
 164                            section_bubble_up_kludge=False,
 165                            inliner=inliner)
 166         self.document = document
 167         self.attach_observer(document.note_source)
 168         self.reporter = self.memo.reporter
 169         self.node = document
 170         results = StateMachineWS.run(self, input_lines, input_offset,
 171                                      input_source=document['source'])
 172         assert results == [], 'RSTStateMachine.run() results should be empty!'
 173         self.node = self.memo = None    # remove unneeded references
 174
 175
 176 class NestedStateMachine(StateMachineWS):
 177
 178     """
 179     StateMachine run from within other StateMachine runs, to parse nested
 180     document structures.
 181     """
 182
 183     def run(self, input_lines, input_offset, memo, node, match_titles=True):
 184         """
 185         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 186
 187         Extend `StateMachineWS.run()`: set up document-wide data.
 188         """
 189         self.match_titles = match_titles
 190         self.memo = memo
 191         self.document = memo.document
 192         self.attach_observer(self.document.note_source)
 193         self.reporter = memo.reporter
 194         self.language = memo.language
 195         self.node = node
 196         results = StateMachineWS.run(self, input_lines, input_offset)
 197         assert results == [], ('NestedStateMachine.run() results should be '
 198                                'empty!')
 199         return results
 200
 201
 202 class RSTState(StateWS):
 203
 204     """
 205     reStructuredText State superclass.
 206
 207     Contains methods used by all State subclasses.
 208     """
 209
 210     nested_sm = NestedStateMachine
 211     nested_sm_cache = []
 212
 213     def __init__(self, state_machine, debug=False):
 214         self.nested_sm_kwargs = {'state_classes': state_classes,
 215                                  'initial_state': 'Body'}
 216         StateWS.__init__(self, state_machine, debug)
 217
 218     def runtime_init(self):
 219         StateWS.runtime_init(self)
 220         memo = self.state_machine.memo
 221         self.memo = memo
 222         self.reporter = memo.reporter
 223         self.inliner = memo.inliner
 224         self.document = memo.document
 225         self.parent = self.state_machine.node
 226         # enable the reporter to determine source and source-line
 227         if not hasattr(self.reporter, 'get_source_and_line'):
 228             self.reporter.get_source_and_line = self.state_machine.get_source_and_line
 229
 230
 231     def goto_line(self, abs_line_offset):
 232         """
 233         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 234         """
 235         try:
 236             self.state_machine.goto_line(abs_line_offset)
 237         except EOFError:
 238             pass
 239
 240     def no_match(self, context, transitions):
 241         """
 242         Override `StateWS.no_match` to generate a system message.
 243
 244         This code should never be run.
 245         """
 246         self.reporter.severe(
 247             'Internal error: no transition pattern match.  State: "%s"; '
 248             'transitions: %s; context: %s; current line: %r.'
 249             % (self.__class__.__name__, transitions, context,
 250                self.state_machine.line))
 251         return context, None, []
 252
 253     def bof(self, context):
 254         """Called at beginning of file."""
 255         return [], []
 256
 257     def nested_parse(self, block, input_offset, node, match_titles=False,
 258                      state_machine_class=None, state_machine_kwargs=None):
 259         """
 260         Create a new StateMachine rooted at `node` and run it over the input
 261         `block`.
 262         """
 263         use_default = 0
 264         if state_machine_class is None:
 265             state_machine_class = self.nested_sm
 266             use_default += 1
 267         if state_machine_kwargs is None:
 268             state_machine_kwargs = self.nested_sm_kwargs
 269             use_default += 1
 270         block_length = len(block)
 271
 272         state_machine = None
 273         if use_default == 2:
 274             try:
 275                 state_machine = self.nested_sm_cache.pop()
 276             except IndexError:
 277                 pass
 278         if not state_machine:
 279             state_machine = state_machine_class(debug=self.debug,
 280                                                 **state_machine_kwargs)
 281         state_machine.run(block, input_offset, memo=self.memo,
 282                           node=node, match_titles=match_titles)
 283         if use_default == 2:
 284             self.nested_sm_cache.append(state_machine)
 285         else:
 286             state_machine.unlink()
 287         new_offset = state_machine.abs_line_offset()
 288         # No `block.parent` implies disconnected -- lines aren't in sync:
 289         if block.parent and (len(block) - block_length) != 0:
 290             # Adjustment for block if modified in nested parse:
 291             self.state_machine.next_line(len(block) - block_length)
 292         return new_offset
 293
 294     def nested_list_parse(self, block, input_offset, node, initial_state,
 295                           blank_finish,
 296                           blank_finish_state=None,
 297                           extra_settings={},
 298                           match_titles=False,
 299                           state_machine_class=None,
 300                           state_machine_kwargs=None):
 301         """
 302         Create a new StateMachine rooted at `node` and run it over the input
 303         `block`. Also keep track of optional intermediate blank lines and the
 304         required final one.
 305         """
 306         if state_machine_class is None:
 307             state_machine_class = self.nested_sm
 308         if state_machine_kwargs is None:
 309             state_machine_kwargs = self.nested_sm_kwargs.copy()
 310         state_machine_kwargs['initial_state'] = initial_state
 311         state_machine = state_machine_class(debug=self.debug,
 312                                             **state_machine_kwargs)
 313         if blank_finish_state is None:
 314             blank_finish_state = initial_state
 315         state_machine.states[blank_finish_state].blank_finish = blank_finish
 316         for key, value in extra_settings.items():
 317             setattr(state_machine.states[initial_state], key, value)
 318         state_machine.run(block, input_offset, memo=self.memo,
 319                           node=node, match_titles=match_titles)
 320         blank_finish = state_machine.states[blank_finish_state].blank_finish
 321         state_machine.unlink()
 322         return state_machine.abs_line_offset(), blank_finish
 323
 324     def section(self, title, source, style, lineno, messages):
 325         """Check for a valid subsection and create one if it checks out."""
 326         if self.check_subsection(source, style, lineno):
 327             self.new_subsection(title, lineno, messages)
 328
 329     def check_subsection(self, source, style, lineno):
 330         """
 331         Check for a valid subsection header.  Return 1 (true) or None (false).
 332
 333         When a new section is reached that isn't a subsection of the current
 334         section, back up the line count (use ``previous_line(-x)``), then
 335         ``raise EOFError``.  The current StateMachine will finish, then the
 336         calling StateMachine can re-examine the title.  This will work its way
 337         back up the calling chain until the correct section level isreached.
 338
 339         @@@ Alternative: Evaluate the title, store the title info & level, and
 340         back up the chain until that level is reached.  Store in memo? Or
 341         return in results?
 342
 343         :Exception: `EOFError` when a sibling or supersection encountered.
 344         """
 345         memo = self.memo
 346         title_styles = memo.title_styles
 347         mylevel = memo.section_level
 348         try:                            # check for existing title style
 349             level = title_styles.index(style) + 1
 350         except ValueError:              # new title style
 351             if len(title_styles) == memo.section_level: # new subsection
 352                 title_styles.append(style)
 353                 return 1
 354             else:                       # not at lowest level
 355                 self.parent += self.title_inconsistent(source, lineno)
 356                 return None
 357         if level <= mylevel:            # sibling or supersection
 358             memo.section_level = level   # bubble up to parent section
 359             if len(style) == 2:
 360                 memo.section_bubble_up_kludge = True
 361             # back up 2 lines for underline title, 3 for overline title
 362             self.state_machine.previous_line(len(style) + 1)
 363             raise EOFError              # let parent section re-evaluate
 364         if level == mylevel + 1:        # immediate subsection
 365             return 1
 366         else:                           # invalid subsection
 367             self.parent += self.title_inconsistent(source, lineno)
 368             return None
 369
 370     def title_inconsistent(self, sourcetext, lineno):
 371         error = self.reporter.severe(
 372             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 373             line=lineno)
 374         return error
 375
 376     def new_subsection(self, title, lineno, messages):
 377         """Append new subsection to document tree. On return, check level."""
 378         memo = self.memo
 379         mylevel = memo.section_level
 380         memo.section_level += 1
 381         section_node = nodes.section()
 382         self.parent += section_node
 383         textnodes, title_messages = self.inline_text(title, lineno)
 384         titlenode = nodes.title(title, '', *textnodes)
 385         name = normalize_name(titlenode.astext())
 386         section_node['names'].append(name)
 387         section_node += titlenode
 388         section_node += messages
 389         section_node += title_messages
 390         self.document.note_implicit_target(section_node, section_node)
 391         offset = self.state_machine.line_offset + 1
 392         absoffset = self.state_machine.abs_line_offset() + 1
 393         newabsoffset = self.nested_parse(
 394               self.state_machine.input_lines[offset:], input_offset=absoffset,
 395               node=section_node, match_titles=True)
 396         self.goto_line(newabsoffset)
 397         if memo.section_level <= mylevel: # can't handle next section?
 398             raise EOFError              # bubble up to supersection
 399         # reset section_level; next pass will detect it properly
 400         memo.section_level = mylevel
 401
 402     def paragraph(self, lines, lineno):
 403         """
 404         Return a list (paragraph & messages) & a boolean: literal_block next?
 405         """
 406         data = '\n'.join(lines).rstrip()
 407         if re.search(r'(?<!\\)(\\\\)*::$', data):
 408             if len(data) == 2:
 409                 return [], 1
 410             elif data[-3] in ' \n':
 411                 text = data[:-3].rstrip()
 412             else:
 413                 text = data[:-1]
 414             literalnext = 1
 415         else:
 416             text = data
 417             literalnext = 0
 418         textnodes, messages = self.inline_text(text, lineno)
 419         p = nodes.paragraph(data, '', *textnodes)
 420         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 421         return [p] + messages, literalnext
 422
 423     def inline_text(self, text, lineno):
 424         """
 425         Return 2 lists: nodes (text and inline elements), and system_messages.
 426         """
 427         nodes, messages = self.inliner.parse(text, lineno,
 428                                              self.memo, self.parent)
 429         return nodes, messages
 430
 431     def unindent_warning(self, node_name):
 432         # the actual problem is one line below the current line
 433         lineno = self.state_machine.abs_line_number()+1
 434         return self.reporter.warning('%s ends without a blank line; '
 435                                      'unexpected unindent.' % node_name,
 436                                      line=lineno)
 437
 438
 439 def build_regexp(definition, compile=True):
 440     """
 441     Build, compile and return a regular expression based on `definition`.
 442
 443     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 444         where "parts" is a list of regular expressions and/or regular
 445         expression definitions to be joined into an or-group.
 446     """
 447     name, prefix, suffix, parts = definition
 448     part_strings = []
 449     for part in parts:
 450         if type(part) is tuple:
 451             part_strings.append(build_regexp(part, None))
 452         else:
 453             part_strings.append(part)
 454     or_group = '|'.join(part_strings)
 455     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 456     if compile:
 457         return re.compile(regexp, re.UNICODE)
 458     else:
 459         return regexp
 460
 461
 462 class Inliner:
 463
 464     """
 465     Parse inline markup; call the `parse()` method.
 466     """
 467
 468     def __init__(self):
 469         self.implicit_dispatch = []
 470         """List of (pattern, bound method) tuples, used by
 471         `self.implicit_inline`."""
 472
 473     def init_customizations(self, settings):
 474         # lookahead and look-behind expressions for inline markup rules
 475         if getattr(settings, 'character_level_inline_markup', False):
 476             start_string_prefix = u'(^|(?<!\x00))'
 477             end_string_suffix = u''
 478         else:
 479             start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' %
 480                                    (punctuation_chars.openers,
 481                                     punctuation_chars.delimiters))
 482             end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' %
 483                                  (punctuation_chars.closing_delimiters,
 484                                   punctuation_chars.delimiters,
 485                                   punctuation_chars.closers))
 486         args = locals().copy()
 487         args.update(vars(self.__class__))
 488
 489         parts = ('initial_inline', start_string_prefix, '',
 490            [('start', '', self.non_whitespace_after, # simple start-strings
 491              [r'\*\*',                # strong
 492               r'\*(?!\*)',            # emphasis but not strong
 493               r'``',                  # literal
 494               r'_`',                  # inline internal target
 495               r'\|(?!\|)']            # substitution reference
 496              ),
 497             ('whole', '', end_string_suffix, # whole constructs
 498              [# reference name & end-string
 499               r'(?P<refname>%s)(?P<refend>__?)' % self.simplename,
 500               ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 501                [r'[0-9]+',               # manually numbered
 502                 r'\#(%s)?' % self.simplename, # auto-numbered (w/ label?)
 503                 r'\*',                   # auto-symbol
 504                 r'(?P<citationlabel>%s)' % self.simplename] # citation reference
 505                )
 506               ]
 507              ),
 508             ('backquote',             # interpreted text or phrase reference
 509              '(?P<role>(:%s:)?)' % self.simplename, # optional role
 510              self.non_whitespace_after,
 511              ['`(?!`)']               # but not literal
 512              )
 513             ]
 514            )
 515         self.start_string_prefix = start_string_prefix
 516         self.end_string_suffix = end_string_suffix
 517         self.parts = parts
 518
 519         self.patterns = Struct(
 520           initial=build_regexp(parts),
 521           emphasis=re.compile(self.non_whitespace_escape_before
 522                               + r'(\*)' + end_string_suffix, re.UNICODE),
 523           strong=re.compile(self.non_whitespace_escape_before
 524                             + r'(\*\*)' + end_string_suffix, re.UNICODE),
 525           interpreted_or_phrase_ref=re.compile(
 526               r"""
 527               %(non_unescaped_whitespace_escape_before)s
 528               (
 529                 `
 530                 (?P<suffix>
 531                   (?P<role>:%(simplename)s:)?
 532                   (?P<refend>__?)?
 533                 )
 534               )
 535               %(end_string_suffix)s
 536               """ % args, re.VERBOSE | re.UNICODE),
 537           embedded_link=re.compile(
 538               r"""
 539               (
 540                 (?:[ \n]+|^)            # spaces or beginning of line/string
 541                 <                       # open bracket
 542                 %(non_whitespace_after)s
 543                 (([^<>]|\x00[<>])+)     # anything but unescaped angle brackets
 544                 %(non_whitespace_escape_before)s
 545                 >                       # close bracket
 546               )
 547               $                         # end of string
 548               """ % args, re.VERBOSE | re.UNICODE),
 549           literal=re.compile(self.non_whitespace_before + '(``)'
 550                              + end_string_suffix, re.UNICODE),
 551           target=re.compile(self.non_whitespace_escape_before
 552                             + r'(`)' + end_string_suffix, re.UNICODE),
 553           substitution_ref=re.compile(self.non_whitespace_escape_before
 554                                       + r'(\|_{0,2})'
 555                                       + end_string_suffix, re.UNICODE),
 556           email=re.compile(self.email_pattern % args + '$',
 557                            re.VERBOSE | re.UNICODE),
 558           uri=re.compile(
 559                 (r"""
 560                 %(start_string_prefix)s
 561                 (?P<whole>
 562                   (?P<absolute>           # absolute URI
 563                     (?P<scheme>             # scheme (http, ftp, mailto)
 564                       [a-zA-Z][a-zA-Z0-9.+-]*
 565                     )
 566                     :
 567                     (
 568                       (                       # either:
 569                         (//?)?                  # hierarchical URI
 570                         %(uric)s*               # URI characters
 571                         %(uri_end)s             # final URI char
 572                       )
 573                       (                       # optional query
 574                         \?%(uric)s*
 575                         %(uri_end)s
 576                       )?
 577                       (                       # optional fragment
 578                         \#%(uric)s*
 579                         %(uri_end)s
 580                       )?
 581                     )
 582                   )
 583                 |                       # *OR*
 584                   (?P<email>              # email address
 585                     """ + self.email_pattern + r"""
 586                   )
 587                 )
 588                 %(end_string_suffix)s
 589                 """) % args, re.VERBOSE | re.UNICODE),
 590           pep=re.compile(
 591                 r"""
 592                 %(start_string_prefix)s
 593                 (
 594                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 595                 |
 596                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 597                 )
 598                 %(end_string_suffix)s""" % args, re.VERBOSE | re.UNICODE),
 599           rfc=re.compile(
 600                 r"""
 601                 %(start_string_prefix)s
 602                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 603                 %(end_string_suffix)s""" % args, re.VERBOSE | re.UNICODE))
 604
 605         self.implicit_dispatch.append((self.patterns.uri,
 606                                        self.standalone_uri))
 607         if settings.pep_references:
 608             self.implicit_dispatch.append((self.patterns.pep,
 609                                            self.pep_reference))
 610         if settings.rfc_references:
 611             self.implicit_dispatch.append((self.patterns.rfc,
 612                                            self.rfc_reference))
 613
 614     def parse(self, text, lineno, memo, parent):
 615         # Needs to be refactored for nested inline markup.
 616         # Add nested_parse() method?
 617         """
 618         Return 2 lists: nodes (text and inline elements), and system_messages.
 619
 620         Using `self.patterns.initial`, a pattern which matches start-strings
 621         (emphasis, strong, interpreted, phrase reference, literal,
 622         substitution reference, and inline target) and complete constructs
 623         (simple reference, footnote reference), search for a candidate.  When
 624         one is found, check for validity (e.g., not a quoted '*' character).
 625         If valid, search for the corresponding end string if applicable, and
 626         check it for validity.  If not found or invalid, generate a warning
 627         and ignore the start-string.  Implicit inline markup (e.g. standalone
 628         URIs) is found last.
 629         """
 630         self.reporter = memo.reporter
 631         self.document = memo.document
 632         self.language = memo.language
 633         self.parent = parent
 634         pattern_search = self.patterns.initial.search
 635         dispatch = self.dispatch
 636         remaining = escape2null(text)
 637         processed = []
 638         unprocessed = []
 639         messages = []
 640         while remaining:
 641             match = pattern_search(remaining)
 642             if match:
 643                 groups = match.groupdict()
 644                 method = dispatch[groups['start'] or groups['backquote']
 645                                   or groups['refend'] or groups['fnend']]
 646                 before, inlines, remaining, sysmessages = method(self, match,
 647                                                                  lineno)
 648                 unprocessed.append(before)
 649                 messages += sysmessages
 650                 if inlines:
 651                     processed += self.implicit_inline(''.join(unprocessed),
 652                                                       lineno)
 653                     processed += inlines
 654                     unprocessed = []
 655             else:
 656                 break
 657         remaining = ''.join(unprocessed) + remaining
 658         if remaining:
 659             processed += self.implicit_inline(remaining, lineno)
 660         return processed, messages
 661
 662     # Inline object recognition
 663     # -------------------------
 664     # See also init_customizations().
 665     non_whitespace_before = r'(?<!\s)'
 666     non_whitespace_escape_before = r'(?<![\s\x00])'
 667     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[\s\x00])'
 668     non_whitespace_after = r'(?!\s)'
 669     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 670     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 671     # Valid URI characters (see RFC 2396 & RFC 2732);
 672     # final \x00 allows backslash escapes in URIs:
 673     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 674     # Delimiter indicating the end of a URI (not part of the URI):
 675     uri_end_delim = r"""[>]"""
 676     # Last URI character; same as uric but no punctuation:
 677     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 678     # End of a URI (either 'urilast' or 'uric followed by a
 679     # uri_end_delim'):
 680     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 681     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 682     email_pattern = r"""
 683           %(emailc)s+(?:\.%(emailc)s+)*   # name
 684           (?<!\x00)@                      # at
 685           %(emailc)s+(?:\.%(emailc)s*)*   # host
 686           %(uri_end)s                     # final URI char
 687           """
 688
 689     def quoted_start(self, match):
 690         """Test if inline markup start-string is 'quoted'.
 691
 692         'Quoted' in this context means the start-string is enclosed in a pair
 693         of matching opening/closing delimiters (not necessarily quotes)
 694         or at the end of the match.
 695         """
 696         string = match.string
 697         start = match.start()
 698         if start == 0:                  # start-string at beginning of text
 699             return False
 700         prestart = string[start - 1]
 701         try:
 702             poststart = string[match.end()]
 703         except IndexError:          # start-string at end of text
 704             return True  # not "quoted" but no markup start-string either
 705         return punctuation_chars.match_chars(prestart, poststart)
 706
 707     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 708                    restore_backslashes=False):
 709         string = match.string
 710         matchstart = match.start('start')
 711         matchend = match.end('start')
 712         if self.quoted_start(match):
 713             return (string[:matchend], [], string[matchend:], [], '')
 714         endmatch = end_pattern.search(string[matchend:])
 715         if endmatch and endmatch.start(1):  # 1 or more chars
 716             _text = endmatch.string[:endmatch.start(1)]
 717             text = unescape(_text, restore_backslashes)
 718             textend = matchend + endmatch.end(1)
 719             rawsource = unescape(string[matchstart:textend], True)
 720             node = nodeclass(rawsource, text)
 721             node[0].rawsource = unescape(_text, True)
 722             return (string[:matchstart], [node],
 723                     string[textend:], [], endmatch.group(1))
 724         msg = self.reporter.warning(
 725               'Inline %s start-string without end-string.'
 726               % nodeclass.__name__, line=lineno)
 727         text = unescape(string[matchstart:matchend], True)
 728         rawsource = unescape(string[matchstart:matchend], True)
 729         prb = self.problematic(text, rawsource, msg)
 730         return string[:matchstart], [prb], string[matchend:], [msg], ''
 731
 732     def problematic(self, text, rawsource, message):
 733         msgid = self.document.set_id(message, self.parent)
 734         problematic = nodes.problematic(rawsource, text, refid=msgid)
 735         prbid = self.document.set_id(problematic)
 736         message.add_backref(prbid)
 737         return problematic
 738
 739     def emphasis(self, match, lineno):
 740         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 741               match, lineno, self.patterns.emphasis, nodes.emphasis)
 742         return before, inlines, remaining, sysmessages
 743
 744     def strong(self, match, lineno):
 745         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 746               match, lineno, self.patterns.strong, nodes.strong)
 747         return before, inlines, remaining, sysmessages
 748
 749     def interpreted_or_phrase_ref(self, match, lineno):
 750         end_pattern = self.patterns.interpreted_or_phrase_ref
 751         string = match.string
 752         matchstart = match.start('backquote')
 753         matchend = match.end('backquote')
 754         rolestart = match.start('role')
 755         role = match.group('role')
 756         position = ''
 757         if role:
 758             role = role[1:-1]
 759             position = 'prefix'
 760         elif self.quoted_start(match):
 761             return (string[:matchend], [], string[matchend:], [])
 762         endmatch = end_pattern.search(string[matchend:])
 763         if endmatch and endmatch.start(1):  # 1 or more chars
 764             textend = matchend + endmatch.end()
 765             if endmatch.group('role'):
 766                 if role:
 767                     msg = self.reporter.warning(
 768                         'Multiple roles in interpreted text (both '
 769                         'prefix and suffix present; only one allowed).',
 770                         line=lineno)
 771                     text = unescape(string[rolestart:textend], True)
 772                     prb = self.problematic(text, text, msg)
 773                     return string[:rolestart], [prb], string[textend:], [msg]
 774                 role = endmatch.group('suffix')[1:-1]
 775                 position = 'suffix'
 776             escaped = endmatch.string[:endmatch.start(1)]
 777             rawsource = unescape(string[matchstart:textend], True)
 778             if rawsource[-1:] == '_':
 779                 if role:
 780                     msg = self.reporter.warning(
 781                           'Mismatch: both interpreted text role %s and '
 782                           'reference suffix.' % position, line=lineno)
 783                     text = unescape(string[rolestart:textend], True)
 784                     prb = self.problematic(text, text, msg)
 785                     return string[:rolestart], [prb], string[textend:], [msg]
 786                 return self.phrase_ref(string[:matchstart], string[textend:],
 787                                        rawsource, escaped, unescape(escaped))
 788             else:
 789                 rawsource = unescape(string[rolestart:textend], True)
 790                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 791                                                       lineno)
 792                 return (string[:rolestart], nodelist,
 793                         string[textend:], messages)
 794         msg = self.reporter.warning(
 795               'Inline interpreted text or phrase reference start-string '
 796               'without end-string.', line=lineno)
 797         text = unescape(string[matchstart:matchend], True)
 798         prb = self.problematic(text, text, msg)
 799         return string[:matchstart], [prb], string[matchend:], [msg]
 800
 801     def phrase_ref(self, before, after, rawsource, escaped, text):
 802         match = self.patterns.embedded_link.search(escaped)
 803         if match: # embedded <URI> or <alias_>
 804             text = unescape(escaped[:match.start(0)])
 805             rawtext = unescape(escaped[:match.start(0)], True)
 806             aliastext = unescape(match.group(2))
 807             rawaliastext = unescape(match.group(2), True)
 808             underscore_escaped = rawaliastext.endswith(r'\_')
 809             if aliastext.endswith('_') and not (underscore_escaped
 810                                         or self.patterns.uri.match(aliastext)):
 811                 aliastype = 'name'
 812                 alias = normalize_name(aliastext[:-1])
 813                 target = nodes.target(match.group(1), refname=alias)
 814                 target.indirect_reference_name = aliastext[:-1]
 815             else:
 816                 aliastype = 'uri'
 817                 alias_parts = split_escaped_whitespace(match.group(2))
 818                 alias = ' '.join(''.join(unescape(part).split())
 819                                  for part in alias_parts)
 820                 alias = self.adjust_uri(alias)
 821                 if alias.endswith(r'\_'):
 822                     alias = alias[:-2] + '_'
 823                 target = nodes.target(match.group(1), refuri=alias)
 824                 target.referenced = 1
 825             if not aliastext:
 826                 raise ApplicationError('problem with embedded link: %r'
 827                                        % aliastext)
 828             if not text:
 829                 text = alias
 830                 rawtext = rawaliastext
 831         else:
 832             target = None
 833             rawtext = unescape(escaped, True)
 834
 835         refname = normalize_name(text)
 836         reference = nodes.reference(rawsource, text,
 837                                     name=whitespace_normalize_name(text))
 838         reference[0].rawsource = rawtext
 839
 840         node_list = [reference]
 841
 842         if rawsource[-2:] == '__':
 843             if  target and (aliastype == 'name'):
 844                 reference['refname'] = alias
 845                 self.document.note_refname(reference)
 846                 # self.document.note_indirect_target(target) # required?
 847             elif target and (aliastype == 'uri'):
 848                 reference['refuri'] = alias
 849             else:
 850                 reference['anonymous'] = 1
 851         else:
 852             if target:
 853                 target['names'].append(refname)
 854                 if aliastype == 'name':
 855                     reference['refname'] = alias
 856                     self.document.note_indirect_target(target)
 857                     self.document.note_refname(reference)
 858                 else:
 859                     reference['refuri'] = alias
 860                     self.document.note_explicit_target(target, self.parent)
 861                 # target.note_referenced_by(name=refname)
 862                 node_list.append(target)
 863             else:
 864                 reference['refname'] = refname
 865                 self.document.note_refname(reference)
 866         return before, node_list, after, []
 867
 868
 869     def adjust_uri(self, uri):
 870         match = self.patterns.email.match(uri)
 871         if match:
 872             return 'mailto:' + uri
 873         else:
 874             return uri
 875
 876     def interpreted(self, rawsource, text, role, lineno):
 877         role_fn, messages = roles.role(role, self.language, lineno,
 878                                        self.reporter)
 879         if role_fn:
 880             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 881             try:
 882                 nodes[0][0].rawsource = unescape(text, True)
 883             except IndexError:
 884                 pass
 885             return nodes, messages + messages2
 886         else:
 887             msg = self.reporter.error(
 888                 'Unknown interpreted text role "%s".' % role,
 889                 line=lineno)
 890             return ([self.problematic(rawsource, rawsource, msg)],
 891                     messages + [msg])
 892
 893     def literal(self, match, lineno):
 894         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 895               match, lineno, self.patterns.literal, nodes.literal,
 896               restore_backslashes=True)
 897         return before, inlines, remaining, sysmessages
 898
 899     def inline_internal_target(self, match, lineno):
 900         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 901               match, lineno, self.patterns.target, nodes.target)
 902         if inlines and isinstance(inlines[0], nodes.target):
 903             assert len(inlines) == 1
 904             target = inlines[0]
 905             name = normalize_name(target.astext())
 906             target['names'].append(name)
 907             self.document.note_explicit_target(target, self.parent)
 908         return before, inlines, remaining, sysmessages
 909
 910     def substitution_reference(self, match, lineno):
 911         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 912               match, lineno, self.patterns.substitution_ref,
 913               nodes.substitution_reference)
 914         if len(inlines) == 1:
 915             subref_node = inlines[0]
 916             if isinstance(subref_node, nodes.substitution_reference):
 917                 subref_text = subref_node.astext()
 918                 self.document.note_substitution_ref(subref_node, subref_text)
 919                 if endstring[-1:] == '_':
 920                     reference_node = nodes.reference(
 921                         '|%s%s' % (subref_text, endstring), '')
 922                     if endstring[-2:] == '__':
 923                         reference_node['anonymous'] = 1
 924                     else:
 925                         reference_node['refname'] = normalize_name(subref_text)
 926                         self.document.note_refname(reference_node)
 927                     reference_node += subref_node
 928                     inlines = [reference_node]
 929         return before, inlines, remaining, sysmessages
 930
 931     def footnote_reference(self, match, lineno):
 932         """
 933         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 934         elements.
 935         """
 936         label = match.group('footnotelabel')
 937         refname = normalize_name(label)
 938         string = match.string
 939         before = string[:match.start('whole')]
 940         remaining = string[match.end('whole'):]
 941         if match.group('citationlabel'):
 942             refnode = nodes.citation_reference('[%s]_' % label,
 943                                                refname=refname)
 944             refnode += nodes.Text(label)
 945             self.document.note_citation_ref(refnode)
 946         else:
 947             refnode = nodes.footnote_reference('[%s]_' % label)
 948             if refname[0] == '#':
 949                 refname = refname[1:]
 950                 refnode['auto'] = 1
 951                 self.document.note_autofootnote_ref(refnode)
 952             elif refname == '*':
 953                 refname = ''
 954                 refnode['auto'] = '*'
 955                 self.document.note_symbol_footnote_ref(
 956                       refnode)
 957             else:
 958                 refnode += nodes.Text(label)
 959             if refname:
 960                 refnode['refname'] = refname
 961                 self.document.note_footnote_ref(refnode)
 962             if utils.get_trim_footnote_ref_space(self.document.settings):
 963                 before = before.rstrip()
 964         return (before, [refnode], remaining, [])
 965
 966     def reference(self, match, lineno, anonymous=False):
 967         referencename = match.group('refname')
 968         refname = normalize_name(referencename)
 969         referencenode = nodes.reference(
 970             referencename + match.group('refend'), referencename,
 971             name=whitespace_normalize_name(referencename))
 972         referencenode[0].rawsource = referencename
 973         if anonymous:
 974             referencenode['anonymous'] = 1
 975         else:
 976             referencenode['refname'] = refname
 977             self.document.note_refname(referencenode)
 978         string = match.string
 979         matchstart = match.start('whole')
 980         matchend = match.end('whole')
 981         return (string[:matchstart], [referencenode], string[matchend:], [])
 982
 983     def anonymous_reference(self, match, lineno):
 984         return self.reference(match, lineno, anonymous=1)
 985
 986     def standalone_uri(self, match, lineno):
 987         if (not match.group('scheme')
 988                 or match.group('scheme').lower() in urischemes.schemes):
 989             if match.group('email'):
 990                 addscheme = 'mailto:'
 991             else:
 992                 addscheme = ''
 993             text = match.group('whole')
 994             unescaped = unescape(text)
 995             rawsource = unescape(text, True)
 996             reference = nodes.reference(rawsource, unescaped,
 997                                         refuri=addscheme + unescaped)
 998             reference[0].rawsource = rawsource
 999             return [reference]
1000         else:                   # not a valid scheme
1001             raise MarkupMismatch
1002
1003     def pep_reference(self, match, lineno):
1004         text = match.group(0)
1005         if text.startswith('pep-'):
1006             pepnum = int(match.group('pepnum1'))
1007         elif text.startswith('PEP'):
1008             pepnum = int(match.group('pepnum2'))
1009         else:
1010             raise MarkupMismatch
1011         ref = (self.document.settings.pep_base_url
1012                + self.document.settings.pep_file_url_template % pepnum)
1013         unescaped = unescape(text)
1014         return [nodes.reference(unescape(text, True), unescaped, refuri=ref)]
1015
1016     rfc_url = 'rfc%d.html'
1017
1018     def rfc_reference(self, match, lineno):
1019         text = match.group(0)
1020         if text.startswith('RFC'):
1021             rfcnum = int(match.group('rfcnum'))
1022             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
1023         else:
1024             raise MarkupMismatch
1025         unescaped = unescape(text)
1026         return [nodes.reference(unescape(text, True), unescaped, refuri=ref)]
1027
1028     def implicit_inline(self, text, lineno):
1029         """
1030         Check each of the patterns in `self.implicit_dispatch` for a match,
1031         and dispatch to the stored method for the pattern.  Recursively check
1032         the text before and after the match.  Return a list of `nodes.Text`
1033         and inline element nodes.
1034         """
1035         if not text:
1036             return []
1037         for pattern, method in self.implicit_dispatch:
1038             match = pattern.search(text)
1039             if match:
1040                 try:
1041                     # Must recurse on strings before *and* after the match;
1042                     # there may be multiple patterns.
1043                     return (self.implicit_inline(text[:match.start()], lineno)
1044                             + method(match, lineno) +
1045                             self.implicit_inline(text[match.end():], lineno))
1046                 except MarkupMismatch:
1047                     pass
1048         return [nodes.Text(unescape(text), rawsource=unescape(text, True))]
1049
1050     dispatch = {'*': emphasis,
1051                 '**': strong,
1052                 '`': interpreted_or_phrase_ref,
1053                 '``': literal,
1054                 '_`': inline_internal_target,
1055                 ']_': footnote_reference,
1056                 '|': substitution_reference,
1057                 '_': reference,
1058                 '__': anonymous_reference}
1059
1060
1061 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1062     return ord(s) - _zero
1063
1064 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1065     return ord(s) - _zero
1066
1067 def _lowerroman_to_int(s):
1068     return roman.fromRoman(s.upper())
1069
1070
1071 class Body(RSTState):
1072
1073     """
1074     Generic classifier of the first line of a block.
1075     """
1076
1077     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1078     """Padding character for East Asian double-width text."""
1079
1080     enum = Struct()
1081     """Enumerated list parsing information."""
1082
1083     enum.formatinfo = {
1084           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1085           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1086           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1087     enum.formats = enum.formatinfo.keys()
1088     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1089                       'lowerroman', 'upperroman'] # ORDERED!
1090     enum.sequencepats = {'arabic': '[0-9]+',
1091                          'loweralpha': '[a-z]',
1092                          'upperalpha': '[A-Z]',
1093                          'lowerroman': '[ivxlcdm]+',
1094                          'upperroman': '[IVXLCDM]+',}
1095     enum.converters = {'arabic': int,
1096                        'loweralpha': _loweralpha_to_int,
1097                        'upperalpha': _upperalpha_to_int,
1098                        'lowerroman': _lowerroman_to_int,
1099                        'upperroman': roman.fromRoman}
1100
1101     enum.sequenceregexps = {}
1102     for sequence in enum.sequences:
1103         enum.sequenceregexps[sequence] = re.compile(
1104               enum.sequencepats[sequence] + '$', re.UNICODE)
1105
1106     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1107     """Matches the top (& bottom) of a full table)."""
1108
1109     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1110     """Matches the top of a simple table."""
1111
1112     simple_table_border_pat = re.compile('=+[ =]*$')
1113     """Matches the bottom & header bottom of a simple table."""
1114
1115     pats = {}
1116     """Fragments of patterns used by transitions."""
1117
1118     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1119     pats['alpha'] = '[a-zA-Z]'
1120     pats['alphanum'] = '[a-zA-Z0-9]'
1121     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1122     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1123                     '|%(upperroman)s|#)' % enum.sequencepats)
1124     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1125     # @@@ Loosen up the pattern?  Allow Unicode?
1126     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1127     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1128     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1129     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1130
1131     for format in enum.formats:
1132         pats[format] = '(?P<%s>%s%s%s)' % (
1133               format, re.escape(enum.formatinfo[format].prefix),
1134               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1135
1136     patterns = {
1137           'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
1138           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1139           'field_marker': r':(?![: ])([^:\\]|\\.|:(?!([ `]|$)))*(?<! ):( +|$)',
1140           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1141           'doctest': r'>>>( +|$)',
1142           'line_block': r'\|( +|$)',
1143           'grid_table_top': grid_table_top_pat,
1144           'simple_table_top': simple_table_top_pat,
1145           'explicit_markup': r'\.\.( +|$)',
1146           'anonymous': r'__( +|$)',
1147           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1148           'text': r''}
1149     initial_transitions = (
1150           'bullet',
1151           'enumerator',
1152           'field_marker',
1153           'option_marker',
1154           'doctest',
1155           'line_block',
1156           'grid_table_top',
1157           'simple_table_top',
1158           'explicit_markup',
1159           'anonymous',
1160           'line',
1161           'text')
1162
1163     def indent(self, match, context, next_state):
1164         """Block quote."""
1165         indented, indent, line_offset, blank_finish = \
1166               self.state_machine.get_indented()
1167         elements = self.block_quote(indented, line_offset)
1168         self.parent += elements
1169         if not blank_finish:
1170             self.parent += self.unindent_warning('Block quote')
1171         return context, next_state, []
1172
1173     def block_quote(self, indented, line_offset):
1174         elements = []
1175         while indented:
1176             (blockquote_lines,
1177              attribution_lines,
1178              attribution_offset,
1179              indented,
1180              new_line_offset) = self.split_attribution(indented, line_offset)
1181             blockquote = nodes.block_quote()
1182             self.nested_parse(blockquote_lines, line_offset, blockquote)
1183             elements.append(blockquote)
1184             if attribution_lines:
1185                 attribution, messages = self.parse_attribution(
1186                     attribution_lines, attribution_offset)
1187                 blockquote += attribution
1188                 elements += messages
1189             line_offset = new_line_offset
1190             while indented and not indented[0]:
1191                 indented = indented[1:]
1192                 line_offset += 1
1193         return elements
1194
1195     # U+2014 is an em-dash:
1196     attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
1197                                      re.UNICODE)
1198
1199     def split_attribution(self, indented, line_offset):
1200         """
1201         Check for a block quote attribution and split it off:
1202
1203         * First line after a blank line must begin with a dash ("--", "---",
1204           em-dash; matches `self.attribution_pattern`).
1205         * Every line after that must have consistent indentation.
1206         * Attributions must be preceded by block quote content.
1207
1208         Return a tuple of: (block quote content lines, content offset,
1209         attribution lines, attribution offset, remaining indented lines).
1210         """
1211         blank = None
1212         nonblank_seen = False
1213         for i in range(len(indented)):
1214             line = indented[i].rstrip()
1215             if line:
1216                 if nonblank_seen and blank == i - 1: # last line blank
1217                     match = self.attribution_pattern.match(line)
1218                     if match:
1219                         attribution_end, indent = self.check_attribution(
1220                             indented, i)
1221                         if attribution_end:
1222                             a_lines = indented[i:attribution_end]
1223                             a_lines.trim_left(match.end(), end=1)
1224                             a_lines.trim_left(indent, start=1)
1225                             return (indented[:i], a_lines,
1226                                     i, indented[attribution_end:],
1227                                     line_offset + attribution_end)
1228                 nonblank_seen = True
1229             else:
1230                 blank = i
1231         else:
1232             return (indented, None, None, None, None)
1233
1234     def check_attribution(self, indented, attribution_start):
1235         """
1236         Check attribution shape.
1237         Return the index past the end of the attribution, and the indent.
1238         """
1239         indent = None
1240         i = attribution_start + 1
1241         for i in range(attribution_start + 1, len(indented)):
1242             line = indented[i].rstrip()
1243             if not line:
1244                 break
1245             if indent is None:
1246                 indent = len(line) - len(line.lstrip())
1247             elif len(line) - len(line.lstrip()) != indent:
1248                 return None, None       # bad shape; not an attribution
1249         else:
1250             # return index of line after last attribution line:
1251             i += 1
1252         return i, (indent or 0)
1253
1254     def parse_attribution(self, indented, line_offset):
1255         text = '\n'.join(indented).rstrip()
1256         lineno = self.state_machine.abs_line_number() + line_offset
1257         textnodes, messages = self.inline_text(text, lineno)
1258         node = nodes.attribution(text, '', *textnodes)
1259         node.source, node.line = self.state_machine.get_source_and_line(lineno)
1260         return node, messages
1261
1262     def bullet(self, match, context, next_state):
1263         """Bullet list item."""
1264         bulletlist = nodes.bullet_list()
1265         (bulletlist.source,
1266          bulletlist.line) = self.state_machine.get_source_and_line()
1267         self.parent += bulletlist
1268         bulletlist['bullet'] = match.string[0]
1269         i, blank_finish = self.list_item(match.end())
1270         bulletlist += i
1271         offset = self.state_machine.line_offset + 1   # next line
1272         new_line_offset, blank_finish = self.nested_list_parse(
1273               self.state_machine.input_lines[offset:],
1274               input_offset=self.state_machine.abs_line_offset() + 1,
1275               node=bulletlist, initial_state='BulletList',
1276               blank_finish=blank_finish)
1277         self.goto_line(new_line_offset)
1278         if not blank_finish:
1279             self.parent += self.unindent_warning('Bullet list')
1280         return [], next_state, []
1281
1282     def list_item(self, indent):
1283         if self.state_machine.line[indent:]:
1284             indented, line_offset, blank_finish = (
1285                 self.state_machine.get_known_indented(indent))
1286         else:
1287             indented, indent, line_offset, blank_finish = (
1288                 self.state_machine.get_first_known_indented(indent))
1289         listitem = nodes.list_item('\n'.join(indented))
1290         if indented:
1291             self.nested_parse(indented, input_offset=line_offset,
1292                               node=listitem)
1293         return listitem, blank_finish
1294
1295     def enumerator(self, match, context, next_state):
1296         """Enumerated List Item"""
1297         format, sequence, text, ordinal = self.parse_enumerator(match)
1298         if not self.is_enumerated_list_item(ordinal, sequence, format):
1299             raise statemachine.TransitionCorrection('text')
1300         enumlist = nodes.enumerated_list()
1301         self.parent += enumlist
1302         if sequence == '#':
1303             enumlist['enumtype'] = 'arabic'
1304         else:
1305             enumlist['enumtype'] = sequence
1306         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1307         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1308         if ordinal != 1:
1309             enumlist['start'] = ordinal
1310             msg = self.reporter.info(
1311                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1312                 % (text, ordinal))
1313             self.parent += msg
1314         listitem, blank_finish = self.list_item(match.end())
1315         enumlist += listitem
1316         offset = self.state_machine.line_offset + 1   # next line
1317         newline_offset, blank_finish = self.nested_list_parse(
1318               self.state_machine.input_lines[offset:],
1319               input_offset=self.state_machine.abs_line_offset() + 1,
1320               node=enumlist, initial_state='EnumeratedList',
1321               blank_finish=blank_finish,
1322               extra_settings={'lastordinal': ordinal,
1323                               'format': format,
1324                               'auto': sequence == '#'})
1325         self.goto_line(newline_offset)
1326         if not blank_finish:
1327             self.parent += self.unindent_warning('Enumerated list')
1328         return [], next_state, []
1329
1330     def parse_enumerator(self, match, expected_sequence=None):
1331         """
1332         Analyze an enumerator and return the results.
1333
1334         :Return:
1335             - the enumerator format ('period', 'parens', or 'rparen'),
1336             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1337             - the text of the enumerator, stripped of formatting, and
1338             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1339               ``None`` is returned for invalid enumerator text).
1340
1341         The enumerator format has already been determined by the regular
1342         expression match. If `expected_sequence` is given, that sequence is
1343         tried first. If not, we check for Roman numeral 1. This way,
1344         single-character Roman numerals (which are also alphabetical) can be
1345         matched. If no sequence has been matched, all sequences are checked in
1346         order.
1347         """
1348         groupdict = match.groupdict()
1349         sequence = ''
1350         for format in self.enum.formats:
1351             if groupdict[format]:       # was this the format matched?
1352                 break                   # yes; keep `format`
1353         else:                           # shouldn't happen
1354             raise ParserError('enumerator format not matched')
1355         text = groupdict[format][self.enum.formatinfo[format].start
1356                                  :self.enum.formatinfo[format].end]
1357         if text == '#':
1358             sequence = '#'
1359         elif expected_sequence:
1360             try:
1361                 if self.enum.sequenceregexps[expected_sequence].match(text):
1362                     sequence = expected_sequence
1363             except KeyError:            # shouldn't happen
1364                 raise ParserError('unknown enumerator sequence: %s'
1365                                   % sequence)
1366         elif text == 'i':
1367             sequence = 'lowerroman'
1368         elif text == 'I':
1369             sequence = 'upperroman'
1370         if not sequence:
1371             for sequence in self.enum.sequences:
1372                 if self.enum.sequenceregexps[sequence].match(text):
1373                     break
1374             else:                       # shouldn't happen
1375                 raise ParserError('enumerator sequence not matched')
1376         if sequence == '#':
1377             ordinal = 1
1378         else:
1379             try:
1380                 ordinal = self.enum.converters[sequence](text)
1381             except roman.InvalidRomanNumeralError:
1382                 ordinal = None
1383         return format, sequence, text, ordinal
1384
1385     def is_enumerated_list_item(self, ordinal, sequence, format):
1386         """
1387         Check validity based on the ordinal value and the second line.
1388
1389         Return true if the ordinal is valid and the second line is blank,
1390         indented, or starts with the next enumerator or an auto-enumerator.
1391         """
1392         if ordinal is None:
1393             return None
1394         try:
1395             next_line = self.state_machine.next_line()
1396         except EOFError:              # end of input lines
1397             self.state_machine.previous_line()
1398             return 1
1399         else:
1400             self.state_machine.previous_line()
1401         if not next_line[:1].strip():   # blank or indented
1402             return 1
1403         result = self.make_enumerator(ordinal + 1, sequence, format)
1404         if result:
1405             next_enumerator, auto_enumerator = result
1406             try:
1407                 if ( next_line.startswith(next_enumerator) or
1408                      next_line.startswith(auto_enumerator) ):
1409                     return 1
1410             except TypeError:
1411                 pass
1412         return None
1413
1414     def make_enumerator(self, ordinal, sequence, format):
1415         """
1416         Construct and return the next enumerated list item marker, and an
1417         auto-enumerator ("#" instead of the regular enumerator).
1418
1419         Return ``None`` for invalid (out of range) ordinals.
1420         """ #"
1421         if sequence == '#':
1422             enumerator = '#'
1423         elif sequence == 'arabic':
1424             enumerator = str(ordinal)
1425         else:
1426             if sequence.endswith('alpha'):
1427                 if ordinal > 26:
1428                     return None
1429                 enumerator = chr(ordinal + ord('a') - 1)
1430             elif sequence.endswith('roman'):
1431                 try:
1432                     enumerator = roman.toRoman(ordinal)
1433                 except roman.RomanError:
1434                     return None
1435             else:                       # shouldn't happen
1436                 raise ParserError('unknown enumerator sequence: "%s"'
1437                                   % sequence)
1438             if sequence.startswith('lower'):
1439                 enumerator = enumerator.lower()
1440             elif sequence.startswith('upper'):
1441                 enumerator = enumerator.upper()
1442             else:                       # shouldn't happen
1443                 raise ParserError('unknown enumerator sequence: "%s"'
1444                                   % sequence)
1445         formatinfo = self.enum.formatinfo[format]
1446         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1447                            + ' ')
1448         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1449         return next_enumerator, auto_enumerator
1450
1451     def field_marker(self, match, context, next_state):
1452         """Field list item."""
1453         field_list = nodes.field_list()
1454         self.parent += field_list
1455         field, blank_finish = self.field(match)
1456         field_list += field
1457         offset = self.state_machine.line_offset + 1   # next line
1458         newline_offset, blank_finish = self.nested_list_parse(
1459               self.state_machine.input_lines[offset:],
1460               input_offset=self.state_machine.abs_line_offset() + 1,
1461               node=field_list, initial_state='FieldList',
1462               blank_finish=blank_finish)
1463         self.goto_line(newline_offset)
1464         if not blank_finish:
1465             self.parent += self.unindent_warning('Field list')
1466         return [], next_state, []
1467
1468     def field(self, match):
1469         name = self.parse_field_marker(match)
1470         src, srcline = self.state_machine.get_source_and_line()
1471         lineno = self.state_machine.abs_line_number()
1472         indented, indent, line_offset, blank_finish = \
1473               self.state_machine.get_first_known_indented(match.end())
1474         field_node = nodes.field()
1475         field_node.source = src
1476         field_node.line = srcline
1477         name_nodes, name_messages = self.inline_text(name, lineno)
1478         field_node += nodes.field_name(name, '', *name_nodes)
1479         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1480         field_node += field_body
1481         if indented:
1482             self.parse_field_body(indented, line_offset, field_body)
1483         return field_node, blank_finish
1484
1485     def parse_field_marker(self, match):
1486         """Extract & return field name from a field marker match."""
1487         field = match.group()[1:]        # strip off leading ':'
1488         field = field[:field.rfind(':')] # strip off trailing ':' etc.
1489         return field
1490
1491     def parse_field_body(self, indented, offset, node):
1492         self.nested_parse(indented, input_offset=offset, node=node)
1493
1494     def option_marker(self, match, context, next_state):
1495         """Option list item."""
1496         optionlist = nodes.option_list()
1497         (optionlist.source, optionlist.line) = self.state_machine.get_source_and_line()
1498         try:
1499             listitem, blank_finish = self.option_list_item(match)
1500         except MarkupError, error:
1501             # This shouldn't happen; pattern won't match.
1502             msg = self.reporter.error(u'Invalid option list marker: %s' %
1503                                       error)
1504             self.parent += msg
1505             indented, indent, line_offset, blank_finish = \
1506                   self.state_machine.get_first_known_indented(match.end())
1507             elements = self.block_quote(indented, line_offset)
1508             self.parent += elements
1509             if not blank_finish:
1510                 self.parent += self.unindent_warning('Option list')
1511             return [], next_state, []
1512         self.parent += optionlist
1513         optionlist += listitem
1514         offset = self.state_machine.line_offset + 1   # next line
1515         newline_offset, blank_finish = self.nested_list_parse(
1516               self.state_machine.input_lines[offset:],
1517               input_offset=self.state_machine.abs_line_offset() + 1,
1518               node=optionlist, initial_state='OptionList',
1519               blank_finish=blank_finish)
1520         self.goto_line(newline_offset)
1521         if not blank_finish:
1522             self.parent += self.unindent_warning('Option list')
1523         return [], next_state, []
1524
1525     def option_list_item(self, match):
1526         offset = self.state_machine.abs_line_offset()
1527         options = self.parse_option_marker(match)
1528         indented, indent, line_offset, blank_finish = \
1529               self.state_machine.get_first_known_indented(match.end())
1530         if not indented:                # not an option list item
1531             self.goto_line(offset)
1532             raise statemachine.TransitionCorrection('text')
1533         option_group = nodes.option_group('', *options)
1534         description = nodes.description('\n'.join(indented))
1535         option_list_item = nodes.option_list_item('', option_group,
1536                                                   description)
1537         if indented:
1538             self.nested_parse(indented, input_offset=line_offset,
1539                               node=description)
1540         return option_list_item, blank_finish
1541
1542     def parse_option_marker(self, match):
1543         """
1544         Return a list of `node.option` and `node.option_argument` objects,
1545         parsed from an option marker match.
1546
1547         :Exception: `MarkupError` for invalid option markers.
1548         """
1549         optlist = []
1550         optionstrings = match.group().rstrip().split(', ')
1551         for optionstring in optionstrings:
1552             tokens = optionstring.split()
1553             delimiter = ' '
1554             firstopt = tokens[0].split('=', 1)
1555             if len(firstopt) > 1:
1556                 # "--opt=value" form
1557                 tokens[:1] = firstopt
1558                 delimiter = '='
1559             elif (len(tokens[0]) > 2
1560                   and ((tokens[0].startswith('-')
1561                         and not tokens[0].startswith('--'))
1562                        or tokens[0].startswith('+'))):
1563                 # "-ovalue" form
1564                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1565                 delimiter = ''
1566             if len(tokens) > 1 and (tokens[1].startswith('<')
1567                                     and tokens[-1].endswith('>')):
1568                 # "-o <value1 value2>" form; join all values into one token
1569                 tokens[1:] = [' '.join(tokens[1:])]
1570             if 0 < len(tokens) <= 2:
1571                 option = nodes.option(optionstring)
1572                 option += nodes.option_string(tokens[0], tokens[0])
1573                 if len(tokens) > 1:
1574                     option += nodes.option_argument(tokens[1], tokens[1],
1575                                                     delimiter=delimiter)
1576                 optlist.append(option)
1577             else:
1578                 raise MarkupError(
1579                     'wrong number of option tokens (=%s), should be 1 or 2: '
1580                     '"%s"' % (len(tokens), optionstring))
1581         return optlist
1582
1583     def doctest(self, match, context, next_state):
1584         data = '\n'.join(self.state_machine.get_text_block())
1585         # TODO: prepend class value ['pycon'] (Python Console)
1586         # parse with `directives.body.CodeBlock` (returns literal-block
1587         # with class "code" and syntax highlight markup).
1588         self.parent += nodes.doctest_block(data, data)
1589         return [], next_state, []
1590
1591     def line_block(self, match, context, next_state):
1592         """First line of a line block."""
1593         block = nodes.line_block()
1594         self.parent += block
1595         lineno = self.state_machine.abs_line_number()
1596         line, messages, blank_finish = self.line_block_line(match, lineno)
1597         block += line
1598         self.parent += messages
1599         if not blank_finish:
1600             offset = self.state_machine.line_offset + 1   # next line
1601             new_line_offset, blank_finish = self.nested_list_parse(
1602                   self.state_machine.input_lines[offset:],
1603                   input_offset=self.state_machine.abs_line_offset() + 1,
1604                   node=block, initial_state='LineBlock',
1605                   blank_finish=0)
1606             self.goto_line(new_line_offset)
1607         if not blank_finish:
1608             self.parent += self.reporter.warning(
1609                 'Line block ends without a blank line.',
1610                 line=lineno+1)
1611         if len(block):
1612             if block[0].indent is None:
1613                 block[0].indent = 0
1614             self.nest_line_block_lines(block)
1615         return [], next_state, []
1616
1617     def line_block_line(self, match, lineno):
1618         """Return one line element of a line_block."""
1619         indented, indent, line_offset, blank_finish = \
1620             self.state_machine.get_first_known_indented(match.end(),
1621                                                         until_blank=True)
1622         text = u'\n'.join(indented)
1623         text_nodes, messages = self.inline_text(text, lineno)
1624         line = nodes.line(text, '', *text_nodes)
1625         if match.string.rstrip() != '|': # not empty
1626             line.indent = len(match.group(1)) - 1
1627         return line, messages, blank_finish
1628
1629     def nest_line_block_lines(self, block):
1630         for index in range(1, len(block)):
1631             if getattr(block[index], 'indent', None) is None:
1632                 block[index].indent = block[index - 1].indent
1633         self.nest_line_block_segment(block)
1634
1635     def nest_line_block_segment(self, block):
1636         indents = [item.indent for item in block]
1637         least = min(indents)
1638         new_items = []
1639         new_block = nodes.line_block()
1640         for item in block:
1641             if item.indent > least:
1642                 new_block.append(item)
1643             else:
1644                 if len(new_block):
1645                     self.nest_line_block_segment(new_block)
1646                     new_items.append(new_block)
1647                     new_block = nodes.line_block()
1648                 new_items.append(item)
1649         if len(new_block):
1650             self.nest_line_block_segment(new_block)
1651             new_items.append(new_block)
1652         block[:] = new_items
1653
1654     def grid_table_top(self, match, context, next_state):
1655         """Top border of a full table."""
1656         return self.table_top(match, context, next_state,
1657                               self.isolate_grid_table,
1658                               tableparser.GridTableParser)
1659
1660     def simple_table_top(self, match, context, next_state):
1661         """Top border of a simple table."""
1662         return self.table_top(match, context, next_state,
1663                               self.isolate_simple_table,
1664                               tableparser.SimpleTableParser)
1665
1666     def table_top(self, match, context, next_state,
1667                   isolate_function, parser_class):
1668         """Top border of a generic table."""
1669         nodelist, blank_finish = self.table(isolate_function, parser_class)
1670         self.parent += nodelist
1671         if not blank_finish:
1672             msg = self.reporter.warning(
1673                 'Blank line required after table.',
1674                 line=self.state_machine.abs_line_number()+1)
1675             self.parent += msg
1676         return [], next_state, []
1677
1678     def table(self, isolate_function, parser_class):
1679         """Parse a table."""
1680         block, messages, blank_finish = isolate_function()
1681         if block:
1682             try:
1683                 parser = parser_class()
1684                 tabledata = parser.parse(block)
1685                 tableline = (self.state_machine.abs_line_number() - len(block)
1686                              + 1)
1687                 table = self.build_table(tabledata, tableline)
1688                 nodelist = [table] + messages
1689             except tableparser.TableMarkupError, err:
1690                 nodelist = self.malformed_table(block, ' '.join(err.args),
1691                                                 offset=err.offset) + messages
1692         else:
1693             nodelist = messages
1694         return nodelist, blank_finish
1695
1696     def isolate_grid_table(self):
1697         messages = []
1698         blank_finish = 1
1699         try:
1700             block = self.state_machine.get_text_block(flush_left=True)
1701         except statemachine.UnexpectedIndentationError, err:
1702             block, src, srcline = err.args
1703             messages.append(self.reporter.error('Unexpected indentation.',
1704                                                 source=src, line=srcline))
1705             blank_finish = 0
1706         block.disconnect()
1707         # for East Asian chars:
1708         block.pad_double_width(self.double_width_pad_char)
1709         width = len(block[0].strip())
1710         for i in range(len(block)):
1711             block[i] = block[i].strip()
1712             if block[i][0] not in '+|': # check left edge
1713                 blank_finish = 0
1714                 self.state_machine.previous_line(len(block) - i)
1715                 del block[i:]
1716                 break
1717         if not self.grid_table_top_pat.match(block[-1]): # find bottom
1718             blank_finish = 0
1719             # from second-last to third line of table:
1720             for i in range(len(block) - 2, 1, -1):
1721                 if self.grid_table_top_pat.match(block[i]):
1722                     self.state_machine.previous_line(len(block) - i + 1)
1723                     del block[i+1:]
1724                     break
1725             else:
1726                 messages.extend(self.malformed_table(block))
1727                 return [], messages, blank_finish
1728         for i in range(len(block)):     # check right edge
1729             if len(block[i]) != width or block[i][-1] not in '+|':
1730                 messages.extend(self.malformed_table(block))
1731                 return [], messages, blank_finish
1732         return block, messages, blank_finish
1733
1734     def isolate_simple_table(self):
1735         start = self.state_machine.line_offset
1736         lines = self.state_machine.input_lines
1737         limit = len(lines) - 1
1738         toplen = len(lines[start].strip())
1739         pattern_match = self.simple_table_border_pat.match
1740         found = 0
1741         found_at = None
1742         i = start + 1
1743         while i <= limit:
1744             line = lines[i]
1745             match = pattern_match(line)
1746             if match:
1747                 if len(line.strip()) != toplen:
1748                     self.state_machine.next_line(i - start)
1749                     messages = self.malformed_table(
1750                         lines[start:i+1], 'Bottom/header table border does '
1751                         'not match top border.')
1752                     return [], messages, i == limit or not lines[i+1].strip()
1753                 found += 1
1754                 found_at = i
1755                 if found == 2 or i == limit or not lines[i+1].strip():
1756                     end = i
1757                     break
1758             i += 1
1759         else:                           # reached end of input_lines
1760             if found:
1761                 extra = ' or no blank line after table bottom'
1762                 self.state_machine.next_line(found_at - start)
1763                 block = lines[start:found_at+1]
1764             else:
1765                 extra = ''
1766                 self.state_machine.next_line(i - start - 1)
1767                 block = lines[start:]
1768             messages = self.malformed_table(
1769                 block, 'No bottom table border found%s.' % extra)
1770             return [], messages, not extra
1771         self.state_machine.next_line(end - start)
1772         block = lines[start:end+1]
1773         # for East Asian chars:
1774         block.pad_double_width(self.double_width_pad_char)
1775         return block, [], end == limit or not lines[end+1].strip()
1776
1777     def malformed_table(self, block, detail='', offset=0):
1778         block.replace(self.double_width_pad_char, '')
1779         data = '\n'.join(block)
1780         message = 'Malformed table.'
1781         startline = self.state_machine.abs_line_number() - len(block) + 1
1782         if detail:
1783             message += '\n' + detail
1784         error = self.reporter.error(message, nodes.literal_block(data, data),
1785                                     line=startline+offset)
1786         return [error]
1787
1788     def build_table(self, tabledata, tableline, stub_columns=0, widths=None):
1789         colwidths, headrows, bodyrows = tabledata
1790         table = nodes.table()
1791         if widths == 'auto':
1792             table['classes'] += ['colwidths-auto']
1793         elif widths: # "grid" or list of integers
1794             table['classes'] += ['colwidths-given']
1795         tgroup = nodes.tgroup(cols=len(colwidths))
1796         table += tgroup
1797         for colwidth in colwidths:
1798             colspec = nodes.colspec(colwidth=colwidth)
1799             if stub_columns:
1800                 colspec.attributes['stub'] = 1
1801                 stub_columns -= 1
1802             tgroup += colspec
1803         if headrows:
1804             thead = nodes.thead()
1805             tgroup += thead
1806             for row in headrows:
1807                 thead += self.build_table_row(row, tableline)
1808         tbody = nodes.tbody()
1809         tgroup += tbody
1810         for row in bodyrows:
1811             tbody += self.build_table_row(row, tableline)
1812         return table
1813
1814     def build_table_row(self, rowdata, tableline):
1815         row = nodes.row()
1816         for cell in rowdata:
1817             if cell is None:
1818                 continue
1819             morerows, morecols, offset, cellblock = cell
1820             attributes = {}
1821             if morerows:
1822                 attributes['morerows'] = morerows
1823             if morecols:
1824                 attributes['morecols'] = morecols
1825             entry = nodes.entry(**attributes)
1826             row += entry
1827             if ''.join(cellblock):
1828                 self.nested_parse(cellblock, input_offset=tableline+offset,
1829                                   node=entry)
1830         return row
1831
1832
1833     explicit = Struct()
1834     """Patterns and constants used for explicit markup recognition."""
1835
1836     explicit.patterns = Struct(
1837           target=re.compile(r"""
1838                             (
1839                               _               # anonymous target
1840                             |               # *OR*
1841                               (?!_)           # no underscore at the beginning
1842                               (?P<quote>`?)   # optional open quote
1843                               (?![ `])        # first char. not space or
1844                                               # backquote
1845                               (?P<name>       # reference name
1846                                 .+?
1847                               )
1848                               %(non_whitespace_escape_before)s
1849                               (?P=quote)      # close quote if open quote used
1850                             )
1851                             (?<!(?<!\x00):) # no unescaped colon at end
1852                             %(non_whitespace_escape_before)s
1853                             [ ]?            # optional space
1854                             :               # end of reference name
1855                             ([ ]+|$)        # followed by whitespace
1856                             """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1857           reference=re.compile(r"""
1858                                (
1859                                  (?P<simple>%(simplename)s)_
1860                                |                  # *OR*
1861                                  `                  # open backquote
1862                                  (?![ ])            # not space
1863                                  (?P<phrase>.+?)    # hyperlink phrase
1864                                  %(non_whitespace_escape_before)s
1865                                  `_                 # close backquote,
1866                                                     # reference mark
1867                                )
1868                                $                  # end of string
1869                                """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1870           substitution=re.compile(r"""
1871                                   (
1872                                     (?![ ])          # first char. not space
1873                                     (?P<name>.+?)    # substitution text
1874                                     %(non_whitespace_escape_before)s
1875                                     \|               # close delimiter
1876                                   )
1877                                   ([ ]+|$)           # followed by whitespace
1878                                   """ % vars(Inliner),
1879                                   re.VERBOSE | re.UNICODE),)
1880
1881     def footnote(self, match):
1882         src, srcline = self.state_machine.get_source_and_line()
1883         indented, indent, offset, blank_finish = \
1884               self.state_machine.get_first_known_indented(match.end())
1885         label = match.group(1)
1886         name = normalize_name(label)
1887         footnote = nodes.footnote('\n'.join(indented))
1888         footnote.source = src
1889         footnote.line = srcline
1890         if name[0] == '#':              # auto-numbered
1891             name = name[1:]             # autonumber label
1892             footnote['auto'] = 1
1893             if name:
1894                 footnote['names'].append(name)
1895             self.document.note_autofootnote(footnote)
1896         elif name == '*':               # auto-symbol
1897             name = ''
1898             footnote['auto'] = '*'
1899             self.document.note_symbol_footnote(footnote)
1900         else:                           # manually numbered
1901             footnote += nodes.label('', label)
1902             footnote['names'].append(name)
1903             self.document.note_footnote(footnote)
1904         if name:
1905             self.document.note_explicit_target(footnote, footnote)
1906         else:
1907             self.document.set_id(footnote, footnote)
1908         if indented:
1909             self.nested_parse(indented, input_offset=offset, node=footnote)
1910         return [footnote], blank_finish
1911
1912     def citation(self, match):
1913         src, srcline = self.state_machine.get_source_and_line()
1914         indented, indent, offset, blank_finish = \
1915               self.state_machine.get_first_known_indented(match.end())
1916         label = match.group(1)
1917         name = normalize_name(label)
1918         citation = nodes.citation('\n'.join(indented))
1919         citation.source = src
1920         citation.line = srcline
1921         citation += nodes.label('', label)
1922         citation['names'].append(name)
1923         self.document.note_citation(citation)
1924         self.document.note_explicit_target(citation, citation)
1925         if indented:
1926             self.nested_parse(indented, input_offset=offset, node=citation)
1927         return [citation], blank_finish
1928
1929     def hyperlink_target(self, match):
1930         pattern = self.explicit.patterns.target
1931         lineno = self.state_machine.abs_line_number()
1932         block, indent, offset, blank_finish = \
1933               self.state_machine.get_first_known_indented(
1934               match.end(), until_blank=True, strip_indent=False)
1935         blocktext = match.string[:match.end()] + '\n'.join(block)
1936         block = [escape2null(line) for line in block]
1937         escaped = block[0]
1938         blockindex = 0
1939         while True:
1940             targetmatch = pattern.match(escaped)
1941             if targetmatch:
1942                 break
1943             blockindex += 1
1944             try:
1945                 escaped += block[blockindex]
1946             except IndexError:
1947                 raise MarkupError('malformed hyperlink target.')
1948         del block[:blockindex]
1949         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1950         target = self.make_target(block, blocktext, lineno,
1951                                   targetmatch.group('name'))
1952         return [target], blank_finish
1953
1954     def make_target(self, block, block_text, lineno, target_name):
1955         target_type, data = self.parse_target(block, block_text, lineno)
1956         if target_type == 'refname':
1957             target = nodes.target(block_text, '', refname=normalize_name(data))
1958             target.indirect_reference_name = data
1959             self.add_target(target_name, '', target, lineno)
1960             self.document.note_indirect_target(target)
1961             return target
1962         elif target_type == 'refuri':
1963             target = nodes.target(block_text, '')
1964             self.add_target(target_name, data, target, lineno)
1965             return target
1966         else:
1967             return data
1968
1969     def parse_target(self, block, block_text, lineno):
1970         """
1971         Determine the type of reference of a target.
1972
1973         :Return: A 2-tuple, one of:
1974
1975             - 'refname' and the indirect reference name
1976             - 'refuri' and the URI
1977             - 'malformed' and a system_message node
1978         """
1979         if block and block[-1].strip()[-1:] == '_': # possible indirect target
1980             reference = ' '.join([line.strip() for line in block])
1981             refname = self.is_reference(reference)
1982             if refname:
1983                 return 'refname', refname
1984         ref_parts = split_escaped_whitespace(' '.join(block))
1985         reference = ' '.join(''.join(unescape(part).split())
1986                              for part in ref_parts)
1987         return 'refuri', reference
1988
1989     def is_reference(self, reference):
1990         match = self.explicit.patterns.reference.match(
1991             whitespace_normalize_name(reference))
1992         if not match:
1993             return None
1994         return unescape(match.group('simple') or match.group('phrase'))
1995
1996     def add_target(self, targetname, refuri, target, lineno):
1997         target.line = lineno
1998         if targetname:
1999             name = normalize_name(unescape(targetname))
2000             target['names'].append(name)
2001             if refuri:
2002                 uri = self.inliner.adjust_uri(refuri)
2003                 if uri:
2004                     target['refuri'] = uri
2005                 else:
2006                     raise ApplicationError('problem with URI: %r' % refuri)
2007             self.document.note_explicit_target(target, self.parent)
2008         else:                       # anonymous target
2009             if refuri:
2010                 target['refuri'] = refuri
2011             target['anonymous'] = 1
2012             self.document.note_anonymous_target(target)
2013
2014     def substitution_def(self, match):
2015         pattern = self.explicit.patterns.substitution
2016         src, srcline = self.state_machine.get_source_and_line()
2017         block, indent, offset, blank_finish = \
2018               self.state_machine.get_first_known_indented(match.end(),
2019                                                           strip_indent=False)
2020         blocktext = (match.string[:match.end()] + '\n'.join(block))
2021         block.disconnect()
2022         escaped = escape2null(block[0].rstrip())
2023         blockindex = 0
2024         while True:
2025             subdefmatch = pattern.match(escaped)
2026             if subdefmatch:
2027                 break
2028             blockindex += 1
2029             try:
2030                 escaped = escaped + ' ' + escape2null(block[blockindex].strip())
2031             except IndexError:
2032                 raise MarkupError('malformed substitution definition.')
2033         del block[:blockindex]          # strip out the substitution marker
2034         block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1]
2035         if not block[0]:
2036             del block[0]
2037             offset += 1
2038         while block and not block[-1].strip():
2039             block.pop()
2040         subname = subdefmatch.group('name')
2041         substitution_node = nodes.substitution_definition(blocktext)
2042         substitution_node.source = src
2043         substitution_node.line = srcline
2044         if not block:
2045             msg = self.reporter.warning(
2046                 'Substitution definition "%s" missing contents.' % subname,
2047                 nodes.literal_block(blocktext, blocktext),
2048                 source=src, line=srcline)
2049             return [msg], blank_finish
2050         block[0] = block[0].strip()
2051         substitution_node['names'].append(
2052             nodes.whitespace_normalize_name(subname))
2053         new_abs_offset, blank_finish = self.nested_list_parse(
2054               block, input_offset=offset, node=substitution_node,
2055               initial_state='SubstitutionDef', blank_finish=blank_finish)
2056         i = 0
2057         for node in substitution_node[:]:
2058             if not (isinstance(node, nodes.Inline) or
2059                     isinstance(node, nodes.Text)):
2060                 self.parent += substitution_node[i]
2061                 del substitution_node[i]
2062             else:
2063                 i += 1
2064         for node in substitution_node.traverse(nodes.Element):
2065             if self.disallowed_inside_substitution_definitions(node):
2066                 pformat = nodes.literal_block('', node.pformat().rstrip())
2067                 msg = self.reporter.error(
2068                     'Substitution definition contains illegal element <%s>:'
2069                     % node.tagname,
2070                     pformat, nodes.literal_block(blocktext, blocktext),
2071                     source=src, line=srcline)
2072                 return [msg], blank_finish
2073         if len(substitution_node) == 0:
2074             msg = self.reporter.warning(
2075                   'Substitution definition "%s" empty or invalid.' % subname,
2076                   nodes.literal_block(blocktext, blocktext),
2077                   source=src, line=srcline)
2078             return [msg], blank_finish
2079         self.document.note_substitution_def(
2080             substitution_node, subname, self.parent)
2081         return [substitution_node], blank_finish
2082
2083     def disallowed_inside_substitution_definitions(self, node):
2084         if (node['ids'] or
2085             isinstance(node, nodes.reference) and node.get('anonymous') or
2086             isinstance(node, nodes.footnote_reference) and node.get('auto')):
2087             return True
2088         else:
2089             return False
2090
2091     def directive(self, match, **option_presets):
2092         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2093         type_name = match.group(1)
2094         directive_class, messages = directives.directive(
2095             type_name, self.memo.language, self.document)
2096         self.parent += messages
2097         if directive_class:
2098             return self.run_directive(
2099                 directive_class, match, type_name, option_presets)
2100         else:
2101             return self.unknown_directive(type_name)
2102
2103     def run_directive(self, directive, match, type_name, option_presets):
2104         """
2105         Parse a directive then run its directive function.
2106
2107         Parameters:
2108
2109         - `directive`: The class implementing the directive.  Must be
2110           a subclass of `rst.Directive`.
2111
2112         - `match`: A regular expression match object which matched the first
2113           line of the directive.
2114
2115         - `type_name`: The directive name, as used in the source text.
2116
2117         - `option_presets`: A dictionary of preset options, defaults for the
2118           directive options.  Currently, only an "alt" option is passed by
2119           substitution definitions (value: the substitution name), which may
2120           be used by an embedded image directive.
2121
2122         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2123         """
2124         if isinstance(directive, (FunctionType, MethodType)):
2125             from docutils.parsers.rst import convert_directive_function
2126             directive = convert_directive_function(directive)
2127         lineno = self.state_machine.abs_line_number()
2128         initial_line_offset = self.state_machine.line_offset
2129         indented, indent, line_offset, blank_finish \
2130                   = self.state_machine.get_first_known_indented(match.end(),
2131                                                                 strip_top=0)
2132         block_text = '\n'.join(self.state_machine.input_lines[
2133             initial_line_offset : self.state_machine.line_offset + 1])
2134         try:
2135             arguments, options, content, content_offset = (
2136                 self.parse_directive_block(indented, line_offset,
2137                                            directive, option_presets))
2138         except MarkupError, detail:
2139             error = self.reporter.error(
2140                 'Error in "%s" directive:\n%s.' % (type_name,
2141                                                    ' '.join(detail.args)),
2142                 nodes.literal_block(block_text, block_text), line=lineno)
2143             return [error], blank_finish
2144         directive_instance = directive(
2145             type_name, arguments, options, content, lineno,
2146             content_offset, block_text, self, self.state_machine)
2147         try:
2148             result = directive_instance.run()
2149         except docutils.parsers.rst.DirectiveError, error:
2150             msg_node = self.reporter.system_message(error.level, error.msg,
2151                                                     line=lineno)
2152             msg_node += nodes.literal_block(block_text, block_text)
2153             result = [msg_node]
2154         assert isinstance(result, list), \
2155                'Directive "%s" must return a list of nodes.' % type_name
2156         for i in range(len(result)):
2157             assert isinstance(result[i], nodes.Node), \
2158                    ('Directive "%s" returned non-Node object (index %s): %r'
2159                     % (type_name, i, result[i]))
2160         return (result,
2161                 blank_finish or self.state_machine.is_next_line_blank())
2162
2163     def parse_directive_block(self, indented, line_offset, directive,
2164                               option_presets):
2165         option_spec = directive.option_spec
2166         has_content = directive.has_content
2167         if indented and not indented[0].strip():
2168             indented.trim_start()
2169             line_offset += 1
2170         while indented and not indented[-1].strip():
2171             indented.trim_end()
2172         if indented and (directive.required_arguments
2173                          or directive.optional_arguments
2174                          or option_spec):
2175             for i, line in enumerate(indented):
2176                 if not line.strip():
2177                     break
2178             else:
2179                 i += 1
2180             arg_block = indented[:i]
2181             content = indented[i+1:]
2182             content_offset = line_offset + i + 1
2183         else:
2184             content = indented
2185             content_offset = line_offset
2186             arg_block = []
2187         if option_spec:
2188             options, arg_block = self.parse_directive_options(
2189                 option_presets, option_spec, arg_block)
2190         else:
2191             options = {}
2192         if arg_block and not (directive.required_arguments
2193                               or directive.optional_arguments):
2194             content = arg_block + indented[i:]
2195             content_offset = line_offset
2196             arg_block = []
2197         while content and not content[0].strip():
2198             content.trim_start()
2199             content_offset += 1
2200         if directive.required_arguments or directive.optional_arguments:
2201             arguments = self.parse_directive_arguments(
2202                 directive, arg_block)
2203         else:
2204             arguments = []
2205         if content and not has_content:
2206             raise MarkupError('no content permitted')
2207         return (arguments, options, content, content_offset)
2208
2209     def parse_directive_options(self, option_presets, option_spec, arg_block):
2210         options = option_presets.copy()
2211         for i, line in enumerate(arg_block):
2212             if re.match(Body.patterns['field_marker'], line):
2213                 opt_block = arg_block[i:]
2214                 arg_block = arg_block[:i]
2215                 break
2216         else:
2217             opt_block = []
2218         if opt_block:
2219             success, data = self.parse_extension_options(option_spec,
2220                                                          opt_block)
2221             if success:                 # data is a dict of options
2222                 options.update(data)
2223             else:                       # data is an error string
2224                 raise MarkupError(data)
2225         return options, arg_block
2226
2227     def parse_directive_arguments(self, directive, arg_block):
2228         required = directive.required_arguments
2229         optional = directive.optional_arguments
2230         arg_text = '\n'.join(arg_block)
2231         arguments = arg_text.split()
2232         if len(arguments) < required:
2233             raise MarkupError('%s argument(s) required, %s supplied'
2234                               % (required, len(arguments)))
2235         elif len(arguments) > required + optional:
2236             if directive.final_argument_whitespace:
2237                 arguments = arg_text.split(None, required + optional - 1)
2238             else:
2239                 raise MarkupError(
2240                     'maximum %s argument(s) allowed, %s supplied'
2241                     % (required + optional, len(arguments)))
2242         return arguments
2243
2244     def parse_extension_options(self, option_spec, datalines):
2245         """
2246         Parse `datalines` for a field list containing extension options
2247         matching `option_spec`.
2248
2249         :Parameters:
2250             - `option_spec`: a mapping of option name to conversion
2251               function, which should raise an exception on bad input.
2252             - `datalines`: a list of input strings.
2253
2254         :Return:
2255             - Success value, 1 or 0.
2256             - An option dictionary on success, an error string on failure.
2257         """
2258         node = nodes.field_list()
2259         newline_offset, blank_finish = self.nested_list_parse(
2260               datalines, 0, node, initial_state='ExtensionOptions',
2261               blank_finish=True)
2262         if newline_offset != len(datalines): # incomplete parse of block
2263             return 0, 'invalid option block'
2264         try:
2265             options = utils.extract_extension_options(node, option_spec)
2266         except KeyError, detail:
2267             return 0, ('unknown option: "%s"' % detail.args[0])
2268         except (ValueError, TypeError), detail:
2269             return 0, ('invalid option value: %s' % ' '.join(detail.args))
2270         except utils.ExtensionOptionError, detail:
2271             return 0, ('invalid option data: %s' % ' '.join(detail.args))
2272         if blank_finish:
2273             return 1, options
2274         else:
2275             return 0, 'option data incompletely parsed'
2276
2277     def unknown_directive(self, type_name):
2278         lineno = self.state_machine.abs_line_number()
2279         indented, indent, offset, blank_finish = \
2280             self.state_machine.get_first_known_indented(0, strip_indent=False)
2281         text = '\n'.join(indented)
2282         error = self.reporter.error(
2283               'Unknown directive type "%s".' % type_name,
2284               nodes.literal_block(text, text), line=lineno)
2285         return [error], blank_finish
2286
2287     def comment(self, match):
2288         if not match.string[match.end():].strip() \
2289               and self.state_machine.is_next_line_blank(): # an empty comment?
2290             return [nodes.comment()], 1 # "A tiny but practical wart."
2291         indented, indent, offset, blank_finish = \
2292               self.state_machine.get_first_known_indented(match.end())
2293         while indented and not indented[-1].strip():
2294             indented.trim_end()
2295         text = '\n'.join(indented)
2296         return [nodes.comment(text, text)], blank_finish
2297
2298     explicit.constructs = [
2299           (footnote,
2300            re.compile(r"""
2301                       \.\.[ ]+          # explicit markup start
2302                       \[
2303                       (                 # footnote label:
2304                           [0-9]+          # manually numbered footnote
2305                         |               # *OR*
2306                           \#              # anonymous auto-numbered footnote
2307                         |               # *OR*
2308                           \#%s            # auto-number ed?) footnote label
2309                         |               # *OR*
2310                           \*              # auto-symbol footnote
2311                       )
2312                       \]
2313                       ([ ]+|$)          # whitespace or end of line
2314                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2315           (citation,
2316            re.compile(r"""
2317                       \.\.[ ]+          # explicit markup start
2318                       \[(%s)\]          # citation label
2319                       ([ ]+|$)          # whitespace or end of line
2320                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2321           (hyperlink_target,
2322            re.compile(r"""
2323                       \.\.[ ]+          # explicit markup start
2324                       _                 # target indicator
2325                       (?![ ]|$)         # first char. not space or EOL
2326                       """, re.VERBOSE | re.UNICODE)),
2327           (substitution_def,
2328            re.compile(r"""
2329                       \.\.[ ]+          # explicit markup start
2330                       \|                # substitution indicator
2331                       (?![ ]|$)         # first char. not space or EOL
2332                       """, re.VERBOSE | re.UNICODE)),
2333           (directive,
2334            re.compile(r"""
2335                       \.\.[ ]+          # explicit markup start
2336                       (%s)              # directive name
2337                       [ ]?              # optional space
2338                       ::                # directive delimiter
2339                       ([ ]+|$)          # whitespace or end of line
2340                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE))]
2341
2342     def explicit_markup(self, match, context, next_state):
2343         """Footnotes, hyperlink targets, directives, comments."""
2344         nodelist, blank_finish = self.explicit_construct(match)
2345         self.parent += nodelist
2346         self.explicit_list(blank_finish)
2347         return [], next_state, []
2348
2349     def explicit_construct(self, match):
2350         """Determine which explicit construct this is, parse & return it."""
2351         errors = []
2352         for method, pattern in self.explicit.constructs:
2353             expmatch = pattern.match(match.string)
2354             if expmatch:
2355                 try:
2356                     return method(self, expmatch)
2357                 except MarkupError, error:
2358                     lineno = self.state_machine.abs_line_number()
2359                     message = ' '.join(error.args)
2360                     errors.append(self.reporter.warning(message, line=lineno))
2361                     break
2362         nodelist, blank_finish = self.comment(match)
2363         return nodelist + errors, blank_finish
2364
2365     def explicit_list(self, blank_finish):
2366         """
2367         Create a nested state machine for a series of explicit markup
2368         constructs (including anonymous hyperlink targets).
2369         """
2370         offset = self.state_machine.line_offset + 1   # next line
2371         newline_offset, blank_finish = self.nested_list_parse(
2372               self.state_machine.input_lines[offset:],
2373               input_offset=self.state_machine.abs_line_offset() + 1,
2374               node=self.parent, initial_state='Explicit',
2375               blank_finish=blank_finish,
2376               match_titles=self.state_machine.match_titles)
2377         self.goto_line(newline_offset)
2378         if not blank_finish:
2379             self.parent += self.unindent_warning('Explicit markup')
2380
2381     def anonymous(self, match, context, next_state):
2382         """Anonymous hyperlink targets."""
2383         nodelist, blank_finish = self.anonymous_target(match)
2384         self.parent += nodelist
2385         self.explicit_list(blank_finish)
2386         return [], next_state, []
2387
2388     def anonymous_target(self, match):
2389         lineno = self.state_machine.abs_line_number()
2390         block, indent, offset, blank_finish \
2391             = self.state_machine.get_first_known_indented(match.end(),
2392                                                         until_blank=True)
2393         blocktext = match.string[:match.end()] + '\n'.join(block)
2394         block = [escape2null(line) for line in block]
2395         target = self.make_target(block, blocktext, lineno, '')
2396         return [target], blank_finish
2397
2398     def line(self, match, context, next_state):
2399         """Section title overline or transition marker."""
2400         if self.state_machine.match_titles:
2401             return [match.string], 'Line', []
2402         elif match.string.strip() == '::':
2403             raise statemachine.TransitionCorrection('text')
2404         elif len(match.string.strip()) < 4:
2405             msg = self.reporter.info(
2406                 'Unexpected possible title overline or transition.\n'
2407                 "Treating it as ordinary text because it's so short.",
2408                 line=self.state_machine.abs_line_number())
2409             self.parent += msg
2410             raise statemachine.TransitionCorrection('text')
2411         else:
2412             blocktext = self.state_machine.line
2413             msg = self.reporter.severe(
2414                   'Unexpected section title or transition.',
2415                   nodes.literal_block(blocktext, blocktext),
2416                   line=self.state_machine.abs_line_number())
2417             self.parent += msg
2418             return [], next_state, []
2419
2420     def text(self, match, context, next_state):
2421         """Titles, definition lists, paragraphs."""
2422         return [match.string], 'Text', []
2423
2424
2425 class RFC2822Body(Body):
2426
2427     """
2428     RFC2822 headers are only valid as the first constructs in documents.  As
2429     soon as anything else appears, the `Body` state should take over.
2430     """
2431
2432     patterns = Body.patterns.copy()     # can't modify the original
2433     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2434     initial_transitions = [(name, 'Body')
2435                            for name in Body.initial_transitions]
2436     initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2437
2438     def rfc2822(self, match, context, next_state):
2439         """RFC2822-style field list item."""
2440         fieldlist = nodes.field_list(classes=['rfc2822'])
2441         self.parent += fieldlist
2442         field, blank_finish = self.rfc2822_field(match)
2443         fieldlist += field
2444         offset = self.state_machine.line_offset + 1   # next line
2445         newline_offset, blank_finish = self.nested_list_parse(
2446               self.state_machine.input_lines[offset:],
2447               input_offset=self.state_machine.abs_line_offset() + 1,
2448               node=fieldlist, initial_state='RFC2822List',
2449               blank_finish=blank_finish)
2450         self.goto_line(newline_offset)
2451         if not blank_finish:
2452             self.parent += self.unindent_warning(
2453                   'RFC2822-style field list')
2454         return [], next_state, []
2455
2456     def rfc2822_field(self, match):
2457         name = match.string[:match.string.find(':')]
2458         indented, indent, line_offset, blank_finish = \
2459               self.state_machine.get_first_known_indented(match.end(),
2460                                                           until_blank=True)
2461         fieldnode = nodes.field()
2462         fieldnode += nodes.field_name(name, name)
2463         fieldbody = nodes.field_body('\n'.join(indented))
2464         fieldnode += fieldbody
2465         if indented:
2466             self.nested_parse(indented, input_offset=line_offset,
2467                               node=fieldbody)
2468         return fieldnode, blank_finish
2469
2470
2471 class SpecializedBody(Body):
2472
2473     """
2474     Superclass for second and subsequent compound element members.  Compound
2475     elements are lists and list-like constructs.
2476
2477     All transition methods are disabled (redefined as `invalid_input`).
2478     Override individual methods in subclasses to re-enable.
2479
2480     For example, once an initial bullet list item, say, is recognized, the
2481     `BulletList` subclass takes over, with a "bullet_list" node as its
2482     container.  Upon encountering the initial bullet list item, `Body.bullet`
2483     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2484     starts up a nested parsing session with `BulletList` as the initial state.
2485     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2486     as only bullet list items are encountered, they are parsed and inserted
2487     into the container.  The first construct which is *not* a bullet list item
2488     triggers the `invalid_input` method, which ends the nested parse and
2489     closes the container.  `BulletList` needs to recognize input that is
2490     invalid in the context of a bullet list, which means everything *other
2491     than* bullet list items, so it inherits the transition list created in
2492     `Body`.
2493     """
2494
2495     def invalid_input(self, match=None, context=None, next_state=None):
2496         """Not a compound element member. Abort this state machine."""
2497         self.state_machine.previous_line() # back up so parent SM can reassess
2498         raise EOFError
2499
2500     indent = invalid_input
2501     bullet = invalid_input
2502     enumerator = invalid_input
2503     field_marker = invalid_input
2504     option_marker = invalid_input
2505     doctest = invalid_input
2506     line_block = invalid_input
2507     grid_table_top = invalid_input
2508     simple_table_top = invalid_input
2509     explicit_markup = invalid_input
2510     anonymous = invalid_input
2511     line = invalid_input
2512     text = invalid_input
2513
2514
2515 class BulletList(SpecializedBody):
2516
2517     """Second and subsequent bullet_list list_items."""
2518
2519     def bullet(self, match, context, next_state):
2520         """Bullet list item."""
2521         if match.string[0] != self.parent['bullet']:
2522             # different bullet: new list
2523             self.invalid_input()
2524         listitem, blank_finish = self.list_item(match.end())
2525         self.parent += listitem
2526         self.blank_finish = blank_finish
2527         return [], next_state, []
2528
2529
2530 class DefinitionList(SpecializedBody):
2531
2532     """Second and subsequent definition_list_items."""
2533
2534     def text(self, match, context, next_state):
2535         """Definition lists."""
2536         return [match.string], 'Definition', []
2537
2538
2539 class EnumeratedList(SpecializedBody):
2540
2541     """Second and subsequent enumerated_list list_items."""
2542
2543     def enumerator(self, match, context, next_state):
2544         """Enumerated list item."""
2545         format, sequence, text, ordinal = self.parse_enumerator(
2546               match, self.parent['enumtype'])
2547         if ( format != self.format
2548              or (sequence != '#' and (sequence != self.parent['enumtype']
2549                                       or self.auto
2550                                       or ordinal != (self.lastordinal + 1)))
2551              or not self.is_enumerated_list_item(ordinal, sequence, format)):
2552             # different enumeration: new list
2553             self.invalid_input()
2554         if sequence == '#':
2555             self.auto = 1
2556         listitem, blank_finish = self.list_item(match.end())
2557         self.parent += listitem
2558         self.blank_finish = blank_finish
2559         self.lastordinal = ordinal
2560         return [], next_state, []
2561
2562
2563 class FieldList(SpecializedBody):
2564
2565     """Second and subsequent field_list fields."""
2566
2567     def field_marker(self, match, context, next_state):
2568         """Field list field."""
2569         field, blank_finish = self.field(match)
2570         self.parent += field
2571         self.blank_finish = blank_finish
2572         return [], next_state, []
2573
2574
2575 class OptionList(SpecializedBody):
2576
2577     """Second and subsequent option_list option_list_items."""
2578
2579     def option_marker(self, match, context, next_state):
2580         """Option list item."""
2581         try:
2582             option_list_item, blank_finish = self.option_list_item(match)
2583         except MarkupError:
2584             self.invalid_input()
2585         self.parent += option_list_item
2586         self.blank_finish = blank_finish
2587         return [], next_state, []
2588
2589
2590 class RFC2822List(SpecializedBody, RFC2822Body):
2591
2592     """Second and subsequent RFC2822-style field_list fields."""
2593
2594     patterns = RFC2822Body.patterns
2595     initial_transitions = RFC2822Body.initial_transitions
2596
2597     def rfc2822(self, match, context, next_state):
2598         """RFC2822-style field list item."""
2599         field, blank_finish = self.rfc2822_field(match)
2600         self.parent += field
2601         self.blank_finish = blank_finish
2602         return [], 'RFC2822List', []
2603
2604     blank = SpecializedBody.invalid_input
2605
2606
2607 class ExtensionOptions(FieldList):
2608
2609     """
2610     Parse field_list fields for extension options.
2611
2612     No nested parsing is done (including inline markup parsing).
2613     """
2614
2615     def parse_field_body(self, indented, offset, node):
2616         """Override `Body.parse_field_body` for simpler parsing."""
2617         lines = []
2618         for line in list(indented) + ['']:
2619             if line.strip():
2620                 lines.append(line)
2621             elif lines:
2622                 text = '\n'.join(lines)
2623                 node += nodes.paragraph(text, text)
2624                 lines = []
2625
2626
2627 class LineBlock(SpecializedBody):
2628
2629     """Second and subsequent lines of a line_block."""
2630
2631     blank = SpecializedBody.invalid_input
2632
2633     def line_block(self, match, context, next_state):
2634         """New line of line block."""
2635         lineno = self.state_machine.abs_line_number()
2636         line, messages, blank_finish = self.line_block_line(match, lineno)
2637         self.parent += line
2638         self.parent.parent += messages
2639         self.blank_finish = blank_finish
2640         return [], next_state, []
2641
2642
2643 class Explicit(SpecializedBody):
2644
2645     """Second and subsequent explicit markup construct."""
2646
2647     def explicit_markup(self, match, context, next_state):
2648         """Footnotes, hyperlink targets, directives, comments."""
2649         nodelist, blank_finish = self.explicit_construct(match)
2650         self.parent += nodelist
2651         self.blank_finish = blank_finish
2652         return [], next_state, []
2653
2654     def anonymous(self, match, context, next_state):
2655         """Anonymous hyperlink targets."""
2656         nodelist, blank_finish = self.anonymous_target(match)
2657         self.parent += nodelist
2658         self.blank_finish = blank_finish
2659         return [], next_state, []
2660
2661     blank = SpecializedBody.invalid_input
2662
2663
2664 class SubstitutionDef(Body):
2665
2666     """
2667     Parser for the contents of a substitution_definition element.
2668     """
2669
2670     patterns = {
2671           'embedded_directive': re.compile(r'(%s)::( +|$)'
2672                                            % Inliner.simplename, re.UNICODE),
2673           'text': r''}
2674     initial_transitions = ['embedded_directive', 'text']
2675
2676     def embedded_directive(self, match, context, next_state):
2677         nodelist, blank_finish = self.directive(match,
2678                                                 alt=self.parent['names'][0])
2679         self.parent += nodelist
2680         if not self.state_machine.at_eof():
2681             self.blank_finish = blank_finish
2682         raise EOFError
2683
2684     def text(self, match, context, next_state):
2685         if not self.state_machine.at_eof():
2686             self.blank_finish = self.state_machine.is_next_line_blank()
2687         raise EOFError
2688
2689
2690 class Text(RSTState):
2691
2692     """
2693     Classifier of second line of a text block.
2694
2695     Could be a paragraph, a definition list item, or a title.
2696     """
2697
2698     patterns = {'underline': Body.patterns['line'],
2699                 'text': r''}
2700     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2701
2702     def blank(self, match, context, next_state):
2703         """End of paragraph."""
2704         # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext
2705         paragraph, literalnext = self.paragraph(
2706               context, self.state_machine.abs_line_number() - 1)
2707         self.parent += paragraph
2708         if literalnext:
2709             self.parent += self.literal_block()
2710         return [], 'Body', []
2711
2712     def eof(self, context):
2713         if context:
2714             self.blank(None, context, None)
2715         return []
2716
2717     def indent(self, match, context, next_state):
2718         """Definition list item."""
2719         definitionlist = nodes.definition_list()
2720         definitionlistitem, blank_finish = self.definition_list_item(context)
2721         definitionlist += definitionlistitem
2722         self.parent += definitionlist
2723         offset = self.state_machine.line_offset + 1   # next line
2724         newline_offset, blank_finish = self.nested_list_parse(
2725               self.state_machine.input_lines[offset:],
2726               input_offset=self.state_machine.abs_line_offset() + 1,
2727               node=definitionlist, initial_state='DefinitionList',
2728               blank_finish=blank_finish, blank_finish_state='Definition')
2729         self.goto_line(newline_offset)
2730         if not blank_finish:
2731             self.parent += self.unindent_warning('Definition list')
2732         return [], 'Body', []
2733
2734     def underline(self, match, context, next_state):
2735         """Section title."""
2736         lineno = self.state_machine.abs_line_number()
2737         title = context[0].rstrip()
2738         underline = match.string.rstrip()
2739         source = title + '\n' + underline
2740         messages = []
2741         if column_width(title) > len(underline):
2742             if len(underline) < 4:
2743                 if self.state_machine.match_titles:
2744                     msg = self.reporter.info(
2745                         'Possible title underline, too short for the title.\n'
2746                         "Treating it as ordinary text because it's so short.",
2747                         line=lineno)
2748                     self.parent += msg
2749                 raise statemachine.TransitionCorrection('text')
2750             else:
2751                 blocktext = context[0] + '\n' + self.state_machine.line
2752                 msg = self.reporter.warning('Title underline too short.',
2753                     nodes.literal_block(blocktext, blocktext), line=lineno)
2754                 messages.append(msg)
2755         if not self.state_machine.match_titles:
2756             blocktext = context[0] + '\n' + self.state_machine.line
2757             # We need get_source_and_line() here to report correctly
2758             src, srcline = self.state_machine.get_source_and_line()
2759             # TODO: why is abs_line_number() == srcline+1
2760             # if the error is in a table (try with test_tables.py)?
2761             # print "get_source_and_line", srcline
2762             # print "abs_line_number", self.state_machine.abs_line_number()
2763             msg = self.reporter.severe('Unexpected section title.',
2764                 nodes.literal_block(blocktext, blocktext),
2765                 source=src, line=srcline)
2766             self.parent += messages
2767             self.parent += msg
2768             return [], next_state, []
2769         style = underline[0]
2770         context[:] = []
2771         self.section(title, source, style, lineno - 1, messages)
2772         return [], next_state, []
2773
2774     def text(self, match, context, next_state):
2775         """Paragraph."""
2776         startline = self.state_machine.abs_line_number() - 1
2777         msg = None
2778         try:
2779             block = self.state_machine.get_text_block(flush_left=True)
2780         except statemachine.UnexpectedIndentationError, err:
2781             block, src, srcline = err.args
2782             msg = self.reporter.error('Unexpected indentation.',
2783                                       source=src, line=srcline)
2784         lines = context + list(block)
2785         paragraph, literalnext = self.paragraph(lines, startline)
2786         self.parent += paragraph
2787         self.parent += msg
2788         if literalnext:
2789             try:
2790                 self.state_machine.next_line()
2791             except EOFError:
2792                 pass
2793             self.parent += self.literal_block()
2794         return [], next_state, []
2795
2796     def literal_block(self):
2797         """Return a list of nodes."""
2798         indented, indent, offset, blank_finish = \
2799               self.state_machine.get_indented()
2800         while indented and not indented[-1].strip():
2801             indented.trim_end()
2802         if not indented:
2803             return self.quoted_literal_block()
2804         data = '\n'.join(indented)
2805         literal_block = nodes.literal_block(data, data)
2806         (literal_block.source,
2807          literal_block.line) = self.state_machine.get_source_and_line(offset+1)
2808         nodelist = [literal_block]
2809         if not blank_finish:
2810             nodelist.append(self.unindent_warning('Literal block'))
2811         return nodelist
2812
2813     def quoted_literal_block(self):
2814         abs_line_offset = self.state_machine.abs_line_offset()
2815         offset = self.state_machine.line_offset
2816         parent_node = nodes.Element()
2817         new_abs_offset = self.nested_parse(
2818             self.state_machine.input_lines[offset:],
2819             input_offset=abs_line_offset, node=parent_node, match_titles=False,
2820             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2821                                   'initial_state': 'QuotedLiteralBlock'})
2822         self.goto_line(new_abs_offset)
2823         return parent_node.children
2824
2825     def definition_list_item(self, termline):
2826         indented, indent, line_offset, blank_finish = \
2827               self.state_machine.get_indented()
2828         itemnode = nodes.definition_list_item(
2829             '\n'.join(termline + list(indented)))
2830         lineno = self.state_machine.abs_line_number() - 1
2831         (itemnode.source,
2832          itemnode.line) = self.state_machine.get_source_and_line(lineno)
2833         termlist, messages = self.term(termline, lineno)
2834         itemnode += termlist
2835         definition = nodes.definition('', *messages)
2836         itemnode += definition
2837         if termline[0][-2:] == '::':
2838             definition += self.reporter.info(
2839                   'Blank line missing before literal block (after the "::")? '
2840                   'Interpreted as a definition list item.',
2841                   line=lineno+1)
2842         self.nested_parse(indented, input_offset=line_offset, node=definition)
2843         return itemnode, blank_finish
2844
2845     classifier_delimiter = re.compile(' +: +')
2846
2847     def term(self, lines, lineno):
2848         """Return a definition_list's term and optional classifiers."""
2849         assert len(lines) == 1
2850         text_nodes, messages = self.inline_text(lines[0], lineno)
2851         term_node = nodes.term(lines[0])
2852         (term_node.source,
2853          term_node.line) = self.state_machine.get_source_and_line(lineno)
2854         node_list = [term_node]
2855         for i in range(len(text_nodes)):
2856             node = text_nodes[i]
2857             if isinstance(node, nodes.Text):
2858                 parts = self.classifier_delimiter.split(node.rawsource)
2859                 if len(parts) == 1:
2860                     node_list[-1] += node
2861                 else:
2862                     rawtext = parts[0].rstrip()
2863                     textnode = nodes.Text(utils.unescape_rawsource(rawtext))
2864                     textnode.rawsource = rawtext
2865                     node_list[-1] += textnode
2866                     for part in parts[1:]:
2867                         classifier_node = nodes.classifier(part,
2868                                             utils.unescape_rawsource(part))
2869                         # might be a reference or similar in the next node
2870                         # then classifier_node is empty
2871                         if len(classifier_node) > 0:
2872                             classifier_node[0].rawsource = part
2873                         node_list.append(classifier_node)
2874             else:
2875                 node_list[-1] += node
2876         return node_list, messages
2877
2878
2879 class SpecializedText(Text):
2880
2881     """
2882     Superclass for second and subsequent lines of Text-variants.
2883
2884     All transition methods are disabled. Override individual methods in
2885     subclasses to re-enable.
2886     """
2887
2888     def eof(self, context):
2889         """Incomplete construct."""
2890         return []
2891
2892     def invalid_input(self, match=None, context=None, next_state=None):
2893         """Not a compound element member. Abort this state machine."""
2894         raise EOFError
2895
2896     blank = invalid_input
2897     indent = invalid_input
2898     underline = invalid_input
2899     text = invalid_input
2900
2901
2902 class Definition(SpecializedText):
2903
2904     """Second line of potential definition_list_item."""
2905
2906     def eof(self, context):
2907         """Not a definition."""
2908         self.state_machine.previous_line(2) # so parent SM can reassess
2909         return []
2910
2911     def indent(self, match, context, next_state):
2912         """Definition list item."""
2913         itemnode, blank_finish = self.definition_list_item(context)
2914         self.parent += itemnode
2915         self.blank_finish = blank_finish
2916         return [], 'DefinitionList', []
2917
2918
2919 class Line(SpecializedText):
2920
2921     """
2922     Second line of over- & underlined section title or transition marker.
2923     """
2924
2925     eofcheck = 1                        # @@@ ???
2926     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2927
2928     def eof(self, context):
2929         """Transition marker at end of section or document."""
2930         marker = context[0].strip()
2931         if self.memo.section_bubble_up_kludge:
2932             self.memo.section_bubble_up_kludge = False
2933         elif len(marker) < 4:
2934             self.state_correction(context)
2935         if self.eofcheck:               # ignore EOFError with sections
2936             lineno = self.state_machine.abs_line_number() - 1
2937             transition = nodes.transition(rawsource=context[0])
2938             transition.line = lineno
2939             self.parent += transition
2940         self.eofcheck = 1
2941         return []
2942
2943     def blank(self, match, context, next_state):
2944         """Transition marker."""
2945         src, srcline = self.state_machine.get_source_and_line()
2946         marker = context[0].strip()
2947         if len(marker) < 4:
2948             self.state_correction(context)
2949         transition = nodes.transition(rawsource=marker)
2950         transition.source = src
2951         transition.line = srcline - 1
2952         self.parent += transition
2953         return [], 'Body', []
2954
2955     def text(self, match, context, next_state):
2956         """Potential over- & underlined title."""
2957         lineno = self.state_machine.abs_line_number() - 1
2958         overline = context[0]
2959         title = match.string
2960         underline = ''
2961         try:
2962             underline = self.state_machine.next_line()
2963         except EOFError:
2964             blocktext = overline + '\n' + title
2965             if len(overline.rstrip()) < 4:
2966                 self.short_overline(context, blocktext, lineno, 2)
2967             else:
2968                 msg = self.reporter.severe(
2969                     'Incomplete section title.',
2970                     nodes.literal_block(blocktext, blocktext),
2971                     line=lineno)
2972                 self.parent += msg
2973                 return [], 'Body', []
2974         source = '%s\n%s\n%s' % (overline, title, underline)
2975         overline = overline.rstrip()
2976         underline = underline.rstrip()
2977         if not self.transitions['underline'][0].match(underline):
2978             blocktext = overline + '\n' + title + '\n' + underline
2979             if len(overline.rstrip()) < 4:
2980                 self.short_overline(context, blocktext, lineno, 2)
2981             else:
2982                 msg = self.reporter.severe(
2983                     'Missing matching underline for section title overline.',
2984                     nodes.literal_block(source, source),
2985                     line=lineno)
2986                 self.parent += msg
2987                 return [], 'Body', []
2988         elif overline != underline:
2989             blocktext = overline + '\n' + title + '\n' + underline
2990             if len(overline.rstrip()) < 4:
2991                 self.short_overline(context, blocktext, lineno, 2)
2992             else:
2993                 msg = self.reporter.severe(
2994                       'Title overline & underline mismatch.',
2995                       nodes.literal_block(source, source),
2996                       line=lineno)
2997                 self.parent += msg
2998                 return [], 'Body', []
2999         title = title.rstrip()
3000         messages = []
3001         if column_width(title) > len(overline):
3002             blocktext = overline + '\n' + title + '\n' + underline
3003             if len(overline.rstrip()) < 4:
3004                 self.short_overline(context, blocktext, lineno, 2)
3005             else:
3006                 msg = self.reporter.warning(
3007                       'Title overline too short.',
3008                       nodes.literal_block(source, source),
3009                       line=lineno)
3010                 messages.append(msg)
3011         style = (overline[0], underline[0])
3012         self.eofcheck = 0               # @@@ not sure this is correct
3013         self.section(title.lstrip(), source, style, lineno + 1, messages)
3014         self.eofcheck = 1
3015         return [], 'Body', []
3016
3017     indent = text                       # indented title
3018
3019     def underline(self, match, context, next_state):
3020         overline = context[0]
3021         blocktext = overline + '\n' + self.state_machine.line
3022         lineno = self.state_machine.abs_line_number() - 1
3023         if len(overline.rstrip()) < 4:
3024             self.short_overline(context, blocktext, lineno, 1)
3025         msg = self.reporter.error(
3026               'Invalid section title or transition marker.',
3027               nodes.literal_block(blocktext, blocktext),
3028               line=lineno)
3029         self.parent += msg
3030         return [], 'Body', []
3031
3032     def short_overline(self, context, blocktext, lineno, lines=1):
3033         msg = self.reporter.info(
3034             'Possible incomplete section title.\nTreating the overline as '
3035             "ordinary text because it's so short.",
3036             line=lineno)
3037         self.parent += msg
3038         self.state_correction(context, lines)
3039
3040     def state_correction(self, context, lines=1):
3041         self.state_machine.previous_line(lines)
3042         context[:] = []
3043         raise statemachine.StateCorrection('Body', 'text')
3044
3045
3046 class QuotedLiteralBlock(RSTState):
3047
3048     """
3049     Nested parse handler for quoted (unindented) literal blocks.
3050
3051     Special-purpose.  Not for inclusion in `state_classes`.
3052     """
3053
3054     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3055                 'text': r''}
3056     initial_transitions = ('initial_quoted', 'text')
3057
3058     def __init__(self, state_machine, debug=False):
3059         RSTState.__init__(self, state_machine, debug)
3060         self.messages = []
3061         self.initial_lineno = None
3062
3063     def blank(self, match, context, next_state):
3064         if context:
3065             raise EOFError
3066         else:
3067             return context, next_state, []
3068
3069     def eof(self, context):
3070         if context:
3071             src, srcline = self.state_machine.get_source_and_line(
3072                                                         self.initial_lineno)
3073             text = '\n'.join(context)
3074             literal_block = nodes.literal_block(text, text)
3075             literal_block.source = src
3076             literal_block.line = srcline
3077             self.parent += literal_block
3078         else:
3079             self.parent += self.reporter.warning(
3080                 'Literal block expected; none found.',
3081                 line=self.state_machine.abs_line_number())
3082                 # src not available, because statemachine.input_lines is empty
3083             self.state_machine.previous_line()
3084         self.parent += self.messages
3085         return []
3086
3087     def indent(self, match, context, next_state):
3088         assert context, ('QuotedLiteralBlock.indent: context should not '
3089                          'be empty!')
3090         self.messages.append(
3091             self.reporter.error('Unexpected indentation.',
3092                                 line=self.state_machine.abs_line_number()))
3093         self.state_machine.previous_line()
3094         raise EOFError
3095
3096     def initial_quoted(self, match, context, next_state):
3097         """Match arbitrary quote character on the first line only."""
3098         self.remove_transition('initial_quoted')
3099         quote = match.string[0]
3100         pattern = re.compile(re.escape(quote), re.UNICODE)
3101         # New transition matches consistent quotes only:
3102         self.add_transition('quoted',
3103                             (pattern, self.quoted, self.__class__.__name__))
3104         self.initial_lineno = self.state_machine.abs_line_number()
3105         return [match.string], next_state, []
3106
3107     def quoted(self, match, context, next_state):
3108         """Match consistent quotes on subsequent lines."""
3109         context.append(match.string)
3110         return context, next_state, []
3111
3112     def text(self, match, context, next_state):
3113         if context:
3114             self.messages.append(
3115                 self.reporter.error('Inconsistent literal block quoting.',
3116                                    line=self.state_machine.abs_line_number()))
3117             self.state_machine.previous_line()
3118         raise EOFError
3119
3120
3121 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3122                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3123                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3124 """Standard set of State classes used to start `RSTStateMachine`."""