docutils/docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import sys
 107 import re
 108 from types import FunctionType, MethodType
 109
 110 from docutils import nodes, statemachine, utils
 111 from docutils import ApplicationError, DataError
 112 from docutils.statemachine import StateMachineWS, StateWS
 113 from docutils.nodes import fully_normalize_name as normalize_name
 114 from docutils.nodes import whitespace_normalize_name
 115 import docutils.parsers.rst
 116 from docutils.parsers.rst import directives, languages, tableparser, roles
 117 from docutils.parsers.rst.languages import en as _fallback_language_module
 118 from docutils.utils import escape2null, unescape, column_width
 119 from docutils.utils import punctuation_chars, roman, urischemes
 120 from docutils.utils import split_escaped_whitespace
 121
 122 class MarkupError(DataError): pass
 123 class UnknownInterpretedRoleError(DataError): pass
 124 class InterpretedRoleNotImplementedError(DataError): pass
 125 class ParserError(ApplicationError): pass
 126 class MarkupMismatch(Exception): pass
 127
 128
 129 class Struct:
 130
 131     """Stores data attributes for dotted-attribute access."""
 132
 133     def __init__(self, **keywordargs):
 134         self.__dict__.update(keywordargs)
 135
 136
 137 class RSTStateMachine(StateMachineWS):
 138
 139     """
 140     reStructuredText's master StateMachine.
 141
 142     The entry point to reStructuredText parsing is the `run()` method.
 143     """
 144
 145     def run(self, input_lines, document, input_offset=0, match_titles=True,
 146             inliner=None):
 147         """
 148         Parse `input_lines` and modify the `document` node in place.
 149
 150         Extend `StateMachineWS.run()`: set up parse-global data and
 151         run the StateMachine.
 152         """
 153         self.language = languages.get_language(
 154             document.settings.language_code)
 155         self.match_titles = match_titles
 156         if inliner is None:
 157             inliner = Inliner()
 158         inliner.init_customizations(document.settings)
 159         self.memo = Struct(document=document,
 160                            reporter=document.reporter,
 161                            language=self.language,
 162                            title_styles=[],
 163                            section_level=0,
 164                            section_bubble_up_kludge=False,
 165                            inliner=inliner)
 166         self.document = document
 167         self.attach_observer(document.note_source)
 168         self.reporter = self.memo.reporter
 169         self.node = document
 170         results = StateMachineWS.run(self, input_lines, input_offset,
 171                                      input_source=document['source'])
 172         assert results == [], 'RSTStateMachine.run() results should be empty!'
 173         self.node = self.memo = None    # remove unneeded references
 174
 175
 176 class NestedStateMachine(StateMachineWS):
 177
 178     """
 179     StateMachine run from within other StateMachine runs, to parse nested
 180     document structures.
 181     """
 182
 183     def run(self, input_lines, input_offset, memo, node, match_titles=True):
 184         """
 185         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 186
 187         Extend `StateMachineWS.run()`: set up document-wide data.
 188         """
 189         self.match_titles = match_titles
 190         self.memo = memo
 191         self.document = memo.document
 192         self.attach_observer(self.document.note_source)
 193         self.reporter = memo.reporter
 194         self.language = memo.language
 195         self.node = node
 196         results = StateMachineWS.run(self, input_lines, input_offset)
 197         assert results == [], ('NestedStateMachine.run() results should be '
 198                                'empty!')
 199         return results
 200
 201
 202 class RSTState(StateWS):
 203
 204     """
 205     reStructuredText State superclass.
 206
 207     Contains methods used by all State subclasses.
 208     """
 209
 210     nested_sm = NestedStateMachine
 211     nested_sm_cache = []
 212
 213     def __init__(self, state_machine, debug=False):
 214         self.nested_sm_kwargs = {'state_classes': state_classes,
 215                                  'initial_state': 'Body'}
 216         StateWS.__init__(self, state_machine, debug)
 217
 218     def runtime_init(self):
 219         StateWS.runtime_init(self)
 220         memo = self.state_machine.memo
 221         self.memo = memo
 222         self.reporter = memo.reporter
 223         self.inliner = memo.inliner
 224         self.document = memo.document
 225         self.parent = self.state_machine.node
 226         # enable the reporter to determine source and source-line
 227         if not hasattr(self.reporter, 'get_source_and_line'):
 228             self.reporter.get_source_and_line = self.state_machine.get_source_and_line
 229
 230
 231     def goto_line(self, abs_line_offset):
 232         """
 233         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 234         """
 235         try:
 236             self.state_machine.goto_line(abs_line_offset)
 237         except EOFError:
 238             pass
 239
 240     def no_match(self, context, transitions):
 241         """
 242         Override `StateWS.no_match` to generate a system message.
 243
 244         This code should never be run.
 245         """
 246         self.reporter.severe(
 247             'Internal error: no transition pattern match.  State: "%s"; '
 248             'transitions: %s; context: %s; current line: %r.'
 249             % (self.__class__.__name__, transitions, context,
 250                self.state_machine.line))
 251         return context, None, []
 252
 253     def bof(self, context):
 254         """Called at beginning of file."""
 255         return [], []
 256
 257     def nested_parse(self, block, input_offset, node, match_titles=False,
 258                      state_machine_class=None, state_machine_kwargs=None):
 259         """
 260         Create a new StateMachine rooted at `node` and run it over the input
 261         `block`.
 262         """
 263         use_default = 0
 264         if state_machine_class is None:
 265             state_machine_class = self.nested_sm
 266             use_default += 1
 267         if state_machine_kwargs is None:
 268             state_machine_kwargs = self.nested_sm_kwargs
 269             use_default += 1
 270         block_length = len(block)
 271
 272         state_machine = None
 273         if use_default == 2:
 274             try:
 275                 state_machine = self.nested_sm_cache.pop()
 276             except IndexError:
 277                 pass
 278         if not state_machine:
 279             state_machine = state_machine_class(debug=self.debug,
 280                                                 **state_machine_kwargs)
 281         state_machine.run(block, input_offset, memo=self.memo,
 282                           node=node, match_titles=match_titles)
 283         if use_default == 2:
 284             self.nested_sm_cache.append(state_machine)
 285         else:
 286             state_machine.unlink()
 287         new_offset = state_machine.abs_line_offset()
 288         # No `block.parent` implies disconnected -- lines aren't in sync:
 289         if block.parent and (len(block) - block_length) != 0:
 290             # Adjustment for block if modified in nested parse:
 291             self.state_machine.next_line(len(block) - block_length)
 292         return new_offset
 293
 294     def nested_list_parse(self, block, input_offset, node, initial_state,
 295                           blank_finish,
 296                           blank_finish_state=None,
 297                           extra_settings={},
 298                           match_titles=False,
 299                           state_machine_class=None,
 300                           state_machine_kwargs=None):
 301         """
 302         Create a new StateMachine rooted at `node` and run it over the input
 303         `block`. Also keep track of optional intermediate blank lines and the
 304         required final one.
 305         """
 306         if state_machine_class is None:
 307             state_machine_class = self.nested_sm
 308         if state_machine_kwargs is None:
 309             state_machine_kwargs = self.nested_sm_kwargs.copy()
 310         state_machine_kwargs['initial_state'] = initial_state
 311         state_machine = state_machine_class(debug=self.debug,
 312                                             **state_machine_kwargs)
 313         if blank_finish_state is None:
 314             blank_finish_state = initial_state
 315         state_machine.states[blank_finish_state].blank_finish = blank_finish
 316         for key, value in extra_settings.items():
 317             setattr(state_machine.states[initial_state], key, value)
 318         state_machine.run(block, input_offset, memo=self.memo,
 319                           node=node, match_titles=match_titles)
 320         blank_finish = state_machine.states[blank_finish_state].blank_finish
 321         state_machine.unlink()
 322         return state_machine.abs_line_offset(), blank_finish
 323
 324     def section(self, title, source, style, lineno, messages):
 325         """Check for a valid subsection and create one if it checks out."""
 326         if self.check_subsection(source, style, lineno):
 327             self.new_subsection(title, lineno, messages)
 328
 329     def check_subsection(self, source, style, lineno):
 330         """
 331         Check for a valid subsection header.  Return 1 (true) or None (false).
 332
 333         When a new section is reached that isn't a subsection of the current
 334         section, back up the line count (use ``previous_line(-x)``), then
 335         ``raise EOFError``.  The current StateMachine will finish, then the
 336         calling StateMachine can re-examine the title.  This will work its way
 337         back up the calling chain until the correct section level isreached.
 338
 339         @@@ Alternative: Evaluate the title, store the title info & level, and
 340         back up the chain until that level is reached.  Store in memo? Or
 341         return in results?
 342
 343         :Exception: `EOFError` when a sibling or supersection encountered.
 344         """
 345         memo = self.memo
 346         title_styles = memo.title_styles
 347         mylevel = memo.section_level
 348         try:                            # check for existing title style
 349             level = title_styles.index(style) + 1
 350         except ValueError:              # new title style
 351             if len(title_styles) == memo.section_level: # new subsection
 352                 title_styles.append(style)
 353                 return 1
 354             else:                       # not at lowest level
 355                 self.parent += self.title_inconsistent(source, lineno)
 356                 return None
 357         if level <= mylevel:            # sibling or supersection
 358             memo.section_level = level   # bubble up to parent section
 359             if len(style) == 2:
 360                 memo.section_bubble_up_kludge = True
 361             # back up 2 lines for underline title, 3 for overline title
 362             self.state_machine.previous_line(len(style) + 1)
 363             raise EOFError              # let parent section re-evaluate
 364         if level == mylevel + 1:        # immediate subsection
 365             return 1
 366         else:                           # invalid subsection
 367             self.parent += self.title_inconsistent(source, lineno)
 368             return None
 369
 370     def title_inconsistent(self, sourcetext, lineno):
 371         error = self.reporter.severe(
 372             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 373             line=lineno)
 374         return error
 375
 376     def new_subsection(self, title, lineno, messages):
 377         """Append new subsection to document tree. On return, check level."""
 378         memo = self.memo
 379         mylevel = memo.section_level
 380         memo.section_level += 1
 381         section_node = nodes.section()
 382         self.parent += section_node
 383         textnodes, title_messages = self.inline_text(title, lineno)
 384         titlenode = nodes.title(title, '', *textnodes)
 385         name = normalize_name(titlenode.astext())
 386         section_node['names'].append(name)
 387         section_node += titlenode
 388         section_node += messages
 389         section_node += title_messages
 390         self.document.note_implicit_target(section_node, section_node)
 391         offset = self.state_machine.line_offset + 1
 392         absoffset = self.state_machine.abs_line_offset() + 1
 393         newabsoffset = self.nested_parse(
 394               self.state_machine.input_lines[offset:], input_offset=absoffset,
 395               node=section_node, match_titles=True)
 396         self.goto_line(newabsoffset)
 397         if memo.section_level <= mylevel: # can't handle next section?
 398             raise EOFError              # bubble up to supersection
 399         # reset section_level; next pass will detect it properly
 400         memo.section_level = mylevel
 401
 402     def paragraph(self, lines, lineno):
 403         """
 404         Return a list (paragraph & messages) & a boolean: literal_block next?
 405         """
 406         data = '\n'.join(lines).rstrip()
 407         if re.search(r'(?<!\\)(\\\\)*::$', data):
 408             if len(data) == 2:
 409                 return [], 1
 410             elif data[-3] in ' \n':
 411                 text = data[:-3].rstrip()
 412             else:
 413                 text = data[:-1]
 414             literalnext = 1
 415         else:
 416             text = data
 417             literalnext = 0
 418         textnodes, messages = self.inline_text(text, lineno)
 419         p = nodes.paragraph(data, '', *textnodes)
 420         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 421         return [p] + messages, literalnext
 422
 423     def inline_text(self, text, lineno):
 424         """
 425         Return 2 lists: nodes (text and inline elements), and system_messages.
 426         """
 427         nodes, messages = self.inliner.parse(text, lineno,
 428                                              self.memo, self.parent)
 429         return nodes, messages
 430
 431     def unindent_warning(self, node_name):
 432         # the actual problem is one line below the current line
 433         lineno = self.state_machine.abs_line_number()+1
 434         return self.reporter.warning('%s ends without a blank line; '
 435                                      'unexpected unindent.' % node_name,
 436                                      line=lineno)
 437
 438
 439 def build_regexp(definition, compile=True):
 440     """
 441     Build, compile and return a regular expression based on `definition`.
 442
 443     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 444         where "parts" is a list of regular expressions and/or regular
 445         expression definitions to be joined into an or-group.
 446     """
 447     name, prefix, suffix, parts = definition
 448     part_strings = []
 449     for part in parts:
 450         if type(part) is tuple:
 451             part_strings.append(build_regexp(part, None))
 452         else:
 453             part_strings.append(part)
 454     or_group = '|'.join(part_strings)
 455     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 456     if compile:
 457         return re.compile(regexp, re.UNICODE)
 458     else:
 459         return regexp
 460
 461
 462 class Inliner:
 463
 464     """
 465     Parse inline markup; call the `parse()` method.
 466     """
 467
 468     def __init__(self):
 469         self.implicit_dispatch = []
 470         """List of (pattern, bound method) tuples, used by
 471         `self.implicit_inline`."""
 472
 473     def init_customizations(self, settings):
 474         # lookahead and look-behind expressions for inline markup rules
 475         if getattr(settings, 'character_level_inline_markup', False):
 476             start_string_prefix = u'(^|(?<!\x00))'
 477             end_string_suffix = u''
 478         else:
 479             start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' %
 480                                    (punctuation_chars.openers,
 481                                     punctuation_chars.delimiters))
 482             end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' %
 483                                  (punctuation_chars.closing_delimiters,
 484                                   punctuation_chars.delimiters,
 485                                   punctuation_chars.closers))
 486         args = locals().copy()
 487         args.update(vars(self.__class__))
 488
 489         parts = ('initial_inline', start_string_prefix, '',
 490            [('start', '', self.non_whitespace_after, # simple start-strings
 491              [r'\*\*',                # strong
 492               r'\*(?!\*)',            # emphasis but not strong
 493               r'``',                  # literal
 494               r'_`',                  # inline internal target
 495               r'\|(?!\|)']            # substitution reference
 496              ),
 497             ('whole', '', end_string_suffix, # whole constructs
 498              [# reference name & end-string
 499               r'(?P<refname>%s)(?P<refend>__?)' % self.simplename,
 500               ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 501                [r'[0-9]+',               # manually numbered
 502                 r'\#(%s)?' % self.simplename, # auto-numbered (w/ label?)
 503                 r'\*',                   # auto-symbol
 504                 r'(?P<citationlabel>%s)' % self.simplename] # citation reference
 505                )
 506               ]
 507              ),
 508             ('backquote',             # interpreted text or phrase reference
 509              '(?P<role>(:%s:)?)' % self.simplename, # optional role
 510              self.non_whitespace_after,
 511              ['`(?!`)']               # but not literal
 512              )
 513             ]
 514            )
 515         self.start_string_prefix = start_string_prefix
 516         self.end_string_suffix = end_string_suffix
 517         self.parts = parts
 518
 519         self.patterns = Struct(
 520           initial=build_regexp(parts),
 521           emphasis=re.compile(self.non_whitespace_escape_before
 522                               + r'(\*)' + end_string_suffix, re.UNICODE),
 523           strong=re.compile(self.non_whitespace_escape_before
 524                             + r'(\*\*)' + end_string_suffix, re.UNICODE),
 525           interpreted_or_phrase_ref=re.compile(
 526               r"""
 527               %(non_unescaped_whitespace_escape_before)s
 528               (
 529                 `
 530                 (?P<suffix>
 531                   (?P<role>:%(simplename)s:)?
 532                   (?P<refend>__?)?
 533                 )
 534               )
 535               %(end_string_suffix)s
 536               """ % args, re.VERBOSE | re.UNICODE),
 537           embedded_link=re.compile(
 538               r"""
 539               (
 540                 (?:[ \n]+|^)            # spaces or beginning of line/string
 541                 <                       # open bracket
 542                 %(non_whitespace_after)s
 543                 (([^<>]|\x00[<>])+)     # anything but unescaped angle brackets
 544                 %(non_whitespace_escape_before)s
 545                 >                       # close bracket
 546               )
 547               $                         # end of string
 548               """ % args, re.VERBOSE | re.UNICODE),
 549           literal=re.compile(self.non_whitespace_before + '(``)'
 550                              + end_string_suffix, re.UNICODE),
 551           target=re.compile(self.non_whitespace_escape_before
 552                             + r'(`)' + end_string_suffix, re.UNICODE),
 553           substitution_ref=re.compile(self.non_whitespace_escape_before
 554                                       + r'(\|_{0,2})'
 555                                       + end_string_suffix, re.UNICODE),
 556           email=re.compile(self.email_pattern % args + '$',
 557                            re.VERBOSE | re.UNICODE),
 558           uri=re.compile(
 559                 (r"""
 560                 %(start_string_prefix)s
 561                 (?P<whole>
 562                   (?P<absolute>           # absolute URI
 563                     (?P<scheme>             # scheme (http, ftp, mailto)
 564                       [a-zA-Z][a-zA-Z0-9.+-]*
 565                     )
 566                     :
 567                     (
 568                       (                       # either:
 569                         (//?)?                  # hierarchical URI
 570                         %(uric)s*               # URI characters
 571                         %(uri_end)s             # final URI char
 572                       )
 573                       (                       # optional query
 574                         \?%(uric)s*
 575                         %(uri_end)s
 576                       )?
 577                       (                       # optional fragment
 578                         \#%(uric)s*
 579                         %(uri_end)s
 580                       )?
 581                     )
 582                   )
 583                 |                       # *OR*
 584                   (?P<email>              # email address
 585                     """ + self.email_pattern + r"""
 586                   )
 587                 )
 588                 %(end_string_suffix)s
 589                 """) % args, re.VERBOSE | re.UNICODE),
 590           pep=re.compile(
 591                 r"""
 592                 %(start_string_prefix)s
 593                 (
 594                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 595                 |
 596                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 597                 )
 598                 %(end_string_suffix)s""" % args, re.VERBOSE | re.UNICODE),
 599           rfc=re.compile(
 600                 r"""
 601                 %(start_string_prefix)s
 602                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 603                 %(end_string_suffix)s""" % args, re.VERBOSE | re.UNICODE))
 604
 605         self.implicit_dispatch.append((self.patterns.uri,
 606                                        self.standalone_uri))
 607         if settings.pep_references:
 608             self.implicit_dispatch.append((self.patterns.pep,
 609                                            self.pep_reference))
 610         if settings.rfc_references:
 611             self.implicit_dispatch.append((self.patterns.rfc,
 612                                            self.rfc_reference))
 613
 614     def parse(self, text, lineno, memo, parent):
 615         # Needs to be refactored for nested inline markup.
 616         # Add nested_parse() method?
 617         """
 618         Return 2 lists: nodes (text and inline elements), and system_messages.
 619
 620         Using `self.patterns.initial`, a pattern which matches start-strings
 621         (emphasis, strong, interpreted, phrase reference, literal,
 622         substitution reference, and inline target) and complete constructs
 623         (simple reference, footnote reference), search for a candidate.  When
 624         one is found, check for validity (e.g., not a quoted '*' character).
 625         If valid, search for the corresponding end string if applicable, and
 626         check it for validity.  If not found or invalid, generate a warning
 627         and ignore the start-string.  Implicit inline markup (e.g. standalone
 628         URIs) is found last.
 629         """
 630         self.reporter = memo.reporter
 631         self.document = memo.document
 632         self.language = memo.language
 633         self.parent = parent
 634         pattern_search = self.patterns.initial.search
 635         dispatch = self.dispatch
 636         remaining = escape2null(text)
 637         processed = []
 638         unprocessed = []
 639         messages = []
 640         while remaining:
 641             match = pattern_search(remaining)
 642             if match:
 643                 groups = match.groupdict()
 644                 method = dispatch[groups['start'] or groups['backquote']
 645                                   or groups['refend'] or groups['fnend']]
 646                 before, inlines, remaining, sysmessages = method(self, match,
 647                                                                  lineno)
 648                 unprocessed.append(before)
 649                 messages += sysmessages
 650                 if inlines:
 651                     processed += self.implicit_inline(''.join(unprocessed),
 652                                                       lineno)
 653                     processed += inlines
 654                     unprocessed = []
 655             else:
 656                 break
 657         remaining = ''.join(unprocessed) + remaining
 658         if remaining:
 659             processed += self.implicit_inline(remaining, lineno)
 660         return processed, messages
 661
 662     # Inline object recognition
 663     # -------------------------
 664     # See also init_customizations().
 665     non_whitespace_before = r'(?<!\s)'
 666     non_whitespace_escape_before = r'(?<![\s\x00])'
 667     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[\s\x00])'
 668     non_whitespace_after = r'(?!\s)'
 669     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 670     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 671     # Valid URI characters (see RFC 2396 & RFC 2732);
 672     # final \x00 allows backslash escapes in URIs:
 673     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 674     # Delimiter indicating the end of a URI (not part of the URI):
 675     uri_end_delim = r"""[>]"""
 676     # Last URI character; same as uric but no punctuation:
 677     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 678     # End of a URI (either 'urilast' or 'uric followed by a
 679     # uri_end_delim'):
 680     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 681     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 682     email_pattern = r"""
 683           %(emailc)s+(?:\.%(emailc)s+)*   # name
 684           (?<!\x00)@                      # at
 685           %(emailc)s+(?:\.%(emailc)s*)*   # host
 686           %(uri_end)s                     # final URI char
 687           """
 688
 689     def quoted_start(self, match):
 690         """Test if inline markup start-string is 'quoted'.
 691
 692         'Quoted' in this context means the start-string is enclosed in a pair
 693         of matching opening/closing delimiters (not necessarily quotes)
 694         or at the end of the match.
 695         """
 696         string = match.string
 697         start = match.start()
 698         if start == 0:                  # start-string at beginning of text
 699             return False
 700         prestart = string[start - 1]
 701         try:
 702             poststart = string[match.end()]
 703         except IndexError:          # start-string at end of text
 704             return True  # not "quoted" but no markup start-string either
 705         return punctuation_chars.match_chars(prestart, poststart)
 706
 707     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 708                    restore_backslashes=False):
 709         string = match.string
 710         matchstart = match.start('start')
 711         matchend = match.end('start')
 712         if self.quoted_start(match):
 713             return (string[:matchend], [], string[matchend:], [], '')
 714         endmatch = end_pattern.search(string[matchend:])
 715         if endmatch and endmatch.start(1):  # 1 or more chars
 716             _text = endmatch.string[:endmatch.start(1)]
 717             text = unescape(_text, restore_backslashes)
 718             textend = matchend + endmatch.end(1)
 719             rawsource = unescape(string[matchstart:textend], True)
 720             node = nodeclass(rawsource, text)
 721             node[0].rawsource = unescape(_text, True)
 722             return (string[:matchstart], [node],
 723                     string[textend:], [], endmatch.group(1))
 724         msg = self.reporter.warning(
 725               'Inline %s start-string without end-string.'
 726               % nodeclass.__name__, line=lineno)
 727         text = unescape(string[matchstart:matchend], True)
 728         rawsource = unescape(string[matchstart:matchend], True)
 729         prb = self.problematic(text, rawsource, msg)
 730         return string[:matchstart], [prb], string[matchend:], [msg], ''
 731
 732     def problematic(self, text, rawsource, message):
 733         msgid = self.document.set_id(message, self.parent)
 734         problematic = nodes.problematic(rawsource, text, refid=msgid)
 735         prbid = self.document.set_id(problematic)
 736         message.add_backref(prbid)
 737         return problematic
 738
 739     def emphasis(self, match, lineno):
 740         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 741               match, lineno, self.patterns.emphasis, nodes.emphasis)
 742         return before, inlines, remaining, sysmessages
 743
 744     def strong(self, match, lineno):
 745         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 746               match, lineno, self.patterns.strong, nodes.strong)
 747         return before, inlines, remaining, sysmessages
 748
 749     def interpreted_or_phrase_ref(self, match, lineno):
 750         end_pattern = self.patterns.interpreted_or_phrase_ref
 751         string = match.string
 752         matchstart = match.start('backquote')
 753         matchend = match.end('backquote')
 754         rolestart = match.start('role')
 755         role = match.group('role')
 756         position = ''
 757         if role:
 758             role = role[1:-1]
 759             position = 'prefix'
 760         elif self.quoted_start(match):
 761             return (string[:matchend], [], string[matchend:], [])
 762         endmatch = end_pattern.search(string[matchend:])
 763         if endmatch and endmatch.start(1):  # 1 or more chars
 764             textend = matchend + endmatch.end()
 765             if endmatch.group('role'):
 766                 if role:
 767                     msg = self.reporter.warning(
 768                         'Multiple roles in interpreted text (both '
 769                         'prefix and suffix present; only one allowed).',
 770                         line=lineno)
 771                     text = unescape(string[rolestart:textend], True)
 772                     prb = self.problematic(text, text, msg)
 773                     return string[:rolestart], [prb], string[textend:], [msg]
 774                 role = endmatch.group('suffix')[1:-1]
 775                 position = 'suffix'
 776             escaped = endmatch.string[:endmatch.start(1)]
 777             rawsource = unescape(string[matchstart:textend], True)
 778             if rawsource[-1:] == '_':
 779                 if role:
 780                     msg = self.reporter.warning(
 781                           'Mismatch: both interpreted text role %s and '
 782                           'reference suffix.' % position, line=lineno)
 783                     text = unescape(string[rolestart:textend], True)
 784                     prb = self.problematic(text, text, msg)
 785                     return string[:rolestart], [prb], string[textend:], [msg]
 786                 return self.phrase_ref(string[:matchstart], string[textend:],
 787                                        rawsource, escaped, unescape(escaped))
 788             else:
 789                 rawsource = unescape(string[rolestart:textend], True)
 790                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 791                                                       lineno)
 792                 return (string[:rolestart], nodelist,
 793                         string[textend:], messages)
 794         msg = self.reporter.warning(
 795               'Inline interpreted text or phrase reference start-string '
 796               'without end-string.', line=lineno)
 797         text = unescape(string[matchstart:matchend], True)
 798         prb = self.problematic(text, text, msg)
 799         return string[:matchstart], [prb], string[matchend:], [msg]
 800
 801     def phrase_ref(self, before, after, rawsource, escaped, text):
 802         match = self.patterns.embedded_link.search(escaped)
 803         if match: # embedded <URI> or <alias_>
 804             text = unescape(escaped[:match.start(0)])
 805             rawtext = unescape(escaped[:match.start(0)], True)
 806             aliastext = unescape(match.group(2))
 807             rawaliastext = unescape(match.group(2), True)
 808             underscore_escaped = rawaliastext.endswith(r'\_')
 809             if aliastext.endswith('_') and not (underscore_escaped
 810                                         or self.patterns.uri.match(aliastext)):
 811                 aliastype = 'name'
 812                 alias = normalize_name(aliastext[:-1])
 813                 target = nodes.target(match.group(1), refname=alias)
 814                 target.indirect_reference_name = aliastext[:-1]
 815             else:
 816                 aliastype = 'uri'
 817                 alias_parts = split_escaped_whitespace(match.group(2))
 818                 alias = ' '.join(''.join(unescape(part).split())
 819                                  for part in alias_parts)
 820                 alias = self.adjust_uri(alias)
 821                 if alias.endswith(r'\_'):
 822                     alias = alias[:-2] + '_'
 823                 target = nodes.target(match.group(1), refuri=alias)
 824                 target.referenced = 1
 825             if not aliastext:
 826                 raise ApplicationError('problem with embedded link: %r'
 827                                        % aliastext)
 828             if not text:
 829                 text = alias
 830                 rawtext = rawaliastext
 831         else:
 832             target = None
 833             rawtext = unescape(escaped, True)
 834
 835         refname = normalize_name(text)
 836         reference = nodes.reference(rawsource, text,
 837                                     name=whitespace_normalize_name(text))
 838         reference[0].rawsource = rawtext
 839
 840         node_list = [reference]
 841
 842         if rawsource[-2:] == '__':
 843             if  target and (aliastype == 'name'):
 844                 reference['refname'] = alias
 845                 self.document.note_refname(reference)
 846                 # self.document.note_indirect_target(target) # required?
 847             elif target and (aliastype == 'uri'):
 848                 reference['refuri'] = alias
 849             else:
 850                 reference['anonymous'] = 1
 851         else:
 852             if target:
 853                 target['names'].append(refname)
 854                 if aliastype == 'name':
 855                     reference['refname'] = alias
 856                     self.document.note_indirect_target(target)
 857                     self.document.note_refname(reference)
 858                 else:
 859                     reference['refuri'] = alias
 860                     self.document.note_explicit_target(target, self.parent)
 861                 # target.note_referenced_by(name=refname)
 862                 node_list.append(target)
 863             else:
 864                 reference['refname'] = refname
 865                 self.document.note_refname(reference)
 866         return before, node_list, after, []
 867
 868
 869     def adjust_uri(self, uri):
 870         match = self.patterns.email.match(uri)
 871         if match:
 872             return 'mailto:' + uri
 873         else:
 874             return uri
 875
 876     def interpreted(self, rawsource, text, role, lineno):
 877         role_fn, messages = roles.role(role, self.language, lineno,
 878                                        self.reporter)
 879         if role_fn:
 880             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 881             try:
 882                 nodes[0][0].rawsource = unescape(text, True)
 883             except IndexError:
 884                 pass
 885             return nodes, messages + messages2
 886         else:
 887             msg = self.reporter.error(
 888                 'Unknown interpreted text role "%s".' % role,
 889                 line=lineno)
 890             return ([self.problematic(rawsource, rawsource, msg)],
 891                     messages + [msg])
 892
 893     def literal(self, match, lineno):
 894         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 895               match, lineno, self.patterns.literal, nodes.literal,
 896               restore_backslashes=True)
 897         return before, inlines, remaining, sysmessages
 898
 899     def inline_internal_target(self, match, lineno):
 900         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 901               match, lineno, self.patterns.target, nodes.target)
 902         if inlines and isinstance(inlines[0], nodes.target):
 903             assert len(inlines) == 1
 904             target = inlines[0]
 905             name = normalize_name(target.astext())
 906             target['names'].append(name)
 907             self.document.note_explicit_target(target, self.parent)
 908         return before, inlines, remaining, sysmessages
 909
 910     def substitution_reference(self, match, lineno):
 911         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 912               match, lineno, self.patterns.substitution_ref,
 913               nodes.substitution_reference)
 914         if len(inlines) == 1:
 915             subref_node = inlines[0]
 916             if isinstance(subref_node, nodes.substitution_reference):
 917                 subref_text = subref_node.astext()
 918                 self.document.note_substitution_ref(subref_node, subref_text)
 919                 if endstring[-1:] == '_':
 920                     reference_node = nodes.reference(
 921                         '|%s%s' % (subref_text, endstring), '')
 922                     if endstring[-2:] == '__':
 923                         reference_node['anonymous'] = 1
 924                     else:
 925                         reference_node['refname'] = normalize_name(subref_text)
 926                         self.document.note_refname(reference_node)
 927                     reference_node += subref_node
 928                     inlines = [reference_node]
 929         return before, inlines, remaining, sysmessages
 930
 931     def footnote_reference(self, match, lineno):
 932         """
 933         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 934         elements.
 935         """
 936         label = match.group('footnotelabel')
 937         refname = normalize_name(label)
 938         string = match.string
 939         before = string[:match.start('whole')]
 940         remaining = string[match.end('whole'):]
 941         if match.group('citationlabel'):
 942             refnode = nodes.citation_reference('[%s]_' % label,
 943                                                refname=refname)
 944             refnode += nodes.Text(label)
 945             self.document.note_citation_ref(refnode)
 946         else:
 947             refnode = nodes.footnote_reference('[%s]_' % label)
 948             if refname[0] == '#':
 949                 refname = refname[1:]
 950                 refnode['auto'] = 1
 951                 self.document.note_autofootnote_ref(refnode)
 952             elif refname == '*':
 953                 refname = ''
 954                 refnode['auto'] = '*'
 955                 self.document.note_symbol_footnote_ref(
 956                       refnode)
 957             else:
 958                 refnode += nodes.Text(label)
 959             if refname:
 960                 refnode['refname'] = refname
 961                 self.document.note_footnote_ref(refnode)
 962             if utils.get_trim_footnote_ref_space(self.document.settings):
 963                 before = before.rstrip()
 964         return (before, [refnode], remaining, [])
 965
 966     def reference(self, match, lineno, anonymous=False):
 967         referencename = match.group('refname')
 968         refname = normalize_name(referencename)
 969         referencenode = nodes.reference(
 970             referencename + match.group('refend'), referencename,
 971             name=whitespace_normalize_name(referencename))
 972         referencenode[0].rawsource = referencename
 973         if anonymous:
 974             referencenode['anonymous'] = 1
 975         else:
 976             referencenode['refname'] = refname
 977             self.document.note_refname(referencenode)
 978         string = match.string
 979         matchstart = match.start('whole')
 980         matchend = match.end('whole')
 981         return (string[:matchstart], [referencenode], string[matchend:], [])
 982
 983     def anonymous_reference(self, match, lineno):
 984         return self.reference(match, lineno, anonymous=1)
 985
 986     def standalone_uri(self, match, lineno):
 987         if (not match.group('scheme')
 988                 or match.group('scheme').lower() in urischemes.schemes):
 989             if match.group('email'):
 990                 addscheme = 'mailto:'
 991             else:
 992                 addscheme = ''
 993             text = match.group('whole')
 994             unescaped = unescape(text)
 995             rawsource = unescape(text, True)
 996             reference = nodes.reference(rawsource, unescaped,
 997                                         refuri=addscheme + unescaped)
 998             reference[0].rawsource = rawsource
 999             return [reference]
1000         else:                   # not a valid scheme
1001             raise MarkupMismatch
1002
1003     def pep_reference(self, match, lineno):
1004         text = match.group(0)
1005         if text.startswith('pep-'):
1006             pepnum = int(match.group('pepnum1'))
1007         elif text.startswith('PEP'):
1008             pepnum = int(match.group('pepnum2'))
1009         else:
1010             raise MarkupMismatch
1011         ref = (self.document.settings.pep_base_url
1012                + self.document.settings.pep_file_url_template % pepnum)
1013         unescaped = unescape(text)
1014         return [nodes.reference(unescape(text, True), unescaped, refuri=ref)]
1015
1016     rfc_url = 'rfc%d.html'
1017
1018     def rfc_reference(self, match, lineno):
1019         text = match.group(0)
1020         if text.startswith('RFC'):
1021             rfcnum = int(match.group('rfcnum'))
1022             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
1023         else:
1024             raise MarkupMismatch
1025         unescaped = unescape(text)
1026         return [nodes.reference(unescape(text, True), unescaped, refuri=ref)]
1027
1028     def implicit_inline(self, text, lineno):
1029         """
1030         Check each of the patterns in `self.implicit_dispatch` for a match,
1031         and dispatch to the stored method for the pattern.  Recursively check
1032         the text before and after the match.  Return a list of `nodes.Text`
1033         and inline element nodes.
1034         """
1035         if not text:
1036             return []
1037         for pattern, method in self.implicit_dispatch:
1038             match = pattern.search(text)
1039             if match:
1040                 try:
1041                     # Must recurse on strings before *and* after the match;
1042                     # there may be multiple patterns.
1043                     return (self.implicit_inline(text[:match.start()], lineno)
1044                             + method(match, lineno) +
1045                             self.implicit_inline(text[match.end():], lineno))
1046                 except MarkupMismatch:
1047                     pass
1048         return [nodes.Text(unescape(text), rawsource=unescape(text, True))]
1049
1050     dispatch = {'*': emphasis,
1051                 '**': strong,
1052                 '`': interpreted_or_phrase_ref,
1053                 '``': literal,
1054                 '_`': inline_internal_target,
1055                 ']_': footnote_reference,
1056                 '|': substitution_reference,
1057                 '_': reference,
1058                 '__': anonymous_reference}
1059
1060
1061 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1062     return ord(s) - _zero
1063
1064 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1065     return ord(s) - _zero
1066
1067 def _lowerroman_to_int(s):
1068     return roman.fromRoman(s.upper())
1069
1070
1071 class Body(RSTState):
1072
1073     """
1074     Generic classifier of the first line of a block.
1075     """
1076
1077     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1078     """Padding character for East Asian double-width text."""
1079
1080     enum = Struct()
1081     """Enumerated list parsing information."""
1082
1083     enum.formatinfo = {
1084           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1085           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1086           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1087     enum.formats = enum.formatinfo.keys()
1088     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1089                       'lowerroman', 'upperroman'] # ORDERED!
1090     enum.sequencepats = {'arabic': '[0-9]+',
1091                          'loweralpha': '[a-z]',
1092                          'upperalpha': '[A-Z]',
1093                          'lowerroman': '[ivxlcdm]+',
1094                          'upperroman': '[IVXLCDM]+',}
1095     enum.converters = {'arabic': int,
1096                        'loweralpha': _loweralpha_to_int,
1097                        'upperalpha': _upperalpha_to_int,
1098                        'lowerroman': _lowerroman_to_int,
1099                        'upperroman': roman.fromRoman}
1100
1101     enum.sequenceregexps = {}
1102     for sequence in enum.sequences:
1103         enum.sequenceregexps[sequence] = re.compile(
1104               enum.sequencepats[sequence] + '$', re.UNICODE)
1105
1106     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1107     """Matches the top (& bottom) of a full table)."""
1108
1109     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1110     """Matches the top of a simple table."""
1111
1112     simple_table_border_pat = re.compile('=+[ =]*$')
1113     """Matches the bottom & header bottom of a simple table."""
1114
1115     pats = {}
1116     """Fragments of patterns used by transitions."""
1117
1118     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1119     pats['alpha'] = '[a-zA-Z]'
1120     pats['alphanum'] = '[a-zA-Z0-9]'
1121     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1122     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1123                     '|%(upperroman)s|#)' % enum.sequencepats)
1124     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1125     # @@@ Loosen up the pattern?  Allow Unicode?
1126     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1127     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1128     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1129     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1130
1131     for format in enum.formats:
1132         pats[format] = '(?P<%s>%s%s%s)' % (
1133               format, re.escape(enum.formatinfo[format].prefix),
1134               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1135
1136     patterns = {
1137           'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
1138           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1139           'field_marker': r':(?![: ])([^:\\]|\\.|:(?!([ `]|$)))*(?<! ):( +|$)',
1140           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1141           'doctest': r'>>>( +|$)',
1142           'line_block': r'\|( +|$)',
1143           'grid_table_top': grid_table_top_pat,
1144           'simple_table_top': simple_table_top_pat,
1145           'explicit_markup': r'\.\.( +|$)',
1146           'anonymous': r'__( +|$)',
1147           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1148           'text': r''}
1149     initial_transitions = (
1150           'bullet',
1151           'enumerator',
1152           'field_marker',
1153           'option_marker',
1154           'doctest',
1155           'line_block',
1156           'grid_table_top',
1157           'simple_table_top',
1158           'explicit_markup',
1159           'anonymous',
1160           'line',
1161           'text')
1162
1163     def indent(self, match, context, next_state):
1164         """Block quote."""
1165         indented, indent, line_offset, blank_finish = \
1166               self.state_machine.get_indented()
1167         elements = self.block_quote(indented, line_offset)
1168         self.parent += elements
1169         if not blank_finish:
1170             self.parent += self.unindent_warning('Block quote')
1171         return context, next_state, []
1172
1173     def block_quote(self, indented, line_offset):
1174         elements = []
1175         while indented:
1176             (blockquote_lines,
1177              attribution_lines,
1178              attribution_offset,
1179              indented,
1180              new_line_offset) = self.split_attribution(indented, line_offset)
1181             blockquote = nodes.block_quote()
1182             self.nested_parse(blockquote_lines, line_offset, blockquote)
1183             elements.append(blockquote)
1184             if attribution_lines:
1185                 attribution, messages = self.parse_attribution(
1186                     attribution_lines, attribution_offset)
1187                 blockquote += attribution
1188                 elements += messages
1189             line_offset = new_line_offset
1190             while indented and not indented[0]:
1191                 indented = indented[1:]
1192                 line_offset += 1
1193         return elements
1194
1195     # U+2014 is an em-dash:
1196     attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
1197                                      re.UNICODE)
1198
1199     def split_attribution(self, indented, line_offset):
1200         """
1201         Check for a block quote attribution and split it off:
1202
1203         * First line after a blank line must begin with a dash ("--", "---",
1204           em-dash; matches `self.attribution_pattern`).
1205         * Every line after that must have consistent indentation.
1206         * Attributions must be preceded by block quote content.
1207
1208         Return a tuple of: (block quote content lines, content offset,
1209         attribution lines, attribution offset, remaining indented lines).
1210         """
1211         blank = None
1212         nonblank_seen = False
1213         for i in range(len(indented)):
1214             line = indented[i].rstrip()
1215             if line:
1216                 if nonblank_seen and blank == i - 1: # last line blank
1217                     match = self.attribution_pattern.match(line)
1218                     if match:
1219                         attribution_end, indent = self.check_attribution(
1220                             indented, i)
1221                         if attribution_end:
1222                             a_lines = indented[i:attribution_end]
1223                             a_lines.trim_left(match.end(), end=1)
1224                             a_lines.trim_left(indent, start=1)
1225                             return (indented[:i], a_lines,
1226                                     i, indented[attribution_end:],
1227                                     line_offset + attribution_end)
1228                 nonblank_seen = True
1229             else:
1230                 blank = i
1231         else:
1232             return (indented, None, None, None, None)
1233
1234     def check_attribution(self, indented, attribution_start):
1235         """
1236         Check attribution shape.
1237         Return the index past the end of the attribution, and the indent.
1238         """
1239         indent = None
1240         i = attribution_start + 1
1241         for i in range(attribution_start + 1, len(indented)):
1242             line = indented[i].rstrip()
1243             if not line:
1244                 break
1245             if indent is None:
1246                 indent = len(line) - len(line.lstrip())
1247             elif len(line) - len(line.lstrip()) != indent:
1248                 return None, None       # bad shape; not an attribution
1249         else:
1250             # return index of line after last attribution line:
1251             i += 1
1252         return i, (indent or 0)
1253
1254     def parse_attribution(self, indented, line_offset):
1255         text = '\n'.join(indented).rstrip()
1256         lineno = self.state_machine.abs_line_number() + line_offset
1257         textnodes, messages = self.inline_text(text, lineno)
1258         node = nodes.attribution(text, '', *textnodes)
1259         node.source, node.line = self.state_machine.get_source_and_line(lineno)
1260         return node, messages
1261
1262     def bullet(self, match, context, next_state):
1263         """Bullet list item."""
1264         bulletlist = nodes.bullet_list()
1265         (bulletlist.source,
1266          bulletlist.line) = self.state_machine.get_source_and_line()
1267         self.parent += bulletlist
1268         bulletlist['bullet'] = match.string[0]
1269         i, blank_finish = self.list_item(match.end())
1270         bulletlist += i
1271         offset = self.state_machine.line_offset + 1   # next line
1272         new_line_offset, blank_finish = self.nested_list_parse(
1273               self.state_machine.input_lines[offset:],
1274               input_offset=self.state_machine.abs_line_offset() + 1,
1275               node=bulletlist, initial_state='BulletList',
1276               blank_finish=blank_finish)
1277         self.goto_line(new_line_offset)
1278         if not blank_finish:
1279             self.parent += self.unindent_warning('Bullet list')
1280         return [], next_state, []
1281
1282     def list_item(self, indent):
1283         if self.state_machine.line[indent:]:
1284             indented, line_offset, blank_finish = (
1285                 self.state_machine.get_known_indented(indent))
1286         else:
1287             indented, indent, line_offset, blank_finish = (
1288                 self.state_machine.get_first_known_indented(indent))
1289         listitem = nodes.list_item('\n'.join(indented))
1290         if indented:
1291             self.nested_parse(indented, input_offset=line_offset,
1292                               node=listitem)
1293         return listitem, blank_finish
1294
1295     def enumerator(self, match, context, next_state):
1296         """Enumerated List Item"""
1297         format, sequence, text, ordinal = self.parse_enumerator(match)
1298         if not self.is_enumerated_list_item(ordinal, sequence, format):
1299             raise statemachine.TransitionCorrection('text')
1300         enumlist = nodes.enumerated_list()
1301         self.parent += enumlist
1302         if sequence == '#':
1303             enumlist['enumtype'] = 'arabic'
1304         else:
1305             enumlist['enumtype'] = sequence
1306         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1307         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1308         if ordinal != 1:
1309             enumlist['start'] = ordinal
1310             msg = self.reporter.info(
1311                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1312                 % (text, ordinal))
1313             self.parent += msg
1314         listitem, blank_finish = self.list_item(match.end())
1315         enumlist += listitem
1316         offset = self.state_machine.line_offset + 1   # next line
1317         newline_offset, blank_finish = self.nested_list_parse(
1318               self.state_machine.input_lines[offset:],
1319               input_offset=self.state_machine.abs_line_offset() + 1,
1320               node=enumlist, initial_state='EnumeratedList',
1321               blank_finish=blank_finish,
1322               extra_settings={'lastordinal': ordinal,
1323                               'format': format,
1324                               'auto': sequence == '#'})
1325         self.goto_line(newline_offset)
1326         if not blank_finish:
1327             self.parent += self.unindent_warning('Enumerated list')
1328         return [], next_state, []
1329
1330     def parse_enumerator(self, match, expected_sequence=None):
1331         """
1332         Analyze an enumerator and return the results.
1333
1334         :Return:
1335             - the enumerator format ('period', 'parens', or 'rparen'),
1336             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1337             - the text of the enumerator, stripped of formatting, and
1338             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1339               ``None`` is returned for invalid enumerator text).
1340
1341         The enumerator format has already been determined by the regular
1342         expression match. If `expected_sequence` is given, that sequence is
1343         tried first. If not, we check for Roman numeral 1. This way,
1344         single-character Roman numerals (which are also alphabetical) can be
1345         matched. If no sequence has been matched, all sequences are checked in
1346         order.
1347         """
1348         groupdict = match.groupdict()
1349         sequence = ''
1350         for format in self.enum.formats:
1351             if groupdict[format]:       # was this the format matched?
1352                 break                   # yes; keep `format`
1353         else:                           # shouldn't happen
1354             raise ParserError('enumerator format not matched')
1355         text = groupdict[format][self.enum.formatinfo[format].start
1356                                  :self.enum.formatinfo[format].end]
1357         if text == '#':
1358             sequence = '#'
1359         elif expected_sequence:
1360             try:
1361                 if self.enum.sequenceregexps[expected_sequence].match(text):
1362                     sequence = expected_sequence
1363             except KeyError:            # shouldn't happen
1364                 raise ParserError('unknown enumerator sequence: %s'
1365                                   % sequence)
1366         elif text == 'i':
1367             sequence = 'lowerroman'
1368         elif text == 'I':
1369             sequence = 'upperroman'
1370         if not sequence:
1371             for sequence in self.enum.sequences:
1372                 if self.enum.sequenceregexps[sequence].match(text):
1373                     break
1374             else:                       # shouldn't happen
1375                 raise ParserError('enumerator sequence not matched')
1376         if sequence == '#':
1377             ordinal = 1
1378         else:
1379             try:
1380                 ordinal = self.enum.converters[sequence](text)
1381             except roman.InvalidRomanNumeralError:
1382                 ordinal = None
1383         return format, sequence, text, ordinal
1384
1385     def is_enumerated_list_item(self, ordinal, sequence, format):
1386         """
1387         Check validity based on the ordinal value and the second line.
1388
1389         Return true if the ordinal is valid and the second line is blank,
1390         indented, or starts with the next enumerator or an auto-enumerator.
1391         """
1392         if ordinal is None:
1393             return None
1394         try:
1395             next_line = self.state_machine.next_line()
1396         except EOFError:              # end of input lines
1397             self.state_machine.previous_line()
1398             return 1
1399         else:
1400             self.state_machine.previous_line()
1401         if not next_line[:1].strip():   # blank or indented
1402             return 1
1403         result = self.make_enumerator(ordinal + 1, sequence, format)
1404         if result:
1405             next_enumerator, auto_enumerator = result
1406             try:
1407                 if ( next_line.startswith(next_enumerator) or
1408                      next_line.startswith(auto_enumerator) ):
1409                     return 1
1410             except TypeError:
1411                 pass
1412         return None
1413
1414     def make_enumerator(self, ordinal, sequence, format):
1415         """
1416         Construct and return the next enumerated list item marker, and an
1417         auto-enumerator ("#" instead of the regular enumerator).
1418
1419         Return ``None`` for invalid (out of range) ordinals.
1420         """ #"
1421         if sequence == '#':
1422             enumerator = '#'
1423         elif sequence == 'arabic':
1424             enumerator = str(ordinal)
1425         else:
1426             if sequence.endswith('alpha'):
1427                 if ordinal > 26:
1428                     return None
1429                 enumerator = chr(ordinal + ord('a') - 1)
1430             elif sequence.endswith('roman'):
1431                 try:
1432                     enumerator = roman.toRoman(ordinal)
1433                 except roman.RomanError:
1434                     return None
1435             else:                       # shouldn't happen
1436                 raise ParserError('unknown enumerator sequence: "%s"'
1437                                   % sequence)
1438             if sequence.startswith('lower'):
1439                 enumerator = enumerator.lower()
1440             elif sequence.startswith('upper'):
1441                 enumerator = enumerator.upper()
1442             else:                       # shouldn't happen
1443                 raise ParserError('unknown enumerator sequence: "%s"'
1444                                   % sequence)
1445         formatinfo = self.enum.formatinfo[format]
1446         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1447                            + ' ')
1448         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1449         return next_enumerator, auto_enumerator
1450
1451     def field_marker(self, match, context, next_state):
1452         """Field list item."""
1453         field_list = nodes.field_list()
1454         self.parent += field_list
1455         field, blank_finish = self.field(match)
1456         field_list += field
1457         offset = self.state_machine.line_offset + 1   # next line
1458         newline_offset, blank_finish = self.nested_list_parse(
1459               self.state_machine.input_lines[offset:],
1460               input_offset=self.state_machine.abs_line_offset() + 1,
1461               node=field_list, initial_state='FieldList',
1462               blank_finish=blank_finish)
1463         self.goto_line(newline_offset)
1464         if not blank_finish:
1465             self.parent += self.unindent_warning('Field list')
1466         return [], next_state, []
1467
1468     def field(self, match):
1469         name = self.parse_field_marker(match)
1470         src, srcline = self.state_machine.get_source_and_line()
1471         lineno = self.state_machine.abs_line_number()
1472         indented, indent, line_offset, blank_finish = \
1473               self.state_machine.get_first_known_indented(match.end())
1474         field_node = nodes.field()
1475         field_node.source = src
1476         field_node.line = srcline
1477         name_nodes, name_messages = self.inline_text(name, lineno)
1478         field_node += nodes.field_name(name, '', *name_nodes)
1479         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1480         field_node += field_body
1481         if indented:
1482             self.parse_field_body(indented, line_offset, field_body)
1483         return field_node, blank_finish
1484
1485     def parse_field_marker(self, match):
1486         """Extract & return field name from a field marker match."""
1487         field = match.group()[1:]        # strip off leading ':'
1488         field = field[:field.rfind(':')] # strip off trailing ':' etc.
1489         return field
1490
1491     def parse_field_body(self, indented, offset, node):
1492         self.nested_parse(indented, input_offset=offset, node=node)
1493
1494     def option_marker(self, match, context, next_state):
1495         """Option list item."""
1496         optionlist = nodes.option_list()
1497         (optionlist.source, optionlist.line) = self.state_machine.get_source_and_line()
1498         try:
1499             listitem, blank_finish = self.option_list_item(match)
1500         except MarkupError, error:
1501             # This shouldn't happen; pattern won't match.
1502             msg = self.reporter.error(u'Invalid option list marker: %s' %
1503                                       error)
1504             self.parent += msg
1505             indented, indent, line_offset, blank_finish = \
1506                   self.state_machine.get_first_known_indented(match.end())
1507             elements = self.block_quote(indented, line_offset)
1508             self.parent += elements
1509             if not blank_finish:
1510                 self.parent += self.unindent_warning('Option list')
1511             return [], next_state, []
1512         self.parent += optionlist
1513         optionlist += listitem
1514         offset = self.state_machine.line_offset + 1   # next line
1515         newline_offset, blank_finish = self.nested_list_parse(
1516               self.state_machine.input_lines[offset:],
1517               input_offset=self.state_machine.abs_line_offset() + 1,
1518               node=optionlist, initial_state='OptionList',
1519               blank_finish=blank_finish)
1520         self.goto_line(newline_offset)
1521         if not blank_finish:
1522             self.parent += self.unindent_warning('Option list')
1523         return [], next_state, []
1524
1525     def option_list_item(self, match):
1526         offset = self.state_machine.abs_line_offset()
1527         options = self.parse_option_marker(match)
1528         indented, indent, line_offset, blank_finish = \
1529               self.state_machine.get_first_known_indented(match.end())
1530         if not indented:                # not an option list item
1531             self.goto_line(offset)
1532             raise statemachine.TransitionCorrection('text')
1533         option_group = nodes.option_group('', *options)
1534         description = nodes.description('\n'.join(indented))
1535         option_list_item = nodes.option_list_item('', option_group,
1536                                                   description)
1537         if indented:
1538             self.nested_parse(indented, input_offset=line_offset,
1539                               node=description)
1540         return option_list_item, blank_finish
1541
1542     def parse_option_marker(self, match):
1543         """
1544         Return a list of `node.option` and `node.option_argument` objects,
1545         parsed from an option marker match.
1546
1547         :Exception: `MarkupError` for invalid option markers.
1548         """
1549         optlist = []
1550         optionstrings = match.group().rstrip().split(', ')
1551         for optionstring in optionstrings:
1552             tokens = optionstring.split()
1553             delimiter = ' '
1554             firstopt = tokens[0].split('=', 1)
1555             if len(firstopt) > 1:
1556                 # "--opt=value" form
1557                 tokens[:1] = firstopt
1558                 delimiter = '='
1559             elif (len(tokens[0]) > 2
1560                   and ((tokens[0].startswith('-')
1561                         and not tokens[0].startswith('--'))
1562                        or tokens[0].startswith('+'))):
1563                 # "-ovalue" form
1564                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1565                 delimiter = ''
1566             if len(tokens) > 1 and (tokens[1].startswith('<')
1567                                     and tokens[-1].endswith('>')):
1568                 # "-o <value1 value2>" form; join all values into one token
1569                 tokens[1:] = [' '.join(tokens[1:])]
1570             if 0 < len(tokens) <= 2:
1571                 option = nodes.option(optionstring)
1572                 option += nodes.option_string(tokens[0], tokens[0])
1573                 if len(tokens) > 1:
1574                     option += nodes.option_argument(tokens[1], tokens[1],
1575                                                     delimiter=delimiter)
1576                 optlist.append(option)
1577             else:
1578                 raise MarkupError(
1579                     'wrong number of option tokens (=%s), should be 1 or 2: '
1580                     '"%s"' % (len(tokens), optionstring))
1581         return optlist
1582
1583     def doctest(self, match, context, next_state):
1584         data = '\n'.join(self.state_machine.get_text_block())
1585         # TODO: prepend class value ['pycon'] (Python Console)
1586         # parse with `directives.body.CodeBlock` (returns literal-block
1587         # with class "code" and syntax highlight markup).
1588         self.parent += nodes.doctest_block(data, data)
1589         return [], next_state, []
1590
1591     def line_block(self, match, context, next_state):
1592         """First line of a line block."""
1593         block = nodes.line_block()
1594         self.parent += block
1595         lineno = self.state_machine.abs_line_number()
1596         line, messages, blank_finish = self.line_block_line(match, lineno)
1597         block += line
1598         self.parent += messages
1599         if not blank_finish:
1600             offset = self.state_machine.line_offset + 1   # next line
1601             new_line_offset, blank_finish = self.nested_list_parse(
1602                   self.state_machine.input_lines[offset:],
1603                   input_offset=self.state_machine.abs_line_offset() + 1,
1604                   node=block, initial_state='LineBlock',
1605                   blank_finish=0)
1606             self.goto_line(new_line_offset)
1607         if not blank_finish:
1608             self.parent += self.reporter.warning(
1609                 'Line block ends without a blank line.',
1610                 line=lineno+1)
1611         if len(block):
1612             if block[0].indent is None:
1613                 block[0].indent = 0
1614             self.nest_line_block_lines(block)
1615         return [], next_state, []
1616
1617     def line_block_line(self, match, lineno):
1618         """Return one line element of a line_block."""
1619         indented, indent, line_offset, blank_finish = \
1620             self.state_machine.get_first_known_indented(match.end(),
1621                                                         until_blank=True)
1622         text = u'\n'.join(indented)
1623         text_nodes, messages = self.inline_text(text, lineno)
1624         line = nodes.line(text, '', *text_nodes)
1625         if match.string.rstrip() != '|': # not empty
1626             line.indent = len(match.group(1)) - 1
1627         return line, messages, blank_finish
1628
1629     def nest_line_block_lines(self, block):
1630         for index in range(1, len(block)):
1631             if getattr(block[index], 'indent', None) is None:
1632                 block[index].indent = block[index - 1].indent
1633         self.nest_line_block_segment(block)
1634
1635     def nest_line_block_segment(self, block):
1636         indents = [item.indent for item in block]
1637         least = min(indents)
1638         new_items = []
1639         new_block = nodes.line_block()
1640         for item in block:
1641             if item.indent > least:
1642                 new_block.append(item)
1643             else:
1644                 if len(new_block):
1645                     self.nest_line_block_segment(new_block)
1646                     new_items.append(new_block)
1647                     new_block = nodes.line_block()
1648                 new_items.append(item)
1649         if len(new_block):
1650             self.nest_line_block_segment(new_block)
1651             new_items.append(new_block)
1652         block[:] = new_items
1653
1654     def grid_table_top(self, match, context, next_state):
1655         """Top border of a full table."""
1656         return self.table_top(match, context, next_state,
1657                               self.isolate_grid_table,
1658                               tableparser.GridTableParser)
1659
1660     def simple_table_top(self, match, context, next_state):
1661         """Top border of a simple table."""
1662         return self.table_top(match, context, next_state,
1663                               self.isolate_simple_table,
1664                               tableparser.SimpleTableParser)
1665
1666     def table_top(self, match, context, next_state,
1667                   isolate_function, parser_class):
1668         """Top border of a generic table."""
1669         nodelist, blank_finish = self.table(isolate_function, parser_class)
1670         self.parent += nodelist
1671         if not blank_finish:
1672             msg = self.reporter.warning(
1673                 'Blank line required after table.',
1674                 line=self.state_machine.abs_line_number()+1)
1675             self.parent += msg
1676         return [], next_state, []
1677
1678     def table(self, isolate_function, parser_class):
1679         """Parse a table."""
1680         block, messages, blank_finish = isolate_function()
1681         if block:
1682             try:
1683                 parser = parser_class()
1684                 tabledata = parser.parse(block)
1685                 tableline = (self.state_machine.abs_line_number() - len(block)
1686                              + 1)
1687                 table = self.build_table(tabledata, tableline)
1688                 nodelist = [table] + messages
1689             except tableparser.TableMarkupError, err:
1690                 nodelist = self.malformed_table(block, ' '.join(err.args),
1691                                                 offset=err.offset) + messages
1692         else:
1693             nodelist = messages
1694         return nodelist, blank_finish
1695
1696     def isolate_grid_table(self):
1697         messages = []
1698         blank_finish = 1
1699         try:
1700             block = self.state_machine.get_text_block(flush_left=True)
1701         except statemachine.UnexpectedIndentationError, err:
1702             block, src, srcline = err.args
1703             messages.append(self.reporter.error('Unexpected indentation.',
1704                                                 source=src, line=srcline))
1705             blank_finish = 0
1706         block.disconnect()
1707         # for East Asian chars:
1708         block.pad_double_width(self.double_width_pad_char)
1709         width = len(block[0].strip())
1710         for i in range(len(block)):
1711             block[i] = block[i].strip()
1712             if block[i][0] not in '+|': # check left edge
1713                 blank_finish = 0
1714                 self.state_machine.previous_line(len(block) - i)
1715                 del block[i:]
1716                 break
1717         if not self.grid_table_top_pat.match(block[-1]): # find bottom
1718             blank_finish = 0
1719             # from second-last to third line of table:
1720             for i in range(len(block) - 2, 1, -1):
1721                 if self.grid_table_top_pat.match(block[i]):
1722                     self.state_machine.previous_line(len(block) - i + 1)
1723                     del block[i+1:]
1724                     break
1725             else:
1726                 messages.extend(self.malformed_table(block))
1727                 return [], messages, blank_finish
1728         for i in range(len(block)):     # check right edge
1729             if len(block[i]) != width or block[i][-1] not in '+|':
1730                 messages.extend(self.malformed_table(block))
1731                 return [], messages, blank_finish
1732         return block, messages, blank_finish
1733
1734     def isolate_simple_table(self):
1735         start = self.state_machine.line_offset
1736         lines = self.state_machine.input_lines
1737         limit = len(lines) - 1
1738         toplen = len(lines[start].strip())
1739         pattern_match = self.simple_table_border_pat.match
1740         found = 0
1741         found_at = None
1742         i = start + 1
1743         while i <= limit:
1744             line = lines[i]
1745             match = pattern_match(line)
1746             if match:
1747                 if len(line.strip()) != toplen:
1748                     self.state_machine.next_line(i - start)
1749                     messages = self.malformed_table(
1750                         lines[start:i+1], 'Bottom/header table border does '
1751                         'not match top border.')
1752                     return [], messages, i == limit or not lines[i+1].strip()
1753                 found += 1
1754                 found_at = i
1755                 if found == 2 or i == limit or not lines[i+1].strip():
1756                     end = i
1757                     break
1758             i += 1
1759         else:                           # reached end of input_lines
1760             if found:
1761                 extra = ' or no blank line after table bottom'
1762                 self.state_machine.next_line(found_at - start)
1763                 block = lines[start:found_at+1]
1764             else:
1765                 extra = ''
1766                 self.state_machine.next_line(i - start - 1)
1767                 block = lines[start:]
1768             messages = self.malformed_table(
1769                 block, 'No bottom table border found%s.' % extra)
1770             return [], messages, not extra
1771         self.state_machine.next_line(end - start)
1772         block = lines[start:end+1]
1773         # for East Asian chars:
1774         block.pad_double_width(self.double_width_pad_char)
1775         return block, [], end == limit or not lines[end+1].strip()
1776
1777     def malformed_table(self, block, detail='', offset=0):
1778         block.replace(self.double_width_pad_char, '')
1779         data = '\n'.join(block)
1780         message = 'Malformed table.'
1781         startline = self.state_machine.abs_line_number() - len(block) + 1
1782         if detail:
1783             message += '\n' + detail
1784         error = self.reporter.error(message, nodes.literal_block(data, data),
1785                                     line=startline+offset)
1786         return [error]
1787
1788     def build_table(self, tabledata, tableline, stub_columns=0, widths=None):
1789         colwidths, headrows, bodyrows = tabledata
1790         table = nodes.table()
1791         if widths == 'auto':
1792             table['classes'] += ['colwidths-auto']
1793         elif widths: # "grid" or list of integers
1794             table['classes'] += ['colwidths-given']
1795         tgroup = nodes.tgroup(cols=len(colwidths))
1796         table += tgroup
1797         for colwidth in colwidths:
1798             colspec = nodes.colspec(colwidth=colwidth)
1799             if stub_columns:
1800                 colspec.attributes['stub'] = 1
1801                 stub_columns -= 1
1802             tgroup += colspec
1803         if headrows:
1804             thead = nodes.thead()
1805             tgroup += thead
1806             for row in headrows:
1807                 thead += self.build_table_row(row, tableline)
1808         tbody = nodes.tbody()
1809         tgroup += tbody
1810         for row in bodyrows:
1811             tbody += self.build_table_row(row, tableline)
1812         return table
1813
1814     def build_table_row(self, rowdata, tableline):
1815         row = nodes.row()
1816         for cell in rowdata:
1817             if cell is None:
1818                 continue
1819             morerows, morecols, offset, cellblock = cell
1820             attributes = {}
1821             if morerows:
1822                 attributes['morerows'] = morerows
1823             if morecols:
1824                 attributes['morecols'] = morecols
1825             entry = nodes.entry(**attributes)
1826             row += entry
1827             if ''.join(cellblock):
1828                 self.nested_parse(cellblock, input_offset=tableline+offset,
1829                                   node=entry)
1830         return row
1831
1832
1833     explicit = Struct()
1834     """Patterns and constants used for explicit markup recognition."""
1835
1836     explicit.patterns = Struct(
1837           target=re.compile(r"""
1838                             (
1839                               _               # anonymous target
1840                             |               # *OR*
1841                               (?!_)           # no underscore at the beginning
1842                               (?P<quote>`?)   # optional open quote
1843                               (?![ `])        # first char. not space or
1844                                               # backquote
1845                               (?P<name>       # reference name
1846                                 .+?
1847                               )
1848                               %(non_whitespace_escape_before)s
1849                               (?P=quote)      # close quote if open quote used
1850                             )
1851                             (?<!(?<!\x00):) # no unescaped colon at end
1852                             %(non_whitespace_escape_before)s
1853                             [ ]?            # optional space
1854                             :               # end of reference name
1855                             ([ ]+|$)        # followed by whitespace
1856                             """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1857           reference=re.compile(r"""
1858                                (
1859                                  (?P<simple>%(simplename)s)_
1860                                |                  # *OR*
1861                                  `                  # open backquote
1862                                  (?![ ])            # not space
1863                                  (?P<phrase>.+?)    # hyperlink phrase
1864                                  %(non_whitespace_escape_before)s
1865                                  `_                 # close backquote,
1866                                                     # reference mark
1867                                )
1868                                $                  # end of string
1869                                """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1870           substitution=re.compile(r"""
1871                                   (
1872                                     (?![ ])          # first char. not space
1873                                     (?P<name>.+?)    # substitution text
1874                                     %(non_whitespace_escape_before)s
1875                                     \|               # close delimiter
1876                                   )
1877                                   ([ ]+|$)           # followed by whitespace
1878                                   """ % vars(Inliner),
1879                                   re.VERBOSE | re.UNICODE),)
1880
1881     def footnote(self, match):
1882         src, srcline = self.state_machine.get_source_and_line()
1883         indented, indent, offset, blank_finish = \
1884               self.state_machine.get_first_known_indented(match.end())
1885         label = match.group(1)
1886         name = normalize_name(label)
1887         footnote = nodes.footnote('\n'.join(indented))
1888         footnote.source = src
1889         footnote.line = srcline
1890         if name[0] == '#':              # auto-numbered
1891             name = name[1:]             # autonumber label
1892             footnote['auto'] = 1
1893             if name:
1894                 footnote['names'].append(name)
1895             self.document.note_autofootnote(footnote)
1896         elif name == '*':               # auto-symbol
1897             name = ''
1898             footnote['auto'] = '*'
1899             self.document.note_symbol_footnote(footnote)
1900         else:                           # manually numbered
1901             footnote += nodes.label('', label)
1902             footnote['names'].append(name)
1903             self.document.note_footnote(footnote)
1904         if name:
1905             self.document.note_explicit_target(footnote, footnote)
1906         else:
1907             self.document.set_id(footnote, footnote)
1908         if indented:
1909             self.nested_parse(indented, input_offset=offset, node=footnote)
1910         return [footnote], blank_finish
1911
1912     def citation(self, match):
1913         src, srcline = self.state_machine.get_source_and_line()
1914         indented, indent, offset, blank_finish = \
1915               self.state_machine.get_first_known_indented(match.end())
1916         label = match.group(1)
1917         name = normalize_name(label)
1918         citation = nodes.citation('\n'.join(indented))
1919         citation.source = src
1920         citation.line = srcline
1921         citation += nodes.label('', label)
1922         citation['names'].append(name)
1923         self.document.note_citation(citation)
1924         self.document.note_explicit_target(citation, citation)
1925         if indented:
1926             self.nested_parse(indented, input_offset=offset, node=citation)
1927         return [citation], blank_finish
1928
1929     def hyperlink_target(self, match):
1930         pattern = self.explicit.patterns.target
1931         lineno = self.state_machine.abs_line_number()
1932         block, indent, offset, blank_finish = \
1933               self.state_machine.get_first_known_indented(
1934               match.end(), until_blank=True, strip_indent=False)
1935         blocktext = match.string[:match.end()] + '\n'.join(block)
1936         block = [escape2null(line) for line in block]
1937         escaped = block[0]
1938         blockindex = 0
1939         while True:
1940             targetmatch = pattern.match(escaped)
1941             if targetmatch:
1942                 break
1943             blockindex += 1
1944             try:
1945                 escaped += block[blockindex]
1946             except IndexError:
1947                 raise MarkupError('malformed hyperlink target.')
1948         del block[:blockindex]
1949         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1950         target = self.make_target(block, blocktext, lineno,
1951                                   targetmatch.group('name'))
1952         return [target], blank_finish
1953
1954     def make_target(self, block, block_text, lineno, target_name):
1955         target_type, data = self.parse_target(block, block_text, lineno)
1956         if target_type == 'refname':
1957             target = nodes.target(block_text, '', refname=normalize_name(data))
1958             target.indirect_reference_name = data
1959             self.add_target(target_name, '', target, lineno)
1960             self.document.note_indirect_target(target)
1961             return target
1962         elif target_type == 'refuri':
1963             target = nodes.target(block_text, '')
1964             self.add_target(target_name, data, target, lineno)
1965             return target
1966         else:
1967             return data
1968
1969     def parse_target(self, block, block_text, lineno):
1970         """
1971         Determine the type of reference of a target.
1972
1973         :Return: A 2-tuple, one of:
1974
1975             - 'refname' and the indirect reference name
1976             - 'refuri' and the URI
1977             - 'malformed' and a system_message node
1978         """
1979         if block and block[-1].strip()[-1:] == '_': # possible indirect target
1980             reference = ' '.join([line.strip() for line in block])
1981             refname = self.is_reference(reference)
1982             if refname:
1983                 return 'refname', refname
1984         ref_parts = split_escaped_whitespace(' '.join(block))
1985         reference = ' '.join(''.join(unescape(part).split())
1986                              for part in ref_parts)
1987         return 'refuri', reference
1988
1989     def is_reference(self, reference):
1990         match = self.explicit.patterns.reference.match(
1991             whitespace_normalize_name(reference))
1992         if not match:
1993             return None
1994         return unescape(match.group('simple') or match.group('phrase'))
1995
1996     def add_target(self, targetname, refuri, target, lineno):
1997         target.line = lineno
1998         if targetname:
1999             name = normalize_name(unescape(targetname))
2000             target['names'].append(name)
2001             if refuri:
2002                 uri = self.inliner.adjust_uri(refuri)
2003                 if uri:
2004                     target['refuri'] = uri
2005                 else:
2006                     raise ApplicationError('problem with URI: %r' % refuri)
2007             self.document.note_explicit_target(target, self.parent)
2008         else:                       # anonymous target
2009             if refuri:
2010                 target['refuri'] = refuri
2011             target['anonymous'] = 1
2012             self.document.note_anonymous_target(target)
2013
2014     def substitution_def(self, match):
2015         pattern = self.explicit.patterns.substitution
2016         src, srcline = self.state_machine.get_source_and_line()
2017         block, indent, offset, blank_finish = \
2018               self.state_machine.get_first_known_indented(match.end(),
2019                                                           strip_indent=False)
2020         blocktext = (match.string[:match.end()] + '\n'.join(block))
2021         block.disconnect()
2022         escaped = escape2null(block[0].rstrip())
2023         blockindex = 0
2024         while True:
2025             subdefmatch = pattern.match(escaped)
2026             if subdefmatch:
2027                 break
2028             blockindex += 1
2029             try:
2030                 escaped = escaped + ' ' + escape2null(block[blockindex].strip())
2031             except IndexError:
2032                 raise MarkupError('malformed substitution definition.')
2033         del block[:blockindex]          # strip out the substitution marker
2034         block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1]
2035         if not block[0]:
2036             del block[0]
2037             offset += 1
2038         while block and not block[-1].strip():
2039             block.pop()
2040         subname = subdefmatch.group('name')
2041         substitution_node = nodes.substitution_definition(blocktext)
2042         substitution_node.source = src
2043         substitution_node.line = srcline
2044         if not block:
2045             msg = self.reporter.warning(
2046                 'Substitution definition "%s" missing contents.' % subname,
2047                 nodes.literal_block(blocktext, blocktext),
2048                 source=src, line=srcline)
2049             return [msg], blank_finish
2050         block[0] = block[0].strip()
2051         substitution_node['names'].append(
2052             nodes.whitespace_normalize_name(subname))
2053         new_abs_offset, blank_finish = self.nested_list_parse(
2054               block, input_offset=offset, node=substitution_node,
2055               initial_state='SubstitutionDef', blank_finish=blank_finish)
2056         i = 0
2057         for node in substitution_node[:]:
2058             if not (isinstance(node, nodes.Inline) or
2059                     isinstance(node, nodes.Text)):
2060                 self.parent += substitution_node[i]
2061                 del substitution_node[i]
2062             else:
2063                 i += 1
2064         for node in substitution_node.traverse(nodes.Element):
2065             if self.disallowed_inside_substitution_definitions(node):
2066                 pformat = nodes.literal_block('', node.pformat().rstrip())
2067                 msg = self.reporter.error(
2068                     'Substitution definition contains illegal element:',
2069                     pformat, nodes.literal_block(blocktext, blocktext),
2070                     source=src, line=srcline)
2071                 return [msg], blank_finish
2072         if len(substitution_node) == 0:
2073             msg = self.reporter.warning(
2074                   'Substitution definition "%s" empty or invalid.' % subname,
2075                   nodes.literal_block(blocktext, blocktext),
2076                   source=src, line=srcline)
2077             return [msg], blank_finish
2078         self.document.note_substitution_def(
2079             substitution_node, subname, self.parent)
2080         return [substitution_node], blank_finish
2081
2082     def disallowed_inside_substitution_definitions(self, node):
2083         if (node['ids'] or
2084             isinstance(node, nodes.reference) and node.get('anonymous') or
2085             isinstance(node, nodes.footnote_reference) and node.get('auto')):
2086             return 1
2087         else:
2088             return 0
2089
2090     def directive(self, match, **option_presets):
2091         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2092         type_name = match.group(1)
2093         directive_class, messages = directives.directive(
2094             type_name, self.memo.language, self.document)
2095         self.parent += messages
2096         if directive_class:
2097             return self.run_directive(
2098                 directive_class, match, type_name, option_presets)
2099         else:
2100             return self.unknown_directive(type_name)
2101
2102     def run_directive(self, directive, match, type_name, option_presets):
2103         """
2104         Parse a directive then run its directive function.
2105
2106         Parameters:
2107
2108         - `directive`: The class implementing the directive.  Must be
2109           a subclass of `rst.Directive`.
2110
2111         - `match`: A regular expression match object which matched the first
2112           line of the directive.
2113
2114         - `type_name`: The directive name, as used in the source text.
2115
2116         - `option_presets`: A dictionary of preset options, defaults for the
2117           directive options.  Currently, only an "alt" option is passed by
2118           substitution definitions (value: the substitution name), which may
2119           be used by an embedded image directive.
2120
2121         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2122         """
2123         if isinstance(directive, (FunctionType, MethodType)):
2124             from docutils.parsers.rst import convert_directive_function
2125             directive = convert_directive_function(directive)
2126         lineno = self.state_machine.abs_line_number()
2127         initial_line_offset = self.state_machine.line_offset
2128         indented, indent, line_offset, blank_finish \
2129                   = self.state_machine.get_first_known_indented(match.end(),
2130                                                                 strip_top=0)
2131         block_text = '\n'.join(self.state_machine.input_lines[
2132             initial_line_offset : self.state_machine.line_offset + 1])
2133         try:
2134             arguments, options, content, content_offset = (
2135                 self.parse_directive_block(indented, line_offset,
2136                                            directive, option_presets))
2137         except MarkupError, detail:
2138             error = self.reporter.error(
2139                 'Error in "%s" directive:\n%s.' % (type_name,
2140                                                    ' '.join(detail.args)),
2141                 nodes.literal_block(block_text, block_text), line=lineno)
2142             return [error], blank_finish
2143         directive_instance = directive(
2144             type_name, arguments, options, content, lineno,
2145             content_offset, block_text, self, self.state_machine)
2146         try:
2147             result = directive_instance.run()
2148         except docutils.parsers.rst.DirectiveError, error:
2149             msg_node = self.reporter.system_message(error.level, error.msg,
2150                                                     line=lineno)
2151             msg_node += nodes.literal_block(block_text, block_text)
2152             result = [msg_node]
2153         assert isinstance(result, list), \
2154                'Directive "%s" must return a list of nodes.' % type_name
2155         for i in range(len(result)):
2156             assert isinstance(result[i], nodes.Node), \
2157                    ('Directive "%s" returned non-Node object (index %s): %r'
2158                     % (type_name, i, result[i]))
2159         return (result,
2160                 blank_finish or self.state_machine.is_next_line_blank())
2161
2162     def parse_directive_block(self, indented, line_offset, directive,
2163                               option_presets):
2164         option_spec = directive.option_spec
2165         has_content = directive.has_content
2166         if indented and not indented[0].strip():
2167             indented.trim_start()
2168             line_offset += 1
2169         while indented and not indented[-1].strip():
2170             indented.trim_end()
2171         if indented and (directive.required_arguments
2172                          or directive.optional_arguments
2173                          or option_spec):
2174             for i, line in enumerate(indented):
2175                 if not line.strip():
2176                     break
2177             else:
2178                 i += 1
2179             arg_block = indented[:i]
2180             content = indented[i+1:]
2181             content_offset = line_offset + i + 1
2182         else:
2183             content = indented
2184             content_offset = line_offset
2185             arg_block = []
2186         if option_spec:
2187             options, arg_block = self.parse_directive_options(
2188                 option_presets, option_spec, arg_block)
2189         else:
2190             options = {}
2191         if arg_block and not (directive.required_arguments
2192                               or directive.optional_arguments):
2193             content = arg_block + indented[i:]
2194             content_offset = line_offset
2195             arg_block = []
2196         while content and not content[0].strip():
2197             content.trim_start()
2198             content_offset += 1
2199         if directive.required_arguments or directive.optional_arguments:
2200             arguments = self.parse_directive_arguments(
2201                 directive, arg_block)
2202         else:
2203             arguments = []
2204         if content and not has_content:
2205             raise MarkupError('no content permitted')
2206         return (arguments, options, content, content_offset)
2207
2208     def parse_directive_options(self, option_presets, option_spec, arg_block):
2209         options = option_presets.copy()
2210         for i, line in enumerate(arg_block):
2211             if re.match(Body.patterns['field_marker'], line):
2212                 opt_block = arg_block[i:]
2213                 arg_block = arg_block[:i]
2214                 break
2215         else:
2216             opt_block = []
2217         if opt_block:
2218             success, data = self.parse_extension_options(option_spec,
2219                                                          opt_block)
2220             if success:                 # data is a dict of options
2221                 options.update(data)
2222             else:                       # data is an error string
2223                 raise MarkupError(data)
2224         return options, arg_block
2225
2226     def parse_directive_arguments(self, directive, arg_block):
2227         required = directive.required_arguments
2228         optional = directive.optional_arguments
2229         arg_text = '\n'.join(arg_block)
2230         arguments = arg_text.split()
2231         if len(arguments) < required:
2232             raise MarkupError('%s argument(s) required, %s supplied'
2233                               % (required, len(arguments)))
2234         elif len(arguments) > required + optional:
2235             if directive.final_argument_whitespace:
2236                 arguments = arg_text.split(None, required + optional - 1)
2237             else:
2238                 raise MarkupError(
2239                     'maximum %s argument(s) allowed, %s supplied'
2240                     % (required + optional, len(arguments)))
2241         return arguments
2242
2243     def parse_extension_options(self, option_spec, datalines):
2244         """
2245         Parse `datalines` for a field list containing extension options
2246         matching `option_spec`.
2247
2248         :Parameters:
2249             - `option_spec`: a mapping of option name to conversion
2250               function, which should raise an exception on bad input.
2251             - `datalines`: a list of input strings.
2252
2253         :Return:
2254             - Success value, 1 or 0.
2255             - An option dictionary on success, an error string on failure.
2256         """
2257         node = nodes.field_list()
2258         newline_offset, blank_finish = self.nested_list_parse(
2259               datalines, 0, node, initial_state='ExtensionOptions',
2260               blank_finish=True)
2261         if newline_offset != len(datalines): # incomplete parse of block
2262             return 0, 'invalid option block'
2263         try:
2264             options = utils.extract_extension_options(node, option_spec)
2265         except KeyError, detail:
2266             return 0, ('unknown option: "%s"' % detail.args[0])
2267         except (ValueError, TypeError), detail:
2268             return 0, ('invalid option value: %s' % ' '.join(detail.args))
2269         except utils.ExtensionOptionError, detail:
2270             return 0, ('invalid option data: %s' % ' '.join(detail.args))
2271         if blank_finish:
2272             return 1, options
2273         else:
2274             return 0, 'option data incompletely parsed'
2275
2276     def unknown_directive(self, type_name):
2277         lineno = self.state_machine.abs_line_number()
2278         indented, indent, offset, blank_finish = \
2279             self.state_machine.get_first_known_indented(0, strip_indent=False)
2280         text = '\n'.join(indented)
2281         error = self.reporter.error(
2282               'Unknown directive type "%s".' % type_name,
2283               nodes.literal_block(text, text), line=lineno)
2284         return [error], blank_finish
2285
2286     def comment(self, match):
2287         if not match.string[match.end():].strip() \
2288               and self.state_machine.is_next_line_blank(): # an empty comment?
2289             return [nodes.comment()], 1 # "A tiny but practical wart."
2290         indented, indent, offset, blank_finish = \
2291               self.state_machine.get_first_known_indented(match.end())
2292         while indented and not indented[-1].strip():
2293             indented.trim_end()
2294         text = '\n'.join(indented)
2295         return [nodes.comment(text, text)], blank_finish
2296
2297     explicit.constructs = [
2298           (footnote,
2299            re.compile(r"""
2300                       \.\.[ ]+          # explicit markup start
2301                       \[
2302                       (                 # footnote label:
2303                           [0-9]+          # manually numbered footnote
2304                         |               # *OR*
2305                           \#              # anonymous auto-numbered footnote
2306                         |               # *OR*
2307                           \#%s            # auto-number ed?) footnote label
2308                         |               # *OR*
2309                           \*              # auto-symbol footnote
2310                       )
2311                       \]
2312                       ([ ]+|$)          # whitespace or end of line
2313                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2314           (citation,
2315            re.compile(r"""
2316                       \.\.[ ]+          # explicit markup start
2317                       \[(%s)\]          # citation label
2318                       ([ ]+|$)          # whitespace or end of line
2319                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2320           (hyperlink_target,
2321            re.compile(r"""
2322                       \.\.[ ]+          # explicit markup start
2323                       _                 # target indicator
2324                       (?![ ]|$)         # first char. not space or EOL
2325                       """, re.VERBOSE | re.UNICODE)),
2326           (substitution_def,
2327            re.compile(r"""
2328                       \.\.[ ]+          # explicit markup start
2329                       \|                # substitution indicator
2330                       (?![ ]|$)         # first char. not space or EOL
2331                       """, re.VERBOSE | re.UNICODE)),
2332           (directive,
2333            re.compile(r"""
2334                       \.\.[ ]+          # explicit markup start
2335                       (%s)              # directive name
2336                       [ ]?              # optional space
2337                       ::                # directive delimiter
2338                       ([ ]+|$)          # whitespace or end of line
2339                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE))]
2340
2341     def explicit_markup(self, match, context, next_state):
2342         """Footnotes, hyperlink targets, directives, comments."""
2343         nodelist, blank_finish = self.explicit_construct(match)
2344         self.parent += nodelist
2345         self.explicit_list(blank_finish)
2346         return [], next_state, []
2347
2348     def explicit_construct(self, match):
2349         """Determine which explicit construct this is, parse & return it."""
2350         errors = []
2351         for method, pattern in self.explicit.constructs:
2352             expmatch = pattern.match(match.string)
2353             if expmatch:
2354                 try:
2355                     return method(self, expmatch)
2356                 except MarkupError, error:
2357                     lineno = self.state_machine.abs_line_number()
2358                     message = ' '.join(error.args)
2359                     errors.append(self.reporter.warning(message, line=lineno))
2360                     break
2361         nodelist, blank_finish = self.comment(match)
2362         return nodelist + errors, blank_finish
2363
2364     def explicit_list(self, blank_finish):
2365         """
2366         Create a nested state machine for a series of explicit markup
2367         constructs (including anonymous hyperlink targets).
2368         """
2369         offset = self.state_machine.line_offset + 1   # next line
2370         newline_offset, blank_finish = self.nested_list_parse(
2371               self.state_machine.input_lines[offset:],
2372               input_offset=self.state_machine.abs_line_offset() + 1,
2373               node=self.parent, initial_state='Explicit',
2374               blank_finish=blank_finish,
2375               match_titles=self.state_machine.match_titles)
2376         self.goto_line(newline_offset)
2377         if not blank_finish:
2378             self.parent += self.unindent_warning('Explicit markup')
2379
2380     def anonymous(self, match, context, next_state):
2381         """Anonymous hyperlink targets."""
2382         nodelist, blank_finish = self.anonymous_target(match)
2383         self.parent += nodelist
2384         self.explicit_list(blank_finish)
2385         return [], next_state, []
2386
2387     def anonymous_target(self, match):
2388         lineno = self.state_machine.abs_line_number()
2389         block, indent, offset, blank_finish \
2390             = self.state_machine.get_first_known_indented(match.end(),
2391                                                         until_blank=True)
2392         blocktext = match.string[:match.end()] + '\n'.join(block)
2393         block = [escape2null(line) for line in block]
2394         target = self.make_target(block, blocktext, lineno, '')
2395         return [target], blank_finish
2396
2397     def line(self, match, context, next_state):
2398         """Section title overline or transition marker."""
2399         if self.state_machine.match_titles:
2400             return [match.string], 'Line', []
2401         elif match.string.strip() == '::':
2402             raise statemachine.TransitionCorrection('text')
2403         elif len(match.string.strip()) < 4:
2404             msg = self.reporter.info(
2405                 'Unexpected possible title overline or transition.\n'
2406                 "Treating it as ordinary text because it's so short.",
2407                 line=self.state_machine.abs_line_number())
2408             self.parent += msg
2409             raise statemachine.TransitionCorrection('text')
2410         else:
2411             blocktext = self.state_machine.line
2412             msg = self.reporter.severe(
2413                   'Unexpected section title or transition.',
2414                   nodes.literal_block(blocktext, blocktext),
2415                   line=self.state_machine.abs_line_number())
2416             self.parent += msg
2417             return [], next_state, []
2418
2419     def text(self, match, context, next_state):
2420         """Titles, definition lists, paragraphs."""
2421         return [match.string], 'Text', []
2422
2423
2424 class RFC2822Body(Body):
2425
2426     """
2427     RFC2822 headers are only valid as the first constructs in documents.  As
2428     soon as anything else appears, the `Body` state should take over.
2429     """
2430
2431     patterns = Body.patterns.copy()     # can't modify the original
2432     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2433     initial_transitions = [(name, 'Body')
2434                            for name in Body.initial_transitions]
2435     initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2436
2437     def rfc2822(self, match, context, next_state):
2438         """RFC2822-style field list item."""
2439         fieldlist = nodes.field_list(classes=['rfc2822'])
2440         self.parent += fieldlist
2441         field, blank_finish = self.rfc2822_field(match)
2442         fieldlist += field
2443         offset = self.state_machine.line_offset + 1   # next line
2444         newline_offset, blank_finish = self.nested_list_parse(
2445               self.state_machine.input_lines[offset:],
2446               input_offset=self.state_machine.abs_line_offset() + 1,
2447               node=fieldlist, initial_state='RFC2822List',
2448               blank_finish=blank_finish)
2449         self.goto_line(newline_offset)
2450         if not blank_finish:
2451             self.parent += self.unindent_warning(
2452                   'RFC2822-style field list')
2453         return [], next_state, []
2454
2455     def rfc2822_field(self, match):
2456         name = match.string[:match.string.find(':')]
2457         indented, indent, line_offset, blank_finish = \
2458               self.state_machine.get_first_known_indented(match.end(),
2459                                                           until_blank=True)
2460         fieldnode = nodes.field()
2461         fieldnode += nodes.field_name(name, name)
2462         fieldbody = nodes.field_body('\n'.join(indented))
2463         fieldnode += fieldbody
2464         if indented:
2465             self.nested_parse(indented, input_offset=line_offset,
2466                               node=fieldbody)
2467         return fieldnode, blank_finish
2468
2469
2470 class SpecializedBody(Body):
2471
2472     """
2473     Superclass for second and subsequent compound element members.  Compound
2474     elements are lists and list-like constructs.
2475
2476     All transition methods are disabled (redefined as `invalid_input`).
2477     Override individual methods in subclasses to re-enable.
2478
2479     For example, once an initial bullet list item, say, is recognized, the
2480     `BulletList` subclass takes over, with a "bullet_list" node as its
2481     container.  Upon encountering the initial bullet list item, `Body.bullet`
2482     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2483     starts up a nested parsing session with `BulletList` as the initial state.
2484     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2485     as only bullet list items are encountered, they are parsed and inserted
2486     into the container.  The first construct which is *not* a bullet list item
2487     triggers the `invalid_input` method, which ends the nested parse and
2488     closes the container.  `BulletList` needs to recognize input that is
2489     invalid in the context of a bullet list, which means everything *other
2490     than* bullet list items, so it inherits the transition list created in
2491     `Body`.
2492     """
2493
2494     def invalid_input(self, match=None, context=None, next_state=None):
2495         """Not a compound element member. Abort this state machine."""
2496         self.state_machine.previous_line() # back up so parent SM can reassess
2497         raise EOFError
2498
2499     indent = invalid_input
2500     bullet = invalid_input
2501     enumerator = invalid_input
2502     field_marker = invalid_input
2503     option_marker = invalid_input
2504     doctest = invalid_input
2505     line_block = invalid_input
2506     grid_table_top = invalid_input
2507     simple_table_top = invalid_input
2508     explicit_markup = invalid_input
2509     anonymous = invalid_input
2510     line = invalid_input
2511     text = invalid_input
2512
2513
2514 class BulletList(SpecializedBody):
2515
2516     """Second and subsequent bullet_list list_items."""
2517
2518     def bullet(self, match, context, next_state):
2519         """Bullet list item."""
2520         if match.string[0] != self.parent['bullet']:
2521             # different bullet: new list
2522             self.invalid_input()
2523         listitem, blank_finish = self.list_item(match.end())
2524         self.parent += listitem
2525         self.blank_finish = blank_finish
2526         return [], next_state, []
2527
2528
2529 class DefinitionList(SpecializedBody):
2530
2531     """Second and subsequent definition_list_items."""
2532
2533     def text(self, match, context, next_state):
2534         """Definition lists."""
2535         return [match.string], 'Definition', []
2536
2537
2538 class EnumeratedList(SpecializedBody):
2539
2540     """Second and subsequent enumerated_list list_items."""
2541
2542     def enumerator(self, match, context, next_state):
2543         """Enumerated list item."""
2544         format, sequence, text, ordinal = self.parse_enumerator(
2545               match, self.parent['enumtype'])
2546         if ( format != self.format
2547              or (sequence != '#' and (sequence != self.parent['enumtype']
2548                                       or self.auto
2549                                       or ordinal != (self.lastordinal + 1)))
2550              or not self.is_enumerated_list_item(ordinal, sequence, format)):
2551             # different enumeration: new list
2552             self.invalid_input()
2553         if sequence == '#':
2554             self.auto = 1
2555         listitem, blank_finish = self.list_item(match.end())
2556         self.parent += listitem
2557         self.blank_finish = blank_finish
2558         self.lastordinal = ordinal
2559         return [], next_state, []
2560
2561
2562 class FieldList(SpecializedBody):
2563
2564     """Second and subsequent field_list fields."""
2565
2566     def field_marker(self, match, context, next_state):
2567         """Field list field."""
2568         field, blank_finish = self.field(match)
2569         self.parent += field
2570         self.blank_finish = blank_finish
2571         return [], next_state, []
2572
2573
2574 class OptionList(SpecializedBody):
2575
2576     """Second and subsequent option_list option_list_items."""
2577
2578     def option_marker(self, match, context, next_state):
2579         """Option list item."""
2580         try:
2581             option_list_item, blank_finish = self.option_list_item(match)
2582         except MarkupError:
2583             self.invalid_input()
2584         self.parent += option_list_item
2585         self.blank_finish = blank_finish
2586         return [], next_state, []
2587
2588
2589 class RFC2822List(SpecializedBody, RFC2822Body):
2590
2591     """Second and subsequent RFC2822-style field_list fields."""
2592
2593     patterns = RFC2822Body.patterns
2594     initial_transitions = RFC2822Body.initial_transitions
2595
2596     def rfc2822(self, match, context, next_state):
2597         """RFC2822-style field list item."""
2598         field, blank_finish = self.rfc2822_field(match)
2599         self.parent += field
2600         self.blank_finish = blank_finish
2601         return [], 'RFC2822List', []
2602
2603     blank = SpecializedBody.invalid_input
2604
2605
2606 class ExtensionOptions(FieldList):
2607
2608     """
2609     Parse field_list fields for extension options.
2610
2611     No nested parsing is done (including inline markup parsing).
2612     """
2613
2614     def parse_field_body(self, indented, offset, node):
2615         """Override `Body.parse_field_body` for simpler parsing."""
2616         lines = []
2617         for line in list(indented) + ['']:
2618             if line.strip():
2619                 lines.append(line)
2620             elif lines:
2621                 text = '\n'.join(lines)
2622                 node += nodes.paragraph(text, text)
2623                 lines = []
2624
2625
2626 class LineBlock(SpecializedBody):
2627
2628     """Second and subsequent lines of a line_block."""
2629
2630     blank = SpecializedBody.invalid_input
2631
2632     def line_block(self, match, context, next_state):
2633         """New line of line block."""
2634         lineno = self.state_machine.abs_line_number()
2635         line, messages, blank_finish = self.line_block_line(match, lineno)
2636         self.parent += line
2637         self.parent.parent += messages
2638         self.blank_finish = blank_finish
2639         return [], next_state, []
2640
2641
2642 class Explicit(SpecializedBody):
2643
2644     """Second and subsequent explicit markup construct."""
2645
2646     def explicit_markup(self, match, context, next_state):
2647         """Footnotes, hyperlink targets, directives, comments."""
2648         nodelist, blank_finish = self.explicit_construct(match)
2649         self.parent += nodelist
2650         self.blank_finish = blank_finish
2651         return [], next_state, []
2652
2653     def anonymous(self, match, context, next_state):
2654         """Anonymous hyperlink targets."""
2655         nodelist, blank_finish = self.anonymous_target(match)
2656         self.parent += nodelist
2657         self.blank_finish = blank_finish
2658         return [], next_state, []
2659
2660     blank = SpecializedBody.invalid_input
2661
2662
2663 class SubstitutionDef(Body):
2664
2665     """
2666     Parser for the contents of a substitution_definition element.
2667     """
2668
2669     patterns = {
2670           'embedded_directive': re.compile(r'(%s)::( +|$)'
2671                                            % Inliner.simplename, re.UNICODE),
2672           'text': r''}
2673     initial_transitions = ['embedded_directive', 'text']
2674
2675     def embedded_directive(self, match, context, next_state):
2676         nodelist, blank_finish = self.directive(match,
2677                                                 alt=self.parent['names'][0])
2678         self.parent += nodelist
2679         if not self.state_machine.at_eof():
2680             self.blank_finish = blank_finish
2681         raise EOFError
2682
2683     def text(self, match, context, next_state):
2684         if not self.state_machine.at_eof():
2685             self.blank_finish = self.state_machine.is_next_line_blank()
2686         raise EOFError
2687
2688
2689 class Text(RSTState):
2690
2691     """
2692     Classifier of second line of a text block.
2693
2694     Could be a paragraph, a definition list item, or a title.
2695     """
2696
2697     patterns = {'underline': Body.patterns['line'],
2698                 'text': r''}
2699     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2700
2701     def blank(self, match, context, next_state):
2702         """End of paragraph."""
2703         # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext
2704         paragraph, literalnext = self.paragraph(
2705               context, self.state_machine.abs_line_number() - 1)
2706         self.parent += paragraph
2707         if literalnext:
2708             self.parent += self.literal_block()
2709         return [], 'Body', []
2710
2711     def eof(self, context):
2712         if context:
2713             self.blank(None, context, None)
2714         return []
2715
2716     def indent(self, match, context, next_state):
2717         """Definition list item."""
2718         definitionlist = nodes.definition_list()
2719         definitionlistitem, blank_finish = self.definition_list_item(context)
2720         definitionlist += definitionlistitem
2721         self.parent += definitionlist
2722         offset = self.state_machine.line_offset + 1   # next line
2723         newline_offset, blank_finish = self.nested_list_parse(
2724               self.state_machine.input_lines[offset:],
2725               input_offset=self.state_machine.abs_line_offset() + 1,
2726               node=definitionlist, initial_state='DefinitionList',
2727               blank_finish=blank_finish, blank_finish_state='Definition')
2728         self.goto_line(newline_offset)
2729         if not blank_finish:
2730             self.parent += self.unindent_warning('Definition list')
2731         return [], 'Body', []
2732
2733     def underline(self, match, context, next_state):
2734         """Section title."""
2735         lineno = self.state_machine.abs_line_number()
2736         title = context[0].rstrip()
2737         underline = match.string.rstrip()
2738         source = title + '\n' + underline
2739         messages = []
2740         if column_width(title) > len(underline):
2741             if len(underline) < 4:
2742                 if self.state_machine.match_titles:
2743                     msg = self.reporter.info(
2744                         'Possible title underline, too short for the title.\n'
2745                         "Treating it as ordinary text because it's so short.",
2746                         line=lineno)
2747                     self.parent += msg
2748                 raise statemachine.TransitionCorrection('text')
2749             else:
2750                 blocktext = context[0] + '\n' + self.state_machine.line
2751                 msg = self.reporter.warning('Title underline too short.',
2752                     nodes.literal_block(blocktext, blocktext), line=lineno)
2753                 messages.append(msg)
2754         if not self.state_machine.match_titles:
2755             blocktext = context[0] + '\n' + self.state_machine.line
2756             # We need get_source_and_line() here to report correctly
2757             src, srcline = self.state_machine.get_source_and_line()
2758             # TODO: why is abs_line_number() == srcline+1
2759             # if the error is in a table (try with test_tables.py)?
2760             # print "get_source_and_line", srcline
2761             # print "abs_line_number", self.state_machine.abs_line_number()
2762             msg = self.reporter.severe('Unexpected section title.',
2763                 nodes.literal_block(blocktext, blocktext),
2764                 source=src, line=srcline)
2765             self.parent += messages
2766             self.parent += msg
2767             return [], next_state, []
2768         style = underline[0]
2769         context[:] = []
2770         self.section(title, source, style, lineno - 1, messages)
2771         return [], next_state, []
2772
2773     def text(self, match, context, next_state):
2774         """Paragraph."""
2775         startline = self.state_machine.abs_line_number() - 1
2776         msg = None
2777         try:
2778             block = self.state_machine.get_text_block(flush_left=True)
2779         except statemachine.UnexpectedIndentationError, err:
2780             block, src, srcline = err.args
2781             msg = self.reporter.error('Unexpected indentation.',
2782                                       source=src, line=srcline)
2783         lines = context + list(block)
2784         paragraph, literalnext = self.paragraph(lines, startline)
2785         self.parent += paragraph
2786         self.parent += msg
2787         if literalnext:
2788             try:
2789                 self.state_machine.next_line()
2790             except EOFError:
2791                 pass
2792             self.parent += self.literal_block()
2793         return [], next_state, []
2794
2795     def literal_block(self):
2796         """Return a list of nodes."""
2797         indented, indent, offset, blank_finish = \
2798               self.state_machine.get_indented()
2799         while indented and not indented[-1].strip():
2800             indented.trim_end()
2801         if not indented:
2802             return self.quoted_literal_block()
2803         data = '\n'.join(indented)
2804         literal_block = nodes.literal_block(data, data)
2805         literal_block.line = offset + 1
2806         nodelist = [literal_block]
2807         if not blank_finish:
2808             nodelist.append(self.unindent_warning('Literal block'))
2809         return nodelist
2810
2811     def quoted_literal_block(self):
2812         abs_line_offset = self.state_machine.abs_line_offset()
2813         offset = self.state_machine.line_offset
2814         parent_node = nodes.Element()
2815         new_abs_offset = self.nested_parse(
2816             self.state_machine.input_lines[offset:],
2817             input_offset=abs_line_offset, node=parent_node, match_titles=False,
2818             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2819                                   'initial_state': 'QuotedLiteralBlock'})
2820         self.goto_line(new_abs_offset)
2821         return parent_node.children
2822
2823     def definition_list_item(self, termline):
2824         indented, indent, line_offset, blank_finish = \
2825               self.state_machine.get_indented()
2826         itemnode = nodes.definition_list_item(
2827             '\n'.join(termline + list(indented)))
2828         lineno = self.state_machine.abs_line_number() - 1
2829         (itemnode.source,
2830          itemnode.line) = self.state_machine.get_source_and_line(lineno)
2831         termlist, messages = self.term(termline, lineno)
2832         itemnode += termlist
2833         definition = nodes.definition('', *messages)
2834         itemnode += definition
2835         if termline[0][-2:] == '::':
2836             definition += self.reporter.info(
2837                   'Blank line missing before literal block (after the "::")? '
2838                   'Interpreted as a definition list item.',
2839                   line=lineno+1)
2840         self.nested_parse(indented, input_offset=line_offset, node=definition)
2841         return itemnode, blank_finish
2842
2843     classifier_delimiter = re.compile(' +: +')
2844
2845     def term(self, lines, lineno):
2846         """Return a definition_list's term and optional classifiers."""
2847         assert len(lines) == 1
2848         text_nodes, messages = self.inline_text(lines[0], lineno)
2849         term_node = nodes.term(lines[0])
2850         (term_node.source,
2851          term_node.line) = self.state_machine.get_source_and_line(lineno)
2852         node_list = [term_node]
2853         for i in range(len(text_nodes)):
2854             node = text_nodes[i]
2855             if isinstance(node, nodes.Text):
2856                 parts = self.classifier_delimiter.split(node.rawsource)
2857                 if len(parts) == 1:
2858                     node_list[-1] += node
2859                 else:
2860                     rawtext = parts[0].rstrip()
2861                     textnode = nodes.Text(utils.unescape_rawsource(rawtext))
2862                     textnode.rawsource = rawtext
2863                     node_list[-1] += textnode
2864                     for part in parts[1:]:
2865                         classifier_node = nodes.classifier(part,
2866                                             utils.unescape_rawsource(part))
2867                         classifier_node[0].rawsource = part
2868                         node_list.append(classifier_node)
2869             else:
2870                 node_list[-1] += node
2871         return node_list, messages
2872
2873
2874 class SpecializedText(Text):
2875
2876     """
2877     Superclass for second and subsequent lines of Text-variants.
2878
2879     All transition methods are disabled. Override individual methods in
2880     subclasses to re-enable.
2881     """
2882
2883     def eof(self, context):
2884         """Incomplete construct."""
2885         return []
2886
2887     def invalid_input(self, match=None, context=None, next_state=None):
2888         """Not a compound element member. Abort this state machine."""
2889         raise EOFError
2890
2891     blank = invalid_input
2892     indent = invalid_input
2893     underline = invalid_input
2894     text = invalid_input
2895
2896
2897 class Definition(SpecializedText):
2898
2899     """Second line of potential definition_list_item."""
2900
2901     def eof(self, context):
2902         """Not a definition."""
2903         self.state_machine.previous_line(2) # so parent SM can reassess
2904         return []
2905
2906     def indent(self, match, context, next_state):
2907         """Definition list item."""
2908         itemnode, blank_finish = self.definition_list_item(context)
2909         self.parent += itemnode
2910         self.blank_finish = blank_finish
2911         return [], 'DefinitionList', []
2912
2913
2914 class Line(SpecializedText):
2915
2916     """
2917     Second line of over- & underlined section title or transition marker.
2918     """
2919
2920     eofcheck = 1                        # @@@ ???
2921     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2922
2923     def eof(self, context):
2924         """Transition marker at end of section or document."""
2925         marker = context[0].strip()
2926         if self.memo.section_bubble_up_kludge:
2927             self.memo.section_bubble_up_kludge = False
2928         elif len(marker) < 4:
2929             self.state_correction(context)
2930         if self.eofcheck:               # ignore EOFError with sections
2931             lineno = self.state_machine.abs_line_number() - 1
2932             transition = nodes.transition(rawsource=context[0])
2933             transition.line = lineno
2934             self.parent += transition
2935         self.eofcheck = 1
2936         return []
2937
2938     def blank(self, match, context, next_state):
2939         """Transition marker."""
2940         src, srcline = self.state_machine.get_source_and_line()
2941         marker = context[0].strip()
2942         if len(marker) < 4:
2943             self.state_correction(context)
2944         transition = nodes.transition(rawsource=marker)
2945         transition.source = src
2946         transition.line = srcline - 1
2947         self.parent += transition
2948         return [], 'Body', []
2949
2950     def text(self, match, context, next_state):
2951         """Potential over- & underlined title."""
2952         lineno = self.state_machine.abs_line_number() - 1
2953         overline = context[0]
2954         title = match.string
2955         underline = ''
2956         try:
2957             underline = self.state_machine.next_line()
2958         except EOFError:
2959             blocktext = overline + '\n' + title
2960             if len(overline.rstrip()) < 4:
2961                 self.short_overline(context, blocktext, lineno, 2)
2962             else:
2963                 msg = self.reporter.severe(
2964                     'Incomplete section title.',
2965                     nodes.literal_block(blocktext, blocktext),
2966                     line=lineno)
2967                 self.parent += msg
2968                 return [], 'Body', []
2969         source = '%s\n%s\n%s' % (overline, title, underline)
2970         overline = overline.rstrip()
2971         underline = underline.rstrip()
2972         if not self.transitions['underline'][0].match(underline):
2973             blocktext = overline + '\n' + title + '\n' + underline
2974             if len(overline.rstrip()) < 4:
2975                 self.short_overline(context, blocktext, lineno, 2)
2976             else:
2977                 msg = self.reporter.severe(
2978                     'Missing matching underline for section title overline.',
2979                     nodes.literal_block(source, source),
2980                     line=lineno)
2981                 self.parent += msg
2982                 return [], 'Body', []
2983         elif overline != underline:
2984             blocktext = overline + '\n' + title + '\n' + underline
2985             if len(overline.rstrip()) < 4:
2986                 self.short_overline(context, blocktext, lineno, 2)
2987             else:
2988                 msg = self.reporter.severe(
2989                       'Title overline & underline mismatch.',
2990                       nodes.literal_block(source, source),
2991                       line=lineno)
2992                 self.parent += msg
2993                 return [], 'Body', []
2994         title = title.rstrip()
2995         messages = []
2996         if column_width(title) > len(overline):
2997             blocktext = overline + '\n' + title + '\n' + underline
2998             if len(overline.rstrip()) < 4:
2999                 self.short_overline(context, blocktext, lineno, 2)
3000             else:
3001                 msg = self.reporter.warning(
3002                       'Title overline too short.',
3003                       nodes.literal_block(source, source),
3004                       line=lineno)
3005                 messages.append(msg)
3006         style = (overline[0], underline[0])
3007         self.eofcheck = 0               # @@@ not sure this is correct
3008         self.section(title.lstrip(), source, style, lineno + 1, messages)
3009         self.eofcheck = 1
3010         return [], 'Body', []
3011
3012     indent = text                       # indented title
3013
3014     def underline(self, match, context, next_state):
3015         overline = context[0]
3016         blocktext = overline + '\n' + self.state_machine.line
3017         lineno = self.state_machine.abs_line_number() - 1
3018         if len(overline.rstrip()) < 4:
3019             self.short_overline(context, blocktext, lineno, 1)
3020         msg = self.reporter.error(
3021               'Invalid section title or transition marker.',
3022               nodes.literal_block(blocktext, blocktext),
3023               line=lineno)
3024         self.parent += msg
3025         return [], 'Body', []
3026
3027     def short_overline(self, context, blocktext, lineno, lines=1):
3028         msg = self.reporter.info(
3029             'Possible incomplete section title.\nTreating the overline as '
3030             "ordinary text because it's so short.",
3031             line=lineno)
3032         self.parent += msg
3033         self.state_correction(context, lines)
3034
3035     def state_correction(self, context, lines=1):
3036         self.state_machine.previous_line(lines)
3037         context[:] = []
3038         raise statemachine.StateCorrection('Body', 'text')
3039
3040
3041 class QuotedLiteralBlock(RSTState):
3042
3043     """
3044     Nested parse handler for quoted (unindented) literal blocks.
3045
3046     Special-purpose.  Not for inclusion in `state_classes`.
3047     """
3048
3049     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3050                 'text': r''}
3051     initial_transitions = ('initial_quoted', 'text')
3052
3053     def __init__(self, state_machine, debug=False):
3054         RSTState.__init__(self, state_machine, debug)
3055         self.messages = []
3056         self.initial_lineno = None
3057
3058     def blank(self, match, context, next_state):
3059         if context:
3060             raise EOFError
3061         else:
3062             return context, next_state, []
3063
3064     def eof(self, context):
3065         if context:
3066             src, srcline = self.state_machine.get_source_and_line(
3067                                                         self.initial_lineno)
3068             text = '\n'.join(context)
3069             literal_block = nodes.literal_block(text, text)
3070             literal_block.source = src
3071             literal_block.line = srcline
3072             self.parent += literal_block
3073         else:
3074             self.parent += self.reporter.warning(
3075                 'Literal block expected; none found.',
3076                 line=self.state_machine.abs_line_number())
3077                 # src not available, because statemachine.input_lines is empty
3078             self.state_machine.previous_line()
3079         self.parent += self.messages
3080         return []
3081
3082     def indent(self, match, context, next_state):
3083         assert context, ('QuotedLiteralBlock.indent: context should not '
3084                          'be empty!')
3085         self.messages.append(
3086             self.reporter.error('Unexpected indentation.',
3087                                 line=self.state_machine.abs_line_number()))
3088         self.state_machine.previous_line()
3089         raise EOFError
3090
3091     def initial_quoted(self, match, context, next_state):
3092         """Match arbitrary quote character on the first line only."""
3093         self.remove_transition('initial_quoted')
3094         quote = match.string[0]
3095         pattern = re.compile(re.escape(quote), re.UNICODE)
3096         # New transition matches consistent quotes only:
3097         self.add_transition('quoted',
3098                             (pattern, self.quoted, self.__class__.__name__))
3099         self.initial_lineno = self.state_machine.abs_line_number()
3100         return [match.string], next_state, []
3101
3102     def quoted(self, match, context, next_state):
3103         """Match consistent quotes on subsequent lines."""
3104         context.append(match.string)
3105         return context, next_state, []
3106
3107     def text(self, match, context, next_state):
3108         if context:
3109             self.messages.append(
3110                 self.reporter.error('Inconsistent literal block quoting.',
3111                                    line=self.state_machine.abs_line_number()))
3112             self.state_machine.previous_line()
3113         raise EOFError
3114
3115
3116 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3117                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3118                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3119 """Standard set of State classes used to start `RSTStateMachine`."""