docutils/parsers/rst/states.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This is the ``docutils.parsers.rst.states`` module, the core of
   7 the reStructuredText parser.  It defines the following:
   8
   9 :Classes:
  10     - `RSTStateMachine`: reStructuredText parser's entry point.
  11     - `NestedStateMachine`: recursive StateMachine.
  12     - `RSTState`: reStructuredText State superclass.
  13     - `Inliner`: For parsing inline markup.
  14     - `Body`: Generic classifier of the first line of a block.
  15     - `SpecializedBody`: Superclass for compound element members.
  16     - `BulletList`: Second and subsequent bullet_list list_items
  17     - `DefinitionList`: Second+ definition_list_items.
  18     - `EnumeratedList`: Second+ enumerated_list list_items.
  19     - `FieldList`: Second+ fields.
  20     - `OptionList`: Second+ option_list_items.
  21     - `RFC2822List`: Second+ RFC2822-style fields.
  22     - `ExtensionOptions`: Parses directive option fields.
  23     - `Explicit`: Second+ explicit markup constructs.
  24     - `SubstitutionDef`: For embedded directives in substitution definitions.
  25     - `Text`: Classifier of second line of a text block.
  26     - `SpecializedText`: Superclass for continuation lines of Text-variants.
  27     - `Definition`: Second line of potential definition_list_item.
  28     - `Line`: Second line of overlined section title or transition marker.
  29     - `Struct`: An auxiliary collection class.
  30
  31 :Exception classes:
  32     - `MarkupError`
  33     - `ParserError`
  34     - `MarkupMismatch`
  35
  36 :Functions:
  37     - `escape2null()`: Return a string, escape-backslashes converted to nulls.
  38     - `unescape()`: Return a string, nulls removed or restored to backslashes.
  39
  40 :Attributes:
  41     - `state_classes`: set of State classes used with `RSTStateMachine`.
  42
  43 Parser Overview
  44 ===============
  45
  46 The reStructuredText parser is implemented as a recursive state machine,
  47 examining its input one line at a time.  To understand how the parser works,
  48 please first become familiar with the `docutils.statemachine` module.  In the
  49 description below, references are made to classes defined in this module;
  50 please see the individual classes for details.
  51
  52 Parsing proceeds as follows:
  53
  54 1. The state machine examines each line of input, checking each of the
  55    transition patterns of the state `Body`, in order, looking for a match.
  56    The implicit transitions (blank lines and indentation) are checked before
  57    any others.  The 'text' transition is a catch-all (matches anything).
  58
  59 2. The method associated with the matched transition pattern is called.
  60
  61    A. Some transition methods are self-contained, appending elements to the
  62       document tree (`Body.doctest` parses a doctest block).  The parser's
  63       current line index is advanced to the end of the element, and parsing
  64       continues with step 1.
  65
  66    B. Other transition methods trigger the creation of a nested state machine,
  67       whose job is to parse a compound construct ('indent' does a block quote,
  68       'bullet' does a bullet list, 'overline' does a section [first checking
  69       for a valid section header], etc.).
  70
  71       - In the case of lists and explicit markup, a one-off state machine is
  72         created and run to parse contents of the first item.
  73
  74       - A new state machine is created and its initial state is set to the
  75         appropriate specialized state (`BulletList` in the case of the
  76         'bullet' transition; see `SpecializedBody` for more detail).  This
  77         state machine is run to parse the compound element (or series of
  78         explicit markup elements), and returns as soon as a non-member element
  79         is encountered.  For example, the `BulletList` state machine ends as
  80         soon as it encounters an element which is not a list item of that
  81         bullet list.  The optional omission of inter-element blank lines is
  82         enabled by this nested state machine.
  83
  84       - The current line index is advanced to the end of the elements parsed,
  85         and parsing continues with step 1.
  86
  87    C. The result of the 'text' transition depends on the next line of text.
  88       The current state is changed to `Text`, under which the second line is
  89       examined.  If the second line is:
  90
  91       - Indented: The element is a definition list item, and parsing proceeds
  92         similarly to step 2.B, using the `DefinitionList` state.
  93
  94       - A line of uniform punctuation characters: The element is a section
  95         header; again, parsing proceeds as in step 2.B, and `Body` is still
  96         used.
  97
  98       - Anything else: The element is a paragraph, which is examined for
  99         inline markup and appended to the parent element.  Processing
 100         continues with step 1.
 101 """
 102
 103 __docformat__ = 'reStructuredText'
 104
 105
 106 import sys
 107 import re
 108 import roman
 109 from types import FunctionType, MethodType
 110
 111 from docutils import nodes, statemachine, utils, urischemes
 112 from docutils import ApplicationError, DataError
 113 from docutils.statemachine import StateMachineWS, StateWS
 114 from docutils.nodes import fully_normalize_name as normalize_name
 115 from docutils.nodes import whitespace_normalize_name
 116 import docutils.parsers.rst
 117 from docutils.parsers.rst import directives, languages, tableparser, roles
 118 from docutils.parsers.rst.languages import en as _fallback_language_module
 119 from docutils.utils import escape2null, unescape, column_width
 120 from docutils.utils import punctuation_chars
 121
 122 class MarkupError(DataError): pass
 123 class UnknownInterpretedRoleError(DataError): pass
 124 class InterpretedRoleNotImplementedError(DataError): pass
 125 class ParserError(ApplicationError): pass
 126 class MarkupMismatch(Exception): pass
 127
 128
 129 class Struct:
 130
 131     """Stores data attributes for dotted-attribute access."""
 132
 133     def __init__(self, **keywordargs):
 134         self.__dict__.update(keywordargs)
 135
 136
 137 class RSTStateMachine(StateMachineWS):
 138
 139     """
 140     reStructuredText's master StateMachine.
 141
 142     The entry point to reStructuredText parsing is the `run()` method.
 143     """
 144
 145     def run(self, input_lines, document, input_offset=0, match_titles=1,
 146             inliner=None):
 147         """
 148         Parse `input_lines` and modify the `document` node in place.
 149
 150         Extend `StateMachineWS.run()`: set up parse-global data and
 151         run the StateMachine.
 152         """
 153         self.language = languages.get_language(
 154             document.settings.language_code)
 155         self.match_titles = match_titles
 156         if inliner is None:
 157             inliner = Inliner()
 158         inliner.init_customizations(document.settings)
 159         self.memo = Struct(document=document,
 160                            reporter=document.reporter,
 161                            language=self.language,
 162                            title_styles=[],
 163                            section_level=0,
 164                            section_bubble_up_kludge=0,
 165                            inliner=inliner)
 166         self.document = document
 167         self.attach_observer(document.note_source)
 168         self.reporter = self.memo.reporter
 169         self.node = document
 170         results = StateMachineWS.run(self, input_lines, input_offset,
 171                                      input_source=document['source'])
 172         assert results == [], 'RSTStateMachine.run() results should be empty!'
 173         self.node = self.memo = None    # remove unneeded references
 174
 175
 176 class NestedStateMachine(StateMachineWS):
 177
 178     """
 179     StateMachine run from within other StateMachine runs, to parse nested
 180     document structures.
 181     """
 182
 183     def run(self, input_lines, input_offset, memo, node, match_titles=1):
 184         """
 185         Parse `input_lines` and populate a `docutils.nodes.document` instance.
 186
 187         Extend `StateMachineWS.run()`: set up document-wide data.
 188         """
 189         self.match_titles = match_titles
 190         self.memo = memo
 191         self.document = memo.document
 192         self.attach_observer(self.document.note_source)
 193         self.reporter = memo.reporter
 194         self.language = memo.language
 195         self.node = node
 196         results = StateMachineWS.run(self, input_lines, input_offset)
 197         assert results == [], ('NestedStateMachine.run() results should be '
 198                                'empty!')
 199         return results
 200
 201
 202 class RSTState(StateWS):
 203
 204     """
 205     reStructuredText State superclass.
 206
 207     Contains methods used by all State subclasses.
 208     """
 209
 210     nested_sm = NestedStateMachine
 211     nested_sm_cache = []
 212
 213     def __init__(self, state_machine, debug=0):
 214         self.nested_sm_kwargs = {'state_classes': state_classes,
 215                                  'initial_state': 'Body'}
 216         StateWS.__init__(self, state_machine, debug)
 217
 218     def runtime_init(self):
 219         StateWS.runtime_init(self)
 220         memo = self.state_machine.memo
 221         self.memo = memo
 222         self.reporter = memo.reporter
 223         self.inliner = memo.inliner
 224         self.document = memo.document
 225         self.parent = self.state_machine.node
 226         # enable the reporter to determine source and source-line
 227         if not hasattr(self.reporter, 'locator'):
 228             self.reporter.locator = self.state_machine.get_source_and_line
 229             # print "adding locator to reporter", self.state_machine.input_offset
 230
 231
 232     def goto_line(self, abs_line_offset):
 233         """
 234         Jump to input line `abs_line_offset`, ignoring jumps past the end.
 235         """
 236         try:
 237             self.state_machine.goto_line(abs_line_offset)
 238         except EOFError:
 239             pass
 240
 241     def no_match(self, context, transitions):
 242         """
 243         Override `StateWS.no_match` to generate a system message.
 244
 245         This code should never be run.
 246         """
 247         src, srcline = self.state_machine.get_source_and_line()
 248         self.reporter.severe(
 249             'Internal error: no transition pattern match.  State: "%s"; '
 250             'transitions: %s; context: %s; current line: %r.'
 251             % (self.__class__.__name__, transitions, context,
 252                self.state_machine.line),
 253             source=src, line=srcline)
 254         return context, None, []
 255
 256     def bof(self, context):
 257         """Called at beginning of file."""
 258         return [], []
 259
 260     def nested_parse(self, block, input_offset, node, match_titles=0,
 261                      state_machine_class=None, state_machine_kwargs=None):
 262         """
 263         Create a new StateMachine rooted at `node` and run it over the input
 264         `block`.
 265         """
 266         use_default = 0
 267         if state_machine_class is None:
 268             state_machine_class = self.nested_sm
 269             use_default += 1
 270         if state_machine_kwargs is None:
 271             state_machine_kwargs = self.nested_sm_kwargs
 272             use_default += 1
 273         block_length = len(block)
 274
 275         state_machine = None
 276         if use_default == 2:
 277             try:
 278                 state_machine = self.nested_sm_cache.pop()
 279             except IndexError:
 280                 pass
 281         if not state_machine:
 282             state_machine = state_machine_class(debug=self.debug,
 283                                                 **state_machine_kwargs)
 284         state_machine.run(block, input_offset, memo=self.memo,
 285                           node=node, match_titles=match_titles)
 286         if use_default == 2:
 287             self.nested_sm_cache.append(state_machine)
 288         else:
 289             state_machine.unlink()
 290         new_offset = state_machine.abs_line_offset()
 291         # No `block.parent` implies disconnected -- lines aren't in sync:
 292         if block.parent and (len(block) - block_length) != 0:
 293             # Adjustment for block if modified in nested parse:
 294             self.state_machine.next_line(len(block) - block_length)
 295         return new_offset
 296
 297     def nested_list_parse(self, block, input_offset, node, initial_state,
 298                           blank_finish,
 299                           blank_finish_state=None,
 300                           extra_settings={},
 301                           match_titles=0,
 302                           state_machine_class=None,
 303                           state_machine_kwargs=None):
 304         """
 305         Create a new StateMachine rooted at `node` and run it over the input
 306         `block`. Also keep track of optional intermediate blank lines and the
 307         required final one.
 308         """
 309         if state_machine_class is None:
 310             state_machine_class = self.nested_sm
 311         if state_machine_kwargs is None:
 312             state_machine_kwargs = self.nested_sm_kwargs.copy()
 313         state_machine_kwargs['initial_state'] = initial_state
 314         state_machine = state_machine_class(debug=self.debug,
 315                                             **state_machine_kwargs)
 316         if blank_finish_state is None:
 317             blank_finish_state = initial_state
 318         state_machine.states[blank_finish_state].blank_finish = blank_finish
 319         for key, value in extra_settings.items():
 320             setattr(state_machine.states[initial_state], key, value)
 321         state_machine.run(block, input_offset, memo=self.memo,
 322                           node=node, match_titles=match_titles)
 323         blank_finish = state_machine.states[blank_finish_state].blank_finish
 324         state_machine.unlink()
 325         return state_machine.abs_line_offset(), blank_finish
 326
 327     def section(self, title, source, style, lineno, messages):
 328         """Check for a valid subsection and create one if it checks out."""
 329         if self.check_subsection(source, style, lineno):
 330             self.new_subsection(title, lineno, messages)
 331
 332     def check_subsection(self, source, style, lineno):
 333         """
 334         Check for a valid subsection header.  Return 1 (true) or None (false).
 335
 336         When a new section is reached that isn't a subsection of the current
 337         section, back up the line count (use ``previous_line(-x)``), then
 338         ``raise EOFError``.  The current StateMachine will finish, then the
 339         calling StateMachine can re-examine the title.  This will work its way
 340         back up the calling chain until the correct section level isreached.
 341
 342         @@@ Alternative: Evaluate the title, store the title info & level, and
 343         back up the chain until that level is reached.  Store in memo? Or
 344         return in results?
 345
 346         :Exception: `EOFError` when a sibling or supersection encountered.
 347         """
 348         memo = self.memo
 349         title_styles = memo.title_styles
 350         mylevel = memo.section_level
 351         try:                            # check for existing title style
 352             level = title_styles.index(style) + 1
 353         except ValueError:              # new title style
 354             if len(title_styles) == memo.section_level: # new subsection
 355                 title_styles.append(style)
 356                 return 1
 357             else:                       # not at lowest level
 358                 self.parent += self.title_inconsistent(source, lineno)
 359                 return None
 360         if level <= mylevel:            # sibling or supersection
 361             memo.section_level = level   # bubble up to parent section
 362             if len(style) == 2:
 363                 memo.section_bubble_up_kludge = 1
 364             # back up 2 lines for underline title, 3 for overline title
 365             self.state_machine.previous_line(len(style) + 1)
 366             raise EOFError              # let parent section re-evaluate
 367         if level == mylevel + 1:        # immediate subsection
 368             return 1
 369         else:                           # invalid subsection
 370             self.parent += self.title_inconsistent(source, lineno)
 371             return None
 372
 373     def title_inconsistent(self, sourcetext, lineno):
 374         src, srcline = self.state_machine.get_source_and_line(lineno)
 375         error = self.reporter.severe(
 376             'Title level inconsistent:', nodes.literal_block('', sourcetext),
 377             source=src, line=srcline)
 378         return error
 379
 380     def new_subsection(self, title, lineno, messages):
 381         """Append new subsection to document tree. On return, check level."""
 382         memo = self.memo
 383         mylevel = memo.section_level
 384         memo.section_level += 1
 385         section_node = nodes.section()
 386         self.parent += section_node
 387         textnodes, title_messages = self.inline_text(title, lineno)
 388         titlenode = nodes.title(title, '', *textnodes)
 389         name = normalize_name(titlenode.astext())
 390         section_node['names'].append(name)
 391         section_node += titlenode
 392         section_node += messages
 393         section_node += title_messages
 394         self.document.note_implicit_target(section_node, section_node)
 395         offset = self.state_machine.line_offset + 1
 396         absoffset = self.state_machine.abs_line_offset() + 1
 397         newabsoffset = self.nested_parse(
 398               self.state_machine.input_lines[offset:], input_offset=absoffset,
 399               node=section_node, match_titles=1)
 400         self.goto_line(newabsoffset)
 401         if memo.section_level <= mylevel: # can't handle next section?
 402             raise EOFError              # bubble up to supersection
 403         # reset section_level; next pass will detect it properly
 404         memo.section_level = mylevel
 405
 406     def paragraph(self, lines, lineno):
 407         """
 408         Return a list (paragraph & messages) & a boolean: literal_block next?
 409         """
 410         data = '\n'.join(lines).rstrip()
 411         if re.search(r'(?<!\\)(\\\\)*::$', data):
 412             if len(data) == 2:
 413                 return [], 1
 414             elif data[-3] in ' \n':
 415                 text = data[:-3].rstrip()
 416             else:
 417                 text = data[:-1]
 418             literalnext = 1
 419         else:
 420             text = data
 421             literalnext = 0
 422         textnodes, messages = self.inline_text(text, lineno)
 423         p = nodes.paragraph(data, '', *textnodes)
 424         p.source, p.line = self.state_machine.get_source_and_line(lineno)
 425         return [p] + messages, literalnext
 426
 427     def inline_text(self, text, lineno):
 428         """
 429         Return 2 lists: nodes (text and inline elements), and system_messages.
 430         """
 431         return self.inliner.parse(text, lineno, self.memo, self.parent)
 432
 433     def unindent_warning(self, node_name):
 434         # the actual problem is one line below the current line
 435         src, srcline = self.state_machine.get_source_and_line()
 436         return self.reporter.warning('%s ends without a blank line; '
 437                                      'unexpected unindent.' % node_name,
 438                                      source=src, line=srcline+1)
 439
 440
 441 def build_regexp(definition, compile=1):
 442     """
 443     Build, compile and return a regular expression based on `definition`.
 444
 445     :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
 446         where "parts" is a list of regular expressions and/or regular
 447         expression definitions to be joined into an or-group.
 448     """
 449     name, prefix, suffix, parts = definition
 450     part_strings = []
 451     for part in parts:
 452         if type(part) is tuple:
 453             part_strings.append(build_regexp(part, None))
 454         else:
 455             part_strings.append(part)
 456     or_group = '|'.join(part_strings)
 457     regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
 458     if compile:
 459         return re.compile(regexp, re.UNICODE)
 460     else:
 461         return regexp
 462
 463
 464 class Inliner:
 465
 466     """
 467     Parse inline markup; call the `parse()` method.
 468     """
 469
 470     def __init__(self):
 471         self.implicit_dispatch = [(self.patterns.uri, self.standalone_uri),]
 472         """List of (pattern, bound method) tuples, used by
 473         `self.implicit_inline`."""
 474
 475     def init_customizations(self, settings):
 476         """Setting-based customizations; run when parsing begins."""
 477         if settings.pep_references:
 478             self.implicit_dispatch.append((self.patterns.pep,
 479                                            self.pep_reference))
 480         if settings.rfc_references:
 481             self.implicit_dispatch.append((self.patterns.rfc,
 482                                            self.rfc_reference))
 483
 484     def parse(self, text, lineno, memo, parent):
 485         # Needs to be refactored for nested inline markup.
 486         # Add nested_parse() method?
 487         """
 488         Return 2 lists: nodes (text and inline elements), and system_messages.
 489
 490         Using `self.patterns.initial`, a pattern which matches start-strings
 491         (emphasis, strong, interpreted, phrase reference, literal,
 492         substitution reference, and inline target) and complete constructs
 493         (simple reference, footnote reference), search for a candidate.  When
 494         one is found, check for validity (e.g., not a quoted '*' character).
 495         If valid, search for the corresponding end string if applicable, and
 496         check it for validity.  If not found or invalid, generate a warning
 497         and ignore the start-string.  Implicit inline markup (e.g. standalone
 498         URIs) is found last.
 499         """
 500         self.reporter = memo.reporter
 501         self.document = memo.document
 502         self.language = memo.language
 503         self.parent = parent
 504         pattern_search = self.patterns.initial.search
 505         dispatch = self.dispatch
 506         remaining = escape2null(text)
 507         processed = []
 508         unprocessed = []
 509         messages = []
 510         while remaining:
 511             match = pattern_search(remaining)
 512             if match:
 513                 groups = match.groupdict()
 514                 method = dispatch[groups['start'] or groups['backquote']
 515                                   or groups['refend'] or groups['fnend']]
 516                 before, inlines, remaining, sysmessages = method(self, match,
 517                                                                  lineno)
 518                 unprocessed.append(before)
 519                 messages += sysmessages
 520                 if inlines:
 521                     processed += self.implicit_inline(''.join(unprocessed),
 522                                                       lineno)
 523                     processed += inlines
 524                     unprocessed = []
 525             else:
 526                 break
 527         remaining = ''.join(unprocessed) + remaining
 528         if remaining:
 529             processed += self.implicit_inline(remaining, lineno)
 530         return processed, messages
 531
 532     # Inline object recognition
 533     # -------------------------
 534     # lookahead and look-behind expressions for inline markup rules
 535     start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' %
 536                            (punctuation_chars.openers,
 537                             punctuation_chars.delimiters))
 538     end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' %
 539                          (punctuation_chars.closing_delimiters,
 540                           punctuation_chars.delimiters,
 541                           punctuation_chars.closers))
 542     # print start_string_prefix.encode('utf8')
 543     # TODO: support non-ASCII whitespace in the following 4 patterns?
 544     non_whitespace_before = r'(?<![ \n])'
 545     non_whitespace_escape_before = r'(?<![ \n\x00])'
 546     non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[ \n\x00])'
 547     non_whitespace_after = r'(?![ \n])'
 548     # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
 549     simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
 550     # Valid URI characters (see RFC 2396 & RFC 2732);
 551     # final \x00 allows backslash escapes in URIs:
 552     uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
 553     # Delimiter indicating the end of a URI (not part of the URI):
 554     uri_end_delim = r"""[>]"""
 555     # Last URI character; same as uric but no punctuation:
 556     urilast = r"""[_~*/=+a-zA-Z0-9]"""
 557     # End of a URI (either 'urilast' or 'uric followed by a
 558     # uri_end_delim'):
 559     uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
 560     emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
 561     email_pattern = r"""
 562           %(emailc)s+(?:\.%(emailc)s+)*   # name
 563           (?<!\x00)@                      # at
 564           %(emailc)s+(?:\.%(emailc)s*)*   # host
 565           %(uri_end)s                     # final URI char
 566           """
 567     parts = ('initial_inline', start_string_prefix, '',
 568              [('start', '', non_whitespace_after,  # simple start-strings
 569                [r'\*\*',                # strong
 570                 r'\*(?!\*)',            # emphasis but not strong
 571                 r'``',                  # literal
 572                 r'_`',                  # inline internal target
 573                 r'\|(?!\|)']            # substitution reference
 574                ),
 575               ('whole', '', end_string_suffix, # whole constructs
 576                [# reference name & end-string
 577                 r'(?P<refname>%s)(?P<refend>__?)' % simplename,
 578                 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
 579                  [r'[0-9]+',               # manually numbered
 580                   r'\#(%s)?' % simplename, # auto-numbered (w/ label?)
 581                   r'\*',                   # auto-symbol
 582                   r'(?P<citationlabel>%s)' % simplename] # citation reference
 583                  )
 584                 ]
 585                ),
 586               ('backquote',             # interpreted text or phrase reference
 587                '(?P<role>(:%s:)?)' % simplename, # optional role
 588                non_whitespace_after,
 589                ['`(?!`)']               # but not literal
 590                )
 591               ]
 592              )
 593     patterns = Struct(
 594           initial=build_regexp(parts),
 595           emphasis=re.compile(non_whitespace_escape_before
 596                               + r'(\*)' + end_string_suffix, re.UNICODE),
 597           strong=re.compile(non_whitespace_escape_before
 598                             + r'(\*\*)' + end_string_suffix, re.UNICODE),
 599           interpreted_or_phrase_ref=re.compile(
 600               r"""
 601               %(non_unescaped_whitespace_escape_before)s
 602               (
 603                 `
 604                 (?P<suffix>
 605                   (?P<role>:%(simplename)s:)?
 606                   (?P<refend>__?)?
 607                 )
 608               )
 609               %(end_string_suffix)s
 610               """ % locals(), re.VERBOSE | re.UNICODE),
 611           embedded_uri=re.compile(
 612               r"""
 613               (
 614                 (?:[ \n]+|^)            # spaces or beginning of line/string
 615                 <                       # open bracket
 616                 %(non_whitespace_after)s
 617                 ([^<>\x00]+)            # anything but angle brackets & nulls
 618                 %(non_whitespace_before)s
 619                 >                       # close bracket w/o whitespace before
 620               )
 621               $                         # end of string
 622               """ % locals(), re.VERBOSE | re.UNICODE),
 623           literal=re.compile(non_whitespace_before + '(``)'
 624                              + end_string_suffix),
 625           target=re.compile(non_whitespace_escape_before
 626                             + r'(`)' + end_string_suffix),
 627           substitution_ref=re.compile(non_whitespace_escape_before
 628                                       + r'(\|_{0,2})'
 629                                       + end_string_suffix),
 630           email=re.compile(email_pattern % locals() + '$',
 631                            re.VERBOSE | re.UNICODE),
 632           uri=re.compile(
 633                 (r"""
 634                 %(start_string_prefix)s
 635                 (?P<whole>
 636                   (?P<absolute>           # absolute URI
 637                     (?P<scheme>             # scheme (http, ftp, mailto)
 638                       [a-zA-Z][a-zA-Z0-9.+-]*
 639                     )
 640                     :
 641                     (
 642                       (                       # either:
 643                         (//?)?                  # hierarchical URI
 644                         %(uric)s*               # URI characters
 645                         %(uri_end)s             # final URI char
 646                       )
 647                       (                       # optional query
 648                         \?%(uric)s*
 649                         %(uri_end)s
 650                       )?
 651                       (                       # optional fragment
 652                         \#%(uric)s*
 653                         %(uri_end)s
 654                       )?
 655                     )
 656                   )
 657                 |                       # *OR*
 658                   (?P<email>              # email address
 659                     """ + email_pattern + r"""
 660                   )
 661                 )
 662                 %(end_string_suffix)s
 663                 """) % locals(), re.VERBOSE | re.UNICODE),
 664           pep=re.compile(
 665                 r"""
 666                 %(start_string_prefix)s
 667                 (
 668                   (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
 669                 |
 670                   (PEP\s+(?P<pepnum2>\d+))      # reference by name
 671                 )
 672                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE),
 673           rfc=re.compile(
 674                 r"""
 675                 %(start_string_prefix)s
 676                 (RFC(-|\s+)?(?P<rfcnum>\d+))
 677                 %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE))
 678
 679     def quoted_start(self, match):
 680         """Test if inline markup start-string is 'quoted'.
 681
 682         'Quoted' in this context means the start-string is enclosed in a pair
 683         of matching opening/closing delimiters (not necessarily quotes)
 684         or at the end of the match.
 685         """
 686         string = match.string
 687         start = match.start()
 688         if start == 0:                  # start-string at beginning of text
 689             return False
 690         prestart = string[start - 1]
 691         try:
 692             poststart = string[match.end()]
 693         except IndexError:          # start-string at end of text
 694             return True  # not "quoted" but no markup start-string either
 695         return punctuation_chars.match_chars(prestart, poststart)
 696
 697     def inline_obj(self, match, lineno, end_pattern, nodeclass,
 698                    restore_backslashes=0):
 699         string = match.string
 700         matchstart = match.start('start')
 701         matchend = match.end('start')
 702         if self.quoted_start(match):
 703             return (string[:matchend], [], string[matchend:], [], '')
 704         endmatch = end_pattern.search(string[matchend:])
 705         if endmatch and endmatch.start(1):  # 1 or more chars
 706             text = unescape(endmatch.string[:endmatch.start(1)],
 707                             restore_backslashes)
 708             textend = matchend + endmatch.end(1)
 709             rawsource = unescape(string[matchstart:textend], 1)
 710             return (string[:matchstart], [nodeclass(rawsource, text)],
 711                     string[textend:], [], endmatch.group(1))
 712         msg = self.reporter.warning(
 713               'Inline %s start-string without end-string.'
 714               % nodeclass.__name__, line=lineno)
 715         text = unescape(string[matchstart:matchend], 1)
 716         rawsource = unescape(string[matchstart:matchend], 1)
 717         prb = self.problematic(text, rawsource, msg)
 718         return string[:matchstart], [prb], string[matchend:], [msg], ''
 719
 720     def problematic(self, text, rawsource, message):
 721         msgid = self.document.set_id(message, self.parent)
 722         problematic = nodes.problematic(rawsource, text, refid=msgid)
 723         prbid = self.document.set_id(problematic)
 724         message.add_backref(prbid)
 725         return problematic
 726
 727     def emphasis(self, match, lineno):
 728         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 729               match, lineno, self.patterns.emphasis, nodes.emphasis)
 730         return before, inlines, remaining, sysmessages
 731
 732     def strong(self, match, lineno):
 733         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 734               match, lineno, self.patterns.strong, nodes.strong)
 735         return before, inlines, remaining, sysmessages
 736
 737     def interpreted_or_phrase_ref(self, match, lineno):
 738         end_pattern = self.patterns.interpreted_or_phrase_ref
 739         string = match.string
 740         matchstart = match.start('backquote')
 741         matchend = match.end('backquote')
 742         rolestart = match.start('role')
 743         role = match.group('role')
 744         position = ''
 745         if role:
 746             role = role[1:-1]
 747             position = 'prefix'
 748         elif self.quoted_start(match):
 749             return (string[:matchend], [], string[matchend:], [])
 750         endmatch = end_pattern.search(string[matchend:])
 751         if endmatch and endmatch.start(1):  # 1 or more chars
 752             textend = matchend + endmatch.end()
 753             if endmatch.group('role'):
 754                 if role:
 755                     msg = self.reporter.warning(
 756                         'Multiple roles in interpreted text (both '
 757                         'prefix and suffix present; only one allowed).',
 758                         line=lineno)
 759                     text = unescape(string[rolestart:textend], 1)
 760                     prb = self.problematic(text, text, msg)
 761                     return string[:rolestart], [prb], string[textend:], [msg]
 762                 role = endmatch.group('suffix')[1:-1]
 763                 position = 'suffix'
 764             escaped = endmatch.string[:endmatch.start(1)]
 765             rawsource = unescape(string[matchstart:textend], 1)
 766             if rawsource[-1:] == '_':
 767                 if role:
 768                     msg = self.reporter.warning(
 769                           'Mismatch: both interpreted text role %s and '
 770                           'reference suffix.' % position, line=lineno)
 771                     text = unescape(string[rolestart:textend], 1)
 772                     prb = self.problematic(text, text, msg)
 773                     return string[:rolestart], [prb], string[textend:], [msg]
 774                 return self.phrase_ref(string[:matchstart], string[textend:],
 775                                        rawsource, escaped, unescape(escaped))
 776             else:
 777                 rawsource = unescape(string[rolestart:textend], 1)
 778                 nodelist, messages = self.interpreted(rawsource, escaped, role,
 779                                                       lineno)
 780                 return (string[:rolestart], nodelist,
 781                         string[textend:], messages)
 782         msg = self.reporter.warning(
 783               'Inline interpreted text or phrase reference start-string '
 784               'without end-string.', line=lineno)
 785         text = unescape(string[matchstart:matchend], 1)
 786         prb = self.problematic(text, text, msg)
 787         return string[:matchstart], [prb], string[matchend:], [msg]
 788
 789     def phrase_ref(self, before, after, rawsource, escaped, text):
 790         match = self.patterns.embedded_uri.search(escaped)
 791         if match:
 792             text = unescape(escaped[:match.start(0)])
 793             uri_text = match.group(2)
 794             uri = ''.join(uri_text.split())
 795             uri = self.adjust_uri(uri)
 796             if uri:
 797                 target = nodes.target(match.group(1), refuri=uri)
 798             else:
 799                 raise ApplicationError('problem with URI: %r' % uri_text)
 800             if not text:
 801                 text = uri
 802         else:
 803             target = None
 804         refname = normalize_name(text)
 805         reference = nodes.reference(rawsource, text,
 806                                     name=whitespace_normalize_name(text))
 807         node_list = [reference]
 808         if rawsource[-2:] == '__':
 809             if target:
 810                 reference['refuri'] = uri
 811             else:
 812                 reference['anonymous'] = 1
 813         else:
 814             if target:
 815                 reference['refuri'] = uri
 816                 target['names'].append(refname)
 817                 self.document.note_explicit_target(target, self.parent)
 818                 node_list.append(target)
 819             else:
 820                 reference['refname'] = refname
 821                 self.document.note_refname(reference)
 822         return before, node_list, after, []
 823
 824     def adjust_uri(self, uri):
 825         match = self.patterns.email.match(uri)
 826         if match:
 827             return 'mailto:' + uri
 828         else:
 829             return uri
 830
 831     def interpreted(self, rawsource, text, role, lineno):
 832         role_fn, messages = roles.role(role, self.language, lineno,
 833                                        self.reporter)
 834         if role_fn:
 835             nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
 836             return nodes, messages + messages2
 837         else:
 838             msg = self.reporter.error(
 839                 'Unknown interpreted text role "%s".' % role,
 840                 line=lineno)
 841             return ([self.problematic(rawsource, rawsource, msg)],
 842                     messages + [msg])
 843
 844     def literal(self, match, lineno):
 845         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 846               match, lineno, self.patterns.literal, nodes.literal,
 847               restore_backslashes=1)
 848         return before, inlines, remaining, sysmessages
 849
 850     def inline_internal_target(self, match, lineno):
 851         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 852               match, lineno, self.patterns.target, nodes.target)
 853         if inlines and isinstance(inlines[0], nodes.target):
 854             assert len(inlines) == 1
 855             target = inlines[0]
 856             name = normalize_name(target.astext())
 857             target['names'].append(name)
 858             self.document.note_explicit_target(target, self.parent)
 859         return before, inlines, remaining, sysmessages
 860
 861     def substitution_reference(self, match, lineno):
 862         before, inlines, remaining, sysmessages, endstring = self.inline_obj(
 863               match, lineno, self.patterns.substitution_ref,
 864               nodes.substitution_reference)
 865         if len(inlines) == 1:
 866             subref_node = inlines[0]
 867             if isinstance(subref_node, nodes.substitution_reference):
 868                 subref_text = subref_node.astext()
 869                 self.document.note_substitution_ref(subref_node, subref_text)
 870                 if endstring[-1:] == '_':
 871                     reference_node = nodes.reference(
 872                         '|%s%s' % (subref_text, endstring), '')
 873                     if endstring[-2:] == '__':
 874                         reference_node['anonymous'] = 1
 875                     else:
 876                         reference_node['refname'] = normalize_name(subref_text)
 877                         self.document.note_refname(reference_node)
 878                     reference_node += subref_node
 879                     inlines = [reference_node]
 880         return before, inlines, remaining, sysmessages
 881
 882     def footnote_reference(self, match, lineno):
 883         """
 884         Handles `nodes.footnote_reference` and `nodes.citation_reference`
 885         elements.
 886         """
 887         label = match.group('footnotelabel')
 888         refname = normalize_name(label)
 889         string = match.string
 890         before = string[:match.start('whole')]
 891         remaining = string[match.end('whole'):]
 892         if match.group('citationlabel'):
 893             refnode = nodes.citation_reference('[%s]_' % label,
 894                                                refname=refname)
 895             refnode += nodes.Text(label)
 896             self.document.note_citation_ref(refnode)
 897         else:
 898             refnode = nodes.footnote_reference('[%s]_' % label)
 899             if refname[0] == '#':
 900                 refname = refname[1:]
 901                 refnode['auto'] = 1
 902                 self.document.note_autofootnote_ref(refnode)
 903             elif refname == '*':
 904                 refname = ''
 905                 refnode['auto'] = '*'
 906                 self.document.note_symbol_footnote_ref(
 907                       refnode)
 908             else:
 909                 refnode += nodes.Text(label)
 910             if refname:
 911                 refnode['refname'] = refname
 912                 self.document.note_footnote_ref(refnode)
 913             if utils.get_trim_footnote_ref_space(self.document.settings):
 914                 before = before.rstrip()
 915         return (before, [refnode], remaining, [])
 916
 917     def reference(self, match, lineno, anonymous=None):
 918         referencename = match.group('refname')
 919         refname = normalize_name(referencename)
 920         referencenode = nodes.reference(
 921             referencename + match.group('refend'), referencename,
 922             name=whitespace_normalize_name(referencename))
 923         if anonymous:
 924             referencenode['anonymous'] = 1
 925         else:
 926             referencenode['refname'] = refname
 927             self.document.note_refname(referencenode)
 928         string = match.string
 929         matchstart = match.start('whole')
 930         matchend = match.end('whole')
 931         return (string[:matchstart], [referencenode], string[matchend:], [])
 932
 933     def anonymous_reference(self, match, lineno):
 934         return self.reference(match, lineno, anonymous=1)
 935
 936     def standalone_uri(self, match, lineno):
 937         if (not match.group('scheme')
 938                 or match.group('scheme').lower() in urischemes.schemes):
 939             if match.group('email'):
 940                 addscheme = 'mailto:'
 941             else:
 942                 addscheme = ''
 943             text = match.group('whole')
 944             unescaped = unescape(text, 0)
 945             return [nodes.reference(unescape(text, 1), unescaped,
 946                                     refuri=addscheme + unescaped)]
 947         else:                   # not a valid scheme
 948             raise MarkupMismatch
 949
 950     def pep_reference(self, match, lineno):
 951         text = match.group(0)
 952         if text.startswith('pep-'):
 953             pepnum = int(match.group('pepnum1'))
 954         elif text.startswith('PEP'):
 955             pepnum = int(match.group('pepnum2'))
 956         else:
 957             raise MarkupMismatch
 958         ref = (self.document.settings.pep_base_url
 959                + self.document.settings.pep_file_url_template % pepnum)
 960         unescaped = unescape(text, 0)
 961         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 962
 963     rfc_url = 'rfc%d.html'
 964
 965     def rfc_reference(self, match, lineno):
 966         text = match.group(0)
 967         if text.startswith('RFC'):
 968             rfcnum = int(match.group('rfcnum'))
 969             ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
 970         else:
 971             raise MarkupMismatch
 972         unescaped = unescape(text, 0)
 973         return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
 974
 975     def implicit_inline(self, text, lineno):
 976         """
 977         Check each of the patterns in `self.implicit_dispatch` for a match,
 978         and dispatch to the stored method for the pattern.  Recursively check
 979         the text before and after the match.  Return a list of `nodes.Text`
 980         and inline element nodes.
 981         """
 982         if not text:
 983             return []
 984         for pattern, method in self.implicit_dispatch:
 985             match = pattern.search(text)
 986             if match:
 987                 try:
 988                     # Must recurse on strings before *and* after the match;
 989                     # there may be multiple patterns.
 990                     return (self.implicit_inline(text[:match.start()], lineno)
 991                             + method(match, lineno) +
 992                             self.implicit_inline(text[match.end():], lineno))
 993                 except MarkupMismatch:
 994                     pass
 995         return [nodes.Text(unescape(text), rawsource=unescape(text, 1))]
 996
 997     dispatch = {'*': emphasis,
 998                 '**': strong,
 999                 '`': interpreted_or_phrase_ref,
1000                 '``': literal,
1001                 '_`': inline_internal_target,
1002                 ']_': footnote_reference,
1003                 '|': substitution_reference,
1004                 '_': reference,
1005                 '__': anonymous_reference}
1006
1007
1008 def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1009     return ord(s) - _zero
1010
1011 def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1012     return ord(s) - _zero
1013
1014 def _lowerroman_to_int(s):
1015     return roman.fromRoman(s.upper())
1016
1017
1018 class Body(RSTState):
1019
1020     """
1021     Generic classifier of the first line of a block.
1022     """
1023
1024     double_width_pad_char = tableparser.TableParser.double_width_pad_char
1025     """Padding character for East Asian double-width text."""
1026
1027     enum = Struct()
1028     """Enumerated list parsing information."""
1029
1030     enum.formatinfo = {
1031           'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1032           'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1033           'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1034     enum.formats = enum.formatinfo.keys()
1035     enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1036                       'lowerroman', 'upperroman'] # ORDERED!
1037     enum.sequencepats = {'arabic': '[0-9]+',
1038                          'loweralpha': '[a-z]',
1039                          'upperalpha': '[A-Z]',
1040                          'lowerroman': '[ivxlcdm]+',
1041                          'upperroman': '[IVXLCDM]+',}
1042     enum.converters = {'arabic': int,
1043                        'loweralpha': _loweralpha_to_int,
1044                        'upperalpha': _upperalpha_to_int,
1045                        'lowerroman': _lowerroman_to_int,
1046                        'upperroman': roman.fromRoman}
1047
1048     enum.sequenceregexps = {}
1049     for sequence in enum.sequences:
1050         enum.sequenceregexps[sequence] = re.compile(
1051               enum.sequencepats[sequence] + '$', re.UNICODE)
1052
1053     grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1054     """Matches the top (& bottom) of a full table)."""
1055
1056     simple_table_top_pat = re.compile('=+( +=+)+ *$')
1057     """Matches the top of a simple table."""
1058
1059     simple_table_border_pat = re.compile('=+[ =]*$')
1060     """Matches the bottom & header bottom of a simple table."""
1061
1062     pats = {}
1063     """Fragments of patterns used by transitions."""
1064
1065     pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1066     pats['alpha'] = '[a-zA-Z]'
1067     pats['alphanum'] = '[a-zA-Z0-9]'
1068     pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1069     pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1070                     '|%(upperroman)s|#)' % enum.sequencepats)
1071     pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1072     # @@@ Loosen up the pattern?  Allow Unicode?
1073     pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1074     pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1075     pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1076     pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1077
1078     for format in enum.formats:
1079         pats[format] = '(?P<%s>%s%s%s)' % (
1080               format, re.escape(enum.formatinfo[format].prefix),
1081               pats['enum'], re.escape(enum.formatinfo[format].suffix))
1082
1083     patterns = {
1084           'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
1085           'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1086           'field_marker': r':(?![: ])([^:\\]|\\.)*(?<! ):( +|$)',
1087           'option_marker': r'%(option)s(, %(option)s)*(  +| ?$)' % pats,
1088           'doctest': r'>>>( +|$)',
1089           'line_block': r'\|( +|$)',
1090           'grid_table_top': grid_table_top_pat,
1091           'simple_table_top': simple_table_top_pat,
1092           'explicit_markup': r'\.\.( +|$)',
1093           'anonymous': r'__( +|$)',
1094           'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1095           'text': r''}
1096     initial_transitions = (
1097           'bullet',
1098           'enumerator',
1099           'field_marker',
1100           'option_marker',
1101           'doctest',
1102           'line_block',
1103           'grid_table_top',
1104           'simple_table_top',
1105           'explicit_markup',
1106           'anonymous',
1107           'line',
1108           'text')
1109
1110     def indent(self, match, context, next_state):
1111         """Block quote."""
1112         indented, indent, line_offset, blank_finish = \
1113               self.state_machine.get_indented()
1114         elements = self.block_quote(indented, line_offset)
1115         self.parent += elements
1116         if not blank_finish:
1117             self.parent += self.unindent_warning('Block quote')
1118         return context, next_state, []
1119
1120     def block_quote(self, indented, line_offset):
1121         elements = []
1122         while indented:
1123             (blockquote_lines,
1124              attribution_lines,
1125              attribution_offset,
1126              indented,
1127              new_line_offset) = self.split_attribution(indented, line_offset)
1128             blockquote = nodes.block_quote()
1129             self.nested_parse(blockquote_lines, line_offset, blockquote)
1130             elements.append(blockquote)
1131             if attribution_lines:
1132                 attribution, messages = self.parse_attribution(
1133                     attribution_lines, attribution_offset)
1134                 blockquote += attribution
1135                 elements += messages
1136             line_offset = new_line_offset
1137             while indented and not indented[0]:
1138                 indented = indented[1:]
1139                 line_offset += 1
1140         return elements
1141
1142     # U+2014 is an em-dash:
1143     attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])',
1144                                      re.UNICODE)
1145
1146     def split_attribution(self, indented, line_offset):
1147         """
1148         Check for a block quote attribution and split it off:
1149
1150         * First line after a blank line must begin with a dash ("--", "---",
1151           em-dash; matches `self.attribution_pattern`).
1152         * Every line after that must have consistent indentation.
1153         * Attributions must be preceded by block quote content.
1154
1155         Return a tuple of: (block quote content lines, content offset,
1156         attribution lines, attribution offset, remaining indented lines).
1157         """
1158         blank = None
1159         nonblank_seen = False
1160         for i in range(len(indented)):
1161             line = indented[i].rstrip()
1162             if line:
1163                 if nonblank_seen and blank == i - 1: # last line blank
1164                     match = self.attribution_pattern.match(line)
1165                     if match:
1166                         attribution_end, indent = self.check_attribution(
1167                             indented, i)
1168                         if attribution_end:
1169                             a_lines = indented[i:attribution_end]
1170                             a_lines.trim_left(match.end(), end=1)
1171                             a_lines.trim_left(indent, start=1)
1172                             return (indented[:i], a_lines,
1173                                     i, indented[attribution_end:],
1174                                     line_offset + attribution_end)
1175                 nonblank_seen = True
1176             else:
1177                 blank = i
1178         else:
1179             return (indented, None, None, None, None)
1180
1181     def check_attribution(self, indented, attribution_start):
1182         """
1183         Check attribution shape.
1184         Return the index past the end of the attribution, and the indent.
1185         """
1186         indent = None
1187         i = attribution_start + 1
1188         for i in range(attribution_start + 1, len(indented)):
1189             line = indented[i].rstrip()
1190             if not line:
1191                 break
1192             if indent is None:
1193                 indent = len(line) - len(line.lstrip())
1194             elif len(line) - len(line.lstrip()) != indent:
1195                 return None, None       # bad shape; not an attribution
1196         else:
1197             # return index of line after last attribution line:
1198             i += 1
1199         return i, (indent or 0)
1200
1201     def parse_attribution(self, indented, line_offset):
1202         text = '\n'.join(indented).rstrip()
1203         lineno = self.state_machine.abs_line_number() + line_offset
1204         textnodes, messages = self.inline_text(text, lineno)
1205         node = nodes.attribution(text, '', *textnodes)
1206         node.line = lineno
1207         # report with source and source-line results in
1208         # ``IndexError: list index out of range``
1209         # node.source, node.line = self.state_machine.get_source_and_line(lineno)
1210         return node, messages
1211
1212     def bullet(self, match, context, next_state):
1213         """Bullet list item."""
1214         bulletlist = nodes.bullet_list()
1215         self.parent += bulletlist
1216         bulletlist['bullet'] = match.string[0]
1217         i, blank_finish = self.list_item(match.end())
1218         bulletlist += i
1219         offset = self.state_machine.line_offset + 1   # next line
1220         new_line_offset, blank_finish = self.nested_list_parse(
1221               self.state_machine.input_lines[offset:],
1222               input_offset=self.state_machine.abs_line_offset() + 1,
1223               node=bulletlist, initial_state='BulletList',
1224               blank_finish=blank_finish)
1225         self.goto_line(new_line_offset)
1226         if not blank_finish:
1227             self.parent += self.unindent_warning('Bullet list')
1228         return [], next_state, []
1229
1230     def list_item(self, indent):
1231         if self.state_machine.line[indent:]:
1232             indented, line_offset, blank_finish = (
1233                 self.state_machine.get_known_indented(indent))
1234         else:
1235             indented, indent, line_offset, blank_finish = (
1236                 self.state_machine.get_first_known_indented(indent))
1237         listitem = nodes.list_item('\n'.join(indented))
1238         if indented:
1239             self.nested_parse(indented, input_offset=line_offset,
1240                               node=listitem)
1241         return listitem, blank_finish
1242
1243     def enumerator(self, match, context, next_state):
1244         """Enumerated List Item"""
1245         format, sequence, text, ordinal = self.parse_enumerator(match)
1246         if not self.is_enumerated_list_item(ordinal, sequence, format):
1247             raise statemachine.TransitionCorrection('text')
1248         enumlist = nodes.enumerated_list()
1249         self.parent += enumlist
1250         if sequence == '#':
1251             enumlist['enumtype'] = 'arabic'
1252         else:
1253             enumlist['enumtype'] = sequence
1254         enumlist['prefix'] = self.enum.formatinfo[format].prefix
1255         enumlist['suffix'] = self.enum.formatinfo[format].suffix
1256         if ordinal != 1:
1257             enumlist['start'] = ordinal
1258             src, srcline = self.state_machine.get_source_and_line()
1259             msg = self.reporter.info(
1260                 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1261                 % (text, ordinal), source=src, line=srcline)
1262             self.parent += msg
1263         listitem, blank_finish = self.list_item(match.end())
1264         enumlist += listitem
1265         offset = self.state_machine.line_offset + 1   # next line
1266         newline_offset, blank_finish = self.nested_list_parse(
1267               self.state_machine.input_lines[offset:],
1268               input_offset=self.state_machine.abs_line_offset() + 1,
1269               node=enumlist, initial_state='EnumeratedList',
1270               blank_finish=blank_finish,
1271               extra_settings={'lastordinal': ordinal,
1272                               'format': format,
1273                               'auto': sequence == '#'})
1274         self.goto_line(newline_offset)
1275         if not blank_finish:
1276             self.parent += self.unindent_warning('Enumerated list')
1277         return [], next_state, []
1278
1279     def parse_enumerator(self, match, expected_sequence=None):
1280         """
1281         Analyze an enumerator and return the results.
1282
1283         :Return:
1284             - the enumerator format ('period', 'parens', or 'rparen'),
1285             - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1286             - the text of the enumerator, stripped of formatting, and
1287             - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1288               ``None`` is returned for invalid enumerator text).
1289
1290         The enumerator format has already been determined by the regular
1291         expression match. If `expected_sequence` is given, that sequence is
1292         tried first. If not, we check for Roman numeral 1. This way,
1293         single-character Roman numerals (which are also alphabetical) can be
1294         matched. If no sequence has been matched, all sequences are checked in
1295         order.
1296         """
1297         groupdict = match.groupdict()
1298         sequence = ''
1299         for format in self.enum.formats:
1300             if groupdict[format]:       # was this the format matched?
1301                 break                   # yes; keep `format`
1302         else:                           # shouldn't happen
1303             raise ParserError('enumerator format not matched')
1304         text = groupdict[format][self.enum.formatinfo[format].start
1305                                  :self.enum.formatinfo[format].end]
1306         if text == '#':
1307             sequence = '#'
1308         elif expected_sequence:
1309             try:
1310                 if self.enum.sequenceregexps[expected_sequence].match(text):
1311                     sequence = expected_sequence
1312             except KeyError:            # shouldn't happen
1313                 raise ParserError('unknown enumerator sequence: %s'
1314                                   % sequence)
1315         elif text == 'i':
1316             sequence = 'lowerroman'
1317         elif text == 'I':
1318             sequence = 'upperroman'
1319         if not sequence:
1320             for sequence in self.enum.sequences:
1321                 if self.enum.sequenceregexps[sequence].match(text):
1322                     break
1323             else:                       # shouldn't happen
1324                 raise ParserError('enumerator sequence not matched')
1325         if sequence == '#':
1326             ordinal = 1
1327         else:
1328             try:
1329                 ordinal = self.enum.converters[sequence](text)
1330             except roman.InvalidRomanNumeralError:
1331                 ordinal = None
1332         return format, sequence, text, ordinal
1333
1334     def is_enumerated_list_item(self, ordinal, sequence, format):
1335         """
1336         Check validity based on the ordinal value and the second line.
1337
1338         Return true if the ordinal is valid and the second line is blank,
1339         indented, or starts with the next enumerator or an auto-enumerator.
1340         """
1341         if ordinal is None:
1342             return None
1343         try:
1344             next_line = self.state_machine.next_line()
1345         except EOFError:              # end of input lines
1346             self.state_machine.previous_line()
1347             return 1
1348         else:
1349             self.state_machine.previous_line()
1350         if not next_line[:1].strip():   # blank or indented
1351             return 1
1352         result = self.make_enumerator(ordinal + 1, sequence, format)
1353         if result:
1354             next_enumerator, auto_enumerator = result
1355             try:
1356                 if ( next_line.startswith(next_enumerator) or
1357                      next_line.startswith(auto_enumerator) ):
1358                     return 1
1359             except TypeError:
1360                 pass
1361         return None
1362
1363     def make_enumerator(self, ordinal, sequence, format):
1364         """
1365         Construct and return the next enumerated list item marker, and an
1366         auto-enumerator ("#" instead of the regular enumerator).
1367
1368         Return ``None`` for invalid (out of range) ordinals.
1369         """ #"
1370         if sequence == '#':
1371             enumerator = '#'
1372         elif sequence == 'arabic':
1373             enumerator = str(ordinal)
1374         else:
1375             if sequence.endswith('alpha'):
1376                 if ordinal > 26:
1377                     return None
1378                 enumerator = chr(ordinal + ord('a') - 1)
1379             elif sequence.endswith('roman'):
1380                 try:
1381                     enumerator = roman.toRoman(ordinal)
1382                 except roman.RomanError:
1383                     return None
1384             else:                       # shouldn't happen
1385                 raise ParserError('unknown enumerator sequence: "%s"'
1386                                   % sequence)
1387             if sequence.startswith('lower'):
1388                 enumerator = enumerator.lower()
1389             elif sequence.startswith('upper'):
1390                 enumerator = enumerator.upper()
1391             else:                       # shouldn't happen
1392                 raise ParserError('unknown enumerator sequence: "%s"'
1393                                   % sequence)
1394         formatinfo = self.enum.formatinfo[format]
1395         next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1396                            + ' ')
1397         auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1398         return next_enumerator, auto_enumerator
1399
1400     def field_marker(self, match, context, next_state):
1401         """Field list item."""
1402         field_list = nodes.field_list()
1403         self.parent += field_list
1404         field, blank_finish = self.field(match)
1405         field_list += field
1406         offset = self.state_machine.line_offset + 1   # next line
1407         newline_offset, blank_finish = self.nested_list_parse(
1408               self.state_machine.input_lines[offset:],
1409               input_offset=self.state_machine.abs_line_offset() + 1,
1410               node=field_list, initial_state='FieldList',
1411               blank_finish=blank_finish)
1412         self.goto_line(newline_offset)
1413         if not blank_finish:
1414             self.parent += self.unindent_warning('Field list')
1415         return [], next_state, []
1416
1417     def field(self, match):
1418         name = self.parse_field_marker(match)
1419         src, srcline = self.state_machine.get_source_and_line()
1420         lineno = self.state_machine.abs_line_number()
1421         indented, indent, line_offset, blank_finish = \
1422               self.state_machine.get_first_known_indented(match.end())
1423         field_node = nodes.field()
1424         field_node.source = src
1425         field_node.line = srcline
1426         name_nodes, name_messages = self.inline_text(name, lineno)
1427         field_node += nodes.field_name(name, '', *name_nodes)
1428         field_body = nodes.field_body('\n'.join(indented), *name_messages)
1429         field_node += field_body
1430         if indented:
1431             self.parse_field_body(indented, line_offset, field_body)
1432         return field_node, blank_finish
1433
1434     def parse_field_marker(self, match):
1435         """Extract & return field name from a field marker match."""
1436         field = match.group()[1:]        # strip off leading ':'
1437         field = field[:field.rfind(':')] # strip off trailing ':' etc.
1438         return field
1439
1440     def parse_field_body(self, indented, offset, node):
1441         self.nested_parse(indented, input_offset=offset, node=node)
1442
1443     def option_marker(self, match, context, next_state):
1444         """Option list item."""
1445         optionlist = nodes.option_list()
1446         try:
1447             listitem, blank_finish = self.option_list_item(match)
1448         except MarkupError, error:
1449             # This shouldn't happen; pattern won't match.
1450             src, srcline = self.state_machine.get_source_and_line()
1451             msg = self.reporter.error(u'Invalid option list marker: %s' %
1452                 error, source=src, line=srcline)
1453             self.parent += msg
1454             indented, indent, line_offset, blank_finish = \
1455                   self.state_machine.get_first_known_indented(match.end())
1456             elements = self.block_quote(indented, line_offset)
1457             self.parent += elements
1458             if not blank_finish:
1459                 self.parent += self.unindent_warning('Option list')
1460             return [], next_state, []
1461         self.parent += optionlist
1462         optionlist += listitem
1463         offset = self.state_machine.line_offset + 1   # next line
1464         newline_offset, blank_finish = self.nested_list_parse(
1465               self.state_machine.input_lines[offset:],
1466               input_offset=self.state_machine.abs_line_offset() + 1,
1467               node=optionlist, initial_state='OptionList',
1468               blank_finish=blank_finish)
1469         self.goto_line(newline_offset)
1470         if not blank_finish:
1471             self.parent += self.unindent_warning('Option list')
1472         return [], next_state, []
1473
1474     def option_list_item(self, match):
1475         offset = self.state_machine.abs_line_offset()
1476         options = self.parse_option_marker(match)
1477         indented, indent, line_offset, blank_finish = \
1478               self.state_machine.get_first_known_indented(match.end())
1479         if not indented:                # not an option list item
1480             self.goto_line(offset)
1481             raise statemachine.TransitionCorrection('text')
1482         option_group = nodes.option_group('', *options)
1483         description = nodes.description('\n'.join(indented))
1484         option_list_item = nodes.option_list_item('', option_group,
1485                                                   description)
1486         if indented:
1487             self.nested_parse(indented, input_offset=line_offset,
1488                               node=description)
1489         return option_list_item, blank_finish
1490
1491     def parse_option_marker(self, match):
1492         """
1493         Return a list of `node.option` and `node.option_argument` objects,
1494         parsed from an option marker match.
1495
1496         :Exception: `MarkupError` for invalid option markers.
1497         """
1498         optlist = []
1499         optionstrings = match.group().rstrip().split(', ')
1500         for optionstring in optionstrings:
1501             tokens = optionstring.split()
1502             delimiter = ' '
1503             firstopt = tokens[0].split('=', 1)
1504             if len(firstopt) > 1:
1505                 # "--opt=value" form
1506                 tokens[:1] = firstopt
1507                 delimiter = '='
1508             elif (len(tokens[0]) > 2
1509                   and ((tokens[0].startswith('-')
1510                         and not tokens[0].startswith('--'))
1511                        or tokens[0].startswith('+'))):
1512                 # "-ovalue" form
1513                 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1514                 delimiter = ''
1515             if len(tokens) > 1 and (tokens[1].startswith('<')
1516                                     and tokens[-1].endswith('>')):
1517                 # "-o <value1 value2>" form; join all values into one token
1518                 tokens[1:] = [' '.join(tokens[1:])]
1519             if 0 < len(tokens) <= 2:
1520                 option = nodes.option(optionstring)
1521                 option += nodes.option_string(tokens[0], tokens[0])
1522                 if len(tokens) > 1:
1523                     option += nodes.option_argument(tokens[1], tokens[1],
1524                                                     delimiter=delimiter)
1525                 optlist.append(option)
1526             else:
1527                 raise MarkupError(
1528                     'wrong number of option tokens (=%s), should be 1 or 2: '
1529                     '"%s"' % (len(tokens), optionstring))
1530         return optlist
1531
1532     def doctest(self, match, context, next_state):
1533         data = '\n'.join(self.state_machine.get_text_block())
1534         self.parent += nodes.doctest_block(data, data)
1535         return [], next_state, []
1536
1537     def line_block(self, match, context, next_state):
1538         """First line of a line block."""
1539         block = nodes.line_block()
1540         self.parent += block
1541         lineno = self.state_machine.abs_line_number()
1542         line, messages, blank_finish = self.line_block_line(match, lineno)
1543         block += line
1544         self.parent += messages
1545         if not blank_finish:
1546             offset = self.state_machine.line_offset + 1   # next line
1547             new_line_offset, blank_finish = self.nested_list_parse(
1548                   self.state_machine.input_lines[offset:],
1549                   input_offset=self.state_machine.abs_line_offset() + 1,
1550                   node=block, initial_state='LineBlock',
1551                   blank_finish=0)
1552             self.goto_line(new_line_offset)
1553         if not blank_finish:
1554             src, srcline = self.state_machine.get_source_and_line()
1555             self.parent += self.reporter.warning(
1556                 'Line block ends without a blank line.',
1557                 source=src, line=srcline+1)
1558         if len(block):
1559             if block[0].indent is None:
1560                 block[0].indent = 0
1561             self.nest_line_block_lines(block)
1562         return [], next_state, []
1563
1564     def line_block_line(self, match, lineno):
1565         """Return one line element of a line_block."""
1566         indented, indent, line_offset, blank_finish = \
1567               self.state_machine.get_first_known_indented(match.end(),
1568                                                           until_blank=1)
1569         text = u'\n'.join(indented)
1570         text_nodes, messages = self.inline_text(text, lineno)
1571         line = nodes.line(text, '', *text_nodes)
1572         if match.string.rstrip() != '|': # not empty
1573             line.indent = len(match.group(1)) - 1
1574         return line, messages, blank_finish
1575
1576     def nest_line_block_lines(self, block):
1577         for index in range(1, len(block)):
1578             if block[index].indent is None:
1579                 block[index].indent = block[index - 1].indent
1580         self.nest_line_block_segment(block)
1581
1582     def nest_line_block_segment(self, block):
1583         indents = [item.indent for item in block]
1584         least = min(indents)
1585         new_items = []
1586         new_block = nodes.line_block()
1587         for item in block:
1588             if item.indent > least:
1589                 new_block.append(item)
1590             else:
1591                 if len(new_block):
1592                     self.nest_line_block_segment(new_block)
1593                     new_items.append(new_block)
1594                     new_block = nodes.line_block()
1595                 new_items.append(item)
1596         if len(new_block):
1597             self.nest_line_block_segment(new_block)
1598             new_items.append(new_block)
1599         block[:] = new_items
1600
1601     def grid_table_top(self, match, context, next_state):
1602         """Top border of a full table."""
1603         return self.table_top(match, context, next_state,
1604                               self.isolate_grid_table,
1605                               tableparser.GridTableParser)
1606
1607     def simple_table_top(self, match, context, next_state):
1608         """Top border of a simple table."""
1609         return self.table_top(match, context, next_state,
1610                               self.isolate_simple_table,
1611                               tableparser.SimpleTableParser)
1612
1613     def table_top(self, match, context, next_state,
1614                   isolate_function, parser_class):
1615         """Top border of a generic table."""
1616         nodelist, blank_finish = self.table(isolate_function, parser_class)
1617         self.parent += nodelist
1618         if not blank_finish:
1619             src, srcline = self.state_machine.get_source_and_line()
1620             msg = self.reporter.warning(
1621                 'Blank line required after table.',
1622                 source=src, line=srcline+1)
1623             self.parent += msg
1624         return [], next_state, []
1625
1626     def table(self, isolate_function, parser_class):
1627         """Parse a table."""
1628         block, messages, blank_finish = isolate_function()
1629         if block:
1630             try:
1631                 parser = parser_class()
1632                 tabledata = parser.parse(block)
1633                 tableline = (self.state_machine.abs_line_number() - len(block)
1634                              + 1)
1635                 table = self.build_table(tabledata, tableline)
1636                 nodelist = [table] + messages
1637             except tableparser.TableMarkupError, detail:
1638                 nodelist = self.malformed_table(
1639                     block, ' '.join(detail.args)) + messages
1640         else:
1641             nodelist = messages
1642         return nodelist, blank_finish
1643
1644     def isolate_grid_table(self):
1645         messages = []
1646         blank_finish = 1
1647         try:
1648             block = self.state_machine.get_text_block(flush_left=1)
1649         except statemachine.UnexpectedIndentationError, instance:
1650             block, src, srcline = instance.args
1651             messages.append(self.reporter.error('Unexpected indentation.',
1652                                                 source=src, line=srcline))
1653             blank_finish = 0
1654         block.disconnect()
1655         # for East Asian chars:
1656         block.pad_double_width(self.double_width_pad_char)
1657         width = len(block[0].strip())
1658         for i in range(len(block)):
1659             block[i] = block[i].strip()
1660             if block[i][0] not in '+|': # check left edge
1661                 blank_finish = 0
1662                 self.state_machine.previous_line(len(block) - i)
1663                 del block[i:]
1664                 break
1665         if not self.grid_table_top_pat.match(block[-1]): # find bottom
1666             blank_finish = 0
1667             # from second-last to third line of table:
1668             for i in range(len(block) - 2, 1, -1):
1669                 if self.grid_table_top_pat.match(block[i]):
1670                     self.state_machine.previous_line(len(block) - i + 1)
1671                     del block[i+1:]
1672                     break
1673             else:
1674                 messages.extend(self.malformed_table(block))
1675                 return [], messages, blank_finish
1676         for i in range(len(block)):     # check right edge
1677             if len(block[i]) != width or block[i][-1] not in '+|':
1678                 messages.extend(self.malformed_table(block))
1679                 return [], messages, blank_finish
1680         return block, messages, blank_finish
1681
1682     def isolate_simple_table(self):
1683         start = self.state_machine.line_offset
1684         lines = self.state_machine.input_lines
1685         limit = len(lines) - 1
1686         toplen = len(lines[start].strip())
1687         pattern_match = self.simple_table_border_pat.match
1688         found = 0
1689         found_at = None
1690         i = start + 1
1691         while i <= limit:
1692             line = lines[i]
1693             match = pattern_match(line)
1694             if match:
1695                 if len(line.strip()) != toplen:
1696                     self.state_machine.next_line(i - start)
1697                     messages = self.malformed_table(
1698                         lines[start:i+1], 'Bottom/header table border does '
1699                         'not match top border.')
1700                     return [], messages, i == limit or not lines[i+1].strip()
1701                 found += 1
1702                 found_at = i
1703                 if found == 2 or i == limit or not lines[i+1].strip():
1704                     end = i
1705                     break
1706             i += 1
1707         else:                           # reached end of input_lines
1708             if found:
1709                 extra = ' or no blank line after table bottom'
1710                 self.state_machine.next_line(found_at - start)
1711                 block = lines[start:found_at+1]
1712             else:
1713                 extra = ''
1714                 self.state_machine.next_line(i - start - 1)
1715                 block = lines[start:]
1716             messages = self.malformed_table(
1717                 block, 'No bottom table border found%s.' % extra)
1718             return [], messages, not extra
1719         self.state_machine.next_line(end - start)
1720         block = lines[start:end+1]
1721         # for East Asian chars:
1722         block.pad_double_width(self.double_width_pad_char)
1723         return block, [], end == limit or not lines[end+1].strip()
1724
1725     def malformed_table(self, block, detail=''):
1726         block.replace(self.double_width_pad_char, '')
1727         data = '\n'.join(block)
1728         message = 'Malformed table.'
1729         startline = self.state_machine.abs_line_number() - len(block) + 1
1730         src, srcline = self.state_machine.get_source_and_line(startline)
1731         if detail:
1732             message += '\n' + detail
1733         error = self.reporter.error(message, nodes.literal_block(data, data),
1734                                     source=src, line=srcline)
1735         return [error]
1736
1737     def build_table(self, tabledata, tableline, stub_columns=0):
1738         colwidths, headrows, bodyrows = tabledata
1739         table = nodes.table()
1740         tgroup = nodes.tgroup(cols=len(colwidths))
1741         table += tgroup
1742         for colwidth in colwidths:
1743             colspec = nodes.colspec(colwidth=colwidth)
1744             if stub_columns:
1745                 colspec.attributes['stub'] = 1
1746                 stub_columns -= 1
1747             tgroup += colspec
1748         if headrows:
1749             thead = nodes.thead()
1750             tgroup += thead
1751             for row in headrows:
1752                 thead += self.build_table_row(row, tableline)
1753         tbody = nodes.tbody()
1754         tgroup += tbody
1755         for row in bodyrows:
1756             tbody += self.build_table_row(row, tableline)
1757         return table
1758
1759     def build_table_row(self, rowdata, tableline):
1760         row = nodes.row()
1761         for cell in rowdata:
1762             if cell is None:
1763                 continue
1764             morerows, morecols, offset, cellblock = cell
1765             attributes = {}
1766             if morerows:
1767                 attributes['morerows'] = morerows
1768             if morecols:
1769                 attributes['morecols'] = morecols
1770             entry = nodes.entry(**attributes)
1771             row += entry
1772             if ''.join(cellblock):
1773                 self.nested_parse(cellblock, input_offset=tableline+offset,
1774                                   node=entry)
1775         return row
1776
1777
1778     explicit = Struct()
1779     """Patterns and constants used for explicit markup recognition."""
1780
1781     explicit.patterns = Struct(
1782           target=re.compile(r"""
1783                             (
1784                               _               # anonymous target
1785                             |               # *OR*
1786                               (?!_)           # no underscore at the beginning
1787                               (?P<quote>`?)   # optional open quote
1788                               (?![ `])        # first char. not space or
1789                                               # backquote
1790                               (?P<name>       # reference name
1791                                 .+?
1792                               )
1793                               %(non_whitespace_escape_before)s
1794                               (?P=quote)      # close quote if open quote used
1795                             )
1796                             (?<!(?<!\x00):) # no unescaped colon at end
1797                             %(non_whitespace_escape_before)s
1798                             [ ]?            # optional space
1799                             :               # end of reference name
1800                             ([ ]+|$)        # followed by whitespace
1801                             """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1802           reference=re.compile(r"""
1803                                (
1804                                  (?P<simple>%(simplename)s)_
1805                                |                  # *OR*
1806                                  `                  # open backquote
1807                                  (?![ ])            # not space
1808                                  (?P<phrase>.+?)    # hyperlink phrase
1809                                  %(non_whitespace_escape_before)s
1810                                  `_                 # close backquote,
1811                                                     # reference mark
1812                                )
1813                                $                  # end of string
1814                                """ % vars(Inliner), re.VERBOSE | re.UNICODE),
1815           substitution=re.compile(r"""
1816                                   (
1817                                     (?![ ])          # first char. not space
1818                                     (?P<name>.+?)    # substitution text
1819                                     %(non_whitespace_escape_before)s
1820                                     \|               # close delimiter
1821                                   )
1822                                   ([ ]+|$)           # followed by whitespace
1823                                   """ % vars(Inliner),
1824                                   re.VERBOSE | re.UNICODE),)
1825
1826     def footnote(self, match):
1827         src, srcline = self.state_machine.get_source_and_line()
1828         indented, indent, offset, blank_finish = \
1829               self.state_machine.get_first_known_indented(match.end())
1830         label = match.group(1)
1831         name = normalize_name(label)
1832         footnote = nodes.footnote('\n'.join(indented))
1833         footnote.source = src
1834         footnote.line = srcline
1835         if name[0] == '#':              # auto-numbered
1836             name = name[1:]             # autonumber label
1837             footnote['auto'] = 1
1838             if name:
1839                 footnote['names'].append(name)
1840             self.document.note_autofootnote(footnote)
1841         elif name == '*':               # auto-symbol
1842             name = ''
1843             footnote['auto'] = '*'
1844             self.document.note_symbol_footnote(footnote)
1845         else:                           # manually numbered
1846             footnote += nodes.label('', label)
1847             footnote['names'].append(name)
1848             self.document.note_footnote(footnote)
1849         if name:
1850             self.document.note_explicit_target(footnote, footnote)
1851         else:
1852             self.document.set_id(footnote, footnote)
1853         if indented:
1854             self.nested_parse(indented, input_offset=offset, node=footnote)
1855         return [footnote], blank_finish
1856
1857     def citation(self, match):
1858         src, srcline = self.state_machine.get_source_and_line()
1859         indented, indent, offset, blank_finish = \
1860               self.state_machine.get_first_known_indented(match.end())
1861         label = match.group(1)
1862         name = normalize_name(label)
1863         citation = nodes.citation('\n'.join(indented))
1864         citation.source = src
1865         citation.line = srcline
1866         citation += nodes.label('', label)
1867         citation['names'].append(name)
1868         self.document.note_citation(citation)
1869         self.document.note_explicit_target(citation, citation)
1870         if indented:
1871             self.nested_parse(indented, input_offset=offset, node=citation)
1872         return [citation], blank_finish
1873
1874     def hyperlink_target(self, match):
1875         pattern = self.explicit.patterns.target
1876         lineno = self.state_machine.abs_line_number()
1877         src, srcline = self.state_machine.get_source_and_line()
1878         block, indent, offset, blank_finish = \
1879               self.state_machine.get_first_known_indented(
1880               match.end(), until_blank=1, strip_indent=0)
1881         blocktext = match.string[:match.end()] + '\n'.join(block)
1882         block = [escape2null(line) for line in block]
1883         escaped = block[0]
1884         blockindex = 0
1885         while 1:
1886             targetmatch = pattern.match(escaped)
1887             if targetmatch:
1888                 break
1889             blockindex += 1
1890             try:
1891                 escaped += block[blockindex]
1892             except IndexError:
1893                 raise MarkupError('malformed hyperlink target.')
1894         del block[:blockindex]
1895         block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1896         target = self.make_target(block, blocktext, lineno,
1897                                   targetmatch.group('name'))
1898         return [target], blank_finish
1899
1900     def make_target(self, block, block_text, lineno, target_name):
1901         target_type, data = self.parse_target(block, block_text, lineno)
1902         if target_type == 'refname':
1903             target = nodes.target(block_text, '', refname=normalize_name(data))
1904             target.indirect_reference_name = data
1905             self.add_target(target_name, '', target, lineno)
1906             self.document.note_indirect_target(target)
1907             return target
1908         elif target_type == 'refuri':
1909             target = nodes.target(block_text, '')
1910             self.add_target(target_name, data, target, lineno)
1911             return target
1912         else:
1913             return data
1914
1915     def parse_target(self, block, block_text, lineno):
1916         """
1917         Determine the type of reference of a target.
1918
1919         :Return: A 2-tuple, one of:
1920
1921             - 'refname' and the indirect reference name
1922             - 'refuri' and the URI
1923             - 'malformed' and a system_message node
1924         """
1925         if block and block[-1].strip()[-1:] == '_': # possible indirect target
1926             reference = ' '.join([line.strip() for line in block])
1927             refname = self.is_reference(reference)
1928             if refname:
1929                 return 'refname', refname
1930         reference = ''.join([''.join(line.split()) for line in block])
1931         return 'refuri', unescape(reference)
1932
1933     def is_reference(self, reference):
1934         match = self.explicit.patterns.reference.match(
1935             whitespace_normalize_name(reference))
1936         if not match:
1937             return None
1938         return unescape(match.group('simple') or match.group('phrase'))
1939
1940     def add_target(self, targetname, refuri, target, lineno):
1941         target.line = lineno
1942         if targetname:
1943             name = normalize_name(unescape(targetname))
1944             target['names'].append(name)
1945             if refuri:
1946                 uri = self.inliner.adjust_uri(refuri)
1947                 if uri:
1948                     target['refuri'] = uri
1949                 else:
1950                     raise ApplicationError('problem with URI: %r' % refuri)
1951             self.document.note_explicit_target(target, self.parent)
1952         else:                       # anonymous target
1953             if refuri:
1954                 target['refuri'] = refuri
1955             target['anonymous'] = 1
1956             self.document.note_anonymous_target(target)
1957
1958     def substitution_def(self, match):
1959         pattern = self.explicit.patterns.substitution
1960         src, srcline = self.state_machine.get_source_and_line()
1961         block, indent, offset, blank_finish = \
1962               self.state_machine.get_first_known_indented(match.end(),
1963                                                           strip_indent=0)
1964         blocktext = (match.string[:match.end()] + '\n'.join(block))
1965         block.disconnect()
1966         escaped = escape2null(block[0].rstrip())
1967         blockindex = 0
1968         while 1:
1969             subdefmatch = pattern.match(escaped)
1970             if subdefmatch:
1971                 break
1972             blockindex += 1
1973             try:
1974                 escaped = escaped + ' ' + escape2null(block[blockindex].strip())
1975             except IndexError:
1976                 raise MarkupError('malformed substitution definition.')
1977         del block[:blockindex]          # strip out the substitution marker
1978         block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1]
1979         if not block[0]:
1980             del block[0]
1981             offset += 1
1982         while block and not block[-1].strip():
1983             block.pop()
1984         subname = subdefmatch.group('name')
1985         substitution_node = nodes.substitution_definition(blocktext)
1986         substitution_node.source = src
1987         substitution_node.line = srcline
1988         if not block:
1989             msg = self.reporter.warning(
1990                 'Substitution definition "%s" missing contents.' % subname,
1991                 nodes.literal_block(blocktext, blocktext),
1992                 source=src, line=srcline)
1993             return [msg], blank_finish
1994         block[0] = block[0].strip()
1995         substitution_node['names'].append(
1996             nodes.whitespace_normalize_name(subname))
1997         new_abs_offset, blank_finish = self.nested_list_parse(
1998               block, input_offset=offset, node=substitution_node,
1999               initial_state='SubstitutionDef', blank_finish=blank_finish)
2000         i = 0
2001         for node in substitution_node[:]:
2002             if not (isinstance(node, nodes.Inline) or
2003                     isinstance(node, nodes.Text)):
2004                 self.parent += substitution_node[i]
2005                 del substitution_node[i]
2006             else:
2007                 i += 1
2008         for node in substitution_node.traverse(nodes.Element):
2009             if self.disallowed_inside_substitution_definitions(node):
2010                 pformat = nodes.literal_block('', node.pformat().rstrip())
2011                 msg = self.reporter.error(
2012                     'Substitution definition contains illegal element:',
2013                     pformat, nodes.literal_block(blocktext, blocktext),
2014                     source=src, line=srcline)
2015                 return [msg], blank_finish
2016         if len(substitution_node) == 0:
2017             msg = self.reporter.warning(
2018                   'Substitution definition "%s" empty or invalid.' % subname,
2019                   nodes.literal_block(blocktext, blocktext),
2020                   source=src, line=srcline)
2021             return [msg], blank_finish
2022         self.document.note_substitution_def(
2023             substitution_node, subname, self.parent)
2024         return [substitution_node], blank_finish
2025
2026     def disallowed_inside_substitution_definitions(self, node):
2027         if (node['ids'] or
2028             isinstance(node, nodes.reference) and node.get('anonymous') or
2029             isinstance(node, nodes.footnote_reference) and node.get('auto')):
2030             return 1
2031         else:
2032             return 0
2033
2034     def directive(self, match, **option_presets):
2035         """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2036         type_name = match.group(1)
2037         directive_class, messages = directives.directive(
2038             type_name, self.memo.language, self.document)
2039         self.parent += messages
2040         if directive_class:
2041             return self.run_directive(
2042                 directive_class, match, type_name, option_presets)
2043         else:
2044             return self.unknown_directive(type_name)
2045
2046     def run_directive(self, directive, match, type_name, option_presets):
2047         """
2048         Parse a directive then run its directive function.
2049
2050         Parameters:
2051
2052         - `directive`: The class implementing the directive.  Must be
2053           a subclass of `rst.Directive`.
2054
2055         - `match`: A regular expression match object which matched the first
2056           line of the directive.
2057
2058         - `type_name`: The directive name, as used in the source text.
2059
2060         - `option_presets`: A dictionary of preset options, defaults for the
2061           directive options.  Currently, only an "alt" option is passed by
2062           substitution definitions (value: the substitution name), which may
2063           be used by an embedded image directive.
2064
2065         Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2066         """
2067         if isinstance(directive, (FunctionType, MethodType)):
2068             from docutils.parsers.rst import convert_directive_function
2069             directive = convert_directive_function(directive)
2070         lineno = self.state_machine.abs_line_number()
2071         src, srcline = self.state_machine.get_source_and_line()
2072         initial_line_offset = self.state_machine.line_offset
2073         indented, indent, line_offset, blank_finish \
2074                   = self.state_machine.get_first_known_indented(match.end(),
2075                                                                 strip_top=0)
2076         block_text = '\n'.join(self.state_machine.input_lines[
2077             initial_line_offset : self.state_machine.line_offset + 1])
2078         try:
2079             arguments, options, content, content_offset = (
2080                 self.parse_directive_block(indented, line_offset,
2081                                            directive, option_presets))
2082         except MarkupError, detail:
2083             error = self.reporter.error(
2084                 'Error in "%s" directive:\n%s.' % (type_name,
2085                                                    ' '.join(detail.args)),
2086                 nodes.literal_block(block_text, block_text),
2087                 source=src, line=srcline)
2088             return [error], blank_finish
2089         directive_instance = directive(
2090             type_name, arguments, options, content, lineno,
2091             content_offset, block_text, self, self.state_machine)
2092         try:
2093             result = directive_instance.run()
2094         except docutils.parsers.rst.DirectiveError, error:
2095             msg_node = self.reporter.system_message(error.level, error.msg,
2096                 source=src, line=srcline)
2097             msg_node += nodes.literal_block(block_text, block_text)
2098             result = [msg_node]
2099         assert isinstance(result, list), \
2100                'Directive "%s" must return a list of nodes.' % type_name
2101         for i in range(len(result)):
2102             assert isinstance(result[i], nodes.Node), \
2103                    ('Directive "%s" returned non-Node object (index %s): %r'
2104                     % (type_name, i, result[i]))
2105         return (result,
2106                 blank_finish or self.state_machine.is_next_line_blank())
2107
2108     def parse_directive_block(self, indented, line_offset, directive,
2109                               option_presets):
2110         option_spec = directive.option_spec
2111         has_content = directive.has_content
2112         if indented and not indented[0].strip():
2113             indented.trim_start()
2114             line_offset += 1
2115         while indented and not indented[-1].strip():
2116             indented.trim_end()
2117         if indented and (directive.required_arguments
2118                          or directive.optional_arguments
2119                          or option_spec):
2120             for i, line in enumerate(indented):
2121                 if not line.strip():
2122                     break
2123             else:
2124                 i += 1
2125             arg_block = indented[:i]
2126             content = indented[i+1:]
2127             content_offset = line_offset + i + 1
2128         else:
2129             content = indented
2130             content_offset = line_offset
2131             arg_block = []
2132         if option_spec:
2133             options, arg_block = self.parse_directive_options(
2134                 option_presets, option_spec, arg_block)
2135         else:
2136             options = {}
2137         if arg_block and not (directive.required_arguments
2138                               or directive.optional_arguments):
2139             content = arg_block + indented[i:]
2140             content_offset = line_offset
2141             arg_block = []
2142         while content and not content[0].strip():
2143             content.trim_start()
2144             content_offset += 1
2145         if directive.required_arguments or directive.optional_arguments:
2146             arguments = self.parse_directive_arguments(
2147                 directive, arg_block)
2148         else:
2149             arguments = []
2150         if content and not has_content:
2151             raise MarkupError('no content permitted')
2152         return (arguments, options, content, content_offset)
2153
2154     def parse_directive_options(self, option_presets, option_spec, arg_block):
2155         options = option_presets.copy()
2156         for i in range(len(arg_block)):
2157             if arg_block[i][:1] == ':':
2158                 opt_block = arg_block[i:]
2159                 arg_block = arg_block[:i]
2160                 break
2161         else:
2162             opt_block = []
2163         if opt_block:
2164             success, data = self.parse_extension_options(option_spec,
2165                                                          opt_block)
2166             if success:                 # data is a dict of options
2167                 options.update(data)
2168             else:                       # data is an error string
2169                 raise MarkupError(data)
2170         return options, arg_block
2171
2172     def parse_directive_arguments(self, directive, arg_block):
2173         required = directive.required_arguments
2174         optional = directive.optional_arguments
2175         arg_text = '\n'.join(arg_block)
2176         arguments = arg_text.split()
2177         if len(arguments) < required:
2178             raise MarkupError('%s argument(s) required, %s supplied'
2179                               % (required, len(arguments)))
2180         elif len(arguments) > required + optional:
2181             if directive.final_argument_whitespace:
2182                 arguments = arg_text.split(None, required + optional - 1)
2183             else:
2184                 raise MarkupError(
2185                     'maximum %s argument(s) allowed, %s supplied'
2186                     % (required + optional, len(arguments)))
2187         return arguments
2188
2189     def parse_extension_options(self, option_spec, datalines):
2190         """
2191         Parse `datalines` for a field list containing extension options
2192         matching `option_spec`.
2193
2194         :Parameters:
2195             - `option_spec`: a mapping of option name to conversion
2196               function, which should raise an exception on bad input.
2197             - `datalines`: a list of input strings.
2198
2199         :Return:
2200             - Success value, 1 or 0.
2201             - An option dictionary on success, an error string on failure.
2202         """
2203         node = nodes.field_list()
2204         newline_offset, blank_finish = self.nested_list_parse(
2205               datalines, 0, node, initial_state='ExtensionOptions',
2206               blank_finish=1)
2207         if newline_offset != len(datalines): # incomplete parse of block
2208             return 0, 'invalid option block'
2209         try:
2210             options = utils.extract_extension_options(node, option_spec)
2211         except KeyError, detail:
2212             return 0, ('unknown option: "%s"' % detail.args[0])
2213         except (ValueError, TypeError), detail:
2214             return 0, ('invalid option value: %s' % ' '.join(detail.args))
2215         except utils.ExtensionOptionError, detail:
2216             return 0, ('invalid option data: %s' % ' '.join(detail.args))
2217         if blank_finish:
2218             return 1, options
2219         else:
2220             return 0, 'option data incompletely parsed'
2221
2222     def unknown_directive(self, type_name):
2223         src, srcline = self.state_machine.get_source_and_line()
2224         indented, indent, offset, blank_finish = \
2225               self.state_machine.get_first_known_indented(0, strip_indent=0)
2226         text = '\n'.join(indented)
2227         error = self.reporter.error(
2228               'Unknown directive type "%s".' % type_name,
2229               nodes.literal_block(text, text), source=src, line=srcline)
2230         return [error], blank_finish
2231
2232     def comment(self, match):
2233         if not match.string[match.end():].strip() \
2234               and self.state_machine.is_next_line_blank(): # an empty comment?
2235             return [nodes.comment()], 1 # "A tiny but practical wart."
2236         indented, indent, offset, blank_finish = \
2237               self.state_machine.get_first_known_indented(match.end())
2238         while indented and not indented[-1].strip():
2239             indented.trim_end()
2240         text = '\n'.join(indented)
2241         return [nodes.comment(text, text)], blank_finish
2242
2243     explicit.constructs = [
2244           (footnote,
2245            re.compile(r"""
2246                       \.\.[ ]+          # explicit markup start
2247                       \[
2248                       (                 # footnote label:
2249                           [0-9]+          # manually numbered footnote
2250                         |               # *OR*
2251                           \#              # anonymous auto-numbered footnote
2252                         |               # *OR*
2253                           \#%s            # auto-number ed?) footnote label
2254                         |               # *OR*
2255                           \*              # auto-symbol footnote
2256                       )
2257                       \]
2258                       ([ ]+|$)          # whitespace or end of line
2259                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2260           (citation,
2261            re.compile(r"""
2262                       \.\.[ ]+          # explicit markup start
2263                       \[(%s)\]          # citation label
2264                       ([ ]+|$)          # whitespace or end of line
2265                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE)),
2266           (hyperlink_target,
2267            re.compile(r"""
2268                       \.\.[ ]+          # explicit markup start
2269                       _                 # target indicator
2270                       (?![ ]|$)         # first char. not space or EOL
2271                       """, re.VERBOSE | re.UNICODE)),
2272           (substitution_def,
2273            re.compile(r"""
2274                       \.\.[ ]+          # explicit markup start
2275                       \|                # substitution indicator
2276                       (?![ ]|$)         # first char. not space or EOL
2277                       """, re.VERBOSE | re.UNICODE)),
2278           (directive,
2279            re.compile(r"""
2280                       \.\.[ ]+          # explicit markup start
2281                       (%s)              # directive name
2282                       [ ]?              # optional space
2283                       ::                # directive delimiter
2284                       ([ ]+|$)          # whitespace or end of line
2285                       """ % Inliner.simplename, re.VERBOSE | re.UNICODE))]
2286
2287     def explicit_markup(self, match, context, next_state):
2288         """Footnotes, hyperlink targets, directives, comments."""
2289         nodelist, blank_finish = self.explicit_construct(match)
2290         self.parent += nodelist
2291         self.explicit_list(blank_finish)
2292         return [], next_state, []
2293
2294     def explicit_construct(self, match):
2295         """Determine which explicit construct this is, parse & return it."""
2296         errors = []
2297         for method, pattern in self.explicit.constructs:
2298             expmatch = pattern.match(match.string)
2299             if expmatch:
2300                 try:
2301                     return method(self, expmatch)
2302                 except MarkupError, error: # never reached?
2303                     message = ' '.join(error.args)
2304                     src, srcline = self.state_machine.get_source_and_line()
2305                     errors.append(self.reporter.warning(
2306                                       message, source=src, line=srcline))
2307                     break
2308         nodelist, blank_finish = self.comment(match)
2309         return nodelist + errors, blank_finish
2310
2311     def explicit_list(self, blank_finish):
2312         """
2313         Create a nested state machine for a series of explicit markup
2314         constructs (including anonymous hyperlink targets).
2315         """
2316         offset = self.state_machine.line_offset + 1   # next line
2317         newline_offset, blank_finish = self.nested_list_parse(
2318               self.state_machine.input_lines[offset:],
2319               input_offset=self.state_machine.abs_line_offset() + 1,
2320               node=self.parent, initial_state='Explicit',
2321               blank_finish=blank_finish,
2322               match_titles=self.state_machine.match_titles)
2323         self.goto_line(newline_offset)
2324         if not blank_finish:
2325             self.parent += self.unindent_warning('Explicit markup')
2326
2327     def anonymous(self, match, context, next_state):
2328         """Anonymous hyperlink targets."""
2329         nodelist, blank_finish = self.anonymous_target(match)
2330         self.parent += nodelist
2331         self.explicit_list(blank_finish)
2332         return [], next_state, []
2333
2334     def anonymous_target(self, match):
2335         lineno = self.state_machine.abs_line_number()
2336         block, indent, offset, blank_finish \
2337               = self.state_machine.get_first_known_indented(match.end(),
2338                                                             until_blank=1)
2339         blocktext = match.string[:match.end()] + '\n'.join(block)
2340         block = [escape2null(line) for line in block]
2341         target = self.make_target(block, blocktext, lineno, '')
2342         return [target], blank_finish
2343
2344     def line(self, match, context, next_state):
2345         """Section title overline or transition marker."""
2346         if self.state_machine.match_titles:
2347             return [match.string], 'Line', []
2348         elif match.string.strip() == '::':
2349             raise statemachine.TransitionCorrection('text')
2350         elif len(match.string.strip()) < 4:
2351             msg = self.reporter.info(
2352                 'Unexpected possible title overline or transition.\n'
2353                 "Treating it as ordinary text because it's so short.",
2354                 line=self.state_machine.abs_line_number())
2355             self.parent += msg
2356             raise statemachine.TransitionCorrection('text')
2357         else:
2358             blocktext = self.state_machine.line
2359             msg = self.reporter.severe(
2360                   'Unexpected section title or transition.',
2361                   nodes.literal_block(blocktext, blocktext),
2362                   line=self.state_machine.abs_line_number())
2363             self.parent += msg
2364             return [], next_state, []
2365
2366     def text(self, match, context, next_state):
2367         """Titles, definition lists, paragraphs."""
2368         return [match.string], 'Text', []
2369
2370
2371 class RFC2822Body(Body):
2372
2373     """
2374     RFC2822 headers are only valid as the first constructs in documents.  As
2375     soon as anything else appears, the `Body` state should take over.
2376     """
2377
2378     patterns = Body.patterns.copy()     # can't modify the original
2379     patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2380     initial_transitions = [(name, 'Body')
2381                            for name in Body.initial_transitions]
2382     initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2383
2384     def rfc2822(self, match, context, next_state):
2385         """RFC2822-style field list item."""
2386         fieldlist = nodes.field_list(classes=['rfc2822'])
2387         self.parent += fieldlist
2388         field, blank_finish = self.rfc2822_field(match)
2389         fieldlist += field
2390         offset = self.state_machine.line_offset + 1   # next line
2391         newline_offset, blank_finish = self.nested_list_parse(
2392               self.state_machine.input_lines[offset:],
2393               input_offset=self.state_machine.abs_line_offset() + 1,
2394               node=fieldlist, initial_state='RFC2822List',
2395               blank_finish=blank_finish)
2396         self.goto_line(newline_offset)
2397         if not blank_finish:
2398             self.parent += self.unindent_warning(
2399                   'RFC2822-style field list')
2400         return [], next_state, []
2401
2402     def rfc2822_field(self, match):
2403         name = match.string[:match.string.find(':')]
2404         indented, indent, line_offset, blank_finish = \
2405               self.state_machine.get_first_known_indented(match.end(),
2406                                                           until_blank=1)
2407         fieldnode = nodes.field()
2408         fieldnode += nodes.field_name(name, name)
2409         fieldbody = nodes.field_body('\n'.join(indented))
2410         fieldnode += fieldbody
2411         if indented:
2412             self.nested_parse(indented, input_offset=line_offset,
2413                               node=fieldbody)
2414         return fieldnode, blank_finish
2415
2416
2417 class SpecializedBody(Body):
2418
2419     """
2420     Superclass for second and subsequent compound element members.  Compound
2421     elements are lists and list-like constructs.
2422
2423     All transition methods are disabled (redefined as `invalid_input`).
2424     Override individual methods in subclasses to re-enable.
2425
2426     For example, once an initial bullet list item, say, is recognized, the
2427     `BulletList` subclass takes over, with a "bullet_list" node as its
2428     container.  Upon encountering the initial bullet list item, `Body.bullet`
2429     calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2430     starts up a nested parsing session with `BulletList` as the initial state.
2431     Only the ``bullet`` transition method is enabled in `BulletList`; as long
2432     as only bullet list items are encountered, they are parsed and inserted
2433     into the container.  The first construct which is *not* a bullet list item
2434     triggers the `invalid_input` method, which ends the nested parse and
2435     closes the container.  `BulletList` needs to recognize input that is
2436     invalid in the context of a bullet list, which means everything *other
2437     than* bullet list items, so it inherits the transition list created in
2438     `Body`.
2439     """
2440
2441     def invalid_input(self, match=None, context=None, next_state=None):
2442         """Not a compound element member. Abort this state machine."""
2443         self.state_machine.previous_line() # back up so parent SM can reassess
2444         raise EOFError
2445
2446     indent = invalid_input
2447     bullet = invalid_input
2448     enumerator = invalid_input
2449     field_marker = invalid_input
2450     option_marker = invalid_input
2451     doctest = invalid_input
2452     line_block = invalid_input
2453     grid_table_top = invalid_input
2454     simple_table_top = invalid_input
2455     explicit_markup = invalid_input
2456     anonymous = invalid_input
2457     line = invalid_input
2458     text = invalid_input
2459
2460
2461 class BulletList(SpecializedBody):
2462
2463     """Second and subsequent bullet_list list_items."""
2464
2465     def bullet(self, match, context, next_state):
2466         """Bullet list item."""
2467         if match.string[0] != self.parent['bullet']:
2468             # different bullet: new list
2469             self.invalid_input()
2470         listitem, blank_finish = self.list_item(match.end())
2471         self.parent += listitem
2472         self.blank_finish = blank_finish
2473         return [], next_state, []
2474
2475
2476 class DefinitionList(SpecializedBody):
2477
2478     """Second and subsequent definition_list_items."""
2479
2480     def text(self, match, context, next_state):
2481         """Definition lists."""
2482         return [match.string], 'Definition', []
2483
2484
2485 class EnumeratedList(SpecializedBody):
2486
2487     """Second and subsequent enumerated_list list_items."""
2488
2489     def enumerator(self, match, context, next_state):
2490         """Enumerated list item."""
2491         format, sequence, text, ordinal = self.parse_enumerator(
2492               match, self.parent['enumtype'])
2493         if ( format != self.format
2494              or (sequence != '#' and (sequence != self.parent['enumtype']
2495                                       or self.auto
2496                                       or ordinal != (self.lastordinal + 1)))
2497              or not self.is_enumerated_list_item(ordinal, sequence, format)):
2498             # different enumeration: new list
2499             self.invalid_input()
2500         if sequence == '#':
2501             self.auto = 1
2502         listitem, blank_finish = self.list_item(match.end())
2503         self.parent += listitem
2504         self.blank_finish = blank_finish
2505         self.lastordinal = ordinal
2506         return [], next_state, []
2507
2508
2509 class FieldList(SpecializedBody):
2510
2511     """Second and subsequent field_list fields."""
2512
2513     def field_marker(self, match, context, next_state):
2514         """Field list field."""
2515         field, blank_finish = self.field(match)
2516         self.parent += field
2517         self.blank_finish = blank_finish
2518         return [], next_state, []
2519
2520
2521 class OptionList(SpecializedBody):
2522
2523     """Second and subsequent option_list option_list_items."""
2524
2525     def option_marker(self, match, context, next_state):
2526         """Option list item."""
2527         try:
2528             option_list_item, blank_finish = self.option_list_item(match)
2529         except MarkupError:
2530             self.invalid_input()
2531         self.parent += option_list_item
2532         self.blank_finish = blank_finish
2533         return [], next_state, []
2534
2535
2536 class RFC2822List(SpecializedBody, RFC2822Body):
2537
2538     """Second and subsequent RFC2822-style field_list fields."""
2539
2540     patterns = RFC2822Body.patterns
2541     initial_transitions = RFC2822Body.initial_transitions
2542
2543     def rfc2822(self, match, context, next_state):
2544         """RFC2822-style field list item."""
2545         field, blank_finish = self.rfc2822_field(match)
2546         self.parent += field
2547         self.blank_finish = blank_finish
2548         return [], 'RFC2822List', []
2549
2550     blank = SpecializedBody.invalid_input
2551
2552
2553 class ExtensionOptions(FieldList):
2554
2555     """
2556     Parse field_list fields for extension options.
2557
2558     No nested parsing is done (including inline markup parsing).
2559     """
2560
2561     def parse_field_body(self, indented, offset, node):
2562         """Override `Body.parse_field_body` for simpler parsing."""
2563         lines = []
2564         for line in list(indented) + ['']:
2565             if line.strip():
2566                 lines.append(line)
2567             elif lines:
2568                 text = '\n'.join(lines)
2569                 node += nodes.paragraph(text, text)
2570                 lines = []
2571
2572
2573 class LineBlock(SpecializedBody):
2574
2575     """Second and subsequent lines of a line_block."""
2576
2577     blank = SpecializedBody.invalid_input
2578
2579     def line_block(self, match, context, next_state):
2580         """New line of line block."""
2581         lineno = self.state_machine.abs_line_number()
2582         line, messages, blank_finish = self.line_block_line(match, lineno)
2583         self.parent += line
2584         self.parent.parent += messages
2585         self.blank_finish = blank_finish
2586         return [], next_state, []
2587
2588
2589 class Explicit(SpecializedBody):
2590
2591     """Second and subsequent explicit markup construct."""
2592
2593     def explicit_markup(self, match, context, next_state):
2594         """Footnotes, hyperlink targets, directives, comments."""
2595         nodelist, blank_finish = self.explicit_construct(match)
2596         self.parent += nodelist
2597         self.blank_finish = blank_finish
2598         return [], next_state, []
2599
2600     def anonymous(self, match, context, next_state):
2601         """Anonymous hyperlink targets."""
2602         nodelist, blank_finish = self.anonymous_target(match)
2603         self.parent += nodelist
2604         self.blank_finish = blank_finish
2605         return [], next_state, []
2606
2607     blank = SpecializedBody.invalid_input
2608
2609
2610 class SubstitutionDef(Body):
2611
2612     """
2613     Parser for the contents of a substitution_definition element.
2614     """
2615
2616     patterns = {
2617           'embedded_directive': re.compile(r'(%s)::( +|$)'
2618                                            % Inliner.simplename, re.UNICODE),
2619           'text': r''}
2620     initial_transitions = ['embedded_directive', 'text']
2621
2622     def embedded_directive(self, match, context, next_state):
2623         nodelist, blank_finish = self.directive(match,
2624                                                 alt=self.parent['names'][0])
2625         self.parent += nodelist
2626         if not self.state_machine.at_eof():
2627             self.blank_finish = blank_finish
2628         raise EOFError
2629
2630     def text(self, match, context, next_state):
2631         if not self.state_machine.at_eof():
2632             self.blank_finish = self.state_machine.is_next_line_blank()
2633         raise EOFError
2634
2635
2636 class Text(RSTState):
2637
2638     """
2639     Classifier of second line of a text block.
2640
2641     Could be a paragraph, a definition list item, or a title.
2642     """
2643
2644     patterns = {'underline': Body.patterns['line'],
2645                 'text': r''}
2646     initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2647
2648     def blank(self, match, context, next_state):
2649         """End of paragraph."""
2650         # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext
2651         paragraph, literalnext = self.paragraph(
2652               context, self.state_machine.abs_line_number() - 1)
2653         self.parent += paragraph
2654         if literalnext:
2655             self.parent += self.literal_block()
2656         return [], 'Body', []
2657
2658     def eof(self, context):
2659         if context:
2660             self.blank(None, context, None)
2661         return []
2662
2663     def indent(self, match, context, next_state):
2664         """Definition list item."""
2665         definitionlist = nodes.definition_list()
2666         definitionlistitem, blank_finish = self.definition_list_item(context)
2667         definitionlist += definitionlistitem
2668         self.parent += definitionlist
2669         offset = self.state_machine.line_offset + 1   # next line
2670         newline_offset, blank_finish = self.nested_list_parse(
2671               self.state_machine.input_lines[offset:],
2672               input_offset=self.state_machine.abs_line_offset() + 1,
2673               node=definitionlist, initial_state='DefinitionList',
2674               blank_finish=blank_finish, blank_finish_state='Definition')
2675         self.goto_line(newline_offset)
2676         if not blank_finish:
2677             self.parent += self.unindent_warning('Definition list')
2678         return [], 'Body', []
2679
2680     def underline(self, match, context, next_state):
2681         """Section title."""
2682         lineno = self.state_machine.abs_line_number()
2683         src, srcline = self.state_machine.get_source_and_line()
2684         title = context[0].rstrip()
2685         underline = match.string.rstrip()
2686         source = title + '\n' + underline
2687         messages = []
2688         if column_width(title) > len(underline):
2689             if len(underline) < 4:
2690                 if self.state_machine.match_titles:
2691                     msg = self.reporter.info(
2692                         'Possible title underline, too short for the title.\n'
2693                         "Treating it as ordinary text because it's so short.",
2694                         source=src, line=srcline)
2695                     self.parent += msg
2696                 raise statemachine.TransitionCorrection('text')
2697             else:
2698                 blocktext = context[0] + '\n' + self.state_machine.line
2699                 msg = self.reporter.warning(
2700                     'Title underline too short.',
2701                     nodes.literal_block(blocktext, blocktext),
2702                     source=src, line=srcline)
2703                 messages.append(msg)
2704         if not self.state_machine.match_titles:
2705             blocktext = context[0] + '\n' + self.state_machine.line
2706             msg = self.reporter.severe(
2707                 'Unexpected section title.',
2708                 nodes.literal_block(blocktext, blocktext),
2709                 source=src, line=srcline)
2710             self.parent += messages
2711             self.parent += msg
2712             return [], next_state, []
2713         style = underline[0]
2714         context[:] = []
2715         self.section(title, source, style, lineno - 1, messages)
2716         return [], next_state, []
2717
2718     def text(self, match, context, next_state):
2719         """Paragraph."""
2720         startline = self.state_machine.abs_line_number() - 1
2721         msg = None
2722         try:
2723             block = self.state_machine.get_text_block(flush_left=1)
2724         except statemachine.UnexpectedIndentationError, instance:
2725             block, src, srcline = instance.args
2726             msg = self.reporter.error('Unexpected indentation.',
2727                                       source=src, line=srcline)
2728         lines = context + list(block)
2729         paragraph, literalnext = self.paragraph(lines, startline)
2730         self.parent += paragraph
2731         self.parent += msg
2732         if literalnext:
2733             try:
2734                 self.state_machine.next_line()
2735             except EOFError:
2736                 pass
2737             self.parent += self.literal_block()
2738         return [], next_state, []
2739
2740     def literal_block(self):
2741         """Return a list of nodes."""
2742         indented, indent, offset, blank_finish = \
2743               self.state_machine.get_indented()
2744         while indented and not indented[-1].strip():
2745             indented.trim_end()
2746         if not indented:
2747             return self.quoted_literal_block()
2748         data = '\n'.join(indented)
2749         literal_block = nodes.literal_block(data, data)
2750         literal_block.line = offset + 1
2751         nodelist = [literal_block]
2752         if not blank_finish:
2753             nodelist.append(self.unindent_warning('Literal block'))
2754         return nodelist
2755
2756     def quoted_literal_block(self):
2757         abs_line_offset = self.state_machine.abs_line_offset()
2758         offset = self.state_machine.line_offset
2759         parent_node = nodes.Element()
2760         new_abs_offset = self.nested_parse(
2761             self.state_machine.input_lines[offset:],
2762             input_offset=abs_line_offset, node=parent_node, match_titles=0,
2763             state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2764                                   'initial_state': 'QuotedLiteralBlock'})
2765         self.goto_line(new_abs_offset)
2766         return parent_node.children
2767
2768     def definition_list_item(self, termline):
2769         indented, indent, line_offset, blank_finish = \
2770               self.state_machine.get_indented()
2771         definitionlistitem = nodes.definition_list_item(
2772             '\n'.join(termline + list(indented)))
2773         lineno = self.state_machine.abs_line_number() - 1
2774         src, srcline = self.state_machine.get_source_and_line()
2775         definitionlistitem.source = src
2776         definitionlistitem.line = srcline - 1
2777         termlist, messages = self.term(termline, lineno)
2778         definitionlistitem += termlist
2779         definition = nodes.definition('', *messages)
2780         definitionlistitem += definition
2781         if termline[0][-2:] == '::':
2782             definition += self.reporter.info(
2783                   'Blank line missing before literal block (after the "::")? '
2784                   'Interpreted as a definition list item.',
2785                   source=src, line=srcline)
2786         self.nested_parse(indented, input_offset=line_offset, node=definition)
2787         return definitionlistitem, blank_finish
2788
2789     classifier_delimiter = re.compile(' +: +')
2790
2791     def term(self, lines, lineno):
2792         """Return a definition_list's term and optional classifiers."""
2793         assert len(lines) == 1
2794         text_nodes, messages = self.inline_text(lines[0], lineno)
2795         term_node = nodes.term()
2796         node_list = [term_node]
2797         for i in range(len(text_nodes)):
2798             node = text_nodes[i]
2799             if isinstance(node, nodes.Text):
2800                 parts = self.classifier_delimiter.split(node.rawsource)
2801                 if len(parts) == 1:
2802                     node_list[-1] += node
2803                 else:
2804
2805                     node_list[-1] += nodes.Text(parts[0].rstrip())
2806                     for part in parts[1:]:
2807                         classifier_node = nodes.classifier('', part)
2808                         node_list.append(classifier_node)
2809             else:
2810                 node_list[-1] += node
2811         return node_list, messages
2812
2813
2814 class SpecializedText(Text):
2815
2816     """
2817     Superclass for second and subsequent lines of Text-variants.
2818
2819     All transition methods are disabled. Override individual methods in
2820     subclasses to re-enable.
2821     """
2822
2823     def eof(self, context):
2824         """Incomplete construct."""
2825         return []
2826
2827     def invalid_input(self, match=None, context=None, next_state=None):
2828         """Not a compound element member. Abort this state machine."""
2829         raise EOFError
2830
2831     blank = invalid_input
2832     indent = invalid_input
2833     underline = invalid_input
2834     text = invalid_input
2835
2836
2837 class Definition(SpecializedText):
2838
2839     """Second line of potential definition_list_item."""
2840
2841     def eof(self, context):
2842         """Not a definition."""
2843         self.state_machine.previous_line(2) # so parent SM can reassess
2844         return []
2845
2846     def indent(self, match, context, next_state):
2847         """Definition list item."""
2848         definitionlistitem, blank_finish = self.definition_list_item(context)
2849         self.parent += definitionlistitem
2850         self.blank_finish = blank_finish
2851         return [], 'DefinitionList', []
2852
2853
2854 class Line(SpecializedText):
2855
2856     """
2857     Second line of over- & underlined section title or transition marker.
2858     """
2859
2860     eofcheck = 1                        # @@@ ???
2861     """Set to 0 while parsing sections, so that we don't catch the EOF."""
2862
2863     def eof(self, context):
2864         """Transition marker at end of section or document."""
2865         marker = context[0].strip()
2866         if self.memo.section_bubble_up_kludge:
2867             self.memo.section_bubble_up_kludge = 0
2868         elif len(marker) < 4:
2869             self.state_correction(context)
2870         if self.eofcheck:               # ignore EOFError with sections
2871             lineno = self.state_machine.abs_line_number() - 1
2872             transition = nodes.transition(rawsource=context[0])
2873             transition.line = lineno
2874             self.parent += transition
2875         self.eofcheck = 1
2876         return []
2877
2878     def blank(self, match, context, next_state):
2879         """Transition marker."""
2880         src, srcline = self.state_machine.get_source_and_line()
2881         marker = context[0].strip()
2882         if len(marker) < 4:
2883             self.state_correction(context)
2884         transition = nodes.transition(rawsource=marker)
2885         transition.source = src
2886         transition.line = srcline - 1
2887         self.parent += transition
2888         return [], 'Body', []
2889
2890     def text(self, match, context, next_state):
2891         """Potential over- & underlined title."""
2892         lineno = self.state_machine.abs_line_number() - 1
2893         src, srcline = self.state_machine.get_source_and_line()
2894         overline = context[0]
2895         title = match.string
2896         underline = ''
2897         try:
2898             underline = self.state_machine.next_line()
2899         except EOFError:
2900             blocktext = overline + '\n' + title
2901             if len(overline.rstrip()) < 4:
2902                 self.short_overline(context, blocktext, lineno, 2)
2903             else:
2904                 msg = self.reporter.severe(
2905                     'Incomplete section title.',
2906                     nodes.literal_block(blocktext, blocktext),
2907                     source=src, line=srcline-1)
2908                 self.parent += msg
2909                 return [], 'Body', []
2910         source = '%s\n%s\n%s' % (overline, title, underline)
2911         overline = overline.rstrip()
2912         underline = underline.rstrip()
2913         if not self.transitions['underline'][0].match(underline):
2914             blocktext = overline + '\n' + title + '\n' + underline
2915             if len(overline.rstrip()) < 4:
2916                 self.short_overline(context, blocktext, lineno, 2)
2917             else:
2918                 msg = self.reporter.severe(
2919                     'Missing matching underline for section title overline.',
2920                     nodes.literal_block(source, source),
2921                     source=src, line=srcline-1)
2922                 self.parent += msg
2923                 return [], 'Body', []
2924         elif overline != underline:
2925             blocktext = overline + '\n' + title + '\n' + underline
2926             if len(overline.rstrip()) < 4:
2927                 self.short_overline(context, blocktext, lineno, 2)
2928             else:
2929                 msg = self.reporter.severe(
2930                       'Title overline & underline mismatch.',
2931                       nodes.literal_block(source, source),
2932                       source=src, line=srcline-1)
2933                 self.parent += msg
2934                 return [], 'Body', []
2935         title = title.rstrip()
2936         messages = []
2937         if column_width(title) > len(overline):
2938             blocktext = overline + '\n' + title + '\n' + underline
2939             if len(overline.rstrip()) < 4:
2940                 self.short_overline(context, blocktext, lineno, 2)
2941             else:
2942                 msg = self.reporter.warning(
2943                       'Title overline too short.',
2944                       nodes.literal_block(source, source),
2945                       source=src, line=srcline-1)
2946                 messages.append(msg)
2947         style = (overline[0], underline[0])
2948         self.eofcheck = 0               # @@@ not sure this is correct
2949         self.section(title.lstrip(), source, style, lineno + 1, messages)
2950         self.eofcheck = 1
2951         return [], 'Body', []
2952
2953     indent = text                       # indented title
2954
2955     def underline(self, match, context, next_state):
2956         overline = context[0]
2957         blocktext = overline + '\n' + self.state_machine.line
2958         lineno = self.state_machine.abs_line_number() - 1
2959         src, srcline = self.state_machine.get_source_and_line()
2960         if len(overline.rstrip()) < 4:
2961             self.short_overline(context, blocktext, lineno, 1)
2962         msg = self.reporter.error(
2963               'Invalid section title or transition marker.',
2964               nodes.literal_block(blocktext, blocktext),
2965               source=src, line=srcline-1)
2966         self.parent += msg
2967         return [], 'Body', []
2968
2969     def short_overline(self, context, blocktext, lineno, lines=1):
2970         src, srcline = self.state_machine.get_source_and_line(lineno)
2971         msg = self.reporter.info(
2972             'Possible incomplete section title.\nTreating the overline as '
2973             "ordinary text because it's so short.",
2974                     source=src, line=srcline)
2975         self.parent += msg
2976         self.state_correction(context, lines)
2977
2978     def state_correction(self, context, lines=1):
2979         self.state_machine.previous_line(lines)
2980         context[:] = []
2981         raise statemachine.StateCorrection('Body', 'text')
2982
2983
2984 class QuotedLiteralBlock(RSTState):
2985
2986     """
2987     Nested parse handler for quoted (unindented) literal blocks.
2988
2989     Special-purpose.  Not for inclusion in `state_classes`.
2990     """
2991
2992     patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
2993                 'text': r''}
2994     initial_transitions = ('initial_quoted', 'text')
2995
2996     def __init__(self, state_machine, debug=0):
2997         RSTState.__init__(self, state_machine, debug)
2998         self.messages = []
2999         self.initial_lineno = None
3000
3001     def blank(self, match, context, next_state):
3002         if context:
3003             raise EOFError
3004         else:
3005             return context, next_state, []
3006
3007     def eof(self, context):
3008         if context:
3009             src, srcline = self.state_machine.get_source_and_line(
3010                                                         self.initial_lineno)
3011             text = '\n'.join(context)
3012             literal_block = nodes.literal_block(text, text)
3013             literal_block.source = src
3014             literal_block.line = srcline
3015             self.parent += literal_block
3016         else:
3017             self.parent += self.reporter.warning(
3018                 'Literal block expected; none found.',
3019                 line=self.state_machine.abs_line_number())
3020                 # src not available, because statemachine.input_lines is empty
3021             self.state_machine.previous_line()
3022         self.parent += self.messages
3023         return []
3024
3025     def indent(self, match, context, next_state):
3026         assert context, ('QuotedLiteralBlock.indent: context should not '
3027                          'be empty!')
3028         self.messages.append(
3029             self.reporter.error('Unexpected indentation.',
3030                                 line=self.state_machine.abs_line_number()))
3031         self.state_machine.previous_line()
3032         raise EOFError
3033
3034     def initial_quoted(self, match, context, next_state):
3035         """Match arbitrary quote character on the first line only."""
3036         self.remove_transition('initial_quoted')
3037         quote = match.string[0]
3038         pattern = re.compile(re.escape(quote), re.UNICODE)
3039         # New transition matches consistent quotes only:
3040         self.add_transition('quoted',
3041                             (pattern, self.quoted, self.__class__.__name__))
3042         self.initial_lineno = self.state_machine.abs_line_number()
3043         return [match.string], next_state, []
3044
3045     def quoted(self, match, context, next_state):
3046         """Match consistent quotes on subsequent lines."""
3047         context.append(match.string)
3048         return context, next_state, []
3049
3050     def text(self, match, context, next_state):
3051         if context:
3052             src, srcline = self.state_machine.get_source_and_line()
3053             self.messages.append(
3054                 self.reporter.error('Inconsistent literal block quoting.',
3055                                     source=src, line=srcline))
3056             self.state_machine.previous_line()
3057         raise EOFError
3058
3059
3060 state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3061                  OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3062                  Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3063 """Standard set of State classes used to start `RSTStateMachine`."""