docutils/docutils/transforms/frontmatter.py

   1 # $Id$
   2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 Transforms related to the front matter of a document or a section
   7 (information found before the main text):
   8
   9 - `DocTitle`: Used to transform a lone top level section's title to
  10   the document title, promote a remaining lone top-level section's
  11   title to the document subtitle, and determine the document's title
  12   metadata (document['title']) based on the document title and/or the
  13   "title" setting.
  14
  15 - `SectionSubTitle`: Used to transform a lone subsection into a
  16   subtitle.
  17
  18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
  19   elements.
  20 """
  21
  22 __docformat__ = 'reStructuredText'
  23
  24 import re
  25 from docutils import nodes, utils
  26 from docutils.transforms import TransformError, Transform
  27
  28
  29 class TitlePromoter(Transform):
  30
  31     """
  32     Abstract base class for DocTitle and SectionSubTitle transforms.
  33     """
  34
  35     def promote_title(self, node):
  36         """
  37         Transform the following tree::
  38
  39             <node>
  40                 <section>
  41                     <title>
  42                     ...
  43
  44         into ::
  45
  46             <node>
  47                 <title>
  48                 ...
  49
  50         `node` is normally a document.
  51         """
  52         # Type check
  53         if not isinstance(node, nodes.Element):
  54             raise TypeError, 'node must be of Element-derived type.'
  55
  56         # `node` must not have a title yet.
  57         assert not (len(node) and isinstance(node[0], nodes.title))
  58         section, index = self.candidate_index(node)
  59         if index is None:
  60             return None
  61
  62         # Transfer the section's attributes to the node:
  63         # NOTE: Change second parameter to False to NOT replace
  64         #       attributes that already exist in node with those in
  65         #       section
  66         # NOTE: Remove third parameter to NOT copy the 'source'
  67         #       attribute from section
  68         node.update_all_atts_concatenating(section, True, True)
  69
  70         # setup_child is called automatically for all nodes.
  71         node[:] = (section[:1]        # section title
  72                    + node[:index]     # everything that was in the
  73                                       # node before the section
  74                    + section[1:])     # everything that was in the section
  75         assert isinstance(node[0], nodes.title)
  76         return 1
  77
  78     def promote_subtitle(self, node):
  79         """
  80         Transform the following node tree::
  81
  82             <node>
  83                 <title>
  84                 <section>
  85                     <title>
  86                     ...
  87
  88         into ::
  89
  90             <node>
  91                 <title>
  92                 <subtitle>
  93                 ...
  94         """
  95         # Type check
  96         if not isinstance(node, nodes.Element):
  97             raise TypeError, 'node must be of Element-derived type.'
  98
  99         subsection, index = self.candidate_index(node)
 100         if index is None:
 101             return None
 102         subtitle = nodes.subtitle()
 103
 104         # Transfer the subsection's attributes to the new subtitle
 105         # NOTE: Change second parameter to False to NOT replace
 106         #       attributes that already exist in node with those in
 107         #       section
 108         # NOTE: Remove third parameter to NOT copy the 'source'
 109         #       attribute from section
 110         subtitle.update_all_atts_concatenating(subsection, True, True)
 111
 112         # Transfer the contents of the subsection's title to the
 113         # subtitle:
 114         subtitle[:] = subsection[0][:]
 115         node[:] = (node[:1]       # title
 116                    + [subtitle]
 117                    # everything that was before the section:
 118                    + node[1:index]
 119                    # everything that was in the subsection:
 120                    + subsection[1:])
 121         return 1
 122
 123     def candidate_index(self, node):
 124         """
 125         Find and return the promotion candidate and its index.
 126
 127         Return (None, None) if no valid candidate was found.
 128         """
 129         index = node.first_child_not_matching_class(
 130             nodes.PreBibliographic)
 131         if index is None or len(node) > (index + 1) or \
 132                not isinstance(node[index], nodes.section):
 133             return None, None
 134         else:
 135             return node[index], index
 136
 137
 138 class DocTitle(TitlePromoter):
 139
 140     """
 141     In reStructuredText_, there is no way to specify a document title
 142     and subtitle explicitly. Instead, we can supply the document title
 143     (and possibly the subtitle as well) implicitly, and use this
 144     two-step transform to "raise" or "promote" the title(s) (and their
 145     corresponding section contents) to the document level.
 146
 147     1. If the document contains a single top-level section as its
 148        first non-comment element, the top-level section's title
 149        becomes the document's title, and the top-level section's
 150        contents become the document's immediate contents. The lone
 151        top-level section header must be the first non-comment element
 152        in the document.
 153
 154        For example, take this input text::
 155
 156            =================
 157             Top-Level Title
 158            =================
 159
 160            A paragraph.
 161
 162        Once parsed, it looks like this::
 163
 164            <document>
 165                <section names="top-level title">
 166                    <title>
 167                        Top-Level Title
 168                    <paragraph>
 169                        A paragraph.
 170
 171        After running the DocTitle transform, we have::
 172
 173            <document names="top-level title">
 174                <title>
 175                    Top-Level Title
 176                <paragraph>
 177                    A paragraph.
 178
 179     2. If step 1 successfully determines the document title, we
 180        continue by checking for a subtitle.
 181
 182        If the lone top-level section itself contains a single
 183        second-level section as its first non-comment element, that
 184        section's title is promoted to the document's subtitle, and
 185        that section's contents become the document's immediate
 186        contents. Given this input text::
 187
 188            =================
 189             Top-Level Title
 190            =================
 191
 192            Second-Level Title
 193            ~~~~~~~~~~~~~~~~~~
 194
 195            A paragraph.
 196
 197        After parsing and running the Section Promotion transform, the
 198        result is::
 199
 200            <document names="top-level title">
 201                <title>
 202                    Top-Level Title
 203                <subtitle names="second-level title">
 204                    Second-Level Title
 205                <paragraph>
 206                    A paragraph.
 207
 208        (Note that the implicit hyperlink target generated by the
 209        "Second-Level Title" is preserved on the "subtitle" element
 210        itself.)
 211
 212     Any comment elements occurring before the document title or
 213     subtitle are accumulated and inserted as the first body elements
 214     after the title(s).
 215
 216     This transform also sets the document's metadata title
 217     (document['title']).
 218
 219     .. _reStructuredText: http://docutils.sf.net/rst.html
 220     """
 221
 222     default_priority = 320
 223
 224     def set_metadata(self):
 225         """
 226         Set document['title'] metadata title from the following
 227         sources, listed in order of priority:
 228
 229         * Existing document['title'] attribute.
 230         * "title" setting.
 231         * Document title node (as promoted by promote_title).
 232         """
 233         if not self.document.hasattr('title'):
 234             if self.document.settings.title is not None:
 235                 self.document['title'] = self.document.settings.title
 236             elif len(self.document) and isinstance(self.document[0], nodes.title):
 237                 self.document['title'] = self.document[0].astext()
 238
 239     def apply(self):
 240         if getattr(self.document.settings, 'doctitle_xform', 1):
 241             # promote_(sub)title defined in TitlePromoter base class.
 242             if self.promote_title(self.document):
 243                 # If a title has been promoted, also try to promote a
 244                 # subtitle.
 245                 self.promote_subtitle(self.document)
 246         # Set document['title'].
 247         self.set_metadata()
 248
 249
 250 class SectionSubTitle(TitlePromoter):
 251
 252     """
 253     This works like document subtitles, but for sections.  For example, ::
 254
 255         <section>
 256             <title>
 257                 Title
 258             <section>
 259                 <title>
 260                     Subtitle
 261                 ...
 262
 263     is transformed into ::
 264
 265         <section>
 266             <title>
 267                 Title
 268             <subtitle>
 269                 Subtitle
 270             ...
 271
 272     For details refer to the docstring of DocTitle.
 273     """
 274
 275     default_priority = 350
 276
 277     def apply(self):
 278         if not getattr(self.document.settings, 'sectsubtitle_xform', 1):
 279             return
 280         for section in self.document.traverse(nodes.section):
 281             # On our way through the node tree, we are deleting
 282             # sections, but we call self.promote_subtitle for those
 283             # sections nonetheless.  To do: Write a test case which
 284             # shows the problem and discuss on Docutils-develop.
 285             self.promote_subtitle(section)
 286
 287
 288 class DocInfo(Transform):
 289
 290     """
 291     This transform is specific to the reStructuredText_ markup syntax;
 292     see "Bibliographic Fields" in the `reStructuredText Markup
 293     Specification`_ for a high-level description. This transform
 294     should be run *after* the `DocTitle` transform.
 295
 296     Given a field list as the first non-comment element after the
 297     document title and subtitle (if present), registered bibliographic
 298     field names are transformed to the corresponding DTD elements,
 299     becoming child elements of the "docinfo" element (except for a
 300     dedication and/or an abstract, which become "topic" elements after
 301     "docinfo").
 302
 303     For example, given this document fragment after parsing::
 304
 305         <document>
 306             <title>
 307                 Document Title
 308             <field_list>
 309                 <field>
 310                     <field_name>
 311                         Author
 312                     <field_body>
 313                         <paragraph>
 314                             A. Name
 315                 <field>
 316                     <field_name>
 317                         Status
 318                     <field_body>
 319                         <paragraph>
 320                             $RCSfile$
 321             ...
 322
 323     After running the bibliographic field list transform, the
 324     resulting document tree would look like this::
 325
 326         <document>
 327             <title>
 328                 Document Title
 329             <docinfo>
 330                 <author>
 331                     A. Name
 332                 <status>
 333                     frontmatter.py
 334             ...
 335
 336     The "Status" field contained an expanded RCS keyword, which is
 337     normally (but optionally) cleaned up by the transform. The sole
 338     contents of the field body must be a paragraph containing an
 339     expanded RCS keyword of the form "$keyword: expansion text $". Any
 340     RCS keyword can be processed in any bibliographic field. The
 341     dollar signs and leading RCS keyword name are removed. Extra
 342     processing is done for the following RCS keywords:
 343
 344     - "RCSfile" expands to the name of the file in the RCS or CVS
 345       repository, which is the name of the source file with a ",v"
 346       suffix appended. The transform will remove the ",v" suffix.
 347
 348     - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
 349       time zone). The RCS Keywords transform will extract just the
 350       date itself and transform it to an ISO 8601 format date, as in
 351       "2000-12-31".
 352
 353       (Since the source file for this text is itself stored under CVS,
 354       we can't show an example of the "Date" RCS keyword because we
 355       can't prevent any RCS keywords used in this explanation from
 356       being expanded. Only the "RCSfile" keyword is stable; its
 357       expansion text changes only if the file name changes.)
 358
 359     .. _reStructuredText: http://docutils.sf.net/rst.html
 360     .. _reStructuredText Markup Specification:
 361        http://docutils.sf.net/docs/ref/rst/restructuredtext.html
 362     """
 363
 364     default_priority = 340
 365
 366     biblio_nodes = {
 367           'author': nodes.author,
 368           'authors': nodes.authors,
 369           'organization': nodes.organization,
 370           'address': nodes.address,
 371           'contact': nodes.contact,
 372           'version': nodes.version,
 373           'revision': nodes.revision,
 374           'status': nodes.status,
 375           'date': nodes.date,
 376           'copyright': nodes.copyright,
 377           'dedication': nodes.topic,
 378           'abstract': nodes.topic}
 379     """Canonical field name (lowcased) to node class name mapping for
 380     bibliographic fields (field_list)."""
 381
 382     def apply(self):
 383         if not getattr(self.document.settings, 'docinfo_xform', 1):
 384             return
 385         document = self.document
 386         index = document.first_child_not_matching_class(
 387               nodes.PreBibliographic)
 388         if index is None:
 389             return
 390         candidate = document[index]
 391         if isinstance(candidate, nodes.field_list):
 392             biblioindex = document.first_child_not_matching_class(
 393                   (nodes.Titular, nodes.Decorative))
 394             nodelist = self.extract_bibliographic(candidate)
 395             del document[index]         # untransformed field list (candidate)
 396             document[biblioindex:biblioindex] = nodelist
 397
 398     def extract_bibliographic(self, field_list):
 399         docinfo = nodes.docinfo()
 400         bibliofields = self.language.bibliographic_fields
 401         labels = self.language.labels
 402         topics = {'dedication': None, 'abstract': None}
 403         for field in field_list:
 404             try:
 405                 name = field[0][0].astext()
 406                 normedname = nodes.fully_normalize_name(name)
 407                 if not (len(field) == 2 and normedname in bibliofields
 408                         and self.check_empty_biblio_field(field, name)):
 409                     raise TransformError
 410                 canonical = bibliofields[normedname]
 411                 biblioclass = self.biblio_nodes[canonical]
 412                 if issubclass(biblioclass, nodes.TextElement):
 413                     if not self.check_compound_biblio_field(field, name):
 414                         raise TransformError
 415                     utils.clean_rcs_keywords(
 416                           field[1][0], self.rcs_keyword_substitutions)
 417                     docinfo.append(biblioclass('', '', *field[1][0]))
 418                 elif issubclass(biblioclass, nodes.authors):
 419                     self.extract_authors(field, name, docinfo)
 420                 elif issubclass(biblioclass, nodes.topic):
 421                     if topics[canonical]:
 422                         field[-1] += self.document.reporter.warning(
 423                             'There can only be one "%s" field.' % name,
 424                             base_node=field)
 425                         raise TransformError
 426                     title = nodes.title(name, labels[canonical])
 427                     topics[canonical] = biblioclass(
 428                         '', title, classes=[canonical], *field[1].children)
 429                 else:
 430                     docinfo.append(biblioclass('', *field[1].children))
 431             except TransformError:
 432                 if len(field[-1]) == 1 \
 433                        and isinstance(field[-1][0], nodes.paragraph):
 434                     utils.clean_rcs_keywords(
 435                         field[-1][0], self.rcs_keyword_substitutions)
 436                 if normedname not in bibliofields:
 437                     classvalue = nodes.make_id(normedname)
 438                     if classvalue:
 439                         field['classes'].append(classvalue)
 440                 docinfo.append(field)
 441         nodelist = []
 442         if len(docinfo) != 0:
 443             nodelist.append(docinfo)
 444         for name in ('dedication', 'abstract'):
 445             if topics[name]:
 446                 nodelist.append(topics[name])
 447         return nodelist
 448
 449     def check_empty_biblio_field(self, field, name):
 450         if len(field[-1]) < 1:
 451             field[-1] += self.document.reporter.warning(
 452                   'Cannot extract empty bibliographic field "%s".' % name,
 453                   base_node=field)
 454             return None
 455         return 1
 456
 457     def check_compound_biblio_field(self, field, name):
 458         if len(field[-1]) > 1:
 459             field[-1] += self.document.reporter.warning(
 460                   'Cannot extract compound bibliographic field "%s".' % name,
 461                   base_node=field)
 462             return None
 463         if not isinstance(field[-1][0], nodes.paragraph):
 464             field[-1] += self.document.reporter.warning(
 465                   'Cannot extract bibliographic field "%s" containing '
 466                   'anything other than a single paragraph.' % name,
 467                   base_node=field)
 468             return None
 469         return 1
 470
 471     rcs_keyword_substitutions = [
 472           (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
 473                       r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
 474           (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
 475           (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1'),]
 476
 477     def extract_authors(self, field, name, docinfo):
 478         try:
 479             if len(field[1]) == 1:
 480                 if isinstance(field[1][0], nodes.paragraph):
 481                     authors = self.authors_from_one_paragraph(field)
 482                 elif isinstance(field[1][0], nodes.bullet_list):
 483                     authors = self.authors_from_bullet_list(field)
 484                 else:
 485                     raise TransformError
 486             else:
 487                 authors = self.authors_from_paragraphs(field)
 488             authornodes = [nodes.author('', '', *author)
 489                            for author in authors if author]
 490             if len(authornodes) >= 1:
 491                 docinfo.append(nodes.authors('', *authornodes))
 492             else:
 493                 raise TransformError
 494         except TransformError:
 495             field[-1] += self.document.reporter.warning(
 496                   'Bibliographic field "%s" incompatible with extraction: '
 497                   'it must contain either a single paragraph (with authors '
 498                   'separated by one of "%s"), multiple paragraphs (one per '
 499                   'author), or a bullet list with one paragraph (one author) '
 500                   'per item.'
 501                   % (name, ''.join(self.language.author_separators)),
 502                   base_node=field)
 503             raise
 504
 505     def authors_from_one_paragraph(self, field):
 506         text = field[1][0].astext().strip()
 507         if not text:
 508             raise TransformError
 509         for authorsep in self.language.author_separators:
 510             authornames = text.split(authorsep)
 511             if len(authornames) > 1:
 512                 break
 513         authornames = [author.strip() for author in authornames]
 514         authors = [[nodes.Text(author)] for author in authornames if author]
 515         return authors
 516
 517     def authors_from_bullet_list(self, field):
 518         authors = []
 519         for item in field[1][0]:
 520             if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
 521                 raise TransformError
 522             authors.append(item[0].children)
 523         if not authors:
 524             raise TransformError
 525         return authors
 526
 527     def authors_from_paragraphs(self, field):
 528         for item in field[1]:
 529             if not isinstance(item, nodes.paragraph):
 530                 raise TransformError
 531         authors = [item.children for item in field[1]]
 532         return authors