docutils/transforms/frontmatter.py

   1 # $Id$
   2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 Transforms related to the front matter of a document or a section
   7 (information found before the main text):
   8
   9 - `DocTitle`: Used to transform a lone top level section's title to
  10   the document title, promote a remaining lone top-level section's
  11   title to the document subtitle, and determine the document's title
  12   metadata (document['title']) based on the document title and/or the
  13   "title" setting.
  14
  15 - `SectionSubTitle`: Used to transform a lone subsection into a
  16   subtitle.
  17
  18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
  19   elements.
  20 """
  21
  22 __docformat__ = 'reStructuredText'
  23
  24 import re
  25 from docutils import nodes, utils
  26 from docutils.transforms import TransformError, Transform
  27
  28
  29 class TitlePromoter(Transform):
  30
  31     """
  32     Abstract base class for DocTitle and SectionSubTitle transforms.
  33     """
  34
  35     def promote_title(self, node):
  36         """
  37         Transform the following tree::
  38
  39             <node>
  40                 <section>
  41                     <title>
  42                     ...
  43
  44         into ::
  45
  46             <node>
  47                 <title>
  48                 ...
  49
  50         `node` is normally a document.
  51         """
  52         # Type check
  53         if not isinstance(node, nodes.Element):
  54             raise TypeError, 'node must be of Element-derived type.'
  55
  56         # `node` must not have a title yet.
  57         assert not (len(node) and isinstance(node[0], nodes.title))
  58         section, index = self.candidate_index(node)
  59         if index is None:
  60             return None
  61
  62         # Transfer the section's attributes to the node:
  63         # NOTE: Change second parameter to False to NOT replace
  64         #       attributes that already exist in node with those in
  65         #       section
  66         # NOTE: Remove third parameter to NOT copy the 'source'
  67         #       attribute from section
  68         node.update_all_atts_concatenating(section, True, True)
  69
  70         # setup_child is called automatically for all nodes.
  71         node[:] = (section[:1]        # section title
  72                    + node[:index]     # everything that was in the
  73                                       # node before the section
  74                    + section[1:])     # everything that was in the section
  75         assert isinstance(node[0], nodes.title)
  76         return 1
  77
  78     def promote_subtitle(self, node):
  79         """
  80         Transform the following node tree::
  81
  82             <node>
  83                 <title>
  84                 <section>
  85                     <title>
  86                     ...
  87
  88         into ::
  89
  90             <node>
  91                 <title>
  92                 <subtitle>
  93                 ...
  94         """
  95         # Type check
  96         if not isinstance(node, nodes.Element):
  97             raise TypeError, 'node must be of Element-derived type.'
  98
  99         subsection, index = self.candidate_index(node)
 100         if index is None:
 101             return None
 102         subtitle = nodes.subtitle()
 103
 104         # Transfer the subsection's attributes to the new subtitle
 105         # NOTE: Change second parameter to False to NOT replace
 106         #       attributes that already exist in node with those in
 107         #       section
 108         # NOTE: Remove third parameter to NOT copy the 'source'
 109         #       attribute from section
 110         subtitle.update_all_atts_concatenating(subsection, True, True)
 111
 112         # Transfer the contents of the subsection's title to the
 113         # subtitle:
 114         subtitle[:] = subsection[0][:]
 115         node[:] = (node[:1]       # title
 116                    + [subtitle]
 117                    # everything that was before the section:
 118                    + node[1:index]
 119                    # everything that was in the subsection:
 120                    + subsection[1:])
 121         return 1
 122
 123     def candidate_index(self, node):
 124         """
 125         Find and return the promotion candidate and its index.
 126
 127         Return (None, None) if no valid candidate was found.
 128         """
 129         index = node.first_child_not_matching_class(
 130             nodes.PreBibliographic)
 131         if index is None or len(node) > (index + 1) or \
 132                not isinstance(node[index], nodes.section):
 133             return None, None
 134         else:
 135             return node[index], index
 136
 137
 138 class DocTitle(TitlePromoter):
 139
 140     """
 141     In reStructuredText_, there is no way to specify a document title
 142     and subtitle explicitly. Instead, we can supply the document title
 143     (and possibly the subtitle as well) implicitly, and use this
 144     two-step transform to "raise" or "promote" the title(s) (and their
 145     corresponding section contents) to the document level.
 146
 147     1. If the document contains a single top-level section as its
 148        first non-comment element, the top-level section's title
 149        becomes the document's title, and the top-level section's
 150        contents become the document's immediate contents. The lone
 151        top-level section header must be the first non-comment element
 152        in the document.
 153
 154        For example, take this input text::
 155
 156            =================
 157             Top-Level Title
 158            =================
 159
 160            A paragraph.
 161
 162        Once parsed, it looks like this::
 163
 164            <document>
 165                <section names="top-level title">
 166                    <title>
 167                        Top-Level Title
 168                    <paragraph>
 169                        A paragraph.
 170
 171        After running the DocTitle transform, we have::
 172
 173            <document names="top-level title">
 174                <title>
 175                    Top-Level Title
 176                <paragraph>
 177                    A paragraph.
 178
 179     2. If step 1 successfully determines the document title, we
 180        continue by checking for a subtitle.
 181
 182        If the lone top-level section itself contains a single
 183        second-level section as its first non-comment element, that
 184        section's title is promoted to the document's subtitle, and
 185        that section's contents become the document's immediate
 186        contents. Given this input text::
 187
 188            =================
 189             Top-Level Title
 190            =================
 191
 192            Second-Level Title
 193            ~~~~~~~~~~~~~~~~~~
 194
 195            A paragraph.
 196
 197        After parsing and running the Section Promotion transform, the
 198        result is::
 199
 200            <document names="top-level title">
 201                <title>
 202                    Top-Level Title
 203                <subtitle names="second-level title">
 204                    Second-Level Title
 205                <paragraph>
 206                    A paragraph.
 207
 208        (Note that the implicit hyperlink target generated by the
 209        "Second-Level Title" is preserved on the "subtitle" element
 210        itself.)
 211
 212     Any comment elements occurring before the document title or
 213     subtitle are accumulated and inserted as the first body elements
 214     after the title(s).
 215
 216     This transform also sets the document's metadata title
 217     (document['title']).
 218
 219     .. _reStructuredText: http://docutils.sf.net/rst.html
 220     """
 221
 222     default_priority = 320
 223
 224     def set_metadata(self):
 225         """
 226         Set document['title'] metadata title from the following
 227         sources, listed in order of priority:
 228
 229         * Existing document['title'] attribute.
 230         * "title" setting.
 231         * Document title node (as promoted by promote_title).
 232         """
 233         if not self.document.hasattr('title'):
 234             if self.document.settings.title is not None:
 235                 self.document['title'] = self.document.settings.title
 236             elif len(self.document) and isinstance(self.document[0], nodes.title):
 237                 self.document['title'] = self.document[0].astext()
 238
 239     def apply(self):
 240         if getattr(self.document.settings, 'doctitle_xform', 1):
 241             # promote_(sub)title defined in TitlePromoter base class.
 242             if self.promote_title(self.document):
 243                 # If a title has been promoted, also try to promote a
 244                 # subtitle.
 245                 self.promote_subtitle(self.document)
 246         # Set document['title'].
 247         self.set_metadata()
 248
 249
 250 class SectionSubTitle(TitlePromoter):
 251
 252     """
 253     This works like document subtitles, but for sections.  For example, ::
 254
 255         <section>
 256             <title>
 257                 Title
 258             <section>
 259                 <title>
 260                     Subtitle
 261                 ...
 262
 263     is transformed into ::
 264
 265         <section>
 266             <title>
 267                 Title
 268             <subtitle>
 269                 Subtitle
 270             ...
 271
 272     For details refer to the docstring of DocTitle.
 273     """
 274
 275     default_priority = 350
 276
 277     def apply(self):
 278         if not getattr(self.document.settings, 'sectsubtitle_xform', 1):
 279             return
 280         for section in self.document.traverse(nodes.section):
 281             # On our way through the node tree, we are deleting
 282             # sections, but we call self.promote_subtitle for those
 283             # sections nonetheless.  To do: Write a test case which
 284             # shows the problem and discuss on Docutils-develop.
 285             self.promote_subtitle(section)
 286
 287
 288 class DocInfo(Transform):
 289
 290     """
 291     This transform is specific to the reStructuredText_ markup syntax;
 292     see "Bibliographic Fields" in the `reStructuredText Markup
 293     Specification`_ for a high-level description. This transform
 294     should be run *after* the `DocTitle` transform.
 295
 296     Given a field list as the first non-comment element after the
 297     document title and subtitle (if present), registered bibliographic
 298     field names are transformed to the corresponding DTD elements,
 299     becoming child elements of the "docinfo" element (except for a
 300     dedication and/or an abstract, which become "topic" elements after
 301     "docinfo").
 302
 303     For example, given this document fragment after parsing::
 304
 305         <document>
 306             <title>
 307                 Document Title
 308             <field_list>
 309                 <field>
 310                     <field_name>
 311                         Author
 312                     <field_body>
 313                         <paragraph>
 314                             A. Name
 315                 <field>
 316                     <field_name>
 317                         Status
 318                     <field_body>
 319                         <paragraph>
 320                             $RCSfile$
 321             ...
 322
 323     After running the bibliographic field list transform, the
 324     resulting document tree would look like this::
 325
 326         <document>
 327             <title>
 328                 Document Title
 329             <docinfo>
 330                 <author>
 331                     A. Name
 332                 <status>
 333                     frontmatter.py
 334             ...
 335
 336     The "Status" field contained an expanded RCS keyword, which is
 337     normally (but optionally) cleaned up by the transform. The sole
 338     contents of the field body must be a paragraph containing an
 339     expanded RCS keyword of the form "$keyword: expansion text $". Any
 340     RCS keyword can be processed in any bibliographic field. The
 341     dollar signs and leading RCS keyword name are removed. Extra
 342     processing is done for the following RCS keywords:
 343
 344     - "RCSfile" expands to the name of the file in the RCS or CVS
 345       repository, which is the name of the source file with a ",v"
 346       suffix appended. The transform will remove the ",v" suffix.
 347
 348     - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
 349       time zone). The RCS Keywords transform will extract just the
 350       date itself and transform it to an ISO 8601 format date, as in
 351       "2000-12-31".
 352
 353       (Since the source file for this text is itself stored under CVS,
 354       we can't show an example of the "Date" RCS keyword because we
 355       can't prevent any RCS keywords used in this explanation from
 356       being expanded. Only the "RCSfile" keyword is stable; its
 357       expansion text changes only if the file name changes.)
 358
 359     .. _reStructuredText: http://docutils.sf.net/rst.html
 360     .. _reStructuredText Markup Specification:
 361        http://docutils.sf.net/docs/ref/rst/restructuredtext.html
 362     """
 363
 364     default_priority = 340
 365
 366     biblio_nodes = {
 367           'author': nodes.author,
 368           'authors': nodes.authors,
 369           'organization': nodes.organization,
 370           'address': nodes.address,
 371           'contact': nodes.contact,
 372           'version': nodes.version,
 373           'revision': nodes.revision,
 374           'status': nodes.status,
 375           'date': nodes.date,
 376           'copyright': nodes.copyright,
 377           'dedication': nodes.topic,
 378           'abstract': nodes.topic}
 379     """Canonical field name (lowcased) to node class name mapping for
 380     bibliographic fields (field_list)."""
 381
 382     def apply(self):
 383         if not getattr(self.document.settings, 'docinfo_xform', 1):
 384             return
 385         document = self.document
 386         index = document.first_child_not_matching_class(
 387               nodes.PreBibliographic)
 388         if index is None:
 389             return
 390         candidate = document[index]
 391         if isinstance(candidate, nodes.field_list):
 392             biblioindex = document.first_child_not_matching_class(
 393                   (nodes.Titular, nodes.Decorative))
 394             nodelist = self.extract_bibliographic(candidate)
 395             del document[index]         # untransformed field list (candidate)
 396             document[biblioindex:biblioindex] = nodelist
 397
 398     def extract_bibliographic(self, field_list):
 399         docinfo = nodes.docinfo()
 400         bibliofields = self.language.bibliographic_fields
 401         labels = self.language.labels
 402         topics = {'dedication': None, 'abstract': None}
 403         for field in field_list:
 404             try:
 405                 name = field[0][0].astext()
 406                 normedname = nodes.fully_normalize_name(name)
 407                 if not (len(field) == 2 and normedname in bibliofields
 408                         and self.check_empty_biblio_field(field, name)):
 409                     raise TransformError
 410                 canonical = bibliofields[normedname]
 411                 biblioclass = self.biblio_nodes[canonical]
 412                 if issubclass(biblioclass, nodes.TextElement):
 413                     if not self.check_compound_biblio_field(field, name):
 414                         raise TransformError
 415                     utils.clean_rcs_keywords(
 416                           field[1][0], self.rcs_keyword_substitutions)
 417                     docinfo.append(biblioclass('', '', *field[1][0]))
 418                 elif issubclass(biblioclass, nodes.authors):
 419                     self.extract_authors(field, name, docinfo)
 420                 elif issubclass(biblioclass, nodes.topic):
 421                     if topics[canonical]:
 422                         field[-1] += self.document.reporter.warning(
 423                             'There can only be one "%s" field.' % name,
 424                             base_node=field)
 425                         raise TransformError
 426                     title = nodes.title(name, labels[canonical])
 427                     topics[canonical] = biblioclass(
 428                         '', title, classes=[canonical], *field[1].children)
 429                 else:
 430                     docinfo.append(biblioclass('', *field[1].children))
 431             except TransformError:
 432                 if len(field[-1]) == 1 \
 433                        and isinstance(field[-1][0], nodes.paragraph):
 434                     utils.clean_rcs_keywords(
 435                         field[-1][0], self.rcs_keyword_substitutions)
 436                 docinfo.append(field)
 437         nodelist = []
 438         if len(docinfo) != 0:
 439             nodelist.append(docinfo)
 440         for name in ('dedication', 'abstract'):
 441             if topics[name]:
 442                 nodelist.append(topics[name])
 443         return nodelist
 444
 445     def check_empty_biblio_field(self, field, name):
 446         if len(field[-1]) < 1:
 447             field[-1] += self.document.reporter.warning(
 448                   'Cannot extract empty bibliographic field "%s".' % name,
 449                   base_node=field)
 450             return None
 451         return 1
 452
 453     def check_compound_biblio_field(self, field, name):
 454         if len(field[-1]) > 1:
 455             field[-1] += self.document.reporter.warning(
 456                   'Cannot extract compound bibliographic field "%s".' % name,
 457                   base_node=field)
 458             return None
 459         if not isinstance(field[-1][0], nodes.paragraph):
 460             field[-1] += self.document.reporter.warning(
 461                   'Cannot extract bibliographic field "%s" containing '
 462                   'anything other than a single paragraph.' % name,
 463                   base_node=field)
 464             return None
 465         return 1
 466
 467     rcs_keyword_substitutions = [
 468           (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
 469                       r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
 470           (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
 471           (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1'),]
 472
 473     def extract_authors(self, field, name, docinfo):
 474         try:
 475             if len(field[1]) == 1:
 476                 if isinstance(field[1][0], nodes.paragraph):
 477                     authors = self.authors_from_one_paragraph(field)
 478                 elif isinstance(field[1][0], nodes.bullet_list):
 479                     authors = self.authors_from_bullet_list(field)
 480                 else:
 481                     raise TransformError
 482             else:
 483                 authors = self.authors_from_paragraphs(field)
 484             authornodes = [nodes.author('', '', *author)
 485                            for author in authors if author]
 486             if len(authornodes) >= 1:
 487                 docinfo.append(nodes.authors('', *authornodes))
 488             else:
 489                 raise TransformError
 490         except TransformError:
 491             field[-1] += self.document.reporter.warning(
 492                   'Bibliographic field "%s" incompatible with extraction: '
 493                   'it must contain either a single paragraph (with authors '
 494                   'separated by one of "%s"), multiple paragraphs (one per '
 495                   'author), or a bullet list with one paragraph (one author) '
 496                   'per item.'
 497                   % (name, ''.join(self.language.author_separators)),
 498                   base_node=field)
 499             raise
 500
 501     def authors_from_one_paragraph(self, field):
 502         text = field[1][0].astext().strip()
 503         if not text:
 504             raise TransformError
 505         for authorsep in self.language.author_separators:
 506             authornames = text.split(authorsep)
 507             if len(authornames) > 1:
 508                 break
 509         authornames = [author.strip() for author in authornames]
 510         authors = [[nodes.Text(author)] for author in authornames if author]
 511         return authors
 512
 513     def authors_from_bullet_list(self, field):
 514         authors = []
 515         for item in field[1][0]:
 516             if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
 517                 raise TransformError
 518             authors.append(item[0].children)
 519         if not authors:
 520             raise TransformError
 521         return authors
 522
 523     def authors_from_paragraphs(self, field):
 524         for item in field[1]:
 525             if not isinstance(item, nodes.paragraph):
 526                 raise TransformError
 527         authors = [item.children for item in field[1]]
 528         return authors