docutils/docutils/transforms/frontmatter.py

   1 # $Id$
   2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 Transforms_ related to the front matter of a document or a section
   7 (information found before the main text):
   8
   9 - `DocTitle`: Used to transform a lone top level section's title to
  10   the document title, promote a remaining lone top-level section's
  11   title to the document subtitle, and determine the document's title
  12   metadata (document['title']) based on the document title and/or the
  13   "title" setting.
  14
  15 - `SectionSubTitle`: Used to transform a lone subsection into a
  16   subtitle.
  17
  18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
  19   elements.
  20
  21 .. _transforms: https://docutils.sourceforge.io/docs/api/transforms.html
  22 """
  23
  24 __docformat__ = 'reStructuredText'
  25
  26 import re
  27
  28 from docutils import nodes, parsers, utils
  29 from docutils.transforms import TransformError, Transform
  30
  31
  32 class TitlePromoter(Transform):
  33
  34     """
  35     Abstract base class for DocTitle and SectionSubTitle transforms.
  36     """
  37
  38     def promote_title(self, node) -> bool:
  39         """
  40         Transform the following tree::
  41
  42             <node>
  43                 <section>
  44                     <title>
  45                     ...
  46
  47         into ::
  48
  49             <node>
  50                 <title>
  51                 ...
  52
  53         `node` is normally a document.
  54         """
  55         # Type check
  56         if not isinstance(node, nodes.Element):
  57             raise TypeError('node must be of Element-derived type.')
  58
  59         # `node` must not have a title yet.
  60         assert not (len(node) and isinstance(node[0], nodes.title))
  61         section, index = self.candidate_index(node)
  62         if index is None:
  63             return False
  64
  65         # Transfer the section's attributes to the node:
  66         # NOTE: Change `replace` to False to NOT replace attributes that
  67         #       already exist in node with those in section.
  68         # NOTE: Remove `and_source` to NOT copy the 'source'
  69         #       attribute from section
  70         node.update_all_atts_concatenating(section, replace=True,
  71                                            and_source=True)
  72
  73         # setup_child is called automatically for all nodes.
  74         node[:] = (section[:1]        # section title
  75                    + node[:index]     # everything that was in the
  76                                       # node before the section
  77                    + section[1:])     # everything that was in the section
  78         assert isinstance(node[0], nodes.title)
  79         return True
  80
  81     def promote_subtitle(self, node) -> bool:
  82         """
  83         Transform the following node tree::
  84
  85             <node>
  86                 <title>
  87                 <section>
  88                     <title>
  89                     ...
  90
  91         into ::
  92
  93             <node>
  94                 <title>
  95                 <subtitle>
  96                 ...
  97         """
  98         # Type check
  99         if not isinstance(node, nodes.Element):
 100             raise TypeError('node must be of Element-derived type.')
 101
 102         subsection, index = self.candidate_index(node)
 103         if index is None:
 104             return False
 105         subtitle = nodes.subtitle()
 106
 107         # Transfer the subsection's attributes to the new subtitle
 108         # NOTE: Change `replace` to False to NOT replace attributes
 109         #       that already exist in node with those in section.
 110         # NOTE: Remove `and_source` to NOT copy the 'source'
 111         #       attribute from section.
 112         subtitle.update_all_atts_concatenating(subsection, replace=True,
 113                                                and_source=True)
 114
 115         # Transfer the contents of the subsection's title to the
 116         # subtitle:
 117         subtitle[:] = subsection[0][:]
 118         node[:] = (node[:1]       # title
 119                    + [subtitle]
 120                    # everything that was before the section:
 121                    + node[1:index]
 122                    # everything that was in the subsection:
 123                    + subsection[1:])
 124         return True
 125
 126     def candidate_index(self, node):
 127         """
 128         Find and return the promotion candidate and its index.
 129
 130         Return (None, None) if no valid candidate was found.
 131         """
 132         index = node.first_child_not_matching_class(
 133             nodes.PreBibliographic)
 134         if (index is None or len(node) > (index + 1)
 135             or not isinstance(node[index], nodes.section)):
 136             return None, None
 137         else:
 138             return node[index], index
 139
 140
 141 class DocTitle(TitlePromoter):
 142
 143     """
 144     In reStructuredText_, there is no way to specify a document title
 145     and subtitle explicitly. Instead, we can supply the document title
 146     (and possibly the subtitle as well) implicitly, and use this
 147     two-step transform to "raise" or "promote" the title(s) (and their
 148     corresponding section contents) to the document level.
 149
 150     1. If the document contains a single top-level section as its first
 151        element (instances of `nodes.PreBibliographic` are ignored),
 152        the top-level section's title becomes the document's title, and
 153        the top-level section's contents become the document's immediate
 154        contents. The title is also used for the <document> element's
 155        "title" attribute default value.
 156
 157     2. If step 1 successfully determines the document title, we
 158        continue by checking for a subtitle.
 159
 160        If the lone top-level section itself contains a single second-level
 161        section as its first "non-PreBibliographic" element, that section's
 162        title is promoted to the document's subtitle, and that section's
 163        contents become the document's immediate contents.
 164
 165     Example:
 166        Given this input text::
 167
 168            =================
 169             Top-Level Title
 170            =================
 171
 172            Second-Level Title
 173            ~~~~~~~~~~~~~~~~~~
 174
 175            A paragraph.
 176
 177        After parsing and running the DocTitle transform, the result is::
 178
 179            <document names="top-level title">
 180                <title>
 181                    Top-Level Title
 182                <subtitle names="second-level title">
 183                    Second-Level Title
 184                <paragraph>
 185                    A paragraph.
 186
 187        (Note that the implicit hyperlink target generated by the
 188        "Second-Level Title" is preserved on the <subtitle> element
 189        itself.)
 190
 191     Any `nodes.PreBibliographic` instances occurring before the
 192     document title or subtitle are accumulated and inserted as
 193     the first body elements after the title(s).
 194
 195     .. _reStructuredText: https://docutils.sourceforge.io/rst.html
 196     """
 197
 198     default_priority = 320
 199
 200     def set_metadata(self) -> None:
 201         """
 202         Set document['title'] metadata title from the following
 203         sources, listed in order of priority:
 204
 205         * Existing document['title'] attribute.
 206         * "title" setting.
 207         * Document title node (as promoted by promote_title).
 208         """
 209         if not self.document.hasattr('title'):
 210             if self.document.settings.title is not None:
 211                 self.document['title'] = self.document.settings.title
 212             elif len(self.document) and isinstance(self.document[0],
 213                                                    nodes.title):
 214                 self.document['title'] = self.document[0].astext()
 215
 216     def apply(self) -> None:
 217         if self.document.settings.setdefault('doctitle_xform', True):
 218             # promote_(sub)title defined in TitlePromoter base class.
 219             if self.promote_title(self.document):
 220                 # If a title has been promoted, also try to promote a
 221                 # subtitle.
 222                 self.promote_subtitle(self.document)
 223         # Set document['title'].
 224         self.set_metadata()
 225
 226
 227 class SectionSubTitle(TitlePromoter):
 228
 229     """
 230     This works like document subtitles, but for sections.  For example, ::
 231
 232         <section>
 233             <title>
 234                 Title
 235             <section>
 236                 <title>
 237                     Subtitle
 238                 ...
 239
 240     is transformed into ::
 241
 242         <section>
 243             <title>
 244                 Title
 245             <subtitle>
 246                 Subtitle
 247             ...
 248
 249     For details refer to the docstring of DocTitle.
 250     """
 251
 252     default_priority = 350
 253
 254     def apply(self) -> None:
 255         if not self.document.settings.setdefault('sectsubtitle_xform', True):
 256             return
 257         for section in self.document.findall(nodes.section):
 258             # On our way through the node tree, we are modifying it
 259             # but only the not-yet-visited part, so that the iterator
 260             # returned by findall() is not corrupted.
 261             self.promote_subtitle(section)
 262
 263
 264 class DocInfo(Transform):
 265
 266     """
 267     This transform is specific to the reStructuredText_ markup syntax;
 268     see "Bibliographic Fields" in the `reStructuredText Markup
 269     Specification`_ for a high-level description. This transform
 270     should be run *after* the `DocTitle` transform.
 271
 272     If the document contains a field list as the first element (instances
 273     of `nodes.PreBibliographic` are ignored), registered bibliographic
 274     field names are transformed to the corresponding DTD elements,
 275     becoming child elements of the <docinfo> element (except for a
 276     dedication and/or an abstract, which become <topic> elements after
 277     <docinfo>).
 278
 279     For example, given this document fragment after parsing::
 280
 281         <document>
 282             <title>
 283                 Document Title
 284             <field_list>
 285                 <field>
 286                     <field_name>
 287                         Author
 288                     <field_body>
 289                         <paragraph>
 290                             A. Name
 291                 <field>
 292                     <field_name>
 293                         Status
 294                     <field_body>
 295                         <paragraph>
 296                             $RCSfile$
 297             ...
 298
 299     After running the bibliographic field list transform, the
 300     resulting document tree would look like this::
 301
 302         <document>
 303             <title>
 304                 Document Title
 305             <docinfo>
 306                 <author>
 307                     A. Name
 308                 <status>
 309                     frontmatter.py
 310             ...
 311
 312     The "Status" field contained an expanded RCS keyword, which is
 313     normally (but optionally) cleaned up by the transform. The sole
 314     contents of the field body must be a paragraph containing an
 315     expanded RCS keyword of the form "$keyword: expansion text $". Any
 316     RCS keyword can be processed in any bibliographic field. The
 317     dollar signs and leading RCS keyword name are removed. Extra
 318     processing is done for the following RCS keywords:
 319
 320     - "RCSfile" expands to the name of the file in the RCS or CVS
 321       repository, which is the name of the source file with a ",v"
 322       suffix appended. The transform will remove the ",v" suffix.
 323
 324     - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
 325       time zone). The RCS Keywords transform will extract just the
 326       date itself and transform it to an ISO 8601 format date, as in
 327       "2000-12-31".
 328
 329       (Since the source file for this text is itself stored under CVS,
 330       we can't show an example of the "Date" RCS keyword because we
 331       can't prevent any RCS keywords used in this explanation from
 332       being expanded. Only the "RCSfile" keyword is stable; its
 333       expansion text changes only if the file name changes.)
 334
 335     .. _reStructuredText: https://docutils.sourceforge.io/rst.html
 336     .. _reStructuredText Markup Specification:
 337        https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
 338     """
 339
 340     default_priority = 340
 341
 342     biblio_nodes = {
 343           'author': nodes.author,
 344           'authors': nodes.authors,
 345           'organization': nodes.organization,
 346           'address': nodes.address,
 347           'contact': nodes.contact,
 348           'version': nodes.version,
 349           'revision': nodes.revision,
 350           'status': nodes.status,
 351           'date': nodes.date,
 352           'copyright': nodes.copyright,
 353           'dedication': nodes.topic,
 354           'abstract': nodes.topic}
 355     """Canonical field name (lowcased) to node class name mapping for
 356     bibliographic fields (field_list)."""
 357
 358     def apply(self) -> None:
 359         if not self.document.settings.setdefault('docinfo_xform', True):
 360             return
 361         document = self.document
 362         index = document.first_child_not_matching_class(
 363               nodes.PreBibliographic)
 364         if index is None:
 365             return
 366         candidate = document[index]
 367         if isinstance(candidate, nodes.field_list):
 368             biblioindex = document.first_child_not_matching_class(
 369                   (nodes.Titular, nodes.decoration, nodes.meta))
 370             nodelist = self.extract_bibliographic(candidate)
 371             del document[index]         # untransformed field list (candidate)
 372             document[biblioindex:biblioindex] = nodelist
 373
 374     def extract_bibliographic(self, field_list):
 375         docinfo = nodes.docinfo()
 376         bibliofields = self.language.bibliographic_fields
 377         labels = self.language.labels
 378         topics = {'dedication': None, 'abstract': None}
 379         for field in field_list:
 380             try:
 381                 name = field[0][0].astext()
 382                 normedname = nodes.fully_normalize_name(name)
 383                 if not (len(field) == 2 and normedname in bibliofields
 384                         and self.check_empty_biblio_field(field, name)):
 385                     raise TransformError
 386                 canonical = bibliofields[normedname]
 387                 biblioclass = self.biblio_nodes[canonical]
 388                 if issubclass(biblioclass, nodes.TextElement):
 389                     if not self.check_compound_biblio_field(field, name):
 390                         raise TransformError
 391                     utils.clean_rcs_keywords(
 392                           field[1][0], self.rcs_keyword_substitutions)
 393                     docinfo.append(biblioclass('', '', *field[1][0]))
 394                 elif issubclass(biblioclass, nodes.authors):
 395                     self.extract_authors(field, name, docinfo)
 396                 elif issubclass(biblioclass, nodes.topic):
 397                     if topics[canonical]:
 398                         field[-1] += self.document.reporter.warning(
 399                             'There can only be one "%s" field.' % name,
 400                             base_node=field)
 401                         raise TransformError
 402                     title = nodes.title(name, labels[canonical])
 403                     title[0].rawsource = labels[canonical]
 404                     topics[canonical] = biblioclass(
 405                         '', title, classes=[canonical], *field[1].children)
 406                 else:
 407                     docinfo.append(biblioclass('', *field[1].children))
 408             except TransformError:
 409                 if len(field[-1]) == 1 \
 410                        and isinstance(field[-1][0], nodes.paragraph):
 411                     utils.clean_rcs_keywords(
 412                         field[-1][0], self.rcs_keyword_substitutions)
 413                 # if normedname not in bibliofields:
 414                 classvalue = nodes.make_id(normedname)
 415                 if classvalue:
 416                     field['classes'].append(classvalue)
 417                 docinfo.append(field)
 418         nodelist = []
 419         if len(docinfo):
 420             nodelist.append(docinfo)
 421         if topics['dedication']:
 422             nodelist.append(topics['dedication'])
 423         if topics['abstract']:
 424             nodelist.append(topics['abstract'])
 425         return nodelist
 426
 427     def check_empty_biblio_field(self, field, name) -> bool:
 428         if len(field[-1]) < 1:
 429             field[-1] += self.document.reporter.warning(
 430                   f'Cannot extract empty bibliographic field "{name}".',
 431                   base_node=field)
 432             return False
 433         return True
 434
 435     def check_compound_biblio_field(self, field, name) -> bool:
 436         # Check that the `field` body contains a single paragraph
 437         # (i.e. it must *not* be a compound element).
 438         f_body = field[-1]
 439         if len(f_body) == 1 and isinstance(f_body[0], nodes.paragraph):
 440             return True
 441         # Restore single author name with initial (E. Xampl) parsed as
 442         # enumerated list
 443         # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#enumerated-lists
 444         if (isinstance(f_body[0], nodes.enumerated_list)
 445             and '\n' not in f_body.rawsource.strip()):
 446             # parse into a dummy document and use created nodes
 447             _document = utils.new_document('*DocInfo transform*',
 448                                            field.document.settings)
 449             parser = parsers.rst.Parser()
 450             parser.parse('\\'+f_body.rawsource, _document)
 451             if (len(_document.children) == 1
 452                 and isinstance(_document.children[0], nodes.paragraph)):
 453                 f_body.children = _document.children
 454                 return True
 455         # Check failed, add a warning
 456         content = [f'<{e.tagname}>' for e in f_body.children]
 457         if len(content) > 1:
 458             content = '[' + ', '.join(content) + ']'
 459         else:
 460             content = 'a ' + content[0]
 461         f_body += self.document.reporter.warning(
 462                       f'Bibliographic field "{name}"\nmust contain '
 463                       f'a single <paragraph>, not {content}.',
 464                       base_node=field)
 465         return False
 466
 467     rcs_keyword_substitutions = [
 468           (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
 469                       r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
 470           (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
 471           (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1')]
 472
 473     def extract_authors(self, field, name, docinfo):
 474         try:
 475             if len(field[1]) == 1:
 476                 if isinstance(field[1][0], nodes.paragraph):
 477                     authors = self.authors_from_one_paragraph(field)
 478                 elif isinstance(field[1][0], nodes.bullet_list):
 479                     authors = self.authors_from_bullet_list(field)
 480                 else:
 481                     raise TransformError
 482             else:
 483                 authors = self.authors_from_paragraphs(field)
 484             authornodes = [nodes.author('', '', *author)
 485                            for author in authors if author]
 486             if len(authornodes) >= 1:
 487                 docinfo.append(nodes.authors('', *authornodes))
 488             else:
 489                 raise TransformError
 490         except TransformError:
 491             field[-1] += self.document.reporter.warning(
 492                 f'Cannot extract "{name}" from bibliographic field:\n'
 493                 f'Bibliographic field "{name}" must contain either\n'
 494                 ' a single paragraph (with author names separated by one of '
 495                 f'"{"".join(self.language.author_separators)}"),\n'
 496                 ' multiple paragraphs (one per author),\n'
 497                 ' or a bullet list with one author name per item.\n'
 498                 'Note: Leading initials can cause (mis)recognizing names '
 499                 'as enumerated list.',
 500                 base_node=field)
 501             raise
 502
 503     def authors_from_one_paragraph(self, field):
 504         """Return list of Text nodes with author names in `field`.
 505
 506         Author names must be separated by one of the "autor separators"
 507         defined for the document language (default: ";" or ",").
 508         """
 509         # @@ keep original formatting? (e.g. ``:authors: A. Test, *et-al*``)
 510         text = ''.join(str(node)
 511                        for node in field[1].findall(nodes.Text))
 512         if not text:
 513             raise TransformError
 514         for authorsep in self.language.author_separators:
 515             # don't split at escaped `authorsep`:
 516             pattern = '(?<!\x00)%s' % authorsep
 517             authornames = re.split(pattern, text)
 518             if len(authornames) > 1:
 519                 break
 520         authornames = (name.strip() for name in authornames)
 521         return [[nodes.Text(name)] for name in authornames if name]
 522
 523     def authors_from_bullet_list(self, field):
 524         authors = []
 525         for item in field[1][0]:
 526             if isinstance(item, nodes.comment):
 527                 continue
 528             if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
 529                 raise TransformError
 530             authors.append(item[0].children)
 531         if not authors:
 532             raise TransformError
 533         return authors
 534
 535     def authors_from_paragraphs(self, field):
 536         for item in field[1]:
 537             if not isinstance(item, (nodes.paragraph, nodes.comment)):
 538                 raise TransformError
 539         authors = [item.children for item in field[1]
 540                    if not isinstance(item, nodes.comment)]
 541         return authors