2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 Transforms_ related to the front matter of a document or a section
7 (information found before the main text):
9 - `DocTitle`: Used to transform a lone top level section's title to
10 the document title, promote a remaining lone top-level section's
11 title to the document subtitle, and determine the document's title
12 metadata (document['title']) based on the document title and/or the
15 - `SectionSubTitle`: Used to transform a lone subsection into a
18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
21 .. _transforms: https://docutils.sourceforge.io/docs/api/transforms.html
24 __docformat__
= 'reStructuredText'
28 from docutils
import nodes
, parsers
, utils
29 from docutils
.transforms
import TransformError
, Transform
32 class TitlePromoter(Transform
):
35 Abstract base class for DocTitle and SectionSubTitle transforms.
38 def promote_title(self
, node
) -> bool:
40 Transform the following tree::
53 `node` is normally a document.
56 if not isinstance(node
, nodes
.Element
):
57 raise TypeError('node must be of Element-derived type.')
59 # `node` must not have a title yet.
60 assert not (len(node
) and isinstance(node
[0], nodes
.title
))
61 section
, index
= self
.candidate_index(node
)
65 # Transfer the section's attributes to the node:
66 # NOTE: Change `replace` to False to NOT replace attributes that
67 # already exist in node with those in section.
68 # NOTE: Remove `and_source` to NOT copy the 'source'
69 # attribute from section
70 node
.update_all_atts_concatenating(section
, replace
=True,
73 # setup_child is called automatically for all nodes.
74 node
[:] = (section
[:1] # section title
75 + node
[:index
] # everything that was in the
76 # node before the section
77 + section
[1:]) # everything that was in the section
78 assert isinstance(node
[0], nodes
.title
)
81 def promote_subtitle(self
, node
) -> bool:
83 Transform the following node tree::
99 if not isinstance(node
, nodes
.Element
):
100 raise TypeError('node must be of Element-derived type.')
102 subsection
, index
= self
.candidate_index(node
)
105 subtitle
= nodes
.subtitle()
107 # Transfer the subsection's attributes to the new subtitle
108 # NOTE: Change `replace` to False to NOT replace attributes
109 # that already exist in node with those in section.
110 # NOTE: Remove `and_source` to NOT copy the 'source'
111 # attribute from section.
112 subtitle
.update_all_atts_concatenating(subsection
, replace
=True,
115 # Transfer the contents of the subsection's title to the
117 subtitle
[:] = subsection
[0][:]
118 node
[:] = (node
[:1] # title
120 # everything that was before the section:
122 # everything that was in the subsection:
126 def candidate_index(self
, node
):
128 Find and return the promotion candidate and its index.
130 Return (None, None) if no valid candidate was found.
132 index
= node
.first_child_not_matching_class(
133 nodes
.PreBibliographic
)
134 if (index
is None or len(node
) > (index
+ 1)
135 or not isinstance(node
[index
], nodes
.section
)):
138 return node
[index
], index
141 class DocTitle(TitlePromoter
):
144 In reStructuredText_, there is no way to specify a document title
145 and subtitle explicitly. Instead, we can supply the document title
146 (and possibly the subtitle as well) implicitly, and use this
147 two-step transform to "raise" or "promote" the title(s) (and their
148 corresponding section contents) to the document level.
150 1. If the document contains a single top-level section as its first
151 element (instances of `nodes.PreBibliographic` are ignored),
152 the top-level section's title becomes the document's title, and
153 the top-level section's contents become the document's immediate
154 contents. The title is also used for the <document> element's
155 "title" attribute default value.
157 2. If step 1 successfully determines the document title, we
158 continue by checking for a subtitle.
160 If the lone top-level section itself contains a single second-level
161 section as its first "non-PreBibliographic" element, that section's
162 title is promoted to the document's subtitle, and that section's
163 contents become the document's immediate contents.
166 Given this input text::
177 After parsing and running the DocTitle transform, the result is::
179 <document names="top-level title">
182 <subtitle names="second-level title">
187 (Note that the implicit hyperlink target generated by the
188 "Second-Level Title" is preserved on the <subtitle> element
191 Any `nodes.PreBibliographic` instances occurring before the
192 document title or subtitle are accumulated and inserted as
193 the first body elements after the title(s).
195 .. _reStructuredText: https://docutils.sourceforge.io/rst.html
198 default_priority
= 320
200 def set_metadata(self
) -> None:
202 Set document['title'] metadata title from the following
203 sources, listed in order of priority:
205 * Existing document['title'] attribute.
207 * Document title node (as promoted by promote_title).
209 if not self
.document
.hasattr('title'):
210 if self
.document
.settings
.title
is not None:
211 self
.document
['title'] = self
.document
.settings
.title
212 elif len(self
.document
) and isinstance(self
.document
[0],
214 self
.document
['title'] = self
.document
[0].astext()
216 def apply(self
) -> None:
217 if self
.document
.settings
.setdefault('doctitle_xform', True):
218 # promote_(sub)title defined in TitlePromoter base class.
219 if self
.promote_title(self
.document
):
220 # If a title has been promoted, also try to promote a
222 self
.promote_subtitle(self
.document
)
223 # Set document['title'].
227 class SectionSubTitle(TitlePromoter
):
230 This works like document subtitles, but for sections. For example, ::
240 is transformed into ::
249 For details refer to the docstring of DocTitle.
252 default_priority
= 350
254 def apply(self
) -> None:
255 if not self
.document
.settings
.setdefault('sectsubtitle_xform', True):
257 for section
in self
.document
.findall(nodes
.section
):
258 # On our way through the node tree, we are modifying it
259 # but only the not-yet-visited part, so that the iterator
260 # returned by findall() is not corrupted.
261 self
.promote_subtitle(section
)
264 class DocInfo(Transform
):
267 This transform is specific to the reStructuredText_ markup syntax;
268 see "Bibliographic Fields" in the `reStructuredText Markup
269 Specification`_ for a high-level description. This transform
270 should be run *after* the `DocTitle` transform.
272 If the document contains a field list as the first element (instances
273 of `nodes.PreBibliographic` are ignored), registered bibliographic
274 field names are transformed to the corresponding DTD elements,
275 becoming child elements of the <docinfo> element (except for a
276 dedication and/or an abstract, which become <topic> elements after
279 For example, given this document fragment after parsing::
299 After running the bibliographic field list transform, the
300 resulting document tree would look like this::
312 The "Status" field contained an expanded RCS keyword, which is
313 normally (but optionally) cleaned up by the transform. The sole
314 contents of the field body must be a paragraph containing an
315 expanded RCS keyword of the form "$keyword: expansion text $". Any
316 RCS keyword can be processed in any bibliographic field. The
317 dollar signs and leading RCS keyword name are removed. Extra
318 processing is done for the following RCS keywords:
320 - "RCSfile" expands to the name of the file in the RCS or CVS
321 repository, which is the name of the source file with a ",v"
322 suffix appended. The transform will remove the ",v" suffix.
324 - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
325 time zone). The RCS Keywords transform will extract just the
326 date itself and transform it to an ISO 8601 format date, as in
329 (Since the source file for this text is itself stored under CVS,
330 we can't show an example of the "Date" RCS keyword because we
331 can't prevent any RCS keywords used in this explanation from
332 being expanded. Only the "RCSfile" keyword is stable; its
333 expansion text changes only if the file name changes.)
335 .. _reStructuredText: https://docutils.sourceforge.io/rst.html
336 .. _reStructuredText Markup Specification:
337 https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
340 default_priority
= 340
343 'author': nodes
.author
,
344 'authors': nodes
.authors
,
345 'organization': nodes
.organization
,
346 'address': nodes
.address
,
347 'contact': nodes
.contact
,
348 'version': nodes
.version
,
349 'revision': nodes
.revision
,
350 'status': nodes
.status
,
352 'copyright': nodes
.copyright
,
353 'dedication': nodes
.topic
,
354 'abstract': nodes
.topic
}
355 """Canonical field name (lowcased) to node class name mapping for
356 bibliographic fields (field_list)."""
358 def apply(self
) -> None:
359 if not self
.document
.settings
.setdefault('docinfo_xform', True):
361 document
= self
.document
362 index
= document
.first_child_not_matching_class(
363 nodes
.PreBibliographic
)
366 candidate
= document
[index
]
367 if isinstance(candidate
, nodes
.field_list
):
368 biblioindex
= document
.first_child_not_matching_class(
369 (nodes
.Titular
, nodes
.decoration
, nodes
.meta
))
370 nodelist
= self
.extract_bibliographic(candidate
)
371 del document
[index
] # untransformed field list (candidate)
372 document
[biblioindex
:biblioindex
] = nodelist
374 def extract_bibliographic(self
, field_list
):
375 docinfo
= nodes
.docinfo()
376 bibliofields
= self
.language
.bibliographic_fields
377 labels
= self
.language
.labels
378 topics
= {'dedication': None, 'abstract': None}
379 for field
in field_list
:
381 name
= field
[0][0].astext()
382 normedname
= nodes
.fully_normalize_name(name
)
383 if not (len(field
) == 2 and normedname
in bibliofields
384 and self
.check_empty_biblio_field(field
, name
)):
386 canonical
= bibliofields
[normedname
]
387 biblioclass
= self
.biblio_nodes
[canonical
]
388 if issubclass(biblioclass
, nodes
.TextElement
):
389 if not self
.check_compound_biblio_field(field
, name
):
391 utils
.clean_rcs_keywords(
392 field
[1][0], self
.rcs_keyword_substitutions
)
393 docinfo
.append(biblioclass('', '', *field
[1][0]))
394 elif issubclass(biblioclass
, nodes
.authors
):
395 self
.extract_authors(field
, name
, docinfo
)
396 elif issubclass(biblioclass
, nodes
.topic
):
397 if topics
[canonical
]:
398 field
[-1] += self
.document
.reporter
.warning(
399 'There can only be one "%s" field.' % name
,
402 title
= nodes
.title(name
, labels
[canonical
])
403 title
[0].rawsource
= labels
[canonical
]
404 topics
[canonical
] = biblioclass(
405 '', title
, classes
=[canonical
], *field
[1].children
)
407 docinfo
.append(biblioclass('', *field
[1].children
))
408 except TransformError
:
409 if len(field
[-1]) == 1 \
410 and isinstance(field
[-1][0], nodes
.paragraph
):
411 utils
.clean_rcs_keywords(
412 field
[-1][0], self
.rcs_keyword_substitutions
)
413 # if normedname not in bibliofields:
414 classvalue
= nodes
.make_id(normedname
)
416 field
['classes'].append(classvalue
)
417 docinfo
.append(field
)
420 nodelist
.append(docinfo
)
421 if topics
['dedication']:
422 nodelist
.append(topics
['dedication'])
423 if topics
['abstract']:
424 nodelist
.append(topics
['abstract'])
427 def check_empty_biblio_field(self
, field
, name
) -> bool:
428 if len(field
[-1]) < 1:
429 field
[-1] += self
.document
.reporter
.warning(
430 f
'Cannot extract empty bibliographic field "{name}".',
435 def check_compound_biblio_field(self
, field
, name
) -> bool:
436 # Check that the `field` body contains a single paragraph
437 # (i.e. it must *not* be a compound element).
439 if len(f_body
) == 1 and isinstance(f_body
[0], nodes
.paragraph
):
441 # Restore single author name with initial (E. Xampl) parsed as
443 # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#enumerated-lists
444 if (isinstance(f_body
[0], nodes
.enumerated_list
)
445 and '\n' not in f_body
.rawsource
.strip()):
446 # parse into a dummy document and use created nodes
447 _document
= utils
.new_document('*DocInfo transform*',
448 field
.document
.settings
)
449 parser
= parsers
.rst
.Parser()
450 parser
.parse('\\'+f_body
.rawsource
, _document
)
451 if (len(_document
.children
) == 1
452 and isinstance(_document
.children
[0], nodes
.paragraph
)):
453 f_body
.children
= _document
.children
455 # Check failed, add a warning
456 content
= [f
'<{e.tagname}>' for e
in f_body
.children
]
458 content
= '[' + ', '.join(content
) + ']'
460 content
= 'a ' + content
[0]
461 f_body
+= self
.document
.reporter
.warning(
462 f
'Bibliographic field "{name}"\nmust contain '
463 f
'a single <paragraph>, not {content}.',
467 rcs_keyword_substitutions
= [
468 (re
.compile(r
'\$' r
'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
469 r
'[^$]* \$', re
.IGNORECASE
), r
'\1-\2-\3'),
470 (re
.compile(r
'\$' r
'RCSfile: (.+),v \$', re
.IGNORECASE
), r
'\1'),
471 (re
.compile(r
'\$[a-zA-Z]+: (.+) \$'), r
'\1')]
473 def extract_authors(self
, field
, name
, docinfo
):
475 if len(field
[1]) == 1:
476 if isinstance(field
[1][0], nodes
.paragraph
):
477 authors
= self
.authors_from_one_paragraph(field
)
478 elif isinstance(field
[1][0], nodes
.bullet_list
):
479 authors
= self
.authors_from_bullet_list(field
)
483 authors
= self
.authors_from_paragraphs(field
)
484 authornodes
= [nodes
.author('', '', *author
)
485 for author
in authors
if author
]
486 if len(authornodes
) >= 1:
487 docinfo
.append(nodes
.authors('', *authornodes
))
490 except TransformError
:
491 field
[-1] += self
.document
.reporter
.warning(
492 f
'Cannot extract "{name}" from bibliographic field:\n'
493 f
'Bibliographic field "{name}" must contain either\n'
494 ' a single paragraph (with author names separated by one of '
495 f
'"{"".join(self.language.author_separators)}"),\n'
496 ' multiple paragraphs (one per author),\n'
497 ' or a bullet list with one author name per item.\n'
498 'Note: Leading initials can cause (mis)recognizing names '
499 'as enumerated list.',
503 def authors_from_one_paragraph(self
, field
):
504 """Return list of Text nodes with author names in `field`.
506 Author names must be separated by one of the "autor separators"
507 defined for the document language (default: ";" or ",").
509 # @@ keep original formatting? (e.g. ``:authors: A. Test, *et-al*``)
510 text
= ''.join(str(node
)
511 for node
in field
[1].findall(nodes
.Text
))
514 for authorsep
in self
.language
.author_separators
:
515 # don't split at escaped `authorsep`:
516 pattern
= '(?<!\x00)%s' % authorsep
517 authornames
= re
.split(pattern
, text
)
518 if len(authornames
) > 1:
520 authornames
= (name
.strip() for name
in authornames
)
521 return [[nodes
.Text(name
)] for name
in authornames
if name
]
523 def authors_from_bullet_list(self
, field
):
525 for item
in field
[1][0]:
526 if isinstance(item
, nodes
.comment
):
528 if len(item
) != 1 or not isinstance(item
[0], nodes
.paragraph
):
530 authors
.append(item
[0].children
)
535 def authors_from_paragraphs(self
, field
):
536 for item
in field
[1]:
537 if not isinstance(item
, (nodes
.paragraph
, nodes
.comment
)):
539 authors
= [item
.children
for item
in field
[1]
540 if not isinstance(item
, nodes
.comment
)]