Use `nodes.parse_measure()` in rST directive option conversion.
[docutils.git] / docutils / docutils / transforms / frontmatter.py
blobb85a2816aa98104e8c9e85361938ca005398ae69
1 # $Id$
2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 Transforms_ related to the front matter of a document or a section
7 (information found before the main text):
9 - `DocTitle`: Used to transform a lone top level section's title to
10 the document title, promote a remaining lone top-level section's
11 title to the document subtitle, and determine the document's title
12 metadata (document['title']) based on the document title and/or the
13 "title" setting.
15 - `SectionSubTitle`: Used to transform a lone subsection into a
16 subtitle.
18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
19 elements.
21 .. _transforms: https://docutils.sourceforge.io/docs/api/transforms.html
22 """
24 __docformat__ = 'reStructuredText'
26 import re
28 from docutils import nodes, parsers, utils
29 from docutils.transforms import TransformError, Transform
32 class TitlePromoter(Transform):
34 """
35 Abstract base class for DocTitle and SectionSubTitle transforms.
36 """
38 def promote_title(self, node) -> bool:
39 """
40 Transform the following tree::
42 <node>
43 <section>
44 <title>
45 ...
47 into ::
49 <node>
50 <title>
51 ...
53 `node` is normally a document.
54 """
55 # Type check
56 if not isinstance(node, nodes.Element):
57 raise TypeError('node must be of Element-derived type.')
59 # `node` must not have a title yet.
60 assert not (len(node) and isinstance(node[0], nodes.title))
61 section, index = self.candidate_index(node)
62 if index is None:
63 return False
65 # Transfer the section's attributes to the node:
66 # NOTE: Change `replace` to False to NOT replace attributes that
67 # already exist in node with those in section.
68 # NOTE: Remove `and_source` to NOT copy the 'source'
69 # attribute from section
70 node.update_all_atts_concatenating(section, replace=True,
71 and_source=True)
73 # setup_child is called automatically for all nodes.
74 node[:] = (section[:1] # section title
75 + node[:index] # everything that was in the
76 # node before the section
77 + section[1:]) # everything that was in the section
78 assert isinstance(node[0], nodes.title)
79 return True
81 def promote_subtitle(self, node) -> bool:
82 """
83 Transform the following node tree::
85 <node>
86 <title>
87 <section>
88 <title>
89 ...
91 into ::
93 <node>
94 <title>
95 <subtitle>
96 ...
97 """
98 # Type check
99 if not isinstance(node, nodes.Element):
100 raise TypeError('node must be of Element-derived type.')
102 subsection, index = self.candidate_index(node)
103 if index is None:
104 return False
105 subtitle = nodes.subtitle()
107 # Transfer the subsection's attributes to the new subtitle
108 # NOTE: Change `replace` to False to NOT replace attributes
109 # that already exist in node with those in section.
110 # NOTE: Remove `and_source` to NOT copy the 'source'
111 # attribute from section.
112 subtitle.update_all_atts_concatenating(subsection, replace=True,
113 and_source=True)
115 # Transfer the contents of the subsection's title to the
116 # subtitle:
117 subtitle[:] = subsection[0][:]
118 node[:] = (node[:1] # title
119 + [subtitle]
120 # everything that was before the section:
121 + node[1:index]
122 # everything that was in the subsection:
123 + subsection[1:])
124 return True
126 def candidate_index(self, node):
128 Find and return the promotion candidate and its index.
130 Return (None, None) if no valid candidate was found.
132 index = node.first_child_not_matching_class(
133 nodes.PreBibliographic)
134 if (index is None or len(node) > (index + 1)
135 or not isinstance(node[index], nodes.section)):
136 return None, None
137 else:
138 return node[index], index
141 class DocTitle(TitlePromoter):
144 In reStructuredText_, there is no way to specify a document title
145 and subtitle explicitly. Instead, we can supply the document title
146 (and possibly the subtitle as well) implicitly, and use this
147 two-step transform to "raise" or "promote" the title(s) (and their
148 corresponding section contents) to the document level.
150 1. If the document contains a single top-level section as its first
151 element (instances of `nodes.PreBibliographic` are ignored),
152 the top-level section's title becomes the document's title, and
153 the top-level section's contents become the document's immediate
154 contents. The title is also used for the <document> element's
155 "title" attribute default value.
157 2. If step 1 successfully determines the document title, we
158 continue by checking for a subtitle.
160 If the lone top-level section itself contains a single second-level
161 section as its first "non-PreBibliographic" element, that section's
162 title is promoted to the document's subtitle, and that section's
163 contents become the document's immediate contents.
165 Example:
166 Given this input text::
168 =================
169 Top-Level Title
170 =================
172 Second-Level Title
173 ~~~~~~~~~~~~~~~~~~
175 A paragraph.
177 After parsing and running the DocTitle transform, the result is::
179 <document names="top-level title">
180 <title>
181 Top-Level Title
182 <subtitle names="second-level title">
183 Second-Level Title
184 <paragraph>
185 A paragraph.
187 (Note that the implicit hyperlink target generated by the
188 "Second-Level Title" is preserved on the <subtitle> element
189 itself.)
191 Any `nodes.PreBibliographic` instances occurring before the
192 document title or subtitle are accumulated and inserted as
193 the first body elements after the title(s).
195 .. _reStructuredText: https://docutils.sourceforge.io/rst.html
198 default_priority = 320
200 def set_metadata(self) -> None:
202 Set document['title'] metadata title from the following
203 sources, listed in order of priority:
205 * Existing document['title'] attribute.
206 * "title" setting.
207 * Document title node (as promoted by promote_title).
209 if not self.document.hasattr('title'):
210 if self.document.settings.title is not None:
211 self.document['title'] = self.document.settings.title
212 elif len(self.document) and isinstance(self.document[0],
213 nodes.title):
214 self.document['title'] = self.document[0].astext()
216 def apply(self) -> None:
217 if self.document.settings.setdefault('doctitle_xform', True):
218 # promote_(sub)title defined in TitlePromoter base class.
219 if self.promote_title(self.document):
220 # If a title has been promoted, also try to promote a
221 # subtitle.
222 self.promote_subtitle(self.document)
223 # Set document['title'].
224 self.set_metadata()
227 class SectionSubTitle(TitlePromoter):
230 This works like document subtitles, but for sections. For example, ::
232 <section>
233 <title>
234 Title
235 <section>
236 <title>
237 Subtitle
240 is transformed into ::
242 <section>
243 <title>
244 Title
245 <subtitle>
246 Subtitle
249 For details refer to the docstring of DocTitle.
252 default_priority = 350
254 def apply(self) -> None:
255 if not self.document.settings.setdefault('sectsubtitle_xform', True):
256 return
257 for section in self.document.findall(nodes.section):
258 # On our way through the node tree, we are modifying it
259 # but only the not-yet-visited part, so that the iterator
260 # returned by findall() is not corrupted.
261 self.promote_subtitle(section)
264 class DocInfo(Transform):
267 This transform is specific to the reStructuredText_ markup syntax;
268 see "Bibliographic Fields" in the `reStructuredText Markup
269 Specification`_ for a high-level description. This transform
270 should be run *after* the `DocTitle` transform.
272 If the document contains a field list as the first element (instances
273 of `nodes.PreBibliographic` are ignored), registered bibliographic
274 field names are transformed to the corresponding DTD elements,
275 becoming child elements of the <docinfo> element (except for a
276 dedication and/or an abstract, which become <topic> elements after
277 <docinfo>).
279 For example, given this document fragment after parsing::
281 <document>
282 <title>
283 Document Title
284 <field_list>
285 <field>
286 <field_name>
287 Author
288 <field_body>
289 <paragraph>
290 A. Name
291 <field>
292 <field_name>
293 Status
294 <field_body>
295 <paragraph>
296 $RCSfile$
299 After running the bibliographic field list transform, the
300 resulting document tree would look like this::
302 <document>
303 <title>
304 Document Title
305 <docinfo>
306 <author>
307 A. Name
308 <status>
309 frontmatter.py
312 The "Status" field contained an expanded RCS keyword, which is
313 normally (but optionally) cleaned up by the transform. The sole
314 contents of the field body must be a paragraph containing an
315 expanded RCS keyword of the form "$keyword: expansion text $". Any
316 RCS keyword can be processed in any bibliographic field. The
317 dollar signs and leading RCS keyword name are removed. Extra
318 processing is done for the following RCS keywords:
320 - "RCSfile" expands to the name of the file in the RCS or CVS
321 repository, which is the name of the source file with a ",v"
322 suffix appended. The transform will remove the ",v" suffix.
324 - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
325 time zone). The RCS Keywords transform will extract just the
326 date itself and transform it to an ISO 8601 format date, as in
327 "2000-12-31".
329 (Since the source file for this text is itself stored under CVS,
330 we can't show an example of the "Date" RCS keyword because we
331 can't prevent any RCS keywords used in this explanation from
332 being expanded. Only the "RCSfile" keyword is stable; its
333 expansion text changes only if the file name changes.)
335 .. _reStructuredText: https://docutils.sourceforge.io/rst.html
336 .. _reStructuredText Markup Specification:
337 https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
340 default_priority = 340
342 biblio_nodes = {
343 'author': nodes.author,
344 'authors': nodes.authors,
345 'organization': nodes.organization,
346 'address': nodes.address,
347 'contact': nodes.contact,
348 'version': nodes.version,
349 'revision': nodes.revision,
350 'status': nodes.status,
351 'date': nodes.date,
352 'copyright': nodes.copyright,
353 'dedication': nodes.topic,
354 'abstract': nodes.topic}
355 """Canonical field name (lowcased) to node class name mapping for
356 bibliographic fields (field_list)."""
358 def apply(self) -> None:
359 if not self.document.settings.setdefault('docinfo_xform', True):
360 return
361 document = self.document
362 index = document.first_child_not_matching_class(
363 nodes.PreBibliographic)
364 if index is None:
365 return
366 candidate = document[index]
367 if isinstance(candidate, nodes.field_list):
368 biblioindex = document.first_child_not_matching_class(
369 (nodes.Titular, nodes.decoration, nodes.meta))
370 nodelist = self.extract_bibliographic(candidate)
371 del document[index] # untransformed field list (candidate)
372 document[biblioindex:biblioindex] = nodelist
374 def extract_bibliographic(self, field_list):
375 docinfo = nodes.docinfo()
376 bibliofields = self.language.bibliographic_fields
377 labels = self.language.labels
378 topics = {'dedication': None, 'abstract': None}
379 for field in field_list:
380 try:
381 name = field[0][0].astext()
382 normedname = nodes.fully_normalize_name(name)
383 if not (len(field) == 2 and normedname in bibliofields
384 and self.check_empty_biblio_field(field, name)):
385 raise TransformError
386 canonical = bibliofields[normedname]
387 biblioclass = self.biblio_nodes[canonical]
388 if issubclass(biblioclass, nodes.TextElement):
389 if not self.check_compound_biblio_field(field, name):
390 raise TransformError
391 utils.clean_rcs_keywords(
392 field[1][0], self.rcs_keyword_substitutions)
393 docinfo.append(biblioclass('', '', *field[1][0]))
394 elif issubclass(biblioclass, nodes.authors):
395 self.extract_authors(field, name, docinfo)
396 elif issubclass(biblioclass, nodes.topic):
397 if topics[canonical]:
398 field[-1] += self.document.reporter.warning(
399 'There can only be one "%s" field.' % name,
400 base_node=field)
401 raise TransformError
402 title = nodes.title(name, labels[canonical])
403 title[0].rawsource = labels[canonical]
404 topics[canonical] = biblioclass(
405 '', title, classes=[canonical], *field[1].children)
406 else:
407 docinfo.append(biblioclass('', *field[1].children))
408 except TransformError:
409 if len(field[-1]) == 1 \
410 and isinstance(field[-1][0], nodes.paragraph):
411 utils.clean_rcs_keywords(
412 field[-1][0], self.rcs_keyword_substitutions)
413 # if normedname not in bibliofields:
414 classvalue = nodes.make_id(normedname)
415 if classvalue:
416 field['classes'].append(classvalue)
417 docinfo.append(field)
418 nodelist = []
419 if len(docinfo):
420 nodelist.append(docinfo)
421 if topics['dedication']:
422 nodelist.append(topics['dedication'])
423 if topics['abstract']:
424 nodelist.append(topics['abstract'])
425 return nodelist
427 def check_empty_biblio_field(self, field, name) -> bool:
428 if len(field[-1]) < 1:
429 field[-1] += self.document.reporter.warning(
430 f'Cannot extract empty bibliographic field "{name}".',
431 base_node=field)
432 return False
433 return True
435 def check_compound_biblio_field(self, field, name) -> bool:
436 # Check that the `field` body contains a single paragraph
437 # (i.e. it must *not* be a compound element).
438 f_body = field[-1]
439 if len(f_body) == 1 and isinstance(f_body[0], nodes.paragraph):
440 return True
441 # Restore single author name with initial (E. Xampl) parsed as
442 # enumerated list
443 # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#enumerated-lists
444 if (isinstance(f_body[0], nodes.enumerated_list)
445 and '\n' not in f_body.rawsource.strip()):
446 # parse into a dummy document and use created nodes
447 _document = utils.new_document('*DocInfo transform*',
448 field.document.settings)
449 parser = parsers.rst.Parser()
450 parser.parse('\\'+f_body.rawsource, _document)
451 if (len(_document.children) == 1
452 and isinstance(_document.children[0], nodes.paragraph)):
453 f_body.children = _document.children
454 return True
455 # Check failed, add a warning
456 content = [f'<{e.tagname}>' for e in f_body.children]
457 if len(content) > 1:
458 content = '[' + ', '.join(content) + ']'
459 else:
460 content = 'a ' + content[0]
461 f_body += self.document.reporter.warning(
462 f'Bibliographic field "{name}"\nmust contain '
463 f'a single <paragraph>, not {content}.',
464 base_node=field)
465 return False
467 rcs_keyword_substitutions = [
468 (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
469 r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
470 (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
471 (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1')]
473 def extract_authors(self, field, name, docinfo):
474 try:
475 if len(field[1]) == 1:
476 if isinstance(field[1][0], nodes.paragraph):
477 authors = self.authors_from_one_paragraph(field)
478 elif isinstance(field[1][0], nodes.bullet_list):
479 authors = self.authors_from_bullet_list(field)
480 else:
481 raise TransformError
482 else:
483 authors = self.authors_from_paragraphs(field)
484 authornodes = [nodes.author('', '', *author)
485 for author in authors if author]
486 if len(authornodes) >= 1:
487 docinfo.append(nodes.authors('', *authornodes))
488 else:
489 raise TransformError
490 except TransformError:
491 field[-1] += self.document.reporter.warning(
492 f'Cannot extract "{name}" from bibliographic field:\n'
493 f'Bibliographic field "{name}" must contain either\n'
494 ' a single paragraph (with author names separated by one of '
495 f'"{"".join(self.language.author_separators)}"),\n'
496 ' multiple paragraphs (one per author),\n'
497 ' or a bullet list with one author name per item.\n'
498 'Note: Leading initials can cause (mis)recognizing names '
499 'as enumerated list.',
500 base_node=field)
501 raise
503 def authors_from_one_paragraph(self, field):
504 """Return list of Text nodes with author names in `field`.
506 Author names must be separated by one of the "autor separators"
507 defined for the document language (default: ";" or ",").
509 # @@ keep original formatting? (e.g. ``:authors: A. Test, *et-al*``)
510 text = ''.join(str(node)
511 for node in field[1].findall(nodes.Text))
512 if not text:
513 raise TransformError
514 for authorsep in self.language.author_separators:
515 # don't split at escaped `authorsep`:
516 pattern = '(?<!\x00)%s' % authorsep
517 authornames = re.split(pattern, text)
518 if len(authornames) > 1:
519 break
520 authornames = (name.strip() for name in authornames)
521 return [[nodes.Text(name)] for name in authornames if name]
523 def authors_from_bullet_list(self, field):
524 authors = []
525 for item in field[1][0]:
526 if isinstance(item, nodes.comment):
527 continue
528 if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
529 raise TransformError
530 authors.append(item[0].children)
531 if not authors:
532 raise TransformError
533 return authors
535 def authors_from_paragraphs(self, field):
536 for item in field[1]:
537 if not isinstance(item, (nodes.paragraph, nodes.comment)):
538 raise TransformError
539 authors = [item.children for item in field[1]
540 if not isinstance(item, nodes.comment)]
541 return authors