Change the default input encoding from ``None`` to "utf-8" in io.py.
[docutils.git] / docutils / docutils / transforms / frontmatter.py
blob0629026a885a3e6847dc70bc59256e080b264803
1 # $Id$
2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 Transforms_ related to the front matter of a document or a section
7 (information found before the main text):
9 - `DocTitle`: Used to transform a lone top level section's title to
10 the document title, promote a remaining lone top-level section's
11 title to the document subtitle, and determine the document's title
12 metadata (document['title']) based on the document title and/or the
13 "title" setting.
15 - `SectionSubTitle`: Used to transform a lone subsection into a
16 subtitle.
18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
19 elements.
21 .. _transforms: https://docutils.sourceforge.io/docs/api/transforms.html
22 """
24 __docformat__ = 'reStructuredText'
26 import re
28 from docutils import nodes, parsers, utils
29 from docutils.transforms import TransformError, Transform
32 class TitlePromoter(Transform):
34 """
35 Abstract base class for DocTitle and SectionSubTitle transforms.
36 """
38 def promote_title(self, node):
39 """
40 Transform the following tree::
42 <node>
43 <section>
44 <title>
45 ...
47 into ::
49 <node>
50 <title>
51 ...
53 `node` is normally a document.
54 """
55 # Type check
56 if not isinstance(node, nodes.Element):
57 raise TypeError('node must be of Element-derived type.')
59 # `node` must not have a title yet.
60 assert not (len(node) and isinstance(node[0], nodes.title))
61 section, index = self.candidate_index(node)
62 if index is None:
63 return False
65 # Transfer the section's attributes to the node:
66 # NOTE: Change `replace` to False to NOT replace attributes that
67 # already exist in node with those in section.
68 # NOTE: Remove `and_source` to NOT copy the 'source'
69 # attribute from section
70 node.update_all_atts_concatenating(section, replace=True,
71 and_source=True)
73 # setup_child is called automatically for all nodes.
74 node[:] = (section[:1] # section title
75 + node[:index] # everything that was in the
76 # node before the section
77 + section[1:]) # everything that was in the section
78 assert isinstance(node[0], nodes.title)
79 return True
81 def promote_subtitle(self, node):
82 """
83 Transform the following node tree::
85 <node>
86 <title>
87 <section>
88 <title>
89 ...
91 into ::
93 <node>
94 <title>
95 <subtitle>
96 ...
97 """
98 # Type check
99 if not isinstance(node, nodes.Element):
100 raise TypeError('node must be of Element-derived type.')
102 subsection, index = self.candidate_index(node)
103 if index is None:
104 return False
105 subtitle = nodes.subtitle()
107 # Transfer the subsection's attributes to the new subtitle
108 # NOTE: Change `replace` to False to NOT replace attributes
109 # that already exist in node with those in section.
110 # NOTE: Remove `and_source` to NOT copy the 'source'
111 # attribute from section.
112 subtitle.update_all_atts_concatenating(subsection, replace=True,
113 and_source=True)
115 # Transfer the contents of the subsection's title to the
116 # subtitle:
117 subtitle[:] = subsection[0][:]
118 node[:] = (node[:1] # title
119 + [subtitle]
120 # everything that was before the section:
121 + node[1:index]
122 # everything that was in the subsection:
123 + subsection[1:])
124 return True
126 def candidate_index(self, node):
128 Find and return the promotion candidate and its index.
130 Return (None, None) if no valid candidate was found.
132 index = node.first_child_not_matching_class(
133 nodes.PreBibliographic)
134 if (index is None or len(node) > (index + 1)
135 or not isinstance(node[index], nodes.section)):
136 return None, None
137 else:
138 return node[index], index
141 class DocTitle(TitlePromoter):
144 In reStructuredText_, there is no way to specify a document title
145 and subtitle explicitly. Instead, we can supply the document title
146 (and possibly the subtitle as well) implicitly, and use this
147 two-step transform to "raise" or "promote" the title(s) (and their
148 corresponding section contents) to the document level.
150 1. If the document contains a single top-level section as its first
151 element (instances of `nodes.PreBibliographic` are ignored),
152 the top-level section's title becomes the document's title, and
153 the top-level section's contents become the document's immediate
154 contents. The title is also used for the <document> element's
155 "title" attribute default value.
157 2. If step 1 successfully determines the document title, we
158 continue by checking for a subtitle.
160 If the lone top-level section itself contains a single second-level
161 section as its first "non-PreBibliographic" element, that section's
162 title is promoted to the document's subtitle, and that section's
163 contents become the document's immediate contents.
165 Example:
166 Given this input text::
168 =================
169 Top-Level Title
170 =================
172 Second-Level Title
173 ~~~~~~~~~~~~~~~~~~
175 A paragraph.
177 After parsing and running the DocTitle transform, the result is::
179 <document names="top-level title">
180 <title>
181 Top-Level Title
182 <subtitle names="second-level title">
183 Second-Level Title
184 <paragraph>
185 A paragraph.
187 (Note that the implicit hyperlink target generated by the
188 "Second-Level Title" is preserved on the <subtitle> element
189 itself.)
191 Any `nodes.PreBibliographic` instances occurring before the
192 document title or subtitle are accumulated and inserted as
193 the first body elements after the title(s).
195 .. _reStructuredText: https://docutils.sourceforge.io/rst.html
198 default_priority = 320
200 def set_metadata(self):
202 Set document['title'] metadata title from the following
203 sources, listed in order of priority:
205 * Existing document['title'] attribute.
206 * "title" setting.
207 * Document title node (as promoted by promote_title).
209 if not self.document.hasattr('title'):
210 if self.document.settings.title is not None:
211 self.document['title'] = self.document.settings.title
212 elif len(self.document) and isinstance(self.document[0],
213 nodes.title):
214 self.document['title'] = self.document[0].astext()
216 def apply(self):
217 if self.document.settings.setdefault('doctitle_xform', True):
218 # promote_(sub)title defined in TitlePromoter base class.
219 if self.promote_title(self.document):
220 # If a title has been promoted, also try to promote a
221 # subtitle.
222 self.promote_subtitle(self.document)
223 # Set document['title'].
224 self.set_metadata()
227 class SectionSubTitle(TitlePromoter):
230 This works like document subtitles, but for sections. For example, ::
232 <section>
233 <title>
234 Title
235 <section>
236 <title>
237 Subtitle
240 is transformed into ::
242 <section>
243 <title>
244 Title
245 <subtitle>
246 Subtitle
249 For details refer to the docstring of DocTitle.
252 default_priority = 350
254 def apply(self):
255 if not self.document.settings.setdefault('sectsubtitle_xform', True):
256 return
257 for section in self.document.findall(nodes.section):
258 # On our way through the node tree, we are modifying it
259 # but only the not-yet-visited part, so that the iterator
260 # returned by findall() is not corrupted.
261 self.promote_subtitle(section)
264 class DocInfo(Transform):
267 This transform is specific to the reStructuredText_ markup syntax;
268 see "Bibliographic Fields" in the `reStructuredText Markup
269 Specification`_ for a high-level description. This transform
270 should be run *after* the `DocTitle` transform.
272 If the document contains a field list as the first element (instances
273 of `nodes.PreBibliographic` are ignored), registered bibliographic
274 field names are transformed to the corresponding DTD elements,
275 becoming child elements of the <docinfo> element (except for a
276 dedication and/or an abstract, which become <topic> elements after
277 <docinfo>).
279 For example, given this document fragment after parsing::
281 <document>
282 <title>
283 Document Title
284 <field_list>
285 <field>
286 <field_name>
287 Author
288 <field_body>
289 <paragraph>
290 A. Name
291 <field>
292 <field_name>
293 Status
294 <field_body>
295 <paragraph>
296 $RCSfile$
299 After running the bibliographic field list transform, the
300 resulting document tree would look like this::
302 <document>
303 <title>
304 Document Title
305 <docinfo>
306 <author>
307 A. Name
308 <status>
309 frontmatter.py
312 The "Status" field contained an expanded RCS keyword, which is
313 normally (but optionally) cleaned up by the transform. The sole
314 contents of the field body must be a paragraph containing an
315 expanded RCS keyword of the form "$keyword: expansion text $". Any
316 RCS keyword can be processed in any bibliographic field. The
317 dollar signs and leading RCS keyword name are removed. Extra
318 processing is done for the following RCS keywords:
320 - "RCSfile" expands to the name of the file in the RCS or CVS
321 repository, which is the name of the source file with a ",v"
322 suffix appended. The transform will remove the ",v" suffix.
324 - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
325 time zone). The RCS Keywords transform will extract just the
326 date itself and transform it to an ISO 8601 format date, as in
327 "2000-12-31".
329 (Since the source file for this text is itself stored under CVS,
330 we can't show an example of the "Date" RCS keyword because we
331 can't prevent any RCS keywords used in this explanation from
332 being expanded. Only the "RCSfile" keyword is stable; its
333 expansion text changes only if the file name changes.)
335 .. _reStructuredText: https://docutils.sourceforge.io/rst.html
336 .. _reStructuredText Markup Specification:
337 https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
340 default_priority = 340
342 biblio_nodes = {
343 'author': nodes.author,
344 'authors': nodes.authors,
345 'organization': nodes.organization,
346 'address': nodes.address,
347 'contact': nodes.contact,
348 'version': nodes.version,
349 'revision': nodes.revision,
350 'status': nodes.status,
351 'date': nodes.date,
352 'copyright': nodes.copyright,
353 'dedication': nodes.topic,
354 'abstract': nodes.topic}
355 """Canonical field name (lowcased) to node class name mapping for
356 bibliographic fields (field_list)."""
358 def apply(self):
359 if not self.document.settings.setdefault('docinfo_xform', True):
360 return
361 document = self.document
362 index = document.first_child_not_matching_class(
363 nodes.PreBibliographic)
364 if index is None:
365 return
366 candidate = document[index]
367 if isinstance(candidate, nodes.field_list):
368 biblioindex = document.first_child_not_matching_class(
369 (nodes.Titular, nodes.decoration, nodes.meta))
370 nodelist = self.extract_bibliographic(candidate)
371 del document[index] # untransformed field list (candidate)
372 document[biblioindex:biblioindex] = nodelist
374 def extract_bibliographic(self, field_list):
375 docinfo = nodes.docinfo()
376 bibliofields = self.language.bibliographic_fields
377 labels = self.language.labels
378 topics = {'dedication': None, 'abstract': None}
379 for field in field_list:
380 try:
381 name = field[0][0].astext()
382 normedname = nodes.fully_normalize_name(name)
383 if not (len(field) == 2 and normedname in bibliofields
384 and self.check_empty_biblio_field(field, name)):
385 raise TransformError
386 canonical = bibliofields[normedname]
387 biblioclass = self.biblio_nodes[canonical]
388 if issubclass(biblioclass, nodes.TextElement):
389 if not self.check_compound_biblio_field(field, name):
390 raise TransformError
391 utils.clean_rcs_keywords(
392 field[1][0], self.rcs_keyword_substitutions)
393 docinfo.append(biblioclass('', '', *field[1][0]))
394 elif issubclass(biblioclass, nodes.authors):
395 self.extract_authors(field, name, docinfo)
396 elif issubclass(biblioclass, nodes.topic):
397 if topics[canonical]:
398 field[-1] += self.document.reporter.warning(
399 'There can only be one "%s" field.' % name,
400 base_node=field)
401 raise TransformError
402 title = nodes.title(name, labels[canonical])
403 title[0].rawsource = labels[canonical]
404 topics[canonical] = biblioclass(
405 '', title, classes=[canonical], *field[1].children)
406 else:
407 docinfo.append(biblioclass('', *field[1].children))
408 except TransformError:
409 if len(field[-1]) == 1 \
410 and isinstance(field[-1][0], nodes.paragraph):
411 utils.clean_rcs_keywords(
412 field[-1][0], self.rcs_keyword_substitutions)
413 # if normedname not in bibliofields:
414 classvalue = nodes.make_id(normedname)
415 if classvalue:
416 field['classes'].append(classvalue)
417 docinfo.append(field)
418 nodelist = []
419 if len(docinfo) != 0:
420 nodelist.append(docinfo)
421 for name in ('dedication', 'abstract'):
422 if topics[name]:
423 nodelist.append(topics[name])
424 return nodelist
426 def check_empty_biblio_field(self, field, name):
427 if len(field[-1]) < 1:
428 field[-1] += self.document.reporter.warning(
429 f'Cannot extract empty bibliographic field "{name}".',
430 base_node=field)
431 return False
432 return True
434 def check_compound_biblio_field(self, field, name):
435 # Check that the `field` body contains a single paragraph
436 # (i.e. it must *not* be a compound element).
437 f_body = field[-1]
438 if len(f_body) == 1 and isinstance(f_body[0], nodes.paragraph):
439 return True
440 # Restore single author name with initial (E. Xampl) parsed as
441 # enumerated list
442 # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#enumerated-lists
443 if (isinstance(f_body[0], nodes.enumerated_list)
444 and '\n' not in f_body.rawsource.strip()):
445 # parse into a dummy document and use created nodes
446 _document = utils.new_document('*DocInfo transform*',
447 field.document.settings)
448 parser = parsers.rst.Parser()
449 parser.parse('\\'+f_body.rawsource, _document)
450 if (len(_document.children) == 1
451 and isinstance(_document.children[0], nodes.paragraph)):
452 f_body.children = _document.children
453 return True
454 # Check failed, add a warning
455 content = [f'<{e.tagname}>' for e in f_body.children]
456 if len(content) > 1:
457 content = '[' + ', '.join(content) + ']'
458 else:
459 content = 'a ' + content[0]
460 f_body += self.document.reporter.warning(
461 f'Bibliographic field "{name}"\nmust contain '
462 f'a single <paragraph>, not {content}.',
463 base_node=field)
464 return False
466 rcs_keyword_substitutions = [
467 (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
468 r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
469 (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
470 (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1')]
472 def extract_authors(self, field, name, docinfo):
473 try:
474 if len(field[1]) == 1:
475 if isinstance(field[1][0], nodes.paragraph):
476 authors = self.authors_from_one_paragraph(field)
477 elif isinstance(field[1][0], nodes.bullet_list):
478 authors = self.authors_from_bullet_list(field)
479 else:
480 raise TransformError
481 else:
482 authors = self.authors_from_paragraphs(field)
483 authornodes = [nodes.author('', '', *author)
484 for author in authors if author]
485 if len(authornodes) >= 1:
486 docinfo.append(nodes.authors('', *authornodes))
487 else:
488 raise TransformError
489 except TransformError:
490 field[-1] += self.document.reporter.warning(
491 f'Cannot extract "{name}" from bibliographic field:\n'
492 f'Bibliographic field "{name}" must contain either\n'
493 ' a single paragraph (with author names separated by one of '
494 f'"{"".join(self.language.author_separators)}"),\n'
495 ' multiple paragraphs (one per author),\n'
496 ' or a bullet list with one author name per item.\n'
497 'Note: Leading initials can cause (mis)recognizing names '
498 'as enumerated list.',
499 base_node=field)
500 raise
502 def authors_from_one_paragraph(self, field):
503 """Return list of Text nodes with author names in `field`.
505 Author names must be separated by one of the "autor separators"
506 defined for the document language (default: ";" or ",").
508 # @@ keep original formatting? (e.g. ``:authors: A. Test, *et-al*``)
509 text = ''.join(str(node)
510 for node in field[1].findall(nodes.Text))
511 if not text:
512 raise TransformError
513 for authorsep in self.language.author_separators:
514 # don't split at escaped `authorsep`:
515 pattern = '(?<!\x00)%s' % authorsep
516 authornames = re.split(pattern, text)
517 if len(authornames) > 1:
518 break
519 authornames = (name.strip() for name in authornames)
520 return [[nodes.Text(name)] for name in authornames if name]
522 def authors_from_bullet_list(self, field):
523 authors = []
524 for item in field[1][0]:
525 if isinstance(item, nodes.comment):
526 continue
527 if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
528 raise TransformError
529 authors.append(item[0].children)
530 if not authors:
531 raise TransformError
532 return authors
534 def authors_from_paragraphs(self, field):
535 for item in field[1]:
536 if not isinstance(item, (nodes.paragraph, nodes.comment)):
537 raise TransformError
538 authors = [item.children for item in field[1]
539 if not isinstance(item, nodes.comment)]
540 return authors