Apply [ 2714873 ] Fix for the overwritting of document attributes.
[docutils.git] / docutils / transforms / frontmatter.py
blob507d662af09804867004613e243d8b52b278f5f9
1 # $Id$
2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 Transforms related to the front matter of a document or a section
7 (information found before the main text):
9 - `DocTitle`: Used to transform a lone top level section's title to
10 the document title, promote a remaining lone top-level section's
11 title to the document subtitle, and determine the document's title
12 metadata (document['title']) based on the document title and/or the
13 "title" setting.
15 - `SectionSubTitle`: Used to transform a lone subsection into a
16 subtitle.
18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
19 elements.
20 """
22 __docformat__ = 'reStructuredText'
24 import re
25 from docutils import nodes, utils
26 from docutils.transforms import TransformError, Transform
29 class TitlePromoter(Transform):
31 """
32 Abstract base class for DocTitle and SectionSubTitle transforms.
33 """
35 def promote_title(self, node):
36 """
37 Transform the following tree::
39 <node>
40 <section>
41 <title>
42 ...
44 into ::
46 <node>
47 <title>
48 ...
50 `node` is normally a document.
51 """
52 # Type check
53 if not isinstance(node, nodes.Element):
54 raise TypeError, 'node must be of Element-derived type.'
56 # `node` must not have a title yet.
57 assert not (len(node) and isinstance(node[0], nodes.title))
58 section, index = self.candidate_index(node)
59 if index is None:
60 return None
62 # Transfer the section's attributes to the node:
63 # NOTE: Change second parameter to False to NOT replace
64 # attributes that already exist in node with those in
65 # section
66 # NOTE: Remove third parameter to NOT copy the 'source'
67 # attribute from section
68 node.update_all_atts_concatenating(section, True, True)
70 # setup_child is called automatically for all nodes.
71 node[:] = (section[:1] # section title
72 + node[:index] # everything that was in the
73 # node before the section
74 + section[1:]) # everything that was in the section
75 assert isinstance(node[0], nodes.title)
76 return 1
78 def promote_subtitle(self, node):
79 """
80 Transform the following node tree::
82 <node>
83 <title>
84 <section>
85 <title>
86 ...
88 into ::
90 <node>
91 <title>
92 <subtitle>
93 ...
94 """
95 # Type check
96 if not isinstance(node, nodes.Element):
97 raise TypeError, 'node must be of Element-derived type.'
99 subsection, index = self.candidate_index(node)
100 if index is None:
101 return None
102 subtitle = nodes.subtitle()
104 # Transfer the subsection's attributes to the new subtitle
105 # NOTE: Change second parameter to False to NOT replace
106 # attributes that already exist in node with those in
107 # section
108 # NOTE: Remove third parameter to NOT copy the 'source'
109 # attribute from section
110 subtitle.update_all_atts_concatenating(subsection, True, True)
112 # Transfer the contents of the subsection's title to the
113 # subtitle:
114 subtitle[:] = subsection[0][:]
115 node[:] = (node[:1] # title
116 + [subtitle]
117 # everything that was before the section:
118 + node[1:index]
119 # everything that was in the subsection:
120 + subsection[1:])
121 return 1
123 def candidate_index(self, node):
125 Find and return the promotion candidate and its index.
127 Return (None, None) if no valid candidate was found.
129 index = node.first_child_not_matching_class(
130 nodes.PreBibliographic)
131 if index is None or len(node) > (index + 1) or \
132 not isinstance(node[index], nodes.section):
133 return None, None
134 else:
135 return node[index], index
138 class DocTitle(TitlePromoter):
141 In reStructuredText_, there is no way to specify a document title
142 and subtitle explicitly. Instead, we can supply the document title
143 (and possibly the subtitle as well) implicitly, and use this
144 two-step transform to "raise" or "promote" the title(s) (and their
145 corresponding section contents) to the document level.
147 1. If the document contains a single top-level section as its
148 first non-comment element, the top-level section's title
149 becomes the document's title, and the top-level section's
150 contents become the document's immediate contents. The lone
151 top-level section header must be the first non-comment element
152 in the document.
154 For example, take this input text::
156 =================
157 Top-Level Title
158 =================
160 A paragraph.
162 Once parsed, it looks like this::
164 <document>
165 <section names="top-level title">
166 <title>
167 Top-Level Title
168 <paragraph>
169 A paragraph.
171 After running the DocTitle transform, we have::
173 <document names="top-level title">
174 <title>
175 Top-Level Title
176 <paragraph>
177 A paragraph.
179 2. If step 1 successfully determines the document title, we
180 continue by checking for a subtitle.
182 If the lone top-level section itself contains a single
183 second-level section as its first non-comment element, that
184 section's title is promoted to the document's subtitle, and
185 that section's contents become the document's immediate
186 contents. Given this input text::
188 =================
189 Top-Level Title
190 =================
192 Second-Level Title
193 ~~~~~~~~~~~~~~~~~~
195 A paragraph.
197 After parsing and running the Section Promotion transform, the
198 result is::
200 <document names="top-level title">
201 <title>
202 Top-Level Title
203 <subtitle names="second-level title">
204 Second-Level Title
205 <paragraph>
206 A paragraph.
208 (Note that the implicit hyperlink target generated by the
209 "Second-Level Title" is preserved on the "subtitle" element
210 itself.)
212 Any comment elements occurring before the document title or
213 subtitle are accumulated and inserted as the first body elements
214 after the title(s).
216 This transform also sets the document's metadata title
217 (document['title']).
219 .. _reStructuredText: http://docutils.sf.net/rst.html
222 default_priority = 320
224 def set_metadata(self):
226 Set document['title'] metadata title from the following
227 sources, listed in order of priority:
229 * Existing document['title'] attribute.
230 * "title" setting.
231 * Document title node (as promoted by promote_title).
233 if not self.document.hasattr('title'):
234 if self.document.settings.title is not None:
235 self.document['title'] = self.document.settings.title
236 elif len(self.document) and isinstance(self.document[0], nodes.title):
237 self.document['title'] = self.document[0].astext()
239 def apply(self):
240 if getattr(self.document.settings, 'doctitle_xform', 1):
241 # promote_(sub)title defined in TitlePromoter base class.
242 if self.promote_title(self.document):
243 # If a title has been promoted, also try to promote a
244 # subtitle.
245 self.promote_subtitle(self.document)
246 # Set document['title'].
247 self.set_metadata()
250 class SectionSubTitle(TitlePromoter):
253 This works like document subtitles, but for sections. For example, ::
255 <section>
256 <title>
257 Title
258 <section>
259 <title>
260 Subtitle
263 is transformed into ::
265 <section>
266 <title>
267 Title
268 <subtitle>
269 Subtitle
272 For details refer to the docstring of DocTitle.
275 default_priority = 350
277 def apply(self):
278 if not getattr(self.document.settings, 'sectsubtitle_xform', 1):
279 return
280 for section in self.document.traverse(nodes.section):
281 # On our way through the node tree, we are deleting
282 # sections, but we call self.promote_subtitle for those
283 # sections nonetheless. To do: Write a test case which
284 # shows the problem and discuss on Docutils-develop.
285 self.promote_subtitle(section)
288 class DocInfo(Transform):
291 This transform is specific to the reStructuredText_ markup syntax;
292 see "Bibliographic Fields" in the `reStructuredText Markup
293 Specification`_ for a high-level description. This transform
294 should be run *after* the `DocTitle` transform.
296 Given a field list as the first non-comment element after the
297 document title and subtitle (if present), registered bibliographic
298 field names are transformed to the corresponding DTD elements,
299 becoming child elements of the "docinfo" element (except for a
300 dedication and/or an abstract, which become "topic" elements after
301 "docinfo").
303 For example, given this document fragment after parsing::
305 <document>
306 <title>
307 Document Title
308 <field_list>
309 <field>
310 <field_name>
311 Author
312 <field_body>
313 <paragraph>
314 A. Name
315 <field>
316 <field_name>
317 Status
318 <field_body>
319 <paragraph>
320 $RCSfile$
323 After running the bibliographic field list transform, the
324 resulting document tree would look like this::
326 <document>
327 <title>
328 Document Title
329 <docinfo>
330 <author>
331 A. Name
332 <status>
333 frontmatter.py
336 The "Status" field contained an expanded RCS keyword, which is
337 normally (but optionally) cleaned up by the transform. The sole
338 contents of the field body must be a paragraph containing an
339 expanded RCS keyword of the form "$keyword: expansion text $". Any
340 RCS keyword can be processed in any bibliographic field. The
341 dollar signs and leading RCS keyword name are removed. Extra
342 processing is done for the following RCS keywords:
344 - "RCSfile" expands to the name of the file in the RCS or CVS
345 repository, which is the name of the source file with a ",v"
346 suffix appended. The transform will remove the ",v" suffix.
348 - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
349 time zone). The RCS Keywords transform will extract just the
350 date itself and transform it to an ISO 8601 format date, as in
351 "2000-12-31".
353 (Since the source file for this text is itself stored under CVS,
354 we can't show an example of the "Date" RCS keyword because we
355 can't prevent any RCS keywords used in this explanation from
356 being expanded. Only the "RCSfile" keyword is stable; its
357 expansion text changes only if the file name changes.)
359 .. _reStructuredText: http://docutils.sf.net/rst.html
360 .. _reStructuredText Markup Specification:
361 http://docutils.sf.net/docs/ref/rst/restructuredtext.html
364 default_priority = 340
366 biblio_nodes = {
367 'author': nodes.author,
368 'authors': nodes.authors,
369 'organization': nodes.organization,
370 'address': nodes.address,
371 'contact': nodes.contact,
372 'version': nodes.version,
373 'revision': nodes.revision,
374 'status': nodes.status,
375 'date': nodes.date,
376 'copyright': nodes.copyright,
377 'dedication': nodes.topic,
378 'abstract': nodes.topic}
379 """Canonical field name (lowcased) to node class name mapping for
380 bibliographic fields (field_list)."""
382 def apply(self):
383 if not getattr(self.document.settings, 'docinfo_xform', 1):
384 return
385 document = self.document
386 index = document.first_child_not_matching_class(
387 nodes.PreBibliographic)
388 if index is None:
389 return
390 candidate = document[index]
391 if isinstance(candidate, nodes.field_list):
392 biblioindex = document.first_child_not_matching_class(
393 (nodes.Titular, nodes.Decorative))
394 nodelist = self.extract_bibliographic(candidate)
395 del document[index] # untransformed field list (candidate)
396 document[biblioindex:biblioindex] = nodelist
398 def extract_bibliographic(self, field_list):
399 docinfo = nodes.docinfo()
400 bibliofields = self.language.bibliographic_fields
401 labels = self.language.labels
402 topics = {'dedication': None, 'abstract': None}
403 for field in field_list:
404 try:
405 name = field[0][0].astext()
406 normedname = nodes.fully_normalize_name(name)
407 if not (len(field) == 2 and normedname in bibliofields
408 and self.check_empty_biblio_field(field, name)):
409 raise TransformError
410 canonical = bibliofields[normedname]
411 biblioclass = self.biblio_nodes[canonical]
412 if issubclass(biblioclass, nodes.TextElement):
413 if not self.check_compound_biblio_field(field, name):
414 raise TransformError
415 utils.clean_rcs_keywords(
416 field[1][0], self.rcs_keyword_substitutions)
417 docinfo.append(biblioclass('', '', *field[1][0]))
418 elif issubclass(biblioclass, nodes.authors):
419 self.extract_authors(field, name, docinfo)
420 elif issubclass(biblioclass, nodes.topic):
421 if topics[canonical]:
422 field[-1] += self.document.reporter.warning(
423 'There can only be one "%s" field.' % name,
424 base_node=field)
425 raise TransformError
426 title = nodes.title(name, labels[canonical])
427 topics[canonical] = biblioclass(
428 '', title, classes=[canonical], *field[1].children)
429 else:
430 docinfo.append(biblioclass('', *field[1].children))
431 except TransformError:
432 if len(field[-1]) == 1 \
433 and isinstance(field[-1][0], nodes.paragraph):
434 utils.clean_rcs_keywords(
435 field[-1][0], self.rcs_keyword_substitutions)
436 docinfo.append(field)
437 nodelist = []
438 if len(docinfo) != 0:
439 nodelist.append(docinfo)
440 for name in ('dedication', 'abstract'):
441 if topics[name]:
442 nodelist.append(topics[name])
443 return nodelist
445 def check_empty_biblio_field(self, field, name):
446 if len(field[-1]) < 1:
447 field[-1] += self.document.reporter.warning(
448 'Cannot extract empty bibliographic field "%s".' % name,
449 base_node=field)
450 return None
451 return 1
453 def check_compound_biblio_field(self, field, name):
454 if len(field[-1]) > 1:
455 field[-1] += self.document.reporter.warning(
456 'Cannot extract compound bibliographic field "%s".' % name,
457 base_node=field)
458 return None
459 if not isinstance(field[-1][0], nodes.paragraph):
460 field[-1] += self.document.reporter.warning(
461 'Cannot extract bibliographic field "%s" containing '
462 'anything other than a single paragraph.' % name,
463 base_node=field)
464 return None
465 return 1
467 rcs_keyword_substitutions = [
468 (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
469 r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
470 (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
471 (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1'),]
473 def extract_authors(self, field, name, docinfo):
474 try:
475 if len(field[1]) == 1:
476 if isinstance(field[1][0], nodes.paragraph):
477 authors = self.authors_from_one_paragraph(field)
478 elif isinstance(field[1][0], nodes.bullet_list):
479 authors = self.authors_from_bullet_list(field)
480 else:
481 raise TransformError
482 else:
483 authors = self.authors_from_paragraphs(field)
484 authornodes = [nodes.author('', '', *author)
485 for author in authors if author]
486 if len(authornodes) >= 1:
487 docinfo.append(nodes.authors('', *authornodes))
488 else:
489 raise TransformError
490 except TransformError:
491 field[-1] += self.document.reporter.warning(
492 'Bibliographic field "%s" incompatible with extraction: '
493 'it must contain either a single paragraph (with authors '
494 'separated by one of "%s"), multiple paragraphs (one per '
495 'author), or a bullet list with one paragraph (one author) '
496 'per item.'
497 % (name, ''.join(self.language.author_separators)),
498 base_node=field)
499 raise
501 def authors_from_one_paragraph(self, field):
502 text = field[1][0].astext().strip()
503 if not text:
504 raise TransformError
505 for authorsep in self.language.author_separators:
506 authornames = text.split(authorsep)
507 if len(authornames) > 1:
508 break
509 authornames = [author.strip() for author in authornames]
510 authors = [[nodes.Text(author)] for author in authornames if author]
511 return authors
513 def authors_from_bullet_list(self, field):
514 authors = []
515 for item in field[1][0]:
516 if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
517 raise TransformError
518 authors.append(item[0].children)
519 if not authors:
520 raise TransformError
521 return authors
523 def authors_from_paragraphs(self, field):
524 for item in field[1]:
525 if not isinstance(item, nodes.paragraph):
526 raise TransformError
527 authors = [item.children for item in field[1]]
528 return authors