Fix [ 320 ] Russian docinfo fields not recognized.
[docutils.git] / docutils / docutils / transforms / frontmatter.py
blob1b5f552dd9774d2e78030d32188b698eaa4240f3
1 # $Id$
2 # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 Transforms related to the front matter of a document or a section
7 (information found before the main text):
9 - `DocTitle`: Used to transform a lone top level section's title to
10 the document title, promote a remaining lone top-level section's
11 title to the document subtitle, and determine the document's title
12 metadata (document['title']) based on the document title and/or the
13 "title" setting.
15 - `SectionSubTitle`: Used to transform a lone subsection into a
16 subtitle.
18 - `DocInfo`: Used to transform a bibliographic field list into docinfo
19 elements.
20 """
22 __docformat__ = 'reStructuredText'
24 import re
25 from docutils import nodes, utils
26 from docutils.transforms import TransformError, Transform
29 class TitlePromoter(Transform):
31 """
32 Abstract base class for DocTitle and SectionSubTitle transforms.
33 """
35 def promote_title(self, node):
36 """
37 Transform the following tree::
39 <node>
40 <section>
41 <title>
42 ...
44 into ::
46 <node>
47 <title>
48 ...
50 `node` is normally a document.
51 """
52 # Type check
53 if not isinstance(node, nodes.Element):
54 raise TypeError, 'node must be of Element-derived type.'
56 # `node` must not have a title yet.
57 assert not (len(node) and isinstance(node[0], nodes.title))
58 section, index = self.candidate_index(node)
59 if index is None:
60 return None
62 # Transfer the section's attributes to the node:
63 # NOTE: Change second parameter to False to NOT replace
64 # attributes that already exist in node with those in
65 # section
66 # NOTE: Remove third parameter to NOT copy the 'source'
67 # attribute from section
68 node.update_all_atts_concatenating(section, True, True)
70 # setup_child is called automatically for all nodes.
71 node[:] = (section[:1] # section title
72 + node[:index] # everything that was in the
73 # node before the section
74 + section[1:]) # everything that was in the section
75 assert isinstance(node[0], nodes.title)
76 return 1
78 def promote_subtitle(self, node):
79 """
80 Transform the following node tree::
82 <node>
83 <title>
84 <section>
85 <title>
86 ...
88 into ::
90 <node>
91 <title>
92 <subtitle>
93 ...
94 """
95 # Type check
96 if not isinstance(node, nodes.Element):
97 raise TypeError, 'node must be of Element-derived type.'
99 subsection, index = self.candidate_index(node)
100 if index is None:
101 return None
102 subtitle = nodes.subtitle()
104 # Transfer the subsection's attributes to the new subtitle
105 # NOTE: Change second parameter to False to NOT replace
106 # attributes that already exist in node with those in
107 # section
108 # NOTE: Remove third parameter to NOT copy the 'source'
109 # attribute from section
110 subtitle.update_all_atts_concatenating(subsection, True, True)
112 # Transfer the contents of the subsection's title to the
113 # subtitle:
114 subtitle[:] = subsection[0][:]
115 node[:] = (node[:1] # title
116 + [subtitle]
117 # everything that was before the section:
118 + node[1:index]
119 # everything that was in the subsection:
120 + subsection[1:])
121 return 1
123 def candidate_index(self, node):
125 Find and return the promotion candidate and its index.
127 Return (None, None) if no valid candidate was found.
129 index = node.first_child_not_matching_class(
130 nodes.PreBibliographic)
131 if index is None or len(node) > (index + 1) or \
132 not isinstance(node[index], nodes.section):
133 return None, None
134 else:
135 return node[index], index
138 class DocTitle(TitlePromoter):
141 In reStructuredText_, there is no way to specify a document title
142 and subtitle explicitly. Instead, we can supply the document title
143 (and possibly the subtitle as well) implicitly, and use this
144 two-step transform to "raise" or "promote" the title(s) (and their
145 corresponding section contents) to the document level.
147 1. If the document contains a single top-level section as its
148 first non-comment element, the top-level section's title
149 becomes the document's title, and the top-level section's
150 contents become the document's immediate contents. The lone
151 top-level section header must be the first non-comment element
152 in the document.
154 For example, take this input text::
156 =================
157 Top-Level Title
158 =================
160 A paragraph.
162 Once parsed, it looks like this::
164 <document>
165 <section names="top-level title">
166 <title>
167 Top-Level Title
168 <paragraph>
169 A paragraph.
171 After running the DocTitle transform, we have::
173 <document names="top-level title">
174 <title>
175 Top-Level Title
176 <paragraph>
177 A paragraph.
179 2. If step 1 successfully determines the document title, we
180 continue by checking for a subtitle.
182 If the lone top-level section itself contains a single
183 second-level section as its first non-comment element, that
184 section's title is promoted to the document's subtitle, and
185 that section's contents become the document's immediate
186 contents. Given this input text::
188 =================
189 Top-Level Title
190 =================
192 Second-Level Title
193 ~~~~~~~~~~~~~~~~~~
195 A paragraph.
197 After parsing and running the Section Promotion transform, the
198 result is::
200 <document names="top-level title">
201 <title>
202 Top-Level Title
203 <subtitle names="second-level title">
204 Second-Level Title
205 <paragraph>
206 A paragraph.
208 (Note that the implicit hyperlink target generated by the
209 "Second-Level Title" is preserved on the "subtitle" element
210 itself.)
212 Any comment elements occurring before the document title or
213 subtitle are accumulated and inserted as the first body elements
214 after the title(s).
216 This transform also sets the document's metadata title
217 (document['title']).
219 .. _reStructuredText: http://docutils.sf.net/rst.html
222 default_priority = 320
224 def set_metadata(self):
226 Set document['title'] metadata title from the following
227 sources, listed in order of priority:
229 * Existing document['title'] attribute.
230 * "title" setting.
231 * Document title node (as promoted by promote_title).
233 if not self.document.hasattr('title'):
234 if self.document.settings.title is not None:
235 self.document['title'] = self.document.settings.title
236 elif len(self.document) and isinstance(self.document[0], nodes.title):
237 self.document['title'] = self.document[0].astext()
239 def apply(self):
240 if getattr(self.document.settings, 'doctitle_xform', 1):
241 # promote_(sub)title defined in TitlePromoter base class.
242 if self.promote_title(self.document):
243 # If a title has been promoted, also try to promote a
244 # subtitle.
245 self.promote_subtitle(self.document)
246 # Set document['title'].
247 self.set_metadata()
250 class SectionSubTitle(TitlePromoter):
253 This works like document subtitles, but for sections. For example, ::
255 <section>
256 <title>
257 Title
258 <section>
259 <title>
260 Subtitle
263 is transformed into ::
265 <section>
266 <title>
267 Title
268 <subtitle>
269 Subtitle
272 For details refer to the docstring of DocTitle.
275 default_priority = 350
277 def apply(self):
278 if not getattr(self.document.settings, 'sectsubtitle_xform', 1):
279 return
280 for section in self.document.traverse(nodes.section):
281 # On our way through the node tree, we are deleting
282 # sections, but we call self.promote_subtitle for those
283 # sections nonetheless. To do: Write a test case which
284 # shows the problem and discuss on Docutils-develop.
285 self.promote_subtitle(section)
288 class DocInfo(Transform):
291 This transform is specific to the reStructuredText_ markup syntax;
292 see "Bibliographic Fields" in the `reStructuredText Markup
293 Specification`_ for a high-level description. This transform
294 should be run *after* the `DocTitle` transform.
296 Given a field list as the first non-comment element after the
297 document title and subtitle (if present), registered bibliographic
298 field names are transformed to the corresponding DTD elements,
299 becoming child elements of the "docinfo" element (except for a
300 dedication and/or an abstract, which become "topic" elements after
301 "docinfo").
303 For example, given this document fragment after parsing::
305 <document>
306 <title>
307 Document Title
308 <field_list>
309 <field>
310 <field_name>
311 Author
312 <field_body>
313 <paragraph>
314 A. Name
315 <field>
316 <field_name>
317 Status
318 <field_body>
319 <paragraph>
320 $RCSfile$
323 After running the bibliographic field list transform, the
324 resulting document tree would look like this::
326 <document>
327 <title>
328 Document Title
329 <docinfo>
330 <author>
331 A. Name
332 <status>
333 frontmatter.py
336 The "Status" field contained an expanded RCS keyword, which is
337 normally (but optionally) cleaned up by the transform. The sole
338 contents of the field body must be a paragraph containing an
339 expanded RCS keyword of the form "$keyword: expansion text $". Any
340 RCS keyword can be processed in any bibliographic field. The
341 dollar signs and leading RCS keyword name are removed. Extra
342 processing is done for the following RCS keywords:
344 - "RCSfile" expands to the name of the file in the RCS or CVS
345 repository, which is the name of the source file with a ",v"
346 suffix appended. The transform will remove the ",v" suffix.
348 - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
349 time zone). The RCS Keywords transform will extract just the
350 date itself and transform it to an ISO 8601 format date, as in
351 "2000-12-31".
353 (Since the source file for this text is itself stored under CVS,
354 we can't show an example of the "Date" RCS keyword because we
355 can't prevent any RCS keywords used in this explanation from
356 being expanded. Only the "RCSfile" keyword is stable; its
357 expansion text changes only if the file name changes.)
359 .. _reStructuredText: http://docutils.sf.net/rst.html
360 .. _reStructuredText Markup Specification:
361 http://docutils.sf.net/docs/ref/rst/restructuredtext.html
364 default_priority = 340
366 biblio_nodes = {
367 'author': nodes.author,
368 'authors': nodes.authors,
369 'organization': nodes.organization,
370 'address': nodes.address,
371 'contact': nodes.contact,
372 'version': nodes.version,
373 'revision': nodes.revision,
374 'status': nodes.status,
375 'date': nodes.date,
376 'copyright': nodes.copyright,
377 'dedication': nodes.topic,
378 'abstract': nodes.topic}
379 """Canonical field name (lowcased) to node class name mapping for
380 bibliographic fields (field_list)."""
382 def apply(self):
383 if not getattr(self.document.settings, 'docinfo_xform', 1):
384 return
385 document = self.document
386 index = document.first_child_not_matching_class(
387 nodes.PreBibliographic)
388 if index is None:
389 return
390 candidate = document[index]
391 if isinstance(candidate, nodes.field_list):
392 biblioindex = document.first_child_not_matching_class(
393 (nodes.Titular, nodes.Decorative))
394 nodelist = self.extract_bibliographic(candidate)
395 del document[index] # untransformed field list (candidate)
396 document[biblioindex:biblioindex] = nodelist
398 def extract_bibliographic(self, field_list):
399 docinfo = nodes.docinfo()
400 bibliofields = self.language.bibliographic_fields
401 labels = self.language.labels
402 topics = {'dedication': None, 'abstract': None}
403 for field in field_list:
404 try:
405 name = field[0][0].astext()
406 normedname = nodes.fully_normalize_name(name)
407 if not (len(field) == 2 and normedname in bibliofields
408 and self.check_empty_biblio_field(field, name)):
409 raise TransformError
410 canonical = bibliofields[normedname]
411 biblioclass = self.biblio_nodes[canonical]
412 if issubclass(biblioclass, nodes.TextElement):
413 if not self.check_compound_biblio_field(field, name):
414 raise TransformError
415 utils.clean_rcs_keywords(
416 field[1][0], self.rcs_keyword_substitutions)
417 docinfo.append(biblioclass('', '', *field[1][0]))
418 elif issubclass(biblioclass, nodes.authors):
419 self.extract_authors(field, name, docinfo)
420 elif issubclass(biblioclass, nodes.topic):
421 if topics[canonical]:
422 field[-1] += self.document.reporter.warning(
423 'There can only be one "%s" field.' % name,
424 base_node=field)
425 raise TransformError
426 title = nodes.title(name, labels[canonical])
427 topics[canonical] = biblioclass(
428 '', title, classes=[canonical], *field[1].children)
429 else:
430 docinfo.append(biblioclass('', *field[1].children))
431 except TransformError:
432 if len(field[-1]) == 1 \
433 and isinstance(field[-1][0], nodes.paragraph):
434 utils.clean_rcs_keywords(
435 field[-1][0], self.rcs_keyword_substitutions)
436 if normedname not in bibliofields:
437 classvalue = nodes.make_id(normedname)
438 if classvalue:
439 field['classes'].append(classvalue)
440 docinfo.append(field)
441 nodelist = []
442 if len(docinfo) != 0:
443 nodelist.append(docinfo)
444 for name in ('dedication', 'abstract'):
445 if topics[name]:
446 nodelist.append(topics[name])
447 return nodelist
449 def check_empty_biblio_field(self, field, name):
450 if len(field[-1]) < 1:
451 field[-1] += self.document.reporter.warning(
452 'Cannot extract empty bibliographic field "%s".' % name,
453 base_node=field)
454 return None
455 return 1
457 def check_compound_biblio_field(self, field, name):
458 if len(field[-1]) > 1:
459 field[-1] += self.document.reporter.warning(
460 'Cannot extract compound bibliographic field "%s".' % name,
461 base_node=field)
462 return None
463 if not isinstance(field[-1][0], nodes.paragraph):
464 field[-1] += self.document.reporter.warning(
465 'Cannot extract bibliographic field "%s" containing '
466 'anything other than a single paragraph.' % name,
467 base_node=field)
468 return None
469 return 1
471 rcs_keyword_substitutions = [
472 (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
473 r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
474 (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
475 (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1'),]
477 def extract_authors(self, field, name, docinfo):
478 try:
479 if len(field[1]) == 1:
480 if isinstance(field[1][0], nodes.paragraph):
481 authors = self.authors_from_one_paragraph(field)
482 elif isinstance(field[1][0], nodes.bullet_list):
483 authors = self.authors_from_bullet_list(field)
484 else:
485 raise TransformError
486 else:
487 authors = self.authors_from_paragraphs(field)
488 authornodes = [nodes.author('', '', *author)
489 for author in authors if author]
490 if len(authornodes) >= 1:
491 docinfo.append(nodes.authors('', *authornodes))
492 else:
493 raise TransformError
494 except TransformError:
495 field[-1] += self.document.reporter.warning(
496 'Bibliographic field "%s" incompatible with extraction: '
497 'it must contain either a single paragraph (with authors '
498 'separated by one of "%s"), multiple paragraphs (one per '
499 'author), or a bullet list with one paragraph (one author) '
500 'per item.'
501 % (name, ''.join(self.language.author_separators)),
502 base_node=field)
503 raise
505 def authors_from_one_paragraph(self, field):
506 text = field[1][0].astext().strip()
507 if not text:
508 raise TransformError
509 for authorsep in self.language.author_separators:
510 authornames = text.split(authorsep)
511 if len(authornames) > 1:
512 break
513 authornames = [author.strip() for author in authornames]
514 authors = [[nodes.Text(author)] for author in authornames if author]
515 return authors
517 def authors_from_bullet_list(self, field):
518 authors = []
519 for item in field[1][0]:
520 if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
521 raise TransformError
522 authors.append(item[0].children)
523 if not authors:
524 raise TransformError
525 return authors
527 def authors_from_paragraphs(self, field):
528 for item in field[1]:
529 if not isinstance(item, nodes.paragraph):
530 raise TransformError
531 authors = [item.children for item in field[1]]
532 return authors