2 # -*- coding: utf-8 -*-
3 # extract_texi_filenames.py
5 # USAGE: extract_texi_filenames.py [-o OUTDIR] FILES
7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
10 # This script parses the .texi file given and creates a file with the
11 # nodename <=> filename/anchor map.
12 # The idea behind: Unnumbered subsections go into the same file as the
13 # previous numbered section, @translationof gives the original node name,
14 # which is then used for the filename/anchor.
16 # If this script is run on a file texifile.texi, it produces a file
17 # texifile[.LANG].xref-map with tab-separated entries of the form
18 # NODE\tFILENAME\tANCHOR
19 # LANG is the document language in case it's not 'en'
20 # Note: The filename does not have any extension appended!
21 # This file can then be used by our texi2html init script to determine
22 # the correct file name and anchor for external refs
29 optlist
, args
= getopt
.getopt (sys
.argv
[1:],'o:')
37 if not os
.path
.isdir (outdir
):
38 if os
.path
.exists (outdir
):
42 include_re
= re
.compile (r
'@include ((?!../lily-).*?)\.texi$', re
.M
)
43 whitespaces
= re
.compile (r
'\s+')
44 section_translation_re
= re
.compile ('^@(node|(?:unnumbered|appendix)\
45 (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
46 (?:major|chap|(?:sub){0,2})heading|translationof) (.*?)\\s*$', re
.MULTILINE
)
48 def expand_includes (m
, filename
):
49 filepath
= os
.path
.join (os
.path
.dirname (filename
), m
.group(1)) + '.texi'
50 if os
.path
.exists (filepath
):
51 return extract_sections (filepath
)[1]
53 print "Unable to locate include file " + filepath
56 lang_re
= re
.compile (r
'^@documentlanguage (.+)', re
.M
)
58 def extract_sections (filename
):
60 f
= open (filename
, 'r')
63 # Search document language
64 m
= lang_re
.search (page
)
65 if m
and m
.group (1) != 'en':
66 lang_suffix
= '.' + m
.group (1)
69 # Replace all includes by their list of sections and extract all sections
70 page
= include_re
.sub (lambda m
: expand_includes (m
, filename
), page
)
71 sections
= section_translation_re
.findall (page
)
73 result
+= "@" + sec
[0] + " " + sec
[1] + "\n"
74 return (lang_suffix
, result
)
76 # Convert a given node name to its proper file name (normalization as explained
77 # in the texinfo manual:
78 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
79 def texinfo_file_name(title
):
80 # exception: The top node is always mapped to index.html
83 # File name normalization by texinfo (described in the texinfo manual):
84 # 1/2: letters and numbers are left unchanged
85 # 3/4: multiple, leading and trailing whitespace is removed
86 title
= title
.strip ();
87 title
= whitespaces
.sub (' ', title
)
88 # 5: all remaining spaces are converted to '-'
89 # 6: all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
91 for index
in range(len(title
)):
93 if char
== ' ': # space -> '-'
95 elif ( ('0' <= char
and char
<= '9' ) or
96 ('A' <= char
and char
<= 'Z' ) or
97 ('a' <= char
and char
<= 'z' ) ): # number or letter
102 result
+= "_%04x" % ccode
104 result
+= "__%06x" % ccode
105 # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
106 if (result
!= '') and (ord(result
[0]) in range (ord('0'), ord('9'))):
107 result
= 't_g' + result
110 texinfo_re
= re
.compile (r
'@.*{(.*)}')
111 def remove_texinfo (title
):
112 return texinfo_re
.sub (r
'\1', title
)
114 def create_texinfo_anchor (title
):
115 return texinfo_file_name (remove_texinfo (title
))
117 unnumbered_re
= re
.compile (r
'unnumbered.*')
118 def process_sections (filename
, lang_suffix
, page
):
119 sections
= section_translation_re
.findall (page
)
120 basename
= os
.path
.splitext (os
.path
.basename (filename
))[0]
121 p
= os
.path
.join (outdir
, basename
) + lang_suffix
+ '.xref-map'
125 this_filename
= 'index'
127 this_unnumbered
= False
131 # Write out the cached values to the file and start a new section:
132 if this_title
!= '' and this_title
!= 'Top':
133 f
.write (this_title
+ "\t" + this_filename
+ "\t" + this_anchor
+ "\n")
135 this_title
= remove_texinfo (sec
[1])
136 this_anchor
= create_texinfo_anchor (sec
[1])
137 elif sec
[0] == "translationof":
138 anchor
= create_texinfo_anchor (sec
[1])
139 # If @translationof is used, it gives the original node name, which
140 # we use for the anchor and the file name (if it is a numbered node)
142 if not this_unnumbered
:
143 this_filename
= anchor
145 # Some pages might not use a node for every section, so treat this
146 # case here, too: If we already had a section and encounter enother
147 # one before the next @node, we write out the old one and start
148 # with the new values
149 if had_section
and this_title
!= '':
150 f
.write (this_title
+ "\t" + this_filename
+ "\t" + this_anchor
+ "\n")
151 this_title
= remove_texinfo (sec
[1])
152 this_anchor
= create_texinfo_anchor (sec
[1])
155 # unnumbered nodes use the previously used file name, only numbered
156 # nodes get their own filename! However, top-level @unnumbered
157 # still get their own file.
158 this_unnumbered
= unnumbered_re
.match (sec
[0])
159 if not this_unnumbered
or sec
[0] == "unnumbered":
160 this_filename
= this_anchor
162 if this_title
!= '' and this_title
!= 'Top':
163 f
.write (this_title
+ "\t" + this_filename
+ "\t" + this_anchor
+ "\n")
167 for filename
in files
:
168 print "extract_texi_filenames.py: Processing %s" % filename
169 (lang_suffix
, sections
) = extract_sections (filename
)
170 process_sections (filename
, lang_suffix
, sections
)