buildscripts/extract_texi_filenames.py

   1 #!@PYTHON@
   2 # -*- coding: utf-8 -*-
   3 # extract_texi_filenames.py
   4
   5 # USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
   6 #
   7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
   8 #
   9 # Description:
  10 # This script parses the .texi file given and creates a file with the
  11 # nodename <=> filename/anchor map.
  12 # The idea behind: Unnumbered subsections go into the same file as the
  13 # previous numbered section, @translationof gives the original node name,
  14 # which is then used for the filename/anchor.
  15 #
  16 # If this script is run on a file texifile.texi, it produces a file
  17 # texifile[.LANG].xref-map with tab-separated entries of the form
  18 #        NODE\tFILENAME\tANCHOR
  19 # LANG is the document language in case it's not 'en'
  20 # Note: The filename does not have any extension appended!
  21 # This file can then be used by our texi2html init script to determine
  22 # the correct file name and anchor for external refs
  23
  24 import sys
  25 import re
  26 import os
  27 import getopt
  28
  29 optlist, args = getopt.getopt (sys.argv[1:],'o:')
  30 files = args
  31
  32 outdir = '.'
  33 for x in optlist:
  34     if x[0] == '-o':
  35         outdir = x[1]
  36
  37 if not os.path.isdir (outdir):
  38     if os.path.exists (outdir):
  39         os.unlink (outdir)
  40     os.makedirs (outdir)
  41
  42 include_re = re.compile (r'@include ((?!../lily-).*?)\.texi$', re.M)
  43 whitespaces = re.compile (r'\s+')
  44 section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\
  45 (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
  46 (?:major|chap|(?:sub){0,2})heading|translationof) (.*?)\\s*$', re.MULTILINE)
  47
  48 def expand_includes (m, filename):
  49     filepath = os.path.join (os.path.dirname (filename), m.group(1)) + '.texi'
  50     if os.path.exists (filepath):
  51         return extract_sections (filepath)[1]
  52     else:
  53         print "Unable to locate include file " + filepath
  54         return ''
  55
  56 lang_re = re.compile (r'^@documentlanguage (.+)', re.M)
  57
  58 def extract_sections (filename):
  59     result = ''
  60     f = open (filename, 'r')
  61     page = f.read ()
  62     f.close()
  63     # Search document language
  64     m = lang_re.search (page)
  65     if m and m.group (1) != 'en':
  66         lang_suffix = '.' + m.group (1)
  67     else:
  68         lang_suffix = ''
  69     # Replace all includes by their list of sections and extract all sections
  70     page = include_re.sub (lambda m: expand_includes (m, filename), page)
  71     sections = section_translation_re.findall (page)
  72     for sec in sections:
  73         result += "@" + sec[0] + " " + sec[1] + "\n"
  74     return (lang_suffix, result)
  75
  76 # Convert a given node name to its proper file name (normalization as explained
  77 # in the texinfo manual:
  78 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
  79 def texinfo_file_name(title):
  80     # exception: The top node is always mapped to index.html
  81     if title == "Top":
  82         return "index"
  83     # File name normalization by texinfo (described in the texinfo manual):
  84     # 1/2: letters and numbers are left unchanged
  85     # 3/4: multiple, leading and trailing whitespace is removed
  86     title = title.strip ();
  87     title = whitespaces.sub (' ', title)
  88     # 5:   all remaining spaces are converted to '-'
  89     # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
  90     result = ''
  91     for index in range(len(title)):
  92         char = title[index]
  93         if char == ' ': # space -> '-'
  94             result += '-'
  95         elif ( ('0' <= char and char <= '9' ) or
  96                ('A' <= char and char <= 'Z' ) or
  97                ('a' <= char and char <= 'z' ) ):  # number or letter
  98             result += char
  99         else:
 100             ccode = ord(char)
 101             if ccode <= 0xFFFF:
 102                 result += "_%04x" % ccode
 103             else:
 104                 result += "__%06x" % ccode
 105     # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
 106     if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
 107         result = 't_g' + result
 108     return result
 109
 110 texinfo_re = re.compile (r'@.*{(.*)}')
 111 def remove_texinfo (title):
 112     return texinfo_re.sub (r'\1', title)
 113
 114 def create_texinfo_anchor (title):
 115     return texinfo_file_name (remove_texinfo (title))
 116
 117 unnumbered_re = re.compile (r'unnumbered.*')
 118 def process_sections (filename, lang_suffix, page):
 119     sections = section_translation_re.findall (page)
 120     basename = os.path.splitext (os.path.basename (filename))[0]
 121     p = os.path.join (outdir, basename) + lang_suffix + '.xref-map'
 122     f = open (p, 'w')
 123
 124     this_title = ''
 125     this_filename = 'index'
 126     this_anchor = ''
 127     this_unnumbered = False
 128     had_section = False
 129     for sec in sections:
 130         if sec[0] == "node":
 131             # Write out the cached values to the file and start a new section:
 132             if this_title != '' and this_title != 'Top':
 133                     f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 134             had_section = False
 135             this_title = remove_texinfo (sec[1])
 136             this_anchor = create_texinfo_anchor (sec[1])
 137         elif sec[0] == "translationof":
 138             anchor = create_texinfo_anchor (sec[1])
 139             # If @translationof is used, it gives the original node name, which
 140             # we use for the anchor and the file name (if it is a numbered node)
 141             this_anchor = anchor
 142             if not this_unnumbered:
 143                 this_filename = anchor
 144         else:
 145             # Some pages might not use a node for every section, so treat this
 146             # case here, too: If we already had a section and encounter enother
 147             # one before the next @node, we write out the old one and start
 148             # with the new values
 149             if had_section and this_title != '':
 150                 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 151                 this_title = remove_texinfo (sec[1])
 152                 this_anchor = create_texinfo_anchor (sec[1])
 153             had_section = True
 154
 155             # unnumbered nodes use the previously used file name, only numbered
 156             # nodes get their own filename! However, top-level @unnumbered
 157             # still get their own file.
 158             this_unnumbered = unnumbered_re.match (sec[0])
 159             if not this_unnumbered or sec[0] == "unnumbered":
 160                 this_filename = this_anchor
 161
 162     if this_title != '' and this_title != 'Top':
 163         f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 164     f.close ()
 165
 166
 167 for filename in files:
 168     print "extract_texi_filenames.py: Processing %s" % filename
 169     (lang_suffix, sections) = extract_sections (filename)
 170     process_sections (filename, lang_suffix, sections)