added test of len() method for SQLTable
[pygr.git] / doc / tools / buildindex.py
blobc53b7fabd9e634e343c73c75b2cc7160b74761e6
1 #! /usr/bin/env python
3 __version__ = '$Revision: 1.1 $'
5 import os.path
6 import re
7 import string
8 import sys
10 from xml.sax.saxutils import quoteattr
13 bang_join = "!".join
14 null_join = "".join
16 REPLACEMENTS = [
17 # Hackish way to deal with macros replaced with simple text
18 (re.compile(r"\\ABC\b"), "ABC"),
19 (re.compile(r"\\ASCII\b"), "ASCII"),
20 (re.compile(r"\\Cpp\b"), "C++"),
21 (re.compile(r"\\EOF\b"), "EOF"),
22 (re.compile(r"\\NULL\b"), "NULL"),
23 (re.compile(r"\\POSIX\b"), "POSIX"),
24 (re.compile(r"\\UNIX\b"), "Unix"),
25 # deal with turds left over from LaTeX2HTML
26 (re.compile(r"<#\d+#>"), ""),
29 class Node:
30 continuation = 0
32 def __init__(self, link, str, seqno):
33 self.links = [link]
34 self.seqno = seqno
35 for pattern, replacement in REPLACEMENTS:
36 str = pattern.sub(replacement, str)
37 # build up the text
38 self.text = split_entry_text(str)
39 self.key = split_entry_key(str)
41 def __cmp__(self, other):
42 """Comparison operator includes sequence number, for use with
43 list.sort()."""
44 return self.cmp_entry(other) or cmp(self.seqno, other.seqno)
46 def cmp_entry(self, other):
47 """Comparison 'operator' that ignores sequence number."""
48 c = 0
49 for i in range(min(len(self.key), len(other.key))):
50 c = (cmp_part(self.key[i], other.key[i])
51 or cmp_part(self.text[i], other.text[i]))
52 if c:
53 break
54 return c or cmp(self.key, other.key) or cmp(self.text, other.text)
56 def __repr__(self):
57 return "<Node for %s (%s)>" % (bang_join(self.text), self.seqno)
59 def __str__(self):
60 return bang_join(self.key)
62 def dump(self):
63 return "%s\1%s###%s\n" \
64 % ("\1".join(self.links),
65 bang_join(self.text),
66 self.seqno)
69 def cmp_part(s1, s2):
70 result = cmp(s1, s2)
71 if result == 0:
72 return 0
73 l1 = s1.lower()
74 l2 = s2.lower()
75 minlen = min(len(s1), len(s2))
76 if len(s1) < len(s2) and l1 == l2[:len(s1)]:
77 result = -1
78 elif len(s2) < len(s1) and l2 == l1[:len(s2)]:
79 result = 1
80 else:
81 result = cmp(l1, l2) or cmp(s1, s2)
82 return result
85 def split_entry(str, which):
86 stuff = []
87 parts = str.split('!')
88 parts = [part.split('@') for part in parts]
89 for entry in parts:
90 if len(entry) != 1:
91 key = entry[which]
92 else:
93 key = entry[0]
94 stuff.append(key)
95 return stuff
98 _rmtt = re.compile(r"""(.*)<tt(?: class=['"][a-z0-9]+["'])?>(.*)</tt>(.*)$""",
99 re.IGNORECASE)
100 _rmparens = re.compile(r"\(\)")
102 def split_entry_key(str):
103 parts = split_entry(str, 1)
104 for i in range(len(parts)):
105 m = _rmtt.match(parts[i])
106 if m:
107 parts[i] = null_join(m.group(1, 2, 3))
108 else:
109 parts[i] = parts[i].lower()
110 # remove '()' from the key:
111 parts[i] = _rmparens.sub('', parts[i])
112 return map(trim_ignored_letters, parts)
115 def split_entry_text(str):
116 if '<' in str:
117 m = _rmtt.match(str)
118 if m:
119 str = null_join(m.group(1, 2, 3))
120 return split_entry(str, 1)
123 def load(fp):
124 nodes = []
125 rx = re.compile("(.*)\1(.*)###(.*)$")
126 while 1:
127 line = fp.readline()
128 if not line:
129 break
130 m = rx.match(line)
131 if m:
132 link, str, seqno = m.group(1, 2, 3)
133 nodes.append(Node(link, str, seqno))
134 return nodes
137 def trim_ignored_letters(s):
138 # ignore $ to keep environment variables with the
139 # leading letter from the name
140 if s.startswith("$"):
141 return s[1:].lower()
142 else:
143 return s.lower()
145 def get_first_letter(s):
146 if s.startswith("<tex2html_percent_mark>"):
147 return "%"
148 else:
149 return trim_ignored_letters(s)[0]
152 def split_letters(nodes):
153 letter_groups = []
154 if nodes:
155 group = []
156 append = group.append
157 letter = get_first_letter(nodes[0].text[0])
158 letter_groups.append((letter, group))
159 for node in nodes:
160 nletter = get_first_letter(node.text[0])
161 if letter != nletter:
162 letter = nletter
163 group = []
164 letter_groups.append((letter, group))
165 append = group.append
166 append(node)
167 return letter_groups
170 def group_symbols(groups):
171 entries = []
172 ident_letters = string.ascii_letters + "_"
173 while groups[0][0] not in ident_letters:
174 entries += groups[0][1]
175 del groups[0]
176 if entries:
177 groups.insert(0, ("Symbols", entries))
180 # need a function to separate the nodes into columns...
181 def split_columns(nodes, columns=1):
182 if columns <= 1:
183 return [nodes]
184 # This is a rough height; we may have to increase to avoid breaks before
185 # a subitem.
186 colheight = int(len(nodes) / columns)
187 numlong = int(len(nodes) % columns)
188 if numlong:
189 colheight = colheight + 1
190 else:
191 numlong = columns
192 cols = []
193 for i in range(numlong):
194 start = i * colheight
195 end = start + colheight
196 cols.append(nodes[start:end])
197 del nodes[:end]
198 colheight = colheight - 1
199 try:
200 numshort = int(len(nodes) / colheight)
201 except ZeroDivisionError:
202 cols = cols + (columns - len(cols)) * [[]]
203 else:
204 for i in range(numshort):
205 start = i * colheight
206 end = start + colheight
207 cols.append(nodes[start:end])
209 # If items continue across columns, make sure they are marked
210 # as continuations so the user knows to look at the previous column.
212 for i in range(len(cols) - 1):
213 try:
214 prev = cols[i][-1]
215 next = cols[i + 1][0]
216 except IndexError:
217 return cols
218 else:
219 n = min(len(prev.key), len(next.key))
220 for j in range(n):
221 if prev.key[j] != next.key[j]:
222 break
223 next.continuation = j + 1
224 return cols
227 DL_LEVEL_INDENT = " "
229 def format_column(nodes):
230 strings = ["<dl compact='compact'>"]
231 append = strings.append
232 level = 0
233 previous = []
234 for node in nodes:
235 current = node.text
236 count = 0
237 for i in range(min(len(current), len(previous))):
238 if previous[i] != current[i]:
239 break
240 count = i + 1
241 if count > level:
242 append("<dl compact='compact'>" * (count - level) + "\n")
243 level = count
244 elif level > count:
245 append("\n")
246 append(level * DL_LEVEL_INDENT)
247 append("</dl>" * (level - count))
248 level = count
249 # else: level == count
250 for i in range(count, len(current) - 1):
251 term = node.text[i]
252 level = level + 1
253 if node.continuation > i:
254 extra = " (continued)"
255 else:
256 extra = ""
257 append("\n<dt>%s%s\n<dd>\n%s<dl compact='compact'>"
258 % (term, extra, level * DL_LEVEL_INDENT))
259 append("\n%s<dt>%s%s</a>"
260 % (level * DL_LEVEL_INDENT, node.links[0], node.text[-1]))
261 for link in node.links[1:]:
262 append(",\n%s %s[Link]</a>" % (level * DL_LEVEL_INDENT, link))
263 previous = current
264 append("\n")
265 append("</dl>" * (level + 1))
266 return null_join(strings)
269 def format_nodes(nodes, columns=1):
270 strings = []
271 append = strings.append
272 if columns > 1:
273 colnos = range(columns)
274 colheight = int(len(nodes) / columns)
275 if len(nodes) % columns:
276 colheight = colheight + 1
277 colwidth = int(100 / columns)
278 append('<table width="100%"><tr valign="top">')
279 for col in split_columns(nodes, columns):
280 append('<td width="%d%%">\n' % colwidth)
281 append(format_column(col))
282 append("\n</td>")
283 append("\n</tr></table>")
284 else:
285 append(format_column(nodes))
286 return null_join(strings)
289 def format_letter(letter):
290 if letter == '.':
291 lettername = ". (dot)"
292 elif letter == '_':
293 lettername = "_ (underscore)"
294 else:
295 lettername = letter.capitalize()
296 return "\n<hr />\n<h2 id=%s>%s</h2>\n\n" \
297 % (quoteattr("letter-" + letter), lettername)
300 def format_html_letters(nodes, columns, group_symbol_nodes):
301 letter_groups = split_letters(nodes)
302 if group_symbol_nodes:
303 group_symbols(letter_groups)
304 items = []
305 for letter, nodes in letter_groups:
306 s = "<b><a href=\"#letter-%s\">%s</a></b>" % (letter, letter)
307 items.append(s)
308 s = ["<hr /><center>\n%s</center>\n" % " |\n".join(items)]
309 for letter, nodes in letter_groups:
310 s.append(format_letter(letter))
311 s.append(format_nodes(nodes, columns))
312 return null_join(s)
314 def format_html(nodes, columns):
315 return format_nodes(nodes, columns)
318 def collapse(nodes):
319 """Collapse sequences of nodes with matching keys into a single node.
320 Destructive."""
321 if len(nodes) < 2:
322 return
323 prev = nodes[0]
324 i = 1
325 while i < len(nodes):
326 node = nodes[i]
327 if not node.cmp_entry(prev):
328 prev.links.append(node.links[0])
329 del nodes[i]
330 else:
331 i = i + 1
332 prev = node
335 def dump(nodes, fp):
336 for node in nodes:
337 fp.write(node.dump())
340 def process_nodes(nodes, columns, letters=0, group_symbol_nodes=0):
341 nodes.sort()
342 collapse(nodes)
343 if letters:
344 return format_html_letters(nodes, columns, group_symbol_nodes)
345 else:
346 return format_html(nodes, columns)
349 def main():
350 import getopt
351 ifn = "-"
352 ofn = "-"
353 columns = 1
354 letters = 0
355 group_symbol_nodes = 1
356 opts, args = getopt.getopt(sys.argv[1:], "c:lo:",
357 ["columns=", "dont-group-symbols",
358 "group-symbols", "letters", "output="])
359 for opt, val in opts:
360 if opt in ("-o", "--output"):
361 ofn = val
362 elif opt in ("-c", "--columns"):
363 columns = int(val, 10)
364 elif opt in ("-l", "--letters"):
365 letters = 1
366 elif opt == "--group-symbols":
367 group_symbol_nodes = 1
368 elif opt == "--dont-group-symbols":
369 group_symbol_nodes = 0
370 if not args:
371 args = [ifn]
372 nodes = []
373 for fn in args:
374 nodes = nodes + load(open(fn))
375 num_nodes = len(nodes)
376 html = process_nodes(nodes, columns, letters, group_symbol_nodes)
377 program = os.path.basename(sys.argv[0])
378 if ofn == "-":
379 sys.stdout.write(html)
380 sys.stderr.write("\n%s: %d index nodes" % (program, num_nodes))
381 else:
382 open(ofn, "w").write(html)
383 print
384 print "%s: %d index nodes" % (program, num_nodes)
387 if __name__ == "__main__":
388 main()