Improved documentation.
[docutils/kirr.git] / sandbox / xhtml2rest / xhtml2rest.py
blob8c6b366db86bb311e9c48264a769bee7795c8593
1 #!/usr/bin/python
2 """
3 NAME
4 ====
6 xhtml2rest - Convert xhtml to reStructuredText
8 SYNOPSIS
9 ========
11 xhtml2rest *xhtmlfile* > *restfile*
13 DESCRIPTION
14 ===========
16 ``xhtml2rest``, which, far from being a decent and complete program, is
17 only something to begin with, hopefully processes the given UTF-8
18 xhtml file and produces reStructuredText "source code" in the standard
19 output. If your input is html and/or not in UTF-8, you can convert it
20 to UTF-8 xhtml using ``iconv`` and ``tidy``:
22 iconv -f *source_encoding* -t utf-8 *source_html* > *html_utf8*
24 tidy -utf8 -asxml -o *xhtmlfile* *html_utf8*
26 xhtml2rest *xhtmlfile* > *restfile*
28 Interestingly, since reStructuredText is not simple markup, but has
29 very strict rules with the intention that the source is perfectly
30 readable, it turns out that converting html to reStructuredText is
31 actually *rendering*. ``xhtml2rest`` is a small rendering engine. Since
32 I had no time to study how existing rendering engines work, I had to
33 reinvent the wheel. So although the code is clean (I actually wrote it
34 twice), I doubt that the core logic is adequate for future extensions.
35 But it's better than nothing. There is some documentation in the code,
36 but feel free to email me if you need more explanations.
38 LIMITATIONS
39 ===========
41 I created ``xhtml2rest`` for a very specific job. It does that job
42 correctly, but for your web page it might not work. It should not be
43 very hard, however, either to improve the code, or to determine what
44 it is in your web page that confuses ``xhtml2rest`` and remove it.
46 Other than that, there are the following limitations:
48 * No indented tables
50 * No multi-col or -row spans in tables
52 * No support for \<br>
54 * Not tested in nested tables (check http://www.w3m.org/story.html)
56 * \<th> support is quick and dirty
58 * If the same anchor text is met twice, the anchor is ignored
60 * No indented \<pre> elements (but I'm not sure the HTML standard
61 allows them)
63 * Images are ignored
65 * The word HARDWIRED in the code indicates a hardwired hack which is
66 specific to the job I wanted ``xhtml2rest`` to do.
68 META
69 ====
71 ``xhtml2rest`` was created by Antonios Christofides,
72 anthony@itia.ntua.gr, May-June 2005.
74 Revision: $Revision$
76 The code and this text is hereby placed in the public domain.
77 """
79 import xml.dom.minidom
80 import re
81 import sys
82 import textwrap
83 import math
84 import UserList
85 import warnings
86 import codecs
88 ###############################################################################
89 # Global variables. I know. I'm terribly sorry. Please get rid of them.
91 # 'unindent' is used by list items. A li list item is always indented, but its
92 # first line is "unindented" and contains the number or bullet. However, it was
93 # difficult for the li node to tell its #text contents (which may be deeply
94 # nested) to use that. So it just places the number or bullet, which must be 4
95 # characters, like " 1. ", in "unindent". The first text to be rendered uses
96 # the unindent and then sets it to empty again.
98 unindent = ''
99 hyperlinks = {} # text-target pairs found in "a href" elements
100 ###############################################################################
102 class Ditem:
103 """A document item; usually a node, but can be a block of text
104 resulting from processing adjacent inline items. If it is a node,
105 it is usually the BlockDitem subclass; if it is text, it is
106 normally a plain Ditem."""
107 def __init__(self, text):
108 self.text = text # Contained text (empty for BlockDitem)
109 self.type = '' # tag for block node, empty for inline
110 self.indentlevel = 0 # 0 - unindented; 1 - indented; etc.
111 def __repr__(self):
112 return self.__class__.__name__+'("""'+self.text+'""")'
113 def propagate_indents(self):
114 "Propagates indent level recursively to children"
115 pass
116 def maxwidth(self):
117 "Width it will occupy if allowed to render on infinite width"
118 self.remove_white_space()
119 return len(self.text) + 4*self.indentlevel
120 def minwidth(self):
121 "Width it will occupy if wrapped as much as possible"
122 wordlens = [len(x) for x in self.text.split()]
123 if wordlens: return max(wordlens) + 4*self.indentlevel
124 else: return 0
125 def format(self, width):
126 """Returns contents formatted so as not to exceed specified
127 width, if possible"""
128 global unindent
129 if(self.type=='pre'): raise Exception, "What are we doing here?"
130 self.remove_white_space()
131 # Quick hack to fix a problem. Do we begin with '* '?
132 while len(self.text)>=2 and self.text[1]==' ' and self.text[0] in '*-':
133 # It may be mistaken for a bullet list. Strip it.
134 self.text = self.text[2:]
135 if width < self.minwidth(): width = self.minwidth()
136 # The textwrap module has the nasty habit of breaking at hyphens. So
137 # we'll do a nasty hack: find a character that does not exist in the
138 # text, replace all hyphens with that character, ok, you get the point.
139 hyphensurrogate = ''
140 for c in '!@#$%^&*~':
141 if self.text.find(c)<0:
142 hyphensurrogate = c
143 break
144 if not hyphensurrogate: raise Exception, "Houston we have a problem"
145 text = self.text.replace('-', hyphensurrogate)
146 wrapper = textwrap.TextWrapper(
147 initial_indent=((4*self.indentlevel)-len(unindent))*' '+unindent,
148 subsequent_indent=4*self.indentlevel*' ',
149 width=width, break_long_words = False)
150 unindent = ''
151 text = wrapper.fill(text)
152 text = text.replace(hyphensurrogate, '-')
153 return text
154 def empty(self):
155 "Returns true if contains nothing"
156 return not self.text
157 def remove_white_space(self):
158 "Removes extra white space"
159 self.text = re.sub('\s+', ' ', self.text).strip()
160 def canmerge(self):
161 "Tells whether it's possible to merge this Ditem with adjacent ones"
162 return True
163 def merge(self, aditem):
164 """If possible, merges aditem, which should be an adjacent Ditem that
165 comes after this one."""
166 if not self.canmerge() or not aditem.canmerge(): return False
167 if len(self.text)>0 and self.text[-1] == '_' and len(aditem.text)>0 \
168 and aditem.text[0] not in """ \n\t:.,!=/|;"'?<>[]{}()""":
169 # Leave space after link if not followed by punctuation
170 self.text = self.text + ' ' + aditem.text
171 else:
172 self.text = self.text + aditem.text
173 return True
175 class BlockDitem(Ditem):
176 "A Ditem which contains other Ditems"
177 def __init__(self, type):
178 Ditem.__init__(self, '')
179 self.type = type
180 self.children = [] # Contained Ditems
181 def __repr__(self):
182 return self.__class__.__name__+'("'+self.type+'"); children = '+repr(self.children)
183 def maxwidth(self):
184 childmaxwidths = [x.maxwidth() for x in self.children]
185 return childmaxwidths and max(childmaxwidths) or 0
186 def minwidth(self):
187 childminwidths = [x.minwidth() for x in self.children]
188 return childminwidths and max(childminwidths) or 0
189 def propagate_indents(self):
190 for x in self.children:
191 x.indentlevel = self.indentlevel
192 x.propagate_indents()
193 def format(self, width):
194 if width < self.minwidth(): width = self.minwidth()
195 results = [x.format(width) for x in self.children]
196 results = [x for x in results if x]
197 return "\n\n".join(results)
198 def empty(self):
199 return not (self.children)
200 def canmerge(self):
201 return False
203 class PreDitem(Ditem):
204 "A Ditem representing a literal block"
205 def maxwidth(self):
206 return max([len(x) for x in self.text.split('\n')])
207 def minwidth(self):
208 return self.maxwidth() # Literal block; width's given
209 def remove_white_space(self):
210 pass
211 def format(self, width):
212 result = '::\n\n'
213 for x in self.text.split('\n'):
214 result = result + ' ' + x + '\n'
215 result = result + '..\n\n'
216 return result
217 def canmerge(self):
218 return False
220 class HeadingDitem(BlockDitem):
221 "A Ditem representing an h1, h2, ..., h9"
222 def __init__(self, type):
223 BlockDitem.__init__(self, type)
224 def minwidth(self):
225 return self.maxwidth() # Headings don't wrap
226 def format(self, width):
227 assert(len(self.children)==1)
228 text = self.children[0].format(32767)
229 level = eval(self.type[1])
230 underliner = "=-`'.~*+^"[level-1]
231 return text + '\n' + len(text)*underliner
233 class BlockQuoteDitem(BlockDitem):
234 "A Ditem representing a blockquote"
235 def __init__(self, type):
236 BlockDitem.__init__(self, type)
237 def propagate_indents(self):
238 self.indentlevel = self.indentlevel + 1
239 BlockDitem.propagate_indents(self)
241 class ListDitem(BlockDitem):
242 "A Ditem representing an ol, ul, or dl"
243 def __init__(self, type):
244 BlockDitem.__init__(self, type)
245 def format(self, width):
246 # First pass the list type and order to the children
247 order = 1
248 for x in self.children:
249 if isinstance(x, ListItemDitem):
250 x.listtype = self.type
251 x.order = order
252 order = order+1
253 # And then process normally
254 return BlockDitem.format(self, width)
256 class ListItemDitem(BlockDitem):
257 "A Ditem representing a li, dt, or dd"
258 def __init__(self, type):
259 BlockDitem.__init__(self, type)
260 self.listtype = None
261 self.order = 0
262 def minwidth(self):
263 if self.type == 'dt': return self.maxwidth() # Don't wrap dt
264 else: return BlockDitem.minwidth(self)
265 def propagate_indents(self):
266 if self.type in ('li', 'ol', 'dd'):
267 self.indentlevel = self.indentlevel + 1
268 BlockDitem.propagate_indents(self)
269 def format(self, width):
270 global unindent
271 if self.type == 'li' and self.listtype == 'ol':
272 unindent = ('%d. ' % (self.order)).ljust(4)
273 elif self.type == 'li' and self.listtype == 'ul':
274 unindent = '* '
275 return BlockDitem.format(self, width)
277 class RenderedColumn:
278 "Width information about a column being rendered"
279 def __init__(self, minwidth, maxwidth):
280 self.minwidth = minwidth
281 self.maxwidth = maxwidth
282 self.curwidth = maxwidth
283 self.fixedwidth = 0
284 def logwidth(self):
285 if self.maxwidth==0: return 0
286 else: return math.log(self.maxwidth)
287 def update(self, minwidth, maxwidth):
288 "Replaces minwidth/maxwidth if greater"
289 self.minwidth = minwidth>self.minwidth and minwidth or self.minwidth
290 self.maxwidth = maxwidth>self.maxwidth and maxwidth or self.maxwidth
291 self.curwidth = self.maxwidth
293 class RenderedColumns(UserList.UserList):
294 "A list of RenderedColumn"
295 def __init__(self, alist):
296 self.data = alist
297 def totalWidth(self):
298 "Returns total table width"
299 return reduce(lambda x,y: x+y, [z.curwidth for z in self.data]) \
300 + len(self.data) + 1
301 def sumLogWidth(self):
302 "Returns sum of logwidth for nonfixed columns"
303 return reduce(lambda x,y: x+y,
304 [x.logwidth()*(1-x.fixedwidth) for x in self.data])
305 def distributeWidthDifference(self, width):
306 "Step 4 of w3m table rendering algorithm"
307 # Note: The use of math.ceil below is because I'd rather have a
308 # suboptimal width (a few characters less than requested width) rather
309 # than go find what to do with rounding.
310 w = self.totalWidth() - width
311 assert(w>0)
312 repeat_distribution = 1
313 while repeat_distribution:
314 repeat_distribution = 0
315 for x in self.data:
316 if x.fixedwidth: continue
317 if x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth()) < \
318 x.minwidth:
319 x.curwidth = x.minwidth
320 x.fixedwidth = 1
321 w = self.totalWidth() - width
322 repeat_distribution=1
323 break
324 # Now that the we finished finding which columns need to be fixed to
325 # their minimum width, perform the distribution once again, without
326 # checking, and actually change remaining column widths
327 for x in self.data:
328 if x.fixedwidth: continue
329 x.curwidth = x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth())
331 def tablehrule(colwidths, rule='-'):
332 "Returns a horizontal table separator for given column widths"
333 result = '+'
334 for x in colwidths:
335 result = result + rule * x + '+'
336 return result
338 class TableDitem(BlockDitem):
339 def __init__(self, type):
340 BlockDitem.__init__(self, type)
341 def format(self, width):
342 # Uses table rendering algorithm of w3m
343 # (http://www.w3m.org/story.html), but ignoring width attribute
344 # Step 1
345 columns = RenderedColumns([RenderedColumn(x.minwidth(),
346 max(x.maxwidth(), 1) # A column can't be smaller than 1 character
347 ) for x in self.children[0].children])
348 for x in self.children:
349 for i in range(len(columns)):
350 if (len(x.children)<=i): continue # Skip empty columns
351 columns[i].update(x.children[i].minwidth(), x.children[i].maxwidth())
352 # Step 2 (width attribute) ignored
353 # Step 3 (already done - list was created with maxwidth)
354 # Step 4
355 if columns.totalWidth() > width: columns.distributeWidthDifference(width)
356 # OK, column widths are now calculated
357 colwidths = [int(x.curwidth) for x in columns]
358 result = tablehrule(colwidths) + '\n'
359 usedheadbodysep = False
360 for tr in self.children:
361 result = result + tr.format(colwidths)
362 rule = '-'
363 if not usedheadbodysep and tr.children[0].type == 'th' \
364 and tr!=self.children[-1]:
365 rule = '='
366 usedheadbodysep = True
367 result = result + tablehrule(colwidths, rule) + '\n'
368 return result
370 class TrDitem(BlockDitem):
371 def __init__(self, type):
372 BlockDitem.__init__(self, type)
373 def maxwidth(self):
374 return reduce(lambda x,y: x+y,
375 [x.maxwidth() for x in self.children]) + len(self.children) + 1
376 def minwidth(self):
377 return reduce(lambda x,y: x+y,
378 [x.minwidth() for x in self.children]) + len(self.children) + 1
379 def format(self, colwidths):
380 columns = [] # List of lists of lines
381 maxlinecount = 0 # Num of lines in vertically largest column
382 for i in range(len(colwidths)):
383 if len(self.children)<=i: lines = [ '' ]
384 else: lines = self.children[i].format(colwidths[i]).split('\n')
385 lines = [x + ' ' * (colwidths[i]-len(x)) for x in lines] # Pad to col len
386 maxlinecount = max(maxlinecount, len(lines))
387 columns.append(lines)
388 # Pad vertically
389 for i in range(len(columns)):
390 for j in range(maxlinecount-len(columns[i])):
391 columns[i].append(' ' * colwidths[i])
392 result = ''
393 # Add vertical separators
394 for i in range(maxlinecount):
395 result = result + '|'
396 for j in range(len(columns)):
397 result = result + columns[j][i] + '|'
398 result = result + '\n'
399 return result
401 def handleNodeList(nodelist):
402 "Processes given nodes; merges them if possible; returns ditem list"
403 ditems = []
404 curditem = Ditem('')
405 for node in nodelist:
406 aditem = handleNode(node)
407 if curditem.merge(aditem): continue
408 ditems.append(curditem)
409 curditem = aditem
410 if not curditem.empty(): ditems.append(curditem)
411 return ditems
413 def handleNode(node):
414 if node.nodeType == node.TEXT_NODE:
415 return handleText(node)
416 elif node.nodeName=='a':
417 return handleAnchor(node)
418 elif re.match('h\d', node.nodeName):
419 return handleHeading(node)
420 elif node.nodeName=='div' and node.getAttribute('class')=='cit': # HARDWIRED
421 return handleBlockQuote(node)
422 elif node.nodeName in ('body', 'div', 'p', 'td', 'th'):
423 return handleGenericBlock(node)
424 elif node.nodeName in ('em', 'i'):
425 return handleEmphasis(node)
426 elif node.nodeName in ('strong', 'b'):
427 return handleStrong(node)
428 elif node.nodeName in ('ol', 'ul', 'dl'):
429 return handleList(node)
430 elif node.nodeName in ('li', 'dd', 'dt'):
431 return handleListItem(node)
432 elif node.nodeName in ('table'):
433 return handleTable(node)
434 elif node.nodeName in ('tr'):
435 return handleTr(node)
436 elif node.nodeName in ('pre'):
437 return handlePre(node)
438 elif node.hasChildNodes():
439 contents = handleNodeList(node.childNodes)
440 if len(contents) == 1: return contents[0]
441 if len(contents) == 0: return Ditem('')
442 result = BlockDitem(node.nodeName)
443 result.children = contents
444 return result
445 return Ditem('')
447 def processChildren(node):
448 if node.hasChildNodes():
449 return handleNodeList(node.childNodes)
450 else:
451 return ()
453 def mergeChildren(node):
454 contents = processChildren(node)
455 if len(contents)>1: raise Exception('Unexpected block elements')
456 if contents: return contents[0]
457 else: return Ditem('')
459 def handleText(node):
460 return Ditem(node.data)
462 def handleAnchor(node):
463 result = mergeChildren(node)
464 result.type = node.nodeName
465 result.text = result.text.strip()
466 if result.text == '': return result
467 target = node.getAttribute('href').strip()
468 if target=="" or target[0]=='#': return result # Ignore intrnl links
469 result.text = re.sub('\s+', ' ', result.text)
470 key = result.text.lower()
471 if hyperlinks.has_key(key) and hyperlinks[key]!=target:
472 # The following try-except is a quick hack to ensure that the
473 # program will not stop because of problems in the warning
474 # mechanism. One such specific problem is a UnicodeEncodeError
475 # when result.text contains difficult characters.
476 try:
477 warnings.warn("Ignoring second appearance of anchor '" + result.text +
478 "' with different target")
479 except:
480 pass
481 return result
482 hyperlinks[key] = target
483 result.text = '`'+result.text+'`_'
484 return result
486 def handleHeading(node):
487 contents = mergeChildren(node)
488 if contents.empty(): return contents
489 result = HeadingDitem(node.nodeName)
490 result.children.append(contents)
491 return result
493 def handleEmphasis(node):
494 result = mergeChildren(node)
495 result.type = node.nodeName
496 if result.text:
497 result.text = '*' + result.text + '*'
498 return result
500 def handleStrong(node):
501 result = mergeChildren(node)
502 result.type = node.nodeName
503 if result.text:
504 result.text = '**' + result.text + '**'
505 return result
507 def handleGenericBlock(node):
508 result = BlockDitem(node.nodeName)
509 result.children = processChildren(node)
510 return result
512 def handleBlockQuote(node):
513 result = BlockQuoteDitem(node.nodeName)
514 result.children = processChildren(node)
515 return result
517 def handleList(node):
518 result = ListDitem(node.nodeName)
519 result.children = processChildren(node)
520 return result
522 def handleListItem(node):
523 result = ListItemDitem(node.nodeName)
524 result.children = processChildren(node)
525 return result
527 def handleTable(node):
528 result = TableDitem(node.nodeName)
529 # Ignore table contents that are not tr
530 result.children = [x
531 for x in processChildren(node) if x.type=='tr']
532 return result
534 def handleTr(node):
535 result = TrDitem(node.nodeName)
536 # Ignore tr contents that are not th or td
537 result.children = [x
538 for x in processChildren(node) if x.type in ('th', 'td')]
539 return result
541 def handlePre(node):
542 return PreDitem(mergeChildren(node).text)
544 dom1 = xml.dom.minidom.parse(sys.argv[1])
545 ditem = handleNode(dom1.getElementsByTagName("body")[0])
546 ditem.propagate_indents()
547 (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup('utf-8')
548 outf = utf8_writer(sys.stdout)
549 outf.write(ditem.format(79) + '\n')
550 for h in hyperlinks.keys():
551 outf.write('\n.. _`' + h + '`:\n ' + hyperlinks[h] + '\n')