6 xhtml2rest - Convert xhtml to reStructuredText
11 xhtml2rest *xhtmlfile* > *restfile*
16 ``xhtml2rest``, which, far from being a decent and complete program, is
17 only something to begin with, hopefully processes the given UTF-8
18 xhtml file and produces reStructuredText "source code" in the standard
19 output. If your input is html and/or not in UTF-8, you can convert it
20 to UTF-8 xhtml using ``iconv`` and ``tidy``:
22 iconv -f *source_encoding* -t utf-8 *source_html* > *html_utf8*
24 tidy -utf8 -asxml -o *xhtmlfile* *html_utf8*
26 xhtml2rest *xhtmlfile* > *restfile*
28 Interestingly, since reStructuredText is not simple markup, but has
29 very strict rules with the intention that the source is perfectly
30 readable, it turns out that converting html to reStructuredText is
31 actually *rendering*. ``xhtml2rest`` is a small rendering engine. Since
32 I had no time to study how existing rendering engines work, I had to
33 reinvent the wheel. So although the code is clean (I actually wrote it
34 twice), I doubt that the core logic is adequate for future extensions.
35 But it's better than nothing. There is some documentation in the code,
36 but feel free to email me if you need more explanations.
41 I created ``xhtml2rest`` for a very specific job. It does that job
42 correctly, but for your web page it might not work. It should not be
43 very hard, however, either to improve the code, or to determine what
44 it is in your web page that confuses ``xhtml2rest`` and remove it.
46 Other than that, there are the following limitations:
50 * No multi-col or -row spans in tables
52 * No support for \<br>
54 * Not tested in nested tables (check http://www.w3m.org/story.html)
56 * \<th> support is quick and dirty
58 * If the same anchor text is met twice, the anchor is ignored
60 * No indented \<pre> elements (but I'm not sure the HTML standard
65 * The word HARDWIRED in the code indicates a hardwired hack which is
66 specific to the job I wanted ``xhtml2rest`` to do.
71 ``xhtml2rest`` was created by Antonios Christofides,
72 anthony@itia.ntua.gr, May-June 2005.
76 The code and this text is hereby placed in the public domain.
79 import xml
.dom
.minidom
88 ###############################################################################
89 # Global variables. I know. I'm terribly sorry. Please get rid of them.
91 # 'unindent' is used by list items. A li list item is always indented, but its
92 # first line is "unindented" and contains the number or bullet. However, it was
93 # difficult for the li node to tell its #text contents (which may be deeply
94 # nested) to use that. So it just places the number or bullet, which must be 4
95 # characters, like " 1. ", in "unindent". The first text to be rendered uses
96 # the unindent and then sets it to empty again.
99 hyperlinks
= {} # text-target pairs found in "a href" elements
100 ###############################################################################
103 """A document item; usually a node, but can be a block of text
104 resulting from processing adjacent inline items. If it is a node,
105 it is usually the BlockDitem subclass; if it is text, it is
106 normally a plain Ditem."""
107 def __init__(self
, text
):
108 self
.text
= text
# Contained text (empty for BlockDitem)
109 self
.type = '' # tag for block node, empty for inline
110 self
.indentlevel
= 0 # 0 - unindented; 1 - indented; etc.
112 return self
.__class
__.__name
__+'("""'+self
.text
+'""")'
113 def propagate_indents(self
):
114 "Propagates indent level recursively to children"
117 "Width it will occupy if allowed to render on infinite width"
118 self
.remove_white_space()
119 return len(self
.text
) + 4*self
.indentlevel
121 "Width it will occupy if wrapped as much as possible"
122 wordlens
= [len(x
) for x
in self
.text
.split()]
123 if wordlens
: return max(wordlens
) + 4*self
.indentlevel
125 def format(self
, width
):
126 """Returns contents formatted so as not to exceed specified
127 width, if possible"""
129 if(self
.type=='pre'): raise Exception, "What are we doing here?"
130 self
.remove_white_space()
131 # Quick hack to fix a problem. Do we begin with '* '?
132 while len(self
.text
)>=2 and self
.text
[1]==' ' and self
.text
[0] in '*-':
133 # It may be mistaken for a bullet list. Strip it.
134 self
.text
= self
.text
[2:]
135 if width
< self
.minwidth(): width
= self
.minwidth()
136 # The textwrap module has the nasty habit of breaking at hyphens. So
137 # we'll do a nasty hack: find a character that does not exist in the
138 # text, replace all hyphens with that character, ok, you get the point.
140 for c
in '!@#$%^&*~':
141 if self
.text
.find(c
)<0:
144 if not hyphensurrogate
: raise Exception, "Houston we have a problem"
145 text
= self
.text
.replace('-', hyphensurrogate
)
146 wrapper
= textwrap
.TextWrapper(
147 initial_indent
=((4*self
.indentlevel
)-len(unindent
))*' '+unindent
,
148 subsequent_indent
=4*self
.indentlevel
*' ',
149 width
=width
, break_long_words
= False)
151 text
= wrapper
.fill(text
)
152 text
= text
.replace(hyphensurrogate
, '-')
155 "Returns true if contains nothing"
157 def remove_white_space(self
):
158 "Removes extra white space"
159 self
.text
= re
.sub('\s+', ' ', self
.text
).strip()
161 "Tells whether it's possible to merge this Ditem with adjacent ones"
163 def merge(self
, aditem
):
164 """If possible, merges aditem, which should be an adjacent Ditem that
165 comes after this one."""
166 if not self
.canmerge() or not aditem
.canmerge(): return False
167 if len(self
.text
)>0 and self
.text
[-1] == '_' and len(aditem
.text
)>0 \
168 and aditem
.text
[0] not in """ \n\t:.,!=/|;"'?<>[]{}()""":
169 # Leave space after link if not followed by punctuation
170 self
.text
= self
.text
+ ' ' + aditem
.text
172 self
.text
= self
.text
+ aditem
.text
175 class BlockDitem(Ditem
):
176 "A Ditem which contains other Ditems"
177 def __init__(self
, type):
178 Ditem
.__init
__(self
, '')
180 self
.children
= [] # Contained Ditems
182 return self
.__class
__.__name
__+'("'+self
.type+'"); children = '+repr(self
.children
)
184 childmaxwidths
= [x
.maxwidth() for x
in self
.children
]
185 return childmaxwidths
and max(childmaxwidths
) or 0
187 childminwidths
= [x
.minwidth() for x
in self
.children
]
188 return childminwidths
and max(childminwidths
) or 0
189 def propagate_indents(self
):
190 for x
in self
.children
:
191 x
.indentlevel
= self
.indentlevel
192 x
.propagate_indents()
193 def format(self
, width
):
194 if width
< self
.minwidth(): width
= self
.minwidth()
195 results
= [x
.format(width
) for x
in self
.children
]
196 results
= [x
for x
in results
if x
]
197 return "\n\n".join(results
)
199 return not (self
.children
)
203 class PreDitem(Ditem
):
204 "A Ditem representing a literal block"
206 return max([len(x
) for x
in self
.text
.split('\n')])
208 return self
.maxwidth() # Literal block; width's given
209 def remove_white_space(self
):
211 def format(self
, width
):
213 for x
in self
.text
.split('\n'):
214 result
= result
+ ' ' + x
+ '\n'
215 result
= result
+ '..\n\n'
220 class HeadingDitem(BlockDitem
):
221 "A Ditem representing an h1, h2, ..., h9"
222 def __init__(self
, type):
223 BlockDitem
.__init
__(self
, type)
225 return self
.maxwidth() # Headings don't wrap
226 def format(self
, width
):
227 assert(len(self
.children
)==1)
228 text
= self
.children
[0].format(32767)
229 level
= eval(self
.type[1])
230 underliner
= "=-`'.~*+^"[level
-1]
231 return text
+ '\n' + len(text
)*underliner
233 class BlockQuoteDitem(BlockDitem
):
234 "A Ditem representing a blockquote"
235 def __init__(self
, type):
236 BlockDitem
.__init
__(self
, type)
237 def propagate_indents(self
):
238 self
.indentlevel
= self
.indentlevel
+ 1
239 BlockDitem
.propagate_indents(self
)
241 class ListDitem(BlockDitem
):
242 "A Ditem representing an ol, ul, or dl"
243 def __init__(self
, type):
244 BlockDitem
.__init
__(self
, type)
245 def format(self
, width
):
246 # First pass the list type and order to the children
248 for x
in self
.children
:
249 if isinstance(x
, ListItemDitem
):
250 x
.listtype
= self
.type
253 # And then process normally
254 return BlockDitem
.format(self
, width
)
256 class ListItemDitem(BlockDitem
):
257 "A Ditem representing a li, dt, or dd"
258 def __init__(self
, type):
259 BlockDitem
.__init
__(self
, type)
263 if self
.type == 'dt': return self
.maxwidth() # Don't wrap dt
264 else: return BlockDitem
.minwidth(self
)
265 def propagate_indents(self
):
266 if self
.type in ('li', 'ol', 'dd'):
267 self
.indentlevel
= self
.indentlevel
+ 1
268 BlockDitem
.propagate_indents(self
)
269 def format(self
, width
):
271 if self
.type == 'li' and self
.listtype
== 'ol':
272 unindent
= ('%d. ' % (self
.order
)).ljust(4)
273 elif self
.type == 'li' and self
.listtype
== 'ul':
275 return BlockDitem
.format(self
, width
)
277 class RenderedColumn
:
278 "Width information about a column being rendered"
279 def __init__(self
, minwidth
, maxwidth
):
280 self
.minwidth
= minwidth
281 self
.maxwidth
= maxwidth
282 self
.curwidth
= maxwidth
285 if self
.maxwidth
==0: return 0
286 else: return math
.log(self
.maxwidth
)
287 def update(self
, minwidth
, maxwidth
):
288 "Replaces minwidth/maxwidth if greater"
289 self
.minwidth
= minwidth
>self
.minwidth
and minwidth
or self
.minwidth
290 self
.maxwidth
= maxwidth
>self
.maxwidth
and maxwidth
or self
.maxwidth
291 self
.curwidth
= self
.maxwidth
293 class RenderedColumns(UserList
.UserList
):
294 "A list of RenderedColumn"
295 def __init__(self
, alist
):
297 def totalWidth(self
):
298 "Returns total table width"
299 return reduce(lambda x
,y
: x
+y
, [z
.curwidth
for z
in self
.data
]) \
301 def sumLogWidth(self
):
302 "Returns sum of logwidth for nonfixed columns"
303 return reduce(lambda x
,y
: x
+y
,
304 [x
.logwidth()*(1-x
.fixedwidth
) for x
in self
.data
])
305 def distributeWidthDifference(self
, width
):
306 "Step 4 of w3m table rendering algorithm"
307 # Note: The use of math.ceil below is because I'd rather have a
308 # suboptimal width (a few characters less than requested width) rather
309 # than go find what to do with rounding.
310 w
= self
.totalWidth() - width
312 repeat_distribution
= 1
313 while repeat_distribution
:
314 repeat_distribution
= 0
316 if x
.fixedwidth
: continue
317 if x
.curwidth
- math
.ceil(w
*x
.logwidth()/self
.sumLogWidth()) < \
319 x
.curwidth
= x
.minwidth
321 w
= self
.totalWidth() - width
322 repeat_distribution
=1
324 # Now that the we finished finding which columns need to be fixed to
325 # their minimum width, perform the distribution once again, without
326 # checking, and actually change remaining column widths
328 if x
.fixedwidth
: continue
329 x
.curwidth
= x
.curwidth
- math
.ceil(w
*x
.logwidth()/self
.sumLogWidth())
331 def tablehrule(colwidths
, rule
='-'):
332 "Returns a horizontal table separator for given column widths"
335 result
= result
+ rule
* x
+ '+'
338 class TableDitem(BlockDitem
):
339 def __init__(self
, type):
340 BlockDitem
.__init
__(self
, type)
341 def format(self
, width
):
342 # Uses table rendering algorithm of w3m
343 # (http://www.w3m.org/story.html), but ignoring width attribute
345 columns
= RenderedColumns([RenderedColumn(x
.minwidth(),
346 max(x
.maxwidth(), 1) # A column can't be smaller than 1 character
347 ) for x
in self
.children
[0].children
])
348 for x
in self
.children
:
349 for i
in range(len(columns
)):
350 if (len(x
.children
)<=i
): continue # Skip empty columns
351 columns
[i
].update(x
.children
[i
].minwidth(), x
.children
[i
].maxwidth())
352 # Step 2 (width attribute) ignored
353 # Step 3 (already done - list was created with maxwidth)
355 if columns
.totalWidth() > width
: columns
.distributeWidthDifference(width
)
356 # OK, column widths are now calculated
357 colwidths
= [int(x
.curwidth
) for x
in columns
]
358 result
= tablehrule(colwidths
) + '\n'
359 usedheadbodysep
= False
360 for tr
in self
.children
:
361 result
= result
+ tr
.format(colwidths
)
363 if not usedheadbodysep
and tr
.children
[0].type == 'th' \
364 and tr
!=self
.children
[-1]:
366 usedheadbodysep
= True
367 result
= result
+ tablehrule(colwidths
, rule
) + '\n'
370 class TrDitem(BlockDitem
):
371 def __init__(self
, type):
372 BlockDitem
.__init
__(self
, type)
374 return reduce(lambda x
,y
: x
+y
,
375 [x
.maxwidth() for x
in self
.children
]) + len(self
.children
) + 1
377 return reduce(lambda x
,y
: x
+y
,
378 [x
.minwidth() for x
in self
.children
]) + len(self
.children
) + 1
379 def format(self
, colwidths
):
380 columns
= [] # List of lists of lines
381 maxlinecount
= 0 # Num of lines in vertically largest column
382 for i
in range(len(colwidths
)):
383 if len(self
.children
)<=i
: lines
= [ '' ]
384 else: lines
= self
.children
[i
].format(colwidths
[i
]).split('\n')
385 lines
= [x
+ ' ' * (colwidths
[i
]-len(x
)) for x
in lines
] # Pad to col len
386 maxlinecount
= max(maxlinecount
, len(lines
))
387 columns
.append(lines
)
389 for i
in range(len(columns
)):
390 for j
in range(maxlinecount
-len(columns
[i
])):
391 columns
[i
].append(' ' * colwidths
[i
])
393 # Add vertical separators
394 for i
in range(maxlinecount
):
395 result
= result
+ '|'
396 for j
in range(len(columns
)):
397 result
= result
+ columns
[j
][i
] + '|'
398 result
= result
+ '\n'
401 def handleNodeList(nodelist
):
402 "Processes given nodes; merges them if possible; returns ditem list"
405 for node
in nodelist
:
406 aditem
= handleNode(node
)
407 if curditem
.merge(aditem
): continue
408 ditems
.append(curditem
)
410 if not curditem
.empty(): ditems
.append(curditem
)
413 def handleNode(node
):
414 if node
.nodeType
== node
.TEXT_NODE
:
415 return handleText(node
)
416 elif node
.nodeName
=='a':
417 return handleAnchor(node
)
418 elif re
.match('h\d', node
.nodeName
):
419 return handleHeading(node
)
420 elif node
.nodeName
=='div' and node
.getAttribute('class')=='cit': # HARDWIRED
421 return handleBlockQuote(node
)
422 elif node
.nodeName
in ('body', 'div', 'p', 'td', 'th'):
423 return handleGenericBlock(node
)
424 elif node
.nodeName
in ('em', 'i'):
425 return handleEmphasis(node
)
426 elif node
.nodeName
in ('strong', 'b'):
427 return handleStrong(node
)
428 elif node
.nodeName
in ('ol', 'ul', 'dl'):
429 return handleList(node
)
430 elif node
.nodeName
in ('li', 'dd', 'dt'):
431 return handleListItem(node
)
432 elif node
.nodeName
in ('table'):
433 return handleTable(node
)
434 elif node
.nodeName
in ('tr'):
435 return handleTr(node
)
436 elif node
.nodeName
in ('pre'):
437 return handlePre(node
)
438 elif node
.hasChildNodes():
439 contents
= handleNodeList(node
.childNodes
)
440 if len(contents
) == 1: return contents
[0]
441 if len(contents
) == 0: return Ditem('')
442 result
= BlockDitem(node
.nodeName
)
443 result
.children
= contents
447 def processChildren(node
):
448 if node
.hasChildNodes():
449 return handleNodeList(node
.childNodes
)
453 def mergeChildren(node
):
454 contents
= processChildren(node
)
455 if len(contents
)>1: raise Exception('Unexpected block elements')
456 if contents
: return contents
[0]
457 else: return Ditem('')
459 def handleText(node
):
460 return Ditem(node
.data
)
462 def handleAnchor(node
):
463 result
= mergeChildren(node
)
464 result
.type = node
.nodeName
465 result
.text
= result
.text
.strip()
466 if result
.text
== '': return result
467 target
= node
.getAttribute('href').strip()
468 if target
=="" or target
[0]=='#': return result
# Ignore intrnl links
469 result
.text
= re
.sub('\s+', ' ', result
.text
)
470 key
= result
.text
.lower()
471 if hyperlinks
.has_key(key
) and hyperlinks
[key
]!=target
:
472 # The following try-except is a quick hack to ensure that the
473 # program will not stop because of problems in the warning
474 # mechanism. One such specific problem is a UnicodeEncodeError
475 # when result.text contains difficult characters.
477 warnings
.warn("Ignoring second appearance of anchor '" + result
.text
+
478 "' with different target")
482 hyperlinks
[key
] = target
483 result
.text
= '`'+result
.text
+'`_'
486 def handleHeading(node
):
487 contents
= mergeChildren(node
)
488 if contents
.empty(): return contents
489 result
= HeadingDitem(node
.nodeName
)
490 result
.children
.append(contents
)
493 def handleEmphasis(node
):
494 result
= mergeChildren(node
)
495 result
.type = node
.nodeName
497 result
.text
= '*' + result
.text
+ '*'
500 def handleStrong(node
):
501 result
= mergeChildren(node
)
502 result
.type = node
.nodeName
504 result
.text
= '**' + result
.text
+ '**'
507 def handleGenericBlock(node
):
508 result
= BlockDitem(node
.nodeName
)
509 result
.children
= processChildren(node
)
512 def handleBlockQuote(node
):
513 result
= BlockQuoteDitem(node
.nodeName
)
514 result
.children
= processChildren(node
)
517 def handleList(node
):
518 result
= ListDitem(node
.nodeName
)
519 result
.children
= processChildren(node
)
522 def handleListItem(node
):
523 result
= ListItemDitem(node
.nodeName
)
524 result
.children
= processChildren(node
)
527 def handleTable(node
):
528 result
= TableDitem(node
.nodeName
)
529 # Ignore table contents that are not tr
531 for x
in processChildren(node
) if x
.type=='tr']
535 result
= TrDitem(node
.nodeName
)
536 # Ignore tr contents that are not th or td
538 for x
in processChildren(node
) if x
.type in ('th', 'td')]
542 return PreDitem(mergeChildren(node
).text
)
544 dom1
= xml
.dom
.minidom
.parse(sys
.argv
[1])
545 ditem
= handleNode(dom1
.getElementsByTagName("body")[0])
546 ditem
.propagate_indents()
547 (utf8_encode
, utf8_decode
, utf8_reader
, utf8_writer
) = codecs
.lookup('utf-8')
548 outf
= utf8_writer(sys
.stdout
)
549 outf
.write(ditem
.format(79) + '\n')
550 for h
in hyperlinks
.keys():
551 outf
.write('\n.. _`' + h
+ '`:\n ' + hyperlinks
[h
] + '\n')