sandbox/xhtml2rest/xhtml2rest.py

   1 #!/usr/bin/python
   2 """
   3 NAME
   4 ====
   5
   6 xhtml2rest - Convert xhtml to reStructuredText
   7
   8 SYNOPSIS
   9 ========
  10
  11 xhtml2rest *xhtmlfile* > *restfile*
  12
  13 DESCRIPTION
  14 ===========
  15
  16 ``xhtml2rest``, which, far from being a decent and complete program, is
  17 only something to begin with, hopefully processes the given UTF-8
  18 xhtml file and produces reStructuredText "source code" in the standard
  19 output.  If your input is html and/or not in UTF-8, you can convert it
  20 to UTF-8 xhtml using ``iconv`` and ``tidy``:
  21
  22     iconv -f *source_encoding* -t utf-8 *source_html* > *html_utf8*
  23
  24     tidy -utf8 -asxml -o *xhtmlfile* *html_utf8*
  25
  26     xhtml2rest *xhtmlfile* > *restfile*
  27
  28 Interestingly, since reStructuredText is not simple markup, but has
  29 very strict rules with the intention that the source is perfectly
  30 readable, it turns out that converting html to reStructuredText is
  31 actually *rendering*. ``xhtml2rest`` is a small rendering engine. Since
  32 I had no time to study how existing rendering engines work, I had to
  33 reinvent the wheel. So although the code is clean (I actually wrote it
  34 twice), I doubt that the core logic is adequate for future extensions.
  35 But it's better than nothing. There is some documentation in the code,
  36 but feel free to email me if you need more explanations.
  37
  38 LIMITATIONS
  39 ===========
  40
  41 I created ``xhtml2rest`` for a very specific job. It does that job
  42 correctly, but for your web page it might not work. It should not be
  43 very hard, however, either to improve the code, or to determine what
  44 it is in your web page that confuses ``xhtml2rest`` and remove it.
  45
  46 Other than that, there are the following limitations:
  47
  48 * No indented tables
  49
  50 * No multi-col or -row spans in tables
  51
  52 * No support for \<br>
  53
  54 * Not tested in nested tables (check http://www.w3m.org/story.html)
  55
  56 * \<th> support is quick and dirty
  57
  58 * If the same anchor text is met twice, the anchor is ignored
  59
  60 * No indented \<pre> elements (but I'm not sure the HTML standard
  61   allows them)
  62
  63 * Images are ignored
  64
  65 * The word HARDWIRED in the code indicates a hardwired hack which is
  66   specific to the job I wanted ``xhtml2rest`` to do.
  67
  68 META
  69 ====
  70
  71 ``xhtml2rest`` was created by Antonios Christofides,
  72 anthony@itia.ntua.gr, May-June 2005.
  73
  74 Revision: $Revision$
  75
  76 The code and this text is hereby placed in the public domain.
  77 """
  78
  79 import xml.dom.minidom
  80 import re
  81 import sys
  82 import textwrap
  83 import math
  84 import UserList
  85 import warnings
  86 import codecs
  87
  88 ###############################################################################
  89 # Global variables. I know. I'm terribly sorry. Please get rid of them.
  90
  91 # 'unindent' is used by list items. A li list item is always indented, but its
  92 # first line is "unindented" and contains the number or bullet. However, it was
  93 # difficult for the li node to tell its #text contents (which may be deeply
  94 # nested) to use that.  So it just places the number or bullet, which must be 4
  95 # characters, like " 1. ", in "unindent". The first text to be rendered uses
  96 # the unindent and then sets it to empty again.
  97
  98 unindent = ''
  99 hyperlinks = {} # text-target pairs found in "a href" elements
 100 ###############################################################################
 101
 102 class Ditem:
 103     """A document item; usually a node, but can be a block of text
 104     resulting from processing adjacent inline items. If it is a node,
 105     it is usually the BlockDitem subclass; if it is text, it is
 106     normally a plain Ditem."""
 107     def __init__(self, text):
 108         self.text = text    # Contained text (empty for BlockDitem)
 109         self.type = ''      # tag for block node, empty for inline
 110         self.indentlevel = 0  # 0 - unindented; 1 - indented; etc.
 111     def __repr__(self):
 112         return self.__class__.__name__+'("""'+self.text+'""")'
 113     def propagate_indents(self):
 114         "Propagates indent level recursively to children"
 115         pass
 116     def maxwidth(self):
 117         "Width it will occupy if allowed to render on infinite width"
 118         self.remove_white_space()
 119         return len(self.text) + 4*self.indentlevel
 120     def minwidth(self):
 121         "Width it will occupy if wrapped as much as possible"
 122         wordlens = [len(x) for x in self.text.split()]
 123         if wordlens: return max(wordlens) + 4*self.indentlevel
 124         else: return 0
 125     def format(self, width):
 126         """Returns contents formatted so as not to exceed specified
 127         width, if possible"""
 128         global unindent
 129         if(self.type=='pre'): raise Exception, "What are we doing here?"
 130         self.remove_white_space()
 131         # Quick hack to fix a problem. Do we begin with '* '?
 132         while len(self.text)>=2 and self.text[1]==' ' and self.text[0] in '*-':
 133             # It may be mistaken for a bullet list. Strip it.
 134             self.text = self.text[2:]
 135         if width < self.minwidth(): width = self.minwidth()
 136         # The textwrap module has the nasty habit of breaking at hyphens. So
 137         # we'll do a nasty hack: find a character that does not exist in the
 138         # text, replace all hyphens with that character, ok, you get the point.
 139         hyphensurrogate = ''
 140         for c in '!@#$%^&*~':
 141             if self.text.find(c)<0:
 142                 hyphensurrogate = c
 143                 break
 144         if not hyphensurrogate: raise Exception, "Houston we have a problem"
 145         text = self.text.replace('-', hyphensurrogate)
 146         wrapper = textwrap.TextWrapper(
 147             initial_indent=((4*self.indentlevel)-len(unindent))*' '+unindent,
 148             subsequent_indent=4*self.indentlevel*' ',
 149             width=width, break_long_words = False)
 150         unindent = ''
 151         text = wrapper.fill(text)
 152         text = text.replace(hyphensurrogate, '-')
 153         return text
 154     def empty(self):
 155         "Returns true if contains nothing"
 156         return not self.text
 157     def remove_white_space(self):
 158         "Removes extra white space"
 159         self.text = re.sub('\s+', ' ', self.text).strip()
 160     def canmerge(self):
 161         "Tells whether it's possible to merge this Ditem with adjacent ones"
 162         return True
 163     def merge(self, aditem):
 164         """If possible, merges aditem, which should be an adjacent Ditem that
 165         comes after this one."""
 166         if not self.canmerge() or not aditem.canmerge(): return False
 167         if len(self.text)>0 and self.text[-1] == '_' and len(aditem.text)>0 \
 168             and aditem.text[0] not in """ \n\t:.,!=/|;"'?<>[]{}()""":
 169             # Leave space after link if not followed by punctuation
 170             self.text = self.text + ' ' + aditem.text
 171         else:
 172             self.text = self.text + aditem.text
 173         return True
 174
 175 class BlockDitem(Ditem):
 176     "A Ditem which contains other Ditems"
 177     def __init__(self, type):
 178         Ditem.__init__(self, '')
 179         self.type = type
 180         self.children = []  # Contained Ditems
 181     def __repr__(self):
 182         return self.__class__.__name__+'("'+self.type+'"); children = '+repr(self.children)
 183     def maxwidth(self):
 184         childmaxwidths = [x.maxwidth() for x in self.children]
 185         return childmaxwidths and max(childmaxwidths) or 0
 186     def minwidth(self):
 187         childminwidths = [x.minwidth() for x in self.children]
 188         return childminwidths and max(childminwidths) or 0
 189     def propagate_indents(self):
 190         for x in self.children:
 191             x.indentlevel = self.indentlevel
 192             x.propagate_indents()
 193     def format(self, width):
 194         if width < self.minwidth(): width = self.minwidth()
 195         results = [x.format(width) for x in self.children]
 196         results = [x for x in results if x]
 197         return "\n\n".join(results)
 198     def empty(self):
 199         return not (self.children)
 200     def canmerge(self):
 201         return False
 202
 203 class PreDitem(Ditem):
 204     "A Ditem representing a literal block"
 205     def maxwidth(self):
 206         return max([len(x) for x in self.text.split('\n')])
 207     def minwidth(self):
 208         return self.maxwidth() # Literal block; width's given
 209     def remove_white_space(self):
 210         pass
 211     def format(self, width):
 212         result = '::\n\n'
 213         for x in self.text.split('\n'):
 214             result = result + '    ' + x + '\n'
 215         result = result + '..\n\n'
 216         return result
 217     def canmerge(self):
 218         return False
 219
 220 class HeadingDitem(BlockDitem):
 221     "A Ditem representing an h1, h2, ..., h9"
 222     def __init__(self, type):
 223         BlockDitem.__init__(self, type)
 224     def minwidth(self):
 225         return self.maxwidth()  # Headings don't wrap
 226     def format(self, width):
 227         assert(len(self.children)==1)
 228         text = self.children[0].format(32767)
 229         level = eval(self.type[1])
 230         underliner = "=-`'.~*+^"[level-1]
 231         return text + '\n' + len(text)*underliner
 232
 233 class BlockQuoteDitem(BlockDitem):
 234     "A Ditem representing a blockquote"
 235     def __init__(self, type):
 236         BlockDitem.__init__(self, type)
 237     def propagate_indents(self):
 238         self.indentlevel = self.indentlevel + 1
 239         BlockDitem.propagate_indents(self)
 240
 241 class ListDitem(BlockDitem):
 242     "A Ditem representing an ol, ul, or dl"
 243     def __init__(self, type):
 244         BlockDitem.__init__(self, type)
 245     def format(self, width):
 246         # First pass the list type and order to the children
 247         order = 1
 248         for x in self.children:
 249             if isinstance(x, ListItemDitem):
 250                 x.listtype = self.type
 251                 x.order = order
 252                 order = order+1
 253         # And then process normally
 254         return BlockDitem.format(self, width)
 255
 256 class ListItemDitem(BlockDitem):
 257     "A Ditem representing a li, dt, or dd"
 258     def __init__(self, type):
 259         BlockDitem.__init__(self, type)
 260         self.listtype = None
 261         self.order = 0
 262     def minwidth(self):
 263         if self.type == 'dt': return self.maxwidth()  # Don't wrap dt
 264         else: return BlockDitem.minwidth(self)
 265     def propagate_indents(self):
 266         if self.type in ('li', 'ol', 'dd'):
 267             self.indentlevel = self.indentlevel + 1
 268         BlockDitem.propagate_indents(self)
 269     def format(self, width):
 270         global unindent
 271         if self.type == 'li' and self.listtype == 'ol':
 272             unindent = ('%d. ' % (self.order)).ljust(4)
 273         elif self.type == 'li' and self.listtype == 'ul':
 274             unindent = '*   '
 275         return BlockDitem.format(self, width)
 276
 277 class RenderedColumn:
 278     "Width information about a column being rendered"
 279     def __init__(self, minwidth, maxwidth):
 280         self.minwidth = minwidth
 281         self.maxwidth = maxwidth
 282         self.curwidth = maxwidth
 283         self.fixedwidth = 0
 284     def logwidth(self):
 285         if self.maxwidth==0: return 0
 286         else: return math.log(self.maxwidth)
 287     def update(self, minwidth, maxwidth):
 288         "Replaces minwidth/maxwidth if greater"
 289         self.minwidth = minwidth>self.minwidth and minwidth or self.minwidth
 290         self.maxwidth = maxwidth>self.maxwidth and maxwidth or self.maxwidth
 291         self.curwidth = self.maxwidth
 292
 293 class RenderedColumns(UserList.UserList):
 294     "A list of RenderedColumn"
 295     def __init__(self, alist):
 296         self.data = alist
 297     def totalWidth(self):
 298         "Returns total table width"
 299         return reduce(lambda x,y: x+y, [z.curwidth for z in self.data]) \
 300             + len(self.data) + 1
 301     def sumLogWidth(self):
 302         "Returns sum of logwidth for nonfixed columns"
 303         return reduce(lambda x,y: x+y,
 304             [x.logwidth()*(1-x.fixedwidth) for x in self.data])
 305     def distributeWidthDifference(self, width):
 306         "Step 4 of w3m table rendering algorithm"
 307         # Note: The use of math.ceil below is because I'd rather have a
 308         # suboptimal width (a few characters less than requested width) rather
 309         # than go find what to do with rounding.
 310         w = self.totalWidth() - width
 311         assert(w>0)
 312         repeat_distribution = 1
 313         while repeat_distribution:
 314             repeat_distribution = 0
 315             for x in self.data:
 316                 if x.fixedwidth: continue
 317                 if x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth()) < \
 318                     x.minwidth:
 319                         x.curwidth = x.minwidth
 320                         x.fixedwidth = 1
 321                         w = self.totalWidth() - width
 322                         repeat_distribution=1
 323                         break
 324         # Now that the we finished finding which columns need to be fixed to
 325         # their minimum width, perform the distribution once again, without
 326         # checking, and actually change remaining column widths
 327         for x in self.data:
 328             if x.fixedwidth: continue
 329             x.curwidth = x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth())
 330
 331 def tablehrule(colwidths, rule='-'):
 332     "Returns a horizontal table separator for given column widths"
 333     result = '+'
 334     for x in colwidths:
 335         result = result + rule * x + '+'
 336     return result
 337
 338 class TableDitem(BlockDitem):
 339     def __init__(self, type):
 340         BlockDitem.__init__(self, type)
 341     def format(self, width):
 342         # Uses table rendering algorithm of w3m
 343         # (http://www.w3m.org/story.html), but ignoring width attribute
 344         # Step 1
 345         columns = RenderedColumns([RenderedColumn(x.minwidth(),
 346             max(x.maxwidth(), 1)    # A column can't be smaller than 1 character
 347             ) for x in self.children[0].children])
 348         for x in self.children:
 349             for i in range(len(columns)):
 350                 if (len(x.children)<=i): continue # Skip empty columns
 351                 columns[i].update(x.children[i].minwidth(), x.children[i].maxwidth())
 352         # Step 2 (width attribute) ignored
 353         # Step 3 (already done - list was created with maxwidth)
 354         # Step 4
 355         if columns.totalWidth() > width: columns.distributeWidthDifference(width)
 356         # OK, column widths are now calculated
 357         colwidths = [int(x.curwidth) for x in columns]
 358         result = tablehrule(colwidths) + '\n'
 359         usedheadbodysep = False
 360         for tr in self.children:
 361             result = result + tr.format(colwidths)
 362             rule = '-'
 363             if not usedheadbodysep and tr.children[0].type == 'th' \
 364                                         and tr!=self.children[-1]:
 365                 rule = '='
 366                 usedheadbodysep = True
 367             result = result + tablehrule(colwidths, rule) + '\n'
 368         return result
 369
 370 class TrDitem(BlockDitem):
 371     def __init__(self, type):
 372         BlockDitem.__init__(self, type)
 373     def maxwidth(self):
 374         return reduce(lambda x,y: x+y,
 375             [x.maxwidth() for x in self.children]) + len(self.children) + 1
 376     def minwidth(self):
 377         return reduce(lambda x,y: x+y,
 378             [x.minwidth() for x in self.children]) + len(self.children) + 1
 379     def format(self, colwidths):
 380         columns = []       # List of lists of lines
 381         maxlinecount = 0   # Num of lines in vertically largest column
 382         for i in range(len(colwidths)):
 383             if len(self.children)<=i: lines = [ '' ]
 384             else: lines = self.children[i].format(colwidths[i]).split('\n')
 385             lines = [x + ' ' * (colwidths[i]-len(x)) for x in lines] # Pad to col len
 386             maxlinecount = max(maxlinecount, len(lines))
 387             columns.append(lines)
 388         # Pad vertically
 389         for i in range(len(columns)):
 390             for j in range(maxlinecount-len(columns[i])):
 391                 columns[i].append(' ' * colwidths[i])
 392         result = ''
 393         # Add vertical separators
 394         for i in range(maxlinecount):
 395             result = result + '|'
 396             for j in range(len(columns)):
 397                 result = result + columns[j][i] + '|'
 398             result = result + '\n'
 399         return result
 400
 401 def handleNodeList(nodelist):
 402     "Processes given nodes; merges them if possible; returns ditem list"
 403     ditems = []
 404     curditem = Ditem('')
 405     for node in nodelist:
 406         aditem = handleNode(node)
 407         if curditem.merge(aditem): continue
 408         ditems.append(curditem)
 409         curditem = aditem
 410     if not curditem.empty(): ditems.append(curditem)
 411     return ditems
 412
 413 def handleNode(node):
 414     if node.nodeType == node.TEXT_NODE:
 415         return handleText(node)
 416     elif node.nodeName=='a':
 417         return handleAnchor(node)
 418     elif re.match('h\d', node.nodeName):
 419         return handleHeading(node)
 420     elif node.nodeName=='div' and node.getAttribute('class')=='cit':  # HARDWIRED
 421         return handleBlockQuote(node)
 422     elif node.nodeName in ('body', 'div', 'p', 'td', 'th'):
 423         return handleGenericBlock(node)
 424     elif node.nodeName in ('em', 'i'):
 425         return handleEmphasis(node)
 426     elif node.nodeName in ('strong', 'b'):
 427         return handleStrong(node)
 428     elif node.nodeName in ('ol', 'ul', 'dl'):
 429         return handleList(node)
 430     elif node.nodeName in ('li', 'dd', 'dt'):
 431         return handleListItem(node)
 432     elif node.nodeName in ('table'):
 433         return handleTable(node)
 434     elif node.nodeName in ('tr'):
 435         return handleTr(node)
 436     elif node.nodeName in ('pre'):
 437         return handlePre(node)
 438     elif node.hasChildNodes():
 439         contents = handleNodeList(node.childNodes)
 440         if len(contents) == 1: return contents[0]
 441         if len(contents) == 0: return Ditem('')
 442         result = BlockDitem(node.nodeName)
 443         result.children = contents
 444         return result
 445     return Ditem('')
 446
 447 def processChildren(node):
 448     if node.hasChildNodes():
 449         return handleNodeList(node.childNodes)
 450     else:
 451         return ()
 452
 453 def mergeChildren(node):
 454     contents = processChildren(node)
 455     if len(contents)>1: raise Exception('Unexpected block elements')
 456     if contents: return contents[0]
 457     else: return Ditem('')
 458
 459 def handleText(node):
 460     return Ditem(node.data)
 461
 462 def handleAnchor(node):
 463     result = mergeChildren(node)
 464     result.type = node.nodeName
 465     result.text = result.text.strip()
 466     if result.text == '': return result
 467     target = node.getAttribute('href').strip()
 468     if target=="" or target[0]=='#': return result  # Ignore intrnl links
 469     result.text = re.sub('\s+', ' ', result.text)
 470     key = result.text.lower()
 471     if hyperlinks.has_key(key) and hyperlinks[key]!=target:
 472         # The following try-except is a quick hack to ensure that the
 473         # program will not stop because of problems in the warning
 474         # mechanism. One such specific problem is a UnicodeEncodeError
 475         # when result.text contains difficult characters.
 476         try:
 477             warnings.warn("Ignoring second appearance of anchor '" + result.text +
 478                                                     "' with different target")
 479         except:
 480             pass
 481         return result
 482     hyperlinks[key] = target
 483     result.text = '`'+result.text+'`_'
 484     return result
 485
 486 def handleHeading(node):
 487     contents = mergeChildren(node)
 488     if contents.empty(): return contents
 489     result = HeadingDitem(node.nodeName)
 490     result.children.append(contents)
 491     return result
 492
 493 def handleEmphasis(node):
 494     result = mergeChildren(node)
 495     result.type = node.nodeName
 496     if result.text:
 497         result.text = '*' + result.text + '*'
 498     return result
 499
 500 def handleStrong(node):
 501     result = mergeChildren(node)
 502     result.type = node.nodeName
 503     if result.text:
 504         result.text = '**' + result.text + '**'
 505     return result
 506
 507 def handleGenericBlock(node):
 508     result = BlockDitem(node.nodeName)
 509     result.children = processChildren(node)
 510     return result
 511
 512 def handleBlockQuote(node):
 513     result = BlockQuoteDitem(node.nodeName)
 514     result.children = processChildren(node)
 515     return result
 516
 517 def handleList(node):
 518     result = ListDitem(node.nodeName)
 519     result.children = processChildren(node)
 520     return result
 521
 522 def handleListItem(node):
 523     result = ListItemDitem(node.nodeName)
 524     result.children = processChildren(node)
 525     return result
 526
 527 def handleTable(node):
 528     result = TableDitem(node.nodeName)
 529     # Ignore table contents that are not tr
 530     result.children = [x
 531         for x in processChildren(node) if x.type=='tr']
 532     return result
 533
 534 def handleTr(node):
 535     result = TrDitem(node.nodeName)
 536     # Ignore tr contents that are not th or td
 537     result.children = [x
 538         for x in processChildren(node) if x.type in ('th', 'td')]
 539     return result
 540
 541 def handlePre(node):
 542     return PreDitem(mergeChildren(node).text)
 543
 544 dom1 = xml.dom.minidom.parse(sys.argv[1])
 545 ditem = handleNode(dom1.getElementsByTagName("body")[0])
 546 ditem.propagate_indents()
 547 (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup('utf-8')
 548 outf = utf8_writer(sys.stdout)
 549 outf.write(ditem.format(79) + '\n')
 550 for h in hyperlinks.keys():
 551     outf.write('\n.. _`' + h + '`:\n    ' + hyperlinks[h] + '\n')