objavi/xhtml_utils.py

   1 """Various things to do with [x]html that might be useful in more than
   2 one place."""
   3
   4 import lxml.html, lxml.html.clean
   5 from lxml import etree
   6
   7 import os
   8 import re
   9
  10 from urlparse import urlsplit
  11 from urllib2 import urlopen, HTTPError
  12
  13 from objavi.config import XHTMLNS, XHTML, IMG_CACHE, MARKER_CLASS_SPLIT, MARKER_CLASS_INFO
  14 from objavi.book_utils import log
  15
  16 ADJUST_HEADING_WEIGHT = False
  17
  18 OK_TAGS = set([
  19     "body", "head", "html", "title", "abbr", "acronym", "address",
  20     "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
  21     "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
  22     "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
  23     "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
  24     "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
  25     "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
  26     "link", "base",
  27     etree.Comment,
  28     ])
  29
  30
  31 XHTML11_DOCTYPE = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
  32     "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  33 '''
  34 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
  35
  36 IMG_PREFIX = 'static/'
  37
  38 def convert_tags(root, elmap):
  39     for el in root.iterdescendants():
  40         if el.tag in elmap:
  41             el.tag = elmap[el.tag]
  42
  43
  44 def url_to_filename(url, prefix=''):
  45     #XXX for TWIKI only
  46     #XXX slightly inefficient to do urlsplit so many times, but versatile
  47     fragments = urlsplit(url)
  48     base, ext = fragments.path.rsplit('.', 1)
  49     server = fragments.netloc.split('.', 1)[0] #en, fr, translate
  50     base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
  51     base = re.sub(r'[^\w]+', '-',  '%s-%s' %(base, server))
  52     return '%s%s.%s' % (prefix, base, ext)
  53
  54
  55 class ImageCache(object):
  56     def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
  57         self._fetched = {}
  58         self.cache_dir = cache_dir
  59         self.prefix = prefix
  60         if not os.path.exists(cache_dir + prefix):
  61             os.makedirs(cache_dir + prefix)
  62
  63     def read_local_url(self, path):
  64         f = open(self.cache_dir + path)
  65         s = f.read()
  66         f.close()
  67         return s
  68
  69     def _save_local_url(self, path, data):
  70         f = open(self.cache_dir + path, 'w')
  71         f.write(data)
  72         f.close()
  73         #os.chmod(path, 0444)
  74
  75     def fetch_if_necessary(self, url, target=None, use_cache=True):
  76         if url in self._fetched:
  77             return self._fetched[url]
  78
  79         if target is None:
  80             target = url_to_filename(url, self.prefix)
  81
  82         if use_cache and os.path.exists(self.cache_dir + target):
  83             log("used cache for %s" % target)
  84             return target
  85
  86         try:
  87             f = urlopen(url)
  88             data = f.read()
  89             f.close()
  90         except HTTPError, e:
  91             # if it is missing, assume it will be missing every time
  92             # after, otherwise, you can get into endless waiting
  93             self._fetched[url] = None
  94             log("Wanting '%s', got error %s" %(url, e))
  95             return None
  96
  97         self._save_local_url(target, data)
  98         self._fetched[url] = target
  99         log("got %s as %s" % (url, target))
 100         return target
 101
 102
 103 class BaseChapter(object):
 104     parser = lxml.html.HTMLParser(encoding='utf-8')
 105     def as_html(self):
 106         """Serialise the tree as html."""
 107         return etree.tostring(self.tree, method='html', encoding='utf-8')
 108
 109     def as_xhtml(self):
 110         """Convert to xhtml and serialise."""
 111         try:
 112             root = self.tree.getroot()
 113         except AttributeError:
 114             root = self.tree
 115
 116         nsmap = {None: XHTML}
 117         xroot = etree.Element(XHTMLNS + "html", nsmap=nsmap)
 118
 119         def xhtml_copy(el, xel):
 120             xel.text = el.text
 121             for k, v in el.items():
 122                 xel.set(k, v)
 123             for child in el.iterchildren():
 124                 xchild = xel.makeelement(XHTMLNS + child.tag)
 125                 xel.append(xchild)
 126                 xhtml_copy(child, xchild)
 127             xel.tail = el.tail
 128
 129         xhtml_copy(root, xroot)
 130
 131         return XML_DEC + XHTML11_DOCTYPE + etree.tostring(xroot)
 132
 133     cleaner = lxml.html.clean.Cleaner(scripts=True,
 134                                       javascript=True,
 135                                       comments=False,
 136                                       style=True,
 137                                       links=True,
 138                                       meta=True,
 139                                       page_structure=False,
 140                                       processing_instructions=True,
 141                                       embedded=True,
 142                                       frames=True,
 143                                       forms=True,
 144                                       annoying_tags=True,
 145                                       allow_tags=OK_TAGS,
 146                                       remove_unknown_tags=False,
 147                                       safe_attrs_only=True,
 148                                       add_nofollow=False
 149                                       )
 150
 151     def remove_bad_tags(self):
 152         #for e in self.tree.iter():
 153         #    if not e.tag in OK_TAGS:
 154         #        log('found bad tag %s' % e.tag)
 155         self.cleaner(self.tree)
 156
 157
 158     def fix_bad_structure(self):
 159         """Attempt to match booki chapter conventions.  This doesn't
 160         care about xhtml correctness, just booki correctness.
 161
 162         This function's philosophy is to be aggressive, and be
 163         modified upon complaint."""
 164         #0. is the first element preceded by text?
 165         body = self.tree.iter('body').next()
 166         if body.text.strip():
 167             log("BAD STRUCTURE: text %r before first tag (not fixing)" % body.text.strip())
 168
 169         #0.5 Remove any <link>, <script>, and <style> tags
 170         #they are at best spurious
 171         for tag in ['link', 'style', 'script', etree.Comment]:
 172             for e in body.iter(tag):
 173                 log("BAD STRUCTURE: trying to remove '%s' (with tail %s)" %
 174                     (("%s" % e)[:60], e.tail))
 175                 parent = e.getparent()
 176                 if e.tail:
 177                     log("rescuing that tail")
 178                     p = e.getprevious()
 179                     if p is None:
 180                         parent.text = (parent.text or "") + e.tail
 181                     else:
 182                         p.tail = (p.tail or "") + e.tail
 183                 parent.remove(e)
 184
 185         #0.75 Remove style attribute from all elements!
 186         for e in body.iter():
 187             if e.get('style'):
 188                 del e.attrib['style']
 189
 190         # 1. is the first element an h1?
 191         el1 = body[0]
 192         if el1.tag == 'div' and len(body) == 1:
 193             #The body has a "content" div. we should compact it.
 194             log("DODGY STRUCTURE: containing div. ")
 195         if el1.tag != 'h1':
 196             log("BAD STRUCTURE: firstelement is %r " % el1.tag)
 197             if el1.tag in ('h2', 'h3', 'strong', 'b'):
 198                 log("converting %r to 'h1'" % el1.tag)
 199                 el1.tag = 'h1'
 200
 201         #2. how many <h1>s are there?
 202         h1s = list(body.iter('h1'))
 203         if not h1s:
 204             log("BAD STRUCTURE: no h1! making one up")
 205             h1 = body.makeelement('h1')
 206             h1.text = "Somebody Should Set The Title For This Chapter!"
 207             body.insert(0, h1)
 208         elif len(h1s) > 1:
 209             log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s[1:])
 210             for h1 in h1s[1:]:
 211                 h1.tag = 'h2'
 212
 213
 214     def _loadtree(self, html):
 215         try:
 216             try:
 217                 self.tree = lxml.html.document_fromstring(html, parser=self.parser)
 218             except UnicodeError, e:
 219                 log('failed to parse tree as unicode, got %s %r' % (e, e),
 220                     'trying again using default parser')
 221                 self.tree = lxml.html.document_fromstring(html)
 222         except etree.XMLSyntaxError, e:
 223             log('Could not parse html file %r, string %r... exception %s' %
 224                 (self.name, html[:40], e))
 225             self.tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 226
 227
 228 class EpubChapter(BaseChapter):
 229     def __init__(self, server, book, chapter_name, html, use_cache=False,
 230                  cache_dir=None):
 231         self.server = server
 232         self.book = book
 233         self.name = chapter_name
 234         self._loadtree(html)
 235
 236     def prepare_for_epub(self):
 237         """Shift all headings down 2 places."""
 238         if ADJUST_HEADING_WEIGHT:
 239             # a question to resolve:
 240             # is it better (quicker) to have multiple, filtered iterations
 241             # converting in order (h4->h5, h3->h4, etc) or to do a single,
 242             # unfiltered pass and convert from a dict?
 243
 244             hmap = dict(('h%s' % x, 'h%s' % (x + 2)) for x in range(4, 0, -1))
 245             hmap['h5'] = 'h6'
 246             convert_tags(self.root, hmap)
 247
 248
 249
 250
 251 ###################################################
 252
 253
 254 class Section(object):
 255     def __init__(self, tree, ID=None, title=None):
 256         self.ID = ID
 257         self.tree = tree
 258         self.title = title
 259
 260     def __str__(self):
 261         return '<Section id: %r title: %r>' % (self.ID, self.title)
 262     __repr__ = __str__
 263
 264 def split_tree(tree):
 265     """If a document has special marker elements (hr tags with class
 266     of config.MARKER_CLASS_SPLIT) it will be broken into smaller
 267     documents using the markers as boundaries.  Each element in the
 268     new documents will be nested and ordered as before, though those
 269     on the new edges will obviously lack siblings they once may have
 270     had.
 271
 272     The new documents are returned as a list of Section objects (see
 273     above), which bundles the new tree with an ID and title if the
 274     marker elements contain those attributes.
 275
 276     The original tree will be destroyed or reused.
 277     """
 278     try:
 279         root = tree.getroot()
 280     except AttributeError:
 281         root = tree
 282
 283     # find the node lineages along which to split the document.
 284     # anything outside these lines (i.e., side branches) can be copied
 285     # wholesale, which speeds things up considerably.
 286     stacks = []
 287     for hr in root.iter(tag='hr'):
 288         klass = hr.get('class')
 289         if klass == MARKER_CLASS_SPLIT:
 290             stack = [hr]
 291             stack.extend(x for x in hr.iterancestors())
 292             stack.reverse()
 293             stacks.append(stack)
 294         elif klass == MARKER_CLASS_INFO:
 295             hr.getparent().remove(hr)
 296
 297     iterstacks = iter(stacks)
 298
 299     src = root
 300     dest = lxml.html.Element(root.tag, **dict(root.items()))
 301     doc = dest
 302     stack = iterstacks.next()
 303     marker = stack[-1]
 304
 305     chapters = []
 306     ID = 'unidentified-front-matter'
 307     title = None
 308     try:
 309         while True:
 310             for e in src:
 311                 if e not in stack:
 312                     #cut and paste branch
 313                     dest.append(e)
 314                 elif e is marker:
 315                     #got one.
 316                     chapters.append(Section(doc, ID, title))
 317                     #The ID and title are for the *next* section, so
 318                     #collect them before deleting the marker.
 319                     ID = e.get('id')
 320                     title = e.get('title')
 321                     src.remove(e)
 322                     src = root
 323                     dest = lxml.html.Element(root.tag, **dict(root.items()))
 324                     doc = dest
 325                     stack = iterstacks.next()
 326                     marker = stack[-1]
 327                     break
 328                 else:
 329                     #next level.
 330                     #It is safe to descend without leaving a trail,
 331                     #because side branches are not descended.
 332                     dest = etree.SubElement(dest, e.tag, **dict(e.items()))
 333                     dest.text = e.text
 334                     e.text = None
 335                     src = e
 336                     break
 337     except StopIteration:
 338         #stacks have run out -- the rest of the tree is the last section
 339         chapters.append(Section(src, ID, title))
 340     return chapters
 341
 342