objavi/xhtml_utils.py

   1 """Various things to do with [x]html that might be useful in more than
   2 one place."""
   3
   4 import lxml.html, lxml.html.clean
   5 from lxml import etree
   6
   7 import os
   8 import re
   9
  10 from urlparse import urlsplit
  11 from urllib2 import urlopen, HTTPError
  12
  13 from objavi.config import XHTMLNS, XHTML, IMG_CACHE, MARKER_CLASS_SPLIT, MARKER_CLASS_INFO
  14 from objavi.book_utils import log
  15
  16 ADJUST_HEADING_WEIGHT = False
  17
  18 OK_TAGS = set([
  19     "body", "head", "html", "title", "abbr", "acronym", "address",
  20     "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
  21     "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
  22     "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
  23     "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
  24     "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
  25     "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
  26     "link", "base",
  27     etree.Comment,
  28     ])
  29
  30
  31 XHTML11_DOCTYPE = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
  32     "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  33 '''
  34 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
  35
  36 IMG_PREFIX = 'static/'
  37
  38 def convert_tags(root, elmap):
  39     for el in root.iterdescendants():
  40         if el.tag in elmap:
  41             el.tag = elmap[el.tag]
  42
  43
  44 def url_to_filename(url, prefix=''):
  45     #XXX for TWIKI only
  46     #XXX slightly inefficient to do urlsplit so many times, but versatile
  47     fragments = urlsplit(url)
  48     base, ext = fragments.path.rsplit('.', 1)
  49     server = fragments.netloc.split('.', 1)[0] #en, fr, translate
  50     base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
  51     base = re.sub(r'[^\w]+', '-',  '%s-%s' %(base, server))
  52     return '%s%s.%s' % (prefix, base, ext)
  53
  54
  55 class ImageCache(object):
  56     def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
  57         self._fetched = {}
  58         self.cache_dir = cache_dir
  59         self.prefix = prefix
  60         if not os.path.exists(cache_dir + prefix):
  61             os.makedirs(cache_dir + prefix)
  62
  63     def read_local_url(self, path):
  64         f = open(self.cache_dir + path)
  65         s = f.read()
  66         f.close()
  67         return s
  68
  69     def _save_local_url(self, path, data):
  70         f = open(self.cache_dir + path, 'w')
  71         f.write(data)
  72         f.close()
  73         #os.chmod(path, 0444)
  74
  75     def fetch_if_necessary(self, url, target=None, use_cache=True):
  76         if url in self._fetched:
  77             return self._fetched[url]
  78
  79         if target is None:
  80             target = url_to_filename(url, self.prefix)
  81
  82         if use_cache and os.path.exists(self.cache_dir + target):
  83             log("used cache for %s" % target)
  84             return target
  85
  86         try:
  87             f = urlopen(url)
  88             data = f.read()
  89             f.close()
  90         except HTTPError, e:
  91             # if it is missing, assume it will be missing every time
  92             # after, otherwise, you can get into endless waiting
  93             self._fetched[url] = None
  94             log("Wanting '%s', got error %s" %(url, e))
  95             return None
  96
  97         self._save_local_url(target, data)
  98         self._fetched[url] = target
  99         log("got %s as %s" % (url, target))
 100         return target
 101
 102
 103 class BaseChapter(object):
 104     parser = lxml.html.HTMLParser(encoding='utf-8')
 105     def as_html(self):
 106         """Serialise the tree as html."""
 107         return etree.tostring(self.tree, method='html', encoding='utf-8')
 108
 109     def as_xhtml(self):
 110         """Convert to xhtml and serialise."""
 111         try:
 112             root = self.tree.getroot()
 113         except AttributeError:
 114             root = self.tree
 115
 116         nsmap = {None: XHTML}
 117         xroot = etree.Element(XHTMLNS + "html", nsmap=nsmap)
 118
 119         def xhtml_copy(el, xel):
 120             xel.text = el.text
 121             for k, v in el.items():
 122                 xel.set(k, v)
 123             for child in el.iterchildren():
 124                 xchild = xel.makeelement(XHTMLNS + child.tag)
 125                 xel.append(xchild)
 126                 xhtml_copy(child, xchild)
 127             xel.tail = el.tail
 128
 129         xhtml_copy(root, xroot)
 130
 131         return XML_DEC + XHTML11_DOCTYPE + etree.tostring(xroot)
 132
 133     cleaner = lxml.html.clean.Cleaner(scripts=True,
 134                                       javascript=True,
 135                                       comments=False,
 136                                       style=True,
 137                                       links=True,
 138                                       meta=True,
 139                                       page_structure=False,
 140                                       processing_instructions=True,
 141                                       embedded=True,
 142                                       frames=True,
 143                                       forms=True,
 144                                       annoying_tags=True,
 145                                       allow_tags=OK_TAGS,
 146                                       remove_unknown_tags=False,
 147                                       safe_attrs_only=True,
 148                                       add_nofollow=False
 149                                       )
 150
 151     def remove_bad_tags(self):
 152         #for e in self.tree.iter():
 153         #    if not e.tag in OK_TAGS:
 154         #        log('found bad tag %s' % e.tag)
 155         self.cleaner(self.tree)
 156
 157
 158     def fix_bad_structure(self):
 159         """Attempt to match booki chapter conventions.  This doesn't
 160         care about xhtml correctness, just booki correctness.
 161
 162         This function's philosophy is to be aggressive, and be
 163         modified upon complaint."""
 164         #0. is the first element preceded by text?
 165         body = self.tree.iter('body').next()
 166         if len(body) == 0:
 167             log("BAD STRUCTURE: empty html, adding something")
 168             etree.SubElement(body, 'span')
 169         if body.text.strip():
 170             log("BAD STRUCTURE: text %r before first tag (not fixing)" % body.text.strip())
 171
 172         #0.5 Remove any <link>, <script>, and <style> tags
 173         #they are at best spurious
 174         for tag in ['link', 'style', 'script', etree.Comment]:
 175             for e in body.iter(tag):
 176                 log("BAD STRUCTURE: trying to remove '%s' (with tail %s)" %
 177                     (("%s" % e)[:60], e.tail))
 178                 parent = e.getparent()
 179                 if e.tail:
 180                     log("rescuing that tail")
 181                     p = e.getprevious()
 182                     if p is None:
 183                         parent.text = (parent.text or "") + e.tail
 184                     else:
 185                         p.tail = (p.tail or "") + e.tail
 186                 parent.remove(e)
 187
 188         #0.75 Remove style and dir attributes from all elements!
 189         #style is usually doing bad things, and perhaps dir is too.
 190         for e in body.iter():
 191             if e.get('style'):
 192                 del e.attrib['style']
 193             if e.get('dir') and e.tag not in ('html', 'body'):
 194                 del e.attrib['dir']
 195
 196         # 1. is the first element an h1?
 197         el1 = body[0]
 198         if el1.tag == 'div' and len(body) == 1:
 199             #The body has a "content" div. we should compact it.
 200             log("DODGY STRUCTURE: containing div. ")
 201         if el1.tag != 'h1':
 202             log("BAD STRUCTURE: firstelement is %r " % el1.tag)
 203             if el1.tag in ('h2', 'h3', 'strong', 'b'):
 204                 log("converting %r to 'h1'" % el1.tag)
 205                 el1.tag = 'h1'
 206
 207         #2. how many <h1>s are there?
 208         h1s = list(body.iter('h1'))
 209         if not h1s:
 210             log("BAD STRUCTURE: no h1! making one up")
 211             h1 = body.makeelement('h1')
 212             h1.text = "Somebody Should Set The Title For This Chapter!"
 213             body.insert(0, h1)
 214         elif len(h1s) > 1:
 215             log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s[1:])
 216             for h1 in h1s[1:]:
 217                 h1.tag = 'h2'
 218
 219
 220     def _loadtree(self, html):
 221         try:
 222             try:
 223                 self.tree = lxml.html.document_fromstring(html, parser=self.parser)
 224             except UnicodeError, e:
 225                 log('failed to parse tree as unicode, got %s %r' % (e, e),
 226                     'trying again using default parser')
 227                 self.tree = lxml.html.document_fromstring(html)
 228         except etree.XMLSyntaxError, e:
 229             log('Could not parse html file %r, string %r... exception %s' %
 230                 (self.name, html[:40], e))
 231             self.tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 232
 233
 234 class EpubChapter(BaseChapter):
 235     def __init__(self, server, book, chapter_name, html, use_cache=False,
 236                  cache_dir=None):
 237         self.server = server
 238         self.book = book
 239         self.name = chapter_name
 240         self._loadtree(html)
 241
 242     def prepare_for_epub(self):
 243         """Shift all headings down 2 places."""
 244         if ADJUST_HEADING_WEIGHT:
 245             # a question to resolve:
 246             # is it better (quicker) to have multiple, filtered iterations
 247             # converting in order (h4->h5, h3->h4, etc) or to do a single,
 248             # unfiltered pass and convert from a dict?
 249
 250             hmap = dict(('h%s' % x, 'h%s' % (x + 2)) for x in range(4, 0, -1))
 251             hmap['h5'] = 'h6'
 252             convert_tags(self.root, hmap)
 253
 254
 255
 256
 257 ###################################################
 258
 259
 260 class Section(object):
 261     def __init__(self, tree, ID=None, title=None):
 262         self.ID = ID
 263         self.tree = tree
 264         self.title = title
 265
 266     def __str__(self):
 267         return '<Section id: %r title: %r>' % (self.ID, self.title)
 268     __repr__ = __str__
 269
 270 def split_tree(tree):
 271     """If a document has special marker elements (hr tags with class
 272     of config.MARKER_CLASS_SPLIT) it will be broken into smaller
 273     documents using the markers as boundaries.  Each element in the
 274     new documents will be nested and ordered as before, though those
 275     on the new edges will obviously lack siblings they once may have
 276     had.
 277
 278     The new documents are returned as a list of Section objects (see
 279     above), which bundles the new tree with an ID and title if the
 280     marker elements contain those attributes.
 281
 282     The original tree will be destroyed or reused.
 283     """
 284     try:
 285         root = tree.getroot()
 286     except AttributeError:
 287         root = tree
 288
 289     # find the node lineages along which to split the document.
 290     # anything outside these lines (i.e., side branches) can be copied
 291     # wholesale, which speeds things up considerably.
 292     stacks = []
 293     for hr in root.iter(tag='hr'):
 294         klass = hr.get('class')
 295         if klass == MARKER_CLASS_SPLIT:
 296             stack = [hr]
 297             stack.extend(x for x in hr.iterancestors())
 298             stack.reverse()
 299             stacks.append(stack)
 300         elif klass == MARKER_CLASS_INFO:
 301             hr.getparent().remove(hr)
 302
 303     iterstacks = iter(stacks)
 304
 305     src = root
 306     dest = lxml.html.Element(root.tag, **dict(root.items()))
 307     doc = dest
 308     stack = iterstacks.next()
 309     marker = stack[-1]
 310
 311     chapters = []
 312     ID = 'unidentified-front-matter'
 313     title = None
 314     try:
 315         while True:
 316             for e in src:
 317                 if e not in stack:
 318                     #cut and paste branch
 319                     dest.append(e)
 320                 elif e is marker:
 321                     #got one.
 322                     chapters.append(Section(doc, ID, title))
 323                     #The ID and title are for the *next* section, so
 324                     #collect them before deleting the marker.
 325                     ID = e.get('id')
 326                     title = e.get('title')
 327                     src.remove(e)
 328                     src = root
 329                     dest = lxml.html.Element(root.tag, **dict(root.items()))
 330                     doc = dest
 331                     stack = iterstacks.next()
 332                     marker = stack[-1]
 333                     break
 334                 else:
 335                     #next level.
 336                     #It is safe to descend without leaving a trail,
 337                     #because side branches are not descended.
 338                     dest = etree.SubElement(dest, e.tag, **dict(e.items()))
 339                     dest.text = e.text
 340                     e.text = None
 341                     src = e
 342                     break
 343     except StopIteration:
 344         #stacks have run out -- the rest of the tree is the last section
 345         chapters.append(Section(src, ID, title))
 346     return chapters
 347
 348