objavi/xhtml_utils.py

   1 """Various things to do with [x]html that might be useful in more than
   2 one place."""
   3
   4 import lxml.html, lxml.html.clean
   5 from lxml import etree
   6
   7 import os
   8 import re
   9
  10 from urlparse import urlsplit
  11 from urllib2 import urlopen, HTTPError
  12
  13 from objavi.config import XHTMLNS, XHTML, IMG_CACHE, MARKER_CLASS_SPLIT, MARKER_CLASS_INFO
  14 from objavi.book_utils import log
  15
  16 ADJUST_HEADING_WEIGHT = False
  17
  18 OK_TAGS = set([
  19     "body", "head", "html", "title", "abbr", "acronym", "address",
  20     "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
  21     "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
  22     "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
  23     "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
  24     "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
  25     "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
  26     "link", "base",
  27     etree.Comment,
  28     ])
  29
  30
  31 XHTML11_DOCTYPE = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
  32     "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  33 '''
  34 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
  35
  36 IMG_PREFIX = 'static/'
  37
  38 def convert_tags(root, elmap):
  39     for el in root.iterdescendants():
  40         if el.tag in elmap:
  41             el.tag = elmap[el.tag]
  42
  43
  44 def url_to_filename(url, prefix=''):
  45     #XXX for TWIKI only
  46     #XXX slightly inefficient to do urlsplit so many times, but versatile
  47     fragments = urlsplit(url)
  48     base, ext = fragments.path.rsplit('.', 1)
  49     server = fragments.netloc.split('.', 1)[0] #en, fr, translate
  50     base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
  51     base = re.sub(r'[^\w]+', '-',  '%s-%s' %(base, server))
  52     return '%s%s.%s' % (prefix, base, ext)
  53
  54
  55 class ImageCache(object):
  56     def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
  57         self._fetched = {}
  58         self.cache_dir = cache_dir
  59         self.prefix = prefix
  60         if not os.path.exists(cache_dir + prefix):
  61             os.makedirs(cache_dir + prefix)
  62
  63     def read_local_url(self, path):
  64         f = open(self.cache_dir + path)
  65         s = f.read()
  66         f.close()
  67         return s
  68
  69     def _save_local_url(self, path, data):
  70         f = open(self.cache_dir + path, 'w')
  71         f.write(data)
  72         f.close()
  73         #os.chmod(path, 0444)
  74
  75     def fetch_if_necessary(self, url, target=None, use_cache=True):
  76         if url in self._fetched:
  77             return self._fetched[url]
  78
  79         if target is None:
  80             target = url_to_filename(url, self.prefix)
  81
  82         if use_cache and os.path.exists(self.cache_dir + target):
  83             log("used cache for %s" % target)
  84             return target
  85
  86         try:
  87             f = urlopen(url)
  88             data = f.read()
  89             f.close()
  90         except HTTPError, e:
  91             # if it is missing, assume it will be missing every time
  92             # after, otherwise, you can get into endless waiting
  93             self._fetched[url] = None
  94             log("Wanting '%s', got error %s" %(url, e))
  95             return None
  96
  97         self._save_local_url(target, data)
  98         self._fetched[url] = target
  99         log("got %s as %s" % (url, target))
 100         return target
 101
 102
 103 class BaseChapter(object):
 104     parser = lxml.html.HTMLParser(encoding='utf-8')
 105     def as_html(self):
 106         """Serialise the tree as html."""
 107         return etree.tostring(self.tree, method='html', encoding='utf-8')
 108
 109     def as_xhtml(self):
 110         """Convert to xhtml and serialise."""
 111         try:
 112             root = self.tree.getroot()
 113         except AttributeError:
 114             root = self.tree
 115
 116         nsmap = {None: XHTML}
 117         xroot = etree.Element(XHTMLNS + "html", nsmap=nsmap)
 118
 119         def xhtml_copy(el, xel):
 120             xel.text = el.text
 121             for k, v in el.items():
 122                 xel.set(k, v)
 123             for child in el.iterchildren():
 124                 xchild = xel.makeelement(XHTMLNS + child.tag)
 125                 xel.append(xchild)
 126                 xhtml_copy(child, xchild)
 127             xel.tail = el.tail
 128
 129         xhtml_copy(root, xroot)
 130
 131         return XML_DEC + XHTML11_DOCTYPE + etree.tostring(xroot)
 132
 133     cleaner = lxml.html.clean.Cleaner(scripts=True,
 134                                       javascript=True,
 135                                       comments=False,
 136                                       style=True,
 137                                       links=True,
 138                                       meta=True,
 139                                       page_structure=False,
 140                                       processing_instructions=True,
 141                                       embedded=True,
 142                                       frames=True,
 143                                       forms=True,
 144                                       annoying_tags=True,
 145                                       allow_tags=OK_TAGS,
 146                                       remove_unknown_tags=False,
 147                                       safe_attrs_only=True,
 148                                       add_nofollow=False
 149                                       )
 150
 151     def remove_bad_tags(self):
 152         #for e in self.tree.iter():
 153         #    if not e.tag in OK_TAGS:
 154         #        log('found bad tag %s' % e.tag)
 155         self.cleaner(self.tree)
 156
 157
 158     def fix_bad_structure(self):
 159         """Attempt to match booki chapter conventions.  This doesn't
 160         care about xhtml correctness, just booki correctness.
 161
 162         This function's philosophy is to be aggressive, and be
 163         modified upon complaint."""
 164
 165         #0. is the first element preceded by text?
 166         body = self.tree.iter('body').next()
 167         if body.text.strip():
 168             log("BAD STRUCTURE: text %r before first tag (not fixing)" % body.text.strip())
 169
 170         #0.5 Remove any <link>, <script>, and <style> tags
 171         #they are at best spurious
 172         for tag in ['link', 'style', 'script', etree.Comment]:
 173             for e in body.iter(tag):
 174                 log("BAD STRUCTURE: trying remove %r (with tail %r)" % (e, e.tail))
 175                 parent = e.getparent()
 176                 if e.tail:
 177                     log("rescuing that tail")
 178                     p = e.getprevious()
 179                     if p is None:
 180                         parent.text = (parent.text or "") + e.tail
 181                     else:
 182                         p.tail = (p.tail or "") + e.tail
 183                 parent.remove(e)
 184
 185         #0.75 Remove style attribute from all elements!
 186         for e in body.iter():
 187             if e.get('style'):
 188                 del e.attrib['style']
 189
 190         # 1. is the first element an h1?
 191         el1 = body[0]
 192         if el1.tag != 'h1':
 193             log("BAD STRUCTURE: firstelement is %r " % el1.tag)
 194             if el1.tag in ('h2', 'h3', 'strong', 'b'):
 195                 log("converting %r to 'h1'" % el1.tag)
 196                 el1.tag = 'h1'
 197
 198         #2. how many <h1>s are there?
 199         h1s = body.findall('h1')
 200         if not h1s:
 201             log("BAD STRUCTURE: no h1! making one up")
 202             h1 = body.makeelement('h1')
 203             h1.text = "Somebody Should Set The Title For This Chapter!"
 204             body.insert(0, h1)
 205         elif len(h1s) > 1:
 206             log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s[1:])
 207             for h1 in h1s[1:]:
 208                 h1.tag = 'h2'
 209
 210
 211     def _loadtree(self, html):
 212         try:
 213             try:
 214                 self.tree = lxml.html.document_fromstring(html, parser=self.parser)
 215             except UnicodeError, e:
 216                 log('failed to parse tree as unicode, got %s %r' % (e, e),
 217                     'trying again using default parser')
 218                 self.tree = lxml.html.document_fromstring(html)
 219         except etree.XMLSyntaxError, e:
 220             log('Could not parse html file %r, string %r... exception %s' %
 221                 (self.name, html[:40], e))
 222             self.tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 223
 224
 225 class EpubChapter(BaseChapter):
 226     def __init__(self, server, book, chapter_name, html, use_cache=False,
 227                  cache_dir=None):
 228         self.server = server
 229         self.book = book
 230         self.name = chapter_name
 231         self._loadtree(html)
 232
 233     def prepare_for_epub(self):
 234         """Shift all headings down 2 places."""
 235         if ADJUST_HEADING_WEIGHT:
 236             # a question to resolve:
 237             # is it better (quicker) to have multiple, filtered iterations
 238             # converting in order (h4->h5, h3->h4, etc) or to do a single,
 239             # unfiltered pass and convert from a dict?
 240
 241             hmap = dict(('h%s' % x, 'h%s' % (x + 2)) for x in range(4, 0, -1))
 242             hmap['h5'] = 'h6'
 243             convert_tags(self.root, hmap)
 244
 245
 246
 247
 248 ###################################################
 249
 250
 251 class Section(object):
 252     def __init__(self, tree, ID=None, title=None):
 253         self.ID = ID
 254         self.tree = tree
 255         self.title = title
 256
 257     def __str__(self):
 258         return '<Section id: %r title: %r>' % (self.ID, self.title)
 259     __repr__ = __str__
 260
 261 def split_tree(tree):
 262     """If a document has special marker elements (hr tags with class
 263     of config.MARKER_CLASS_SPLIT) it will be broken into smaller
 264     documents using the markers as boundaries.  Each element in the
 265     new documents will be nested and ordered as before, though those
 266     on the new edges will obviously lack siblings they once may have
 267     had.
 268
 269     The new documents are returned as a list of Section objects (see
 270     above), which bundles the new tree with an ID and title if the
 271     marker elements contain those attributes.
 272
 273     The original tree will be destroyed or reused.
 274     """
 275     try:
 276         root = tree.getroot()
 277     except AttributeError:
 278         root = tree
 279
 280     # find the node lineages along which to split the document.
 281     # anything outside these lines (i.e., side branches) can be copied
 282     # wholesale, which speeds things up considerably.
 283     stacks = []
 284     for hr in root.iter(tag='hr'):
 285         klass = hr.get('class')
 286         if klass == MARKER_CLASS_SPLIT:
 287             stack = [hr]
 288             stack.extend(x for x in hr.iterancestors())
 289             stack.reverse()
 290             stacks.append(stack)
 291         elif klass == MARKER_CLASS_INFO:
 292             hr.getparent().remove(hr)
 293
 294     iterstacks = iter(stacks)
 295
 296     src = root
 297     dest = lxml.html.Element(root.tag, **dict(root.items()))
 298     doc = dest
 299     stack = iterstacks.next()
 300     marker = stack[-1]
 301
 302     chapters = []
 303     ID = 'unidentified-front-matter'
 304     title = None
 305     try:
 306         while True:
 307             for e in src:
 308                 if e not in stack:
 309                     #cut and paste branch
 310                     dest.append(e)
 311                 elif e is marker:
 312                     #got one.
 313                     chapters.append(Section(doc, ID, title))
 314                     #The ID and title are for the *next* section, so
 315                     #collect them before deleting the marker.
 316                     ID = e.get('id')
 317                     title = e.get('title')
 318                     src.remove(e)
 319                     src = root
 320                     dest = lxml.html.Element(root.tag, **dict(root.items()))
 321                     doc = dest
 322                     stack = iterstacks.next()
 323                     marker = stack[-1]
 324                     break
 325                 else:
 326                     #next level.
 327                     #It is safe to descend without leaving a trail,
 328                     #because side branches are not descended.
 329                     dest = etree.SubElement(dest, e.tag, **dict(e.items()))
 330                     dest.text = e.text
 331                     e.text = None
 332                     src = e
 333                     break
 334     except StopIteration:
 335         #stacks have run out -- the rest of the tree is the last section
 336         chapters.append(Section(src, ID, title))
 337     return chapters
 338
 339