1 """Various things to do with [x]html that might be useful in more than
2 one place."""
4 import lxml.html, lxml.html.clean
5 from lxml import etree
7 import os
8 import re
10 from urlparse import urlsplit
11 from urllib2 import urlopen, HTTPError
14 from objavi.book_utils import log
18 OK_TAGS = set([
19 "body", "head", "html", "title", "abbr", "acronym", "address",
20 "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
21 "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
22 "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
23 "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
24 "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
25 "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
26 "link", "base",
27 etree.Comment,
32 "">
33 '''
34 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
36 IMG_PREFIX = 'static/'
38 def convert_tags(root, elmap):
39 for el in root.iterdescendants():
40 if el.tag in elmap:
41 el.tag = elmap[el.tag]
44 def url_to_filename(url, prefix=''):
45 #XXX for TWIKI only
46 #XXX slightly inefficient to do urlsplit so many times, but versatile
47 fragments = urlsplit(url)
48 base, ext = fragments.path.rsplit('.', 1)
49 server = fragments.netloc.split('.', 1)[0] #en, fr, translate
50 base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
51 base = re.sub(r'[^\w]+', '-', '%s-%s' %(base, server))
52 return '%s%s.%s' % (prefix, base, ext)
55 class ImageCache(object):
56 def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
57 self._fetched = {}
58 self.cache_dir = cache_dir
59 self.prefix = prefix
60 if not os.path.exists(cache_dir + prefix):
61 os.makedirs(cache_dir + prefix)
63 def read_local_url(self, path):
64 f = open(self.cache_dir + path)
65 s =
66 f.close()
67 return s
69 def _save_local_url(self, path, data):
70 f = open(self.cache_dir + path, 'w')
71 f.write(data)
72 f.close()
73 #os.chmod(path, 0444)
75 def fetch_if_necessary(self, url, target=None, use_cache=True):
76 if url in self._fetched:
77 return self._fetched[url]
79 if target is None:
80 target = url_to_filename(url, self.prefix)
82 if use_cache and os.path.exists(self.cache_dir + target):
83 log("used cache for %s" % target)
84 return target
86 try:
87 f = urlopen(url)
88 data =
89 f.close()
90 except HTTPError, e:
91 # if it is missing, assume it will be missing every time
92 # after, otherwise, you can get into endless waiting
93 self._fetched[url] = None
94 log("Wanting '%s', got error %s" %(url, e))
95 return None
97 self._save_local_url(target, data)
98 self._fetched[url] = target
99 log("got %s as %s" % (url, target))
100 return target
103 class BaseChapter(object):
104 parser = lxml.html.HTMLParser(encoding='utf-8')
105 def as_html(self):
106 """Serialise the tree as html."""
107 return etree.tostring(self.tree, method='html', encoding='utf-8')
109 def as_xhtml(self):
110 """Convert to xhtml and serialise."""
111 try:
112 root = self.tree.getroot()
113 except AttributeError:
114 root = self.tree
116 nsmap = {None: XHTML}
117 xroot = etree.Element(XHTMLNS + "html", nsmap=nsmap)
119 def xhtml_copy(el, xel):
120 xel.text = el.text
121 for k, v in el.items():
122 xel.set(k, v)
123 for child in el.iterchildren():
124 xchild = xel.makeelement(XHTMLNS + child.tag)
125 xel.append(xchild)
126 xhtml_copy(child, xchild)
127 xel.tail = el.tail
129 xhtml_copy(root, xroot)
131 return XML_DEC + XHTML11_DOCTYPE + etree.tostring(xroot)
133 cleaner = lxml.html.clean.Cleaner(scripts=True,
134 javascript=True,
135 comments=False,
136 style=True,
137 links=True,
138 meta=True,
139 page_structure=False,
140 processing_instructions=True,
141 embedded=True,
142 frames=True,
143 forms=True,
144 annoying_tags=True,
145 allow_tags=OK_TAGS,
146 remove_unknown_tags=False,
147 safe_attrs_only=True,
148 add_nofollow=False
151 def remove_bad_tags(self):
152 #for e in self.tree.iter():
153 # if not e.tag in OK_TAGS:
154 # log('found bad tag %s' % e.tag)
155 self.cleaner(self.tree)
158 def fix_bad_structure(self):
159 """Attempt to match booki chapter conventions. This doesn't
160 care about xhtml correctness, just booki correctness.
162 This function's philosophy is to be aggressive, and be
163 modified upon complaint."""
164 #0. is the first element preceded by text?
165 body = self.tree.iter('body').next()
166 if body.text.strip():
167 log("BAD STRUCTURE: text %r before first tag (not fixing)" % body.text.strip())
169 #0.5 Remove any <link>, <script>, and <style> tags
170 #they are at best spurious
171 for tag in ['link', 'style', 'script', etree.Comment]:
172 for e in body.iter(tag):
173 log("BAD STRUCTURE: trying to remove '%s' (with tail %s)" %
174 (("%s" % e)[:60], e.tail))
175 parent = e.getparent()
176 if e.tail:
177 log("rescuing that tail")
178 p = e.getprevious()
179 if p is None:
180 parent.text = (parent.text or "") + e.tail
181 else:
182 p.tail = (p.tail or "") + e.tail
183 parent.remove(e)
185 #0.75 Remove style attribute from all elements!
186 for e in body.iter():
187 if e.get('style'):
188 del e.attrib['style']
190 # 1. is the first element an h1?
191 el1 = body[0]
192 if el1.tag == 'div' and len(body) == 1:
193 #The body has a "content" div. we should compact it.
194 log("DODGY STRUCTURE: containing div. ")
195 if el1.tag != 'h1':
196 log("BAD STRUCTURE: firstelement is %r " % el1.tag)
197 if el1.tag in ('h2', 'h3', 'strong', 'b'):
198 log("converting %r to 'h1'" % el1.tag)
199 el1.tag = 'h1'
201 #2. how many <h1>s are there?
202 h1s = list(body.iter('h1'))
203 if not h1s:
204 log("BAD STRUCTURE: no h1! making one up")
205 h1 = body.makeelement('h1')
206 h1.text = "Somebody Should Set The Title For This Chapter!"
207 body.insert(0, h1)
208 elif len(h1s) > 1:
209 log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s[1:])
210 for h1 in h1s[1:]:
211 h1.tag = 'h2'
214 def _loadtree(self, html):
215 try:
216 try:
217 self.tree = lxml.html.document_fromstring(html, parser=self.parser)
218 except UnicodeError, e:
219 log('failed to parse tree as unicode, got %s %r' % (e, e),
220 'trying again using default parser')
221 self.tree = lxml.html.document_fromstring(html)
222 except etree.XMLSyntaxError, e:
223 log('Could not parse html file %r, string %r... exception %s' %
224 (, html[:40], e))
225 self.tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
228 class EpubChapter(BaseChapter):
229 def __init__(self, server, book, chapter_name, html, use_cache=False,
230 cache_dir=None):
231 self.server = server
232 = book
233 = chapter_name
234 self._loadtree(html)
236 def prepare_for_epub(self):
237 """Shift all headings down 2 places."""
239 # a question to resolve:
240 # is it better (quicker) to have multiple, filtered iterations
241 # converting in order (h4->h5, h3->h4, etc) or to do a single,
242 # unfiltered pass and convert from a dict?
244 hmap = dict(('h%s' % x, 'h%s' % (x + 2)) for x in range(4, 0, -1))
245 hmap['h5'] = 'h6'
246 convert_tags(self.root, hmap)
251 ###################################################
254 class Section(object):
255 def __init__(self, tree, ID=None, title=None):
256 self.ID = ID
257 self.tree = tree
258 self.title = title
260 def __str__(self):
261 return '<Section id: %r title: %r>' % (self.ID, self.title)
262 __repr__ = __str__
264 def split_tree(tree):
265 """If a document has special marker elements (hr tags with class
266 of config.MARKER_CLASS_SPLIT) it will be broken into smaller
267 documents using the markers as boundaries. Each element in the
268 new documents will be nested and ordered as before, though those
269 on the new edges will obviously lack siblings they once may have
270 had.
272 The new documents are returned as a list of Section objects (see
273 above), which bundles the new tree with an ID and title if the
274 marker elements contain those attributes.
276 The original tree will be destroyed or reused.
278 try:
279 root = tree.getroot()
280 except AttributeError:
281 root = tree
283 # find the node lineages along which to split the document.
284 # anything outside these lines (i.e., side branches) can be copied
285 # wholesale, which speeds things up considerably.
286 stacks = []
287 for hr in root.iter(tag='hr'):
288 klass = hr.get('class')
289 if klass == MARKER_CLASS_SPLIT:
290 stack = [hr]
291 stack.extend(x for x in hr.iterancestors())
292 stack.reverse()
293 stacks.append(stack)
294 elif klass == MARKER_CLASS_INFO:
295 hr.getparent().remove(hr)
297 iterstacks = iter(stacks)
299 src = root
300 dest = lxml.html.Element(root.tag, **dict(root.items()))
301 doc = dest
302 stack =
303 marker = stack[-1]
305 chapters = []
306 ID = 'unidentified-front-matter'
307 title = None
308 try:
309 while True:
310 for e in src:
311 if e not in stack:
312 #cut and paste branch
313 dest.append(e)
314 elif e is marker:
315 #got one.
316 chapters.append(Section(doc, ID, title))
317 #The ID and title are for the *next* section, so
318 #collect them before deleting the marker.
319 ID = e.get('id')
320 title = e.get('title')
321 src.remove(e)
322 src = root
323 dest = lxml.html.Element(root.tag, **dict(root.items()))
324 doc = dest
325 stack =
326 marker = stack[-1]
327 break
328 else:
329 #next level.
330 #It is safe to descend without leaving a trail,
331 #because side branches are not descended.
332 dest = etree.SubElement(dest, e.tag, **dict(e.items()))
333 dest.text = e.text
334 e.text = None
335 src = e
336 break
337 except StopIteration:
338 #stacks have run out -- the rest of the tree is the last section
339 chapters.append(Section(src, ID, title))
340 return chapters