More aggressive cleaning of bad files
[objavi2.git] / objavi / xhtml_utils.py
bloba3e4827a37b51630c9bf58ced08cd39b7e1f58de
1 """Various things to do with [x]html that might be useful in more than
2 one place."""
4 import lxml.html, lxml.html.clean
5 from lxml import etree
7 import os
8 import re
10 from urlparse import urlsplit
11 from urllib2 import urlopen, HTTPError
13 from objavi.config import XHTMLNS, XHTML, IMG_CACHE, MARKER_CLASS_SPLIT, MARKER_CLASS_INFO
14 from objavi.book_utils import log
16 ADJUST_HEADING_WEIGHT = False
18 OK_TAGS = set([
19 "body", "head", "html", "title", "abbr", "acronym", "address",
20 "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
21 "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
22 "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
23 "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
24 "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
25 "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
26 "link", "base",
27 etree.Comment,
31 XHTML11_DOCTYPE = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
32 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
33 '''
34 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
36 IMG_PREFIX = 'static/'
38 def convert_tags(root, elmap):
39 for el in root.iterdescendants():
40 if el.tag in elmap:
41 el.tag = elmap[el.tag]
44 def url_to_filename(url, prefix=''):
45 #XXX for TWIKI only
46 #XXX slightly inefficient to do urlsplit so many times, but versatile
47 fragments = urlsplit(url)
48 base, ext = fragments.path.rsplit('.', 1)
49 server = fragments.netloc.split('.', 1)[0] #en, fr, translate
50 base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
51 base = re.sub(r'[^\w]+', '-', '%s-%s' %(base, server))
52 return '%s%s.%s' % (prefix, base, ext)
55 class ImageCache(object):
56 def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
57 self._fetched = {}
58 self.cache_dir = cache_dir
59 self.prefix = prefix
60 if not os.path.exists(cache_dir + prefix):
61 os.makedirs(cache_dir + prefix)
63 def read_local_url(self, path):
64 f = open(self.cache_dir + path)
65 s = f.read()
66 f.close()
67 return s
69 def _save_local_url(self, path, data):
70 f = open(self.cache_dir + path, 'w')
71 f.write(data)
72 f.close()
73 #os.chmod(path, 0444)
75 def fetch_if_necessary(self, url, target=None, use_cache=True):
76 if url in self._fetched:
77 return self._fetched[url]
79 if target is None:
80 target = url_to_filename(url, self.prefix)
82 if use_cache and os.path.exists(self.cache_dir + target):
83 log("used cache for %s" % target)
84 return target
86 try:
87 f = urlopen(url)
88 data = f.read()
89 f.close()
90 except HTTPError, e:
91 # if it is missing, assume it will be missing every time
92 # after, otherwise, you can get into endless waiting
93 self._fetched[url] = None
94 log("Wanting '%s', got error %s" %(url, e))
95 return None
97 self._save_local_url(target, data)
98 self._fetched[url] = target
99 log("got %s as %s" % (url, target))
100 return target
103 class BaseChapter(object):
104 parser = lxml.html.HTMLParser(encoding='utf-8')
105 def as_html(self):
106 """Serialise the tree as html."""
107 return etree.tostring(self.tree, method='html', encoding='utf-8')
109 def as_xhtml(self):
110 """Convert to xhtml and serialise."""
111 try:
112 root = self.tree.getroot()
113 except AttributeError:
114 root = self.tree
116 nsmap = {None: XHTML}
117 xroot = etree.Element(XHTMLNS + "html", nsmap=nsmap)
119 def xhtml_copy(el, xel):
120 xel.text = el.text
121 for k, v in el.items():
122 xel.set(k, v)
123 for child in el.iterchildren():
124 xchild = xel.makeelement(XHTMLNS + child.tag)
125 xel.append(xchild)
126 xhtml_copy(child, xchild)
127 xel.tail = el.tail
129 xhtml_copy(root, xroot)
131 return XML_DEC + XHTML11_DOCTYPE + etree.tostring(xroot)
133 cleaner = lxml.html.clean.Cleaner(scripts=True,
134 javascript=True,
135 comments=False,
136 style=True,
137 links=True,
138 meta=True,
139 page_structure=False,
140 processing_instructions=True,
141 embedded=True,
142 frames=True,
143 forms=True,
144 annoying_tags=True,
145 allow_tags=OK_TAGS,
146 remove_unknown_tags=False,
147 safe_attrs_only=True,
148 add_nofollow=False
151 def remove_bad_tags(self):
152 #for e in self.tree.iter():
153 # if not e.tag in OK_TAGS:
154 # log('found bad tag %s' % e.tag)
155 self.cleaner(self.tree)
158 def fix_bad_structure(self):
159 """Attempt to match booki chapter conventions. This doesn't
160 care about xhtml correctness, just booki correctness.
162 This function's philosophy is to be aggressive, and be
163 modified upon complaint."""
165 #0. is the first element preceded by text?
166 body = self.tree.iter('body').next()
167 if body.text.strip():
168 log("BAD STRUCTURE: text %r before first tag (not fixing)" % body.text.strip())
170 #0.5 Remove any <link>, <script>, and <style> tags
171 #they are at best spurious
172 for tag in ['link', 'style', 'script', etree.Comment]:
173 for e in body.iter(tag):
174 log("BAD STRUCTURE: trying remove %r (with tail %r)" % (e, e.tail))
175 parent = e.getparent()
176 if e.tail:
177 log("rescuing that tail")
178 p = e.getprevious()
179 if p is None:
180 parent.text = (parent.text or "") + e.tail
181 else:
182 p.tail = (p.tail or "") + e.tail
183 parent.remove(e)
185 #0.75 Remove style attribute from all elements!
186 for e in body.iter():
187 if e.get('style'):
188 del e.attrib['style']
190 # 1. is the first element an h1?
191 el1 = body[0]
192 if el1.tag != 'h1':
193 log("BAD STRUCTURE: firstelement is %r " % el1.tag)
194 if el1.tag in ('h2', 'h3', 'strong', 'b'):
195 log("converting %r to 'h1'" % el1.tag)
196 el1.tag = 'h1'
198 #2. how many <h1>s are there?
199 h1s = body.findall('h1')
200 if not h1s:
201 log("BAD STRUCTURE: no h1! making one up")
202 h1 = body.makeelement('h1')
203 h1.text = "Somebody Should Set The Title For This Chapter!"
204 body.insert(0, h1)
205 elif len(h1s) > 1:
206 log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s[1:])
207 for h1 in h1s[1:]:
208 h1.tag = 'h2'
211 def _loadtree(self, html):
212 try:
213 try:
214 self.tree = lxml.html.document_fromstring(html, parser=self.parser)
215 except UnicodeError, e:
216 log('failed to parse tree as unicode, got %s %r' % (e, e),
217 'trying again using default parser')
218 self.tree = lxml.html.document_fromstring(html)
219 except etree.XMLSyntaxError, e:
220 log('Could not parse html file %r, string %r... exception %s' %
221 (self.name, html[:40], e))
222 self.tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
225 class EpubChapter(BaseChapter):
226 def __init__(self, server, book, chapter_name, html, use_cache=False,
227 cache_dir=None):
228 self.server = server
229 self.book = book
230 self.name = chapter_name
231 self._loadtree(html)
233 def prepare_for_epub(self):
234 """Shift all headings down 2 places."""
235 if ADJUST_HEADING_WEIGHT:
236 # a question to resolve:
237 # is it better (quicker) to have multiple, filtered iterations
238 # converting in order (h4->h5, h3->h4, etc) or to do a single,
239 # unfiltered pass and convert from a dict?
241 hmap = dict(('h%s' % x, 'h%s' % (x + 2)) for x in range(4, 0, -1))
242 hmap['h5'] = 'h6'
243 convert_tags(self.root, hmap)
248 ###################################################
251 class Section(object):
252 def __init__(self, tree, ID=None, title=None):
253 self.ID = ID
254 self.tree = tree
255 self.title = title
257 def __str__(self):
258 return '<Section id: %r title: %r>' % (self.ID, self.title)
259 __repr__ = __str__
261 def split_tree(tree):
262 """If a document has special marker elements (hr tags with class
263 of config.MARKER_CLASS_SPLIT) it will be broken into smaller
264 documents using the markers as boundaries. Each element in the
265 new documents will be nested and ordered as before, though those
266 on the new edges will obviously lack siblings they once may have
267 had.
269 The new documents are returned as a list of Section objects (see
270 above), which bundles the new tree with an ID and title if the
271 marker elements contain those attributes.
273 The original tree will be destroyed or reused.
275 try:
276 root = tree.getroot()
277 except AttributeError:
278 root = tree
280 # find the node lineages along which to split the document.
281 # anything outside these lines (i.e., side branches) can be copied
282 # wholesale, which speeds things up considerably.
283 stacks = []
284 for hr in root.iter(tag='hr'):
285 klass = hr.get('class')
286 if klass == MARKER_CLASS_SPLIT:
287 stack = [hr]
288 stack.extend(x for x in hr.iterancestors())
289 stack.reverse()
290 stacks.append(stack)
291 elif klass == MARKER_CLASS_INFO:
292 hr.getparent().remove(hr)
294 iterstacks = iter(stacks)
296 src = root
297 dest = lxml.html.Element(root.tag, **dict(root.items()))
298 doc = dest
299 stack = iterstacks.next()
300 marker = stack[-1]
302 chapters = []
303 ID = 'unidentified-front-matter'
304 title = None
305 try:
306 while True:
307 for e in src:
308 if e not in stack:
309 #cut and paste branch
310 dest.append(e)
311 elif e is marker:
312 #got one.
313 chapters.append(Section(doc, ID, title))
314 #The ID and title are for the *next* section, so
315 #collect them before deleting the marker.
316 ID = e.get('id')
317 title = e.get('title')
318 src.remove(e)
319 src = root
320 dest = lxml.html.Element(root.tag, **dict(root.items()))
321 doc = dest
322 stack = iterstacks.next()
323 marker = stack[-1]
324 break
325 else:
326 #next level.
327 #It is safe to descend without leaving a trail,
328 #because side branches are not descended.
329 dest = etree.SubElement(dest, e.tag, **dict(e.items()))
330 dest.text = e.text
331 e.text = None
332 src = e
333 break
334 except StopIteration:
335 #stacks have run out -- the rest of the tree is the last section
336 chapters.append(Section(src, ID, title))
337 return chapters