msnbot is hateful and stupid, flooding logs with probes for never-existent files
[objavi2.git] / objavi / xhtml_utils.py
blob0bc63c725a432b6db3835336795cbf483a7770f1
1 """Various things to do with [x]html that might be useful in more than
2 one place."""
4 import lxml.html, lxml.html.clean
5 from lxml import etree
7 import os
8 import re
10 from urlparse import urlsplit
11 from urllib2 import urlopen, HTTPError
13 from objavi.config import XHTMLNS, XHTML, IMG_CACHE, MARKER_CLASS_SPLIT, MARKER_CLASS_INFO
14 from objavi.book_utils import log
16 ADJUST_HEADING_WEIGHT = False
18 OK_TAGS = set([
19 "body", "head", "html", "title", "abbr", "acronym", "address",
20 "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
21 "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
22 "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
23 "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
24 "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
25 "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
26 "link", "base",
27 etree.Comment,
31 XHTML11_DOCTYPE = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
32 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
33 '''
34 XML_DEC = '<?xml version="1.0" encoding="UTF-8"?>\n'
36 IMG_PREFIX = 'static/'
38 def convert_tags(root, elmap):
39 for el in root.iterdescendants():
40 if el.tag in elmap:
41 el.tag = elmap[el.tag]
44 def url_to_filename(url, prefix=''):
45 #XXX for TWIKI only
46 #XXX slightly inefficient to do urlsplit so many times, but versatile
47 fragments = urlsplit(url)
48 base, ext = fragments.path.rsplit('.', 1)
49 server = fragments.netloc.split('.', 1)[0] #en, fr, translate
50 base = base.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
51 base = re.sub(r'[^\w]+', '-', '%s-%s' %(base, server))
52 return '%s%s.%s' % (prefix, base, ext)
55 class ImageCache(object):
56 def __init__(self, cache_dir=IMG_CACHE, prefix=IMG_PREFIX):
57 self._fetched = {}
58 self.cache_dir = cache_dir
59 self.prefix = prefix
60 if not os.path.exists(cache_dir + prefix):
61 os.makedirs(cache_dir + prefix)
63 def read_local_url(self, path):
64 f = open(self.cache_dir + path)
65 s = f.read()
66 f.close()
67 return s
69 def _save_local_url(self, path, data):
70 f = open(self.cache_dir + path, 'w')
71 f.write(data)
72 f.close()
73 #os.chmod(path, 0444)
75 def fetch_if_necessary(self, url, target=None, use_cache=True):
76 if url in self._fetched:
77 return self._fetched[url]
79 if target is None:
80 target = url_to_filename(url, self.prefix)
82 if use_cache and os.path.exists(self.cache_dir + target):
83 log("used cache for %s" % target)
84 return target
86 try:
87 f = urlopen(url)
88 data = f.read()
89 f.close()
90 except HTTPError, e:
91 # if it is missing, assume it will be missing every time
92 # after, otherwise, you can get into endless waiting
93 self._fetched[url] = None
94 log("Wanting '%s', got error %s" %(url, e))
95 return None
97 self._save_local_url(target, data)
98 self._fetched[url] = target
99 log("got %s as %s" % (url, target))
100 return target
103 class BaseChapter(object):
104 parser = lxml.html.HTMLParser(encoding='utf-8')
105 def as_html(self):
106 """Serialise the tree as html."""
107 return etree.tostring(self.tree, method='html', encoding='utf-8')
109 def as_xhtml(self):
110 """Convert to xhtml and serialise."""
111 try:
112 root = self.tree.getroot()
113 except AttributeError:
114 root = self.tree
116 nsmap = {None: XHTML}
117 xroot = etree.Element(XHTMLNS + "html", nsmap=nsmap)
119 def xhtml_copy(el, xel):
120 xel.text = el.text
121 for k, v in el.items():
122 xel.set(k, v)
123 for child in el.iterchildren():
124 xchild = xel.makeelement(XHTMLNS + child.tag)
125 xel.append(xchild)
126 xhtml_copy(child, xchild)
127 xel.tail = el.tail
129 xhtml_copy(root, xroot)
131 return XML_DEC + XHTML11_DOCTYPE + etree.tostring(xroot)
133 cleaner = lxml.html.clean.Cleaner(scripts=True,
134 javascript=True,
135 comments=False,
136 style=True,
137 links=True,
138 meta=True,
139 page_structure=False,
140 processing_instructions=True,
141 embedded=True,
142 frames=True,
143 forms=True,
144 annoying_tags=True,
145 allow_tags=OK_TAGS,
146 remove_unknown_tags=False,
147 safe_attrs_only=True,
148 add_nofollow=False
151 def remove_bad_tags(self):
152 #for e in self.tree.iter():
153 # if not e.tag in OK_TAGS:
154 # log('found bad tag %s' % e.tag)
155 self.cleaner(self.tree)
158 def fix_bad_structure(self):
159 """Attempt to match booki chapter conventions. This doesn't
160 care about xhtml correctness, just booki correctness.
162 This function's philosophy is to be aggressive, and be
163 modified upon complaint."""
164 #0. is the first element preceded by text?
165 body = self.tree.iter('body').next()
166 if len(body) == 0:
167 log("BAD STRUCTURE: empty html, adding something")
168 etree.SubElement(body, 'span')
169 if body.text.strip():
170 log("BAD STRUCTURE: text %r before first tag (not fixing)" % body.text.strip())
172 #0.5 Remove any <link>, <script>, and <style> tags
173 #they are at best spurious
174 for tag in ['link', 'style', 'script', etree.Comment]:
175 for e in body.iter(tag):
176 log("BAD STRUCTURE: trying to remove '%s' (with tail %s)" %
177 (("%s" % e)[:60], e.tail))
178 parent = e.getparent()
179 if e.tail:
180 log("rescuing that tail")
181 p = e.getprevious()
182 if p is None:
183 parent.text = (parent.text or "") + e.tail
184 else:
185 p.tail = (p.tail or "") + e.tail
186 parent.remove(e)
188 #0.75 Remove style and dir attributes from all elements!
189 #style is usually doing bad things, and perhaps dir is too.
190 for e in body.iter():
191 if e.get('style'):
192 del e.attrib['style']
193 if e.get('dir') and e.tag not in ('html', 'body'):
194 del e.attrib['dir']
196 # 1. is the first element an h1?
197 el1 = body[0]
198 if el1.tag == 'div' and len(body) == 1:
199 #The body has a "content" div. we should compact it.
200 log("DODGY STRUCTURE: containing div. ")
201 if el1.tag != 'h1':
202 log("BAD STRUCTURE: firstelement is %r " % el1.tag)
203 if el1.tag in ('h2', 'h3', 'strong', 'b'):
204 log("converting %r to 'h1'" % el1.tag)
205 el1.tag = 'h1'
207 #2. how many <h1>s are there?
208 h1s = list(body.iter('h1'))
209 if not h1s:
210 log("BAD STRUCTURE: no h1! making one up")
211 h1 = body.makeelement('h1')
212 h1.text = "Somebody Should Set The Title For This Chapter!"
213 body.insert(0, h1)
214 elif len(h1s) > 1:
215 log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s[1:])
216 for h1 in h1s[1:]:
217 h1.tag = 'h2'
220 def _loadtree(self, html):
221 try:
222 try:
223 self.tree = lxml.html.document_fromstring(html, parser=self.parser)
224 except UnicodeError, e:
225 log('failed to parse tree as unicode, got %s %r' % (e, e),
226 'trying again using default parser')
227 self.tree = lxml.html.document_fromstring(html)
228 except etree.XMLSyntaxError, e:
229 log('Could not parse html file %r, string %r... exception %s' %
230 (self.name, html[:40], e))
231 self.tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
234 class EpubChapter(BaseChapter):
235 def __init__(self, server, book, chapter_name, html, use_cache=False,
236 cache_dir=None):
237 self.server = server
238 self.book = book
239 self.name = chapter_name
240 self._loadtree(html)
242 def prepare_for_epub(self):
243 """Shift all headings down 2 places."""
244 if ADJUST_HEADING_WEIGHT:
245 # a question to resolve:
246 # is it better (quicker) to have multiple, filtered iterations
247 # converting in order (h4->h5, h3->h4, etc) or to do a single,
248 # unfiltered pass and convert from a dict?
250 hmap = dict(('h%s' % x, 'h%s' % (x + 2)) for x in range(4, 0, -1))
251 hmap['h5'] = 'h6'
252 convert_tags(self.root, hmap)
257 ###################################################
260 class Section(object):
261 def __init__(self, tree, ID=None, title=None):
262 self.ID = ID
263 self.tree = tree
264 self.title = title
266 def __str__(self):
267 return '<Section id: %r title: %r>' % (self.ID, self.title)
268 __repr__ = __str__
270 def split_tree(tree):
271 """If a document has special marker elements (hr tags with class
272 of config.MARKER_CLASS_SPLIT) it will be broken into smaller
273 documents using the markers as boundaries. Each element in the
274 new documents will be nested and ordered as before, though those
275 on the new edges will obviously lack siblings they once may have
276 had.
278 The new documents are returned as a list of Section objects (see
279 above), which bundles the new tree with an ID and title if the
280 marker elements contain those attributes.
282 The original tree will be destroyed or reused.
284 try:
285 root = tree.getroot()
286 except AttributeError:
287 root = tree
289 # find the node lineages along which to split the document.
290 # anything outside these lines (i.e., side branches) can be copied
291 # wholesale, which speeds things up considerably.
292 stacks = []
293 for hr in root.iter(tag='hr'):
294 klass = hr.get('class')
295 if klass == MARKER_CLASS_SPLIT:
296 stack = [hr]
297 stack.extend(x for x in hr.iterancestors())
298 stack.reverse()
299 stacks.append(stack)
300 elif klass == MARKER_CLASS_INFO:
301 hr.getparent().remove(hr)
303 iterstacks = iter(stacks)
305 src = root
306 dest = lxml.html.Element(root.tag, **dict(root.items()))
307 doc = dest
308 stack = iterstacks.next()
309 marker = stack[-1]
311 chapters = []
312 ID = 'unidentified-front-matter'
313 title = None
314 try:
315 while True:
316 for e in src:
317 if e not in stack:
318 #cut and paste branch
319 dest.append(e)
320 elif e is marker:
321 #got one.
322 chapters.append(Section(doc, ID, title))
323 #The ID and title are for the *next* section, so
324 #collect them before deleting the marker.
325 ID = e.get('id')
326 title = e.get('title')
327 src.remove(e)
328 src = root
329 dest = lxml.html.Element(root.tag, **dict(root.items()))
330 doc = dest
331 stack = iterstacks.next()
332 marker = stack[-1]
333 break
334 else:
335 #next level.
336 #It is safe to descend without leaving a trail,
337 #because side branches are not descended.
338 dest = etree.SubElement(dest, e.tag, **dict(e.items()))
339 dest.text = e.text
340 e.text = None
341 src = e
342 break
343 except StopIteration:
344 #stacks have run out -- the rest of the tree is the last section
345 chapters.append(Section(src, ID, title))
346 return chapters