1 """Various things to do with [x]html that might be useful in more than
4 import lxml
.html
, lxml
.html
.clean
10 from urlparse
import urlsplit
11 from urllib2
import urlopen
, HTTPError
13 from objavi
.config
import XHTMLNS
, XHTML
, IMG_CACHE
, MARKER_CLASS_SPLIT
, MARKER_CLASS_INFO
14 from objavi
.book_utils
import log
16 ADJUST_HEADING_WEIGHT
= False
19 "body", "head", "html", "title", "abbr", "acronym", "address",
20 "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
21 "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
22 "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
23 "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
24 "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
25 "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
31 XHTML11_DOCTYPE
= '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
32 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
34 XML_DEC
= '<?xml version="1.0" encoding="UTF-8"?>\n'
36 IMG_PREFIX
= 'static/'
38 def convert_tags(root
, elmap
):
39 for el
in root
.iterdescendants():
41 el
.tag
= elmap
[el
.tag
]
44 def url_to_filename(url
, prefix
=''):
46 #XXX slightly inefficient to do urlsplit so many times, but versatile
47 fragments
= urlsplit(url
)
48 base
, ext
= fragments
.path
.rsplit('.', 1)
49 server
= fragments
.netloc
.split('.', 1)[0] #en, fr, translate
50 base
= base
.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
51 base
= re
.sub(r
'[^\w]+', '-', '%s-%s' %(base
, server
))
52 return '%s%s.%s' % (prefix
, base
, ext
)
55 class ImageCache(object):
56 def __init__(self
, cache_dir
=IMG_CACHE
, prefix
=IMG_PREFIX
):
58 self
.cache_dir
= cache_dir
60 if not os
.path
.exists(cache_dir
+ prefix
):
61 os
.makedirs(cache_dir
+ prefix
)
63 def read_local_url(self
, path
):
64 f
= open(self
.cache_dir
+ path
)
69 def _save_local_url(self
, path
, data
):
70 f
= open(self
.cache_dir
+ path
, 'w')
75 def fetch_if_necessary(self
, url
, target
=None, use_cache
=True):
76 if url
in self
._fetched
:
77 return self
._fetched
[url
]
80 target
= url_to_filename(url
, self
.prefix
)
82 if use_cache
and os
.path
.exists(self
.cache_dir
+ target
):
83 log("used cache for %s" % target
)
91 # if it is missing, assume it will be missing every time
92 # after, otherwise, you can get into endless waiting
93 self
._fetched
[url
] = None
94 log("Wanting '%s', got error %s" %(url
, e
))
97 self
._save
_local
_url
(target
, data
)
98 self
._fetched
[url
] = target
99 log("got %s as %s" % (url
, target
))
103 class BaseChapter(object):
104 parser
= lxml
.html
.HTMLParser(encoding
='utf-8')
106 """Serialise the tree as html."""
107 return etree
.tostring(self
.tree
, method
='html', encoding
='utf-8')
110 """Convert to xhtml and serialise."""
112 root
= self
.tree
.getroot()
113 except AttributeError:
116 nsmap
= {None: XHTML
}
117 xroot
= etree
.Element(XHTMLNS
+ "html", nsmap
=nsmap
)
119 def xhtml_copy(el
, xel
):
121 for k
, v
in el
.items():
123 for child
in el
.iterchildren():
124 xchild
= xel
.makeelement(XHTMLNS
+ child
.tag
)
126 xhtml_copy(child
, xchild
)
129 xhtml_copy(root
, xroot
)
131 return XML_DEC
+ XHTML11_DOCTYPE
+ etree
.tostring(xroot
)
133 cleaner
= lxml
.html
.clean
.Cleaner(scripts
=True,
139 page_structure
=False,
140 processing_instructions
=True,
146 remove_unknown_tags
=False,
147 safe_attrs_only
=True,
151 def remove_bad_tags(self
):
152 #for e in self.tree.iter():
153 # if not e.tag in OK_TAGS:
154 # log('found bad tag %s' % e.tag)
155 self
.cleaner(self
.tree
)
158 def fix_bad_structure(self
):
159 """Attempt to match booki chapter conventions. This doesn't
160 care about xhtml correctness, just booki correctness.
162 This function's philosophy is to be aggressive, and be
163 modified upon complaint."""
165 #0. is the first element preceded by text?
166 body
= self
.tree
.iter('body').next()
167 if body
.text
.strip():
168 log("BAD STRUCTURE: text %r before first tag (not fixing)" % body
.text
.strip())
170 #0.5 Remove any <link>, <script>, and <style> tags
171 #they are at best spurious
172 for tag
in ['link', 'style', 'script', etree
.Comment
]:
173 for e
in body
.iter(tag
):
174 log("BAD STRUCTURE: trying remove %r (with tail %r)" % (e
, e
.tail
))
175 parent
= e
.getparent()
177 log("rescuing that tail")
180 parent
.text
= (parent
.text
or "") + e
.tail
182 p
.tail
= (p
.tail
or "") + e
.tail
185 #0.75 Remove style attribute from all elements!
186 for e
in body
.iter():
188 del e
.attrib
['style']
190 # 1. is the first element an h1?
193 log("BAD STRUCTURE: firstelement is %r " % el1
.tag
)
194 if el1
.tag
in ('h2', 'h3', 'strong', 'b'):
195 log("converting %r to 'h1'" % el1
.tag
)
198 #2. how many <h1>s are there?
199 h1s
= body
.findall('h1')
201 log("BAD STRUCTURE: no h1! making one up")
202 h1
= body
.makeelement('h1')
203 h1
.text
= "Somebody Should Set The Title For This Chapter!"
206 log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s
[1:])
211 def _loadtree(self
, html
):
214 self
.tree
= lxml
.html
.document_fromstring(html
, parser
=self
.parser
)
215 except UnicodeError, e
:
216 log('failed to parse tree as unicode, got %s %r' % (e
, e
),
217 'trying again using default parser')
218 self
.tree
= lxml
.html
.document_fromstring(html
)
219 except etree
.XMLSyntaxError
, e
:
220 log('Could not parse html file %r, string %r... exception %s' %
221 (self
.name
, html
[:40], e
))
222 self
.tree
= lxml
.html
.document_fromstring('<html><body></body></html>').getroottree()
225 class EpubChapter(BaseChapter
):
226 def __init__(self
, server
, book
, chapter_name
, html
, use_cache
=False,
230 self
.name
= chapter_name
233 def prepare_for_epub(self
):
234 """Shift all headings down 2 places."""
235 if ADJUST_HEADING_WEIGHT
:
236 # a question to resolve:
237 # is it better (quicker) to have multiple, filtered iterations
238 # converting in order (h4->h5, h3->h4, etc) or to do a single,
239 # unfiltered pass and convert from a dict?
241 hmap
= dict(('h%s' % x
, 'h%s' % (x
+ 2)) for x
in range(4, 0, -1))
243 convert_tags(self
.root
, hmap
)
248 ###################################################
251 class Section(object):
252 def __init__(self
, tree
, ID
=None, title
=None):
258 return '<Section id: %r title: %r>' % (self
.ID
, self
.title
)
261 def split_tree(tree
):
262 """If a document has special marker elements (hr tags with class
263 of config.MARKER_CLASS_SPLIT) it will be broken into smaller
264 documents using the markers as boundaries. Each element in the
265 new documents will be nested and ordered as before, though those
266 on the new edges will obviously lack siblings they once may have
269 The new documents are returned as a list of Section objects (see
270 above), which bundles the new tree with an ID and title if the
271 marker elements contain those attributes.
273 The original tree will be destroyed or reused.
276 root
= tree
.getroot()
277 except AttributeError:
280 # find the node lineages along which to split the document.
281 # anything outside these lines (i.e., side branches) can be copied
282 # wholesale, which speeds things up considerably.
284 for hr
in root
.iter(tag
='hr'):
285 klass
= hr
.get('class')
286 if klass
== MARKER_CLASS_SPLIT
:
288 stack
.extend(x
for x
in hr
.iterancestors())
291 elif klass
== MARKER_CLASS_INFO
:
292 hr
.getparent().remove(hr
)
294 iterstacks
= iter(stacks
)
297 dest
= lxml
.html
.Element(root
.tag
, **dict(root
.items()))
299 stack
= iterstacks
.next()
303 ID
= 'unidentified-front-matter'
309 #cut and paste branch
313 chapters
.append(Section(doc
, ID
, title
))
314 #The ID and title are for the *next* section, so
315 #collect them before deleting the marker.
317 title
= e
.get('title')
320 dest
= lxml
.html
.Element(root
.tag
, **dict(root
.items()))
322 stack
= iterstacks
.next()
327 #It is safe to descend without leaving a trail,
328 #because side branches are not descended.
329 dest
= etree
.SubElement(dest
, e
.tag
, **dict(e
.items()))
334 except StopIteration:
335 #stacks have run out -- the rest of the tree is the last section
336 chapters
.append(Section(src
, ID
, title
))