1 """Various things to do with [x]html that might be useful in more than
4 import lxml
.html
, lxml
.html
.clean
10 from urlparse
import urlsplit
11 from urllib2
import urlopen
, HTTPError
13 from objavi
.config
import XHTMLNS
, XHTML
, IMG_CACHE
, MARKER_CLASS_SPLIT
, MARKER_CLASS_INFO
14 from objavi
.book_utils
import log
16 ADJUST_HEADING_WEIGHT
= False
19 "body", "head", "html", "title", "abbr", "acronym", "address",
20 "blockquote", "br", "cite", "code", "dfn", "div", "em", "h1", "h2",
21 "h3", "h4", "h5", "h6", "kbd", "p", "pre", "q", "samp", "span",
22 "strong", "var", "a", "dl", "dt", "dd", "ol", "ul", "li", "object",
23 "param", "b", "big", "hr", "i", "small", "sub", "sup", "tt", "del",
24 "ins", "bdo", "caption", "col", "colgroup", "table", "tbody", "td",
25 "tfoot", "th", "thead", "tr", "img", "area", "map", "meta", "style",
31 XHTML11_DOCTYPE
= '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
32 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
34 XML_DEC
= '<?xml version="1.0" encoding="UTF-8"?>\n'
36 IMG_PREFIX
= 'static/'
38 def convert_tags(root
, elmap
):
39 for el
in root
.iterdescendants():
41 el
.tag
= elmap
[el
.tag
]
44 def url_to_filename(url
, prefix
=''):
46 #XXX slightly inefficient to do urlsplit so many times, but versatile
47 fragments
= urlsplit(url
)
48 base
, ext
= fragments
.path
.rsplit('.', 1)
49 server
= fragments
.netloc
.split('.', 1)[0] #en, fr, translate
50 base
= base
.split('/pub/', 1)[1] #remove /floss/pub/ or /pub/
51 base
= re
.sub(r
'[^\w]+', '-', '%s-%s' %(base
, server
))
52 return '%s%s.%s' % (prefix
, base
, ext
)
55 class ImageCache(object):
56 def __init__(self
, cache_dir
=IMG_CACHE
, prefix
=IMG_PREFIX
):
58 self
.cache_dir
= cache_dir
60 if not os
.path
.exists(cache_dir
+ prefix
):
61 os
.makedirs(cache_dir
+ prefix
)
63 def read_local_url(self
, path
):
64 f
= open(self
.cache_dir
+ path
)
69 def _save_local_url(self
, path
, data
):
70 f
= open(self
.cache_dir
+ path
, 'w')
75 def fetch_if_necessary(self
, url
, target
=None, use_cache
=True):
76 if url
in self
._fetched
:
77 return self
._fetched
[url
]
80 target
= url_to_filename(url
, self
.prefix
)
82 if use_cache
and os
.path
.exists(self
.cache_dir
+ target
):
83 log("used cache for %s" % target
)
91 # if it is missing, assume it will be missing every time
92 # after, otherwise, you can get into endless waiting
93 self
._fetched
[url
] = None
94 log("Wanting '%s', got error %s" %(url
, e
))
97 self
._save
_local
_url
(target
, data
)
98 self
._fetched
[url
] = target
99 log("got %s as %s" % (url
, target
))
103 class BaseChapter(object):
104 parser
= lxml
.html
.HTMLParser(encoding
='utf-8')
106 """Serialise the tree as html."""
107 return etree
.tostring(self
.tree
, method
='html', encoding
='utf-8')
110 """Convert to xhtml and serialise."""
112 root
= self
.tree
.getroot()
113 except AttributeError:
116 nsmap
= {None: XHTML
}
117 xroot
= etree
.Element(XHTMLNS
+ "html", nsmap
=nsmap
)
119 def xhtml_copy(el
, xel
):
121 for k
, v
in el
.items():
123 for child
in el
.iterchildren():
124 xchild
= xel
.makeelement(XHTMLNS
+ child
.tag
)
126 xhtml_copy(child
, xchild
)
129 xhtml_copy(root
, xroot
)
131 return XML_DEC
+ XHTML11_DOCTYPE
+ etree
.tostring(xroot
)
133 cleaner
= lxml
.html
.clean
.Cleaner(scripts
=True,
139 page_structure
=False,
140 processing_instructions
=True,
146 remove_unknown_tags
=False,
147 safe_attrs_only
=True,
151 def remove_bad_tags(self
):
152 #for e in self.tree.iter():
153 # if not e.tag in OK_TAGS:
154 # log('found bad tag %s' % e.tag)
155 self
.cleaner(self
.tree
)
158 def fix_bad_structure(self
):
159 """Attempt to match booki chapter conventions. This doesn't
160 care about xhtml correctness, just booki correctness.
162 This function's philosophy is to be aggressive, and be
163 modified upon complaint."""
164 #0. is the first element preceded by text?
165 body
= self
.tree
.iter('body').next()
167 log("BAD STRUCTURE: empty html, adding something")
168 etree
.SubElement(body
, 'span')
169 if body
.text
.strip():
170 log("BAD STRUCTURE: text %r before first tag (not fixing)" % body
.text
.strip())
172 #0.5 Remove any <link>, <script>, and <style> tags
173 #they are at best spurious
174 for tag
in ['link', 'style', 'script', etree
.Comment
]:
175 for e
in body
.iter(tag
):
176 log("BAD STRUCTURE: trying to remove '%s' (with tail %s)" %
177 (("%s" % e
)[:60], e
.tail
))
178 parent
= e
.getparent()
180 log("rescuing that tail")
183 parent
.text
= (parent
.text
or "") + e
.tail
185 p
.tail
= (p
.tail
or "") + e
.tail
188 #0.75 Remove style and dir attributes from all elements!
189 #style is usually doing bad things, and perhaps dir is too.
190 for e
in body
.iter():
192 del e
.attrib
['style']
193 if e
.get('dir') and e
.tag
not in ('html', 'body'):
196 # 1. is the first element an h1?
198 if el1
.tag
== 'div' and len(body
) == 1:
199 #The body has a "content" div. we should compact it.
200 log("DODGY STRUCTURE: containing div. ")
202 log("BAD STRUCTURE: firstelement is %r " % el1
.tag
)
203 if el1
.tag
in ('h2', 'h3', 'strong', 'b'):
204 log("converting %r to 'h1'" % el1
.tag
)
207 #2. how many <h1>s are there?
208 h1s
= list(body
.iter('h1'))
210 log("BAD STRUCTURE: no h1! making one up")
211 h1
= body
.makeelement('h1')
212 h1
.text
= "Somebody Should Set The Title For This Chapter!"
215 log("BAD STRUCTURE: found extra h1s: %s, converting to h2" % h1s
[1:])
220 def _loadtree(self
, html
):
223 self
.tree
= lxml
.html
.document_fromstring(html
, parser
=self
.parser
)
224 except UnicodeError, e
:
225 log('failed to parse tree as unicode, got %s %r' % (e
, e
),
226 'trying again using default parser')
227 self
.tree
= lxml
.html
.document_fromstring(html
)
228 except etree
.XMLSyntaxError
, e
:
229 log('Could not parse html file %r, string %r... exception %s' %
230 (self
.name
, html
[:40], e
))
231 self
.tree
= lxml
.html
.document_fromstring('<html><body></body></html>').getroottree()
234 class EpubChapter(BaseChapter
):
235 def __init__(self
, server
, book
, chapter_name
, html
, use_cache
=False,
239 self
.name
= chapter_name
242 def prepare_for_epub(self
):
243 """Shift all headings down 2 places."""
244 if ADJUST_HEADING_WEIGHT
:
245 # a question to resolve:
246 # is it better (quicker) to have multiple, filtered iterations
247 # converting in order (h4->h5, h3->h4, etc) or to do a single,
248 # unfiltered pass and convert from a dict?
250 hmap
= dict(('h%s' % x
, 'h%s' % (x
+ 2)) for x
in range(4, 0, -1))
252 convert_tags(self
.root
, hmap
)
257 ###################################################
260 class Section(object):
261 def __init__(self
, tree
, ID
=None, title
=None):
267 return '<Section id: %r title: %r>' % (self
.ID
, self
.title
)
270 def split_tree(tree
):
271 """If a document has special marker elements (hr tags with class
272 of config.MARKER_CLASS_SPLIT) it will be broken into smaller
273 documents using the markers as boundaries. Each element in the
274 new documents will be nested and ordered as before, though those
275 on the new edges will obviously lack siblings they once may have
278 The new documents are returned as a list of Section objects (see
279 above), which bundles the new tree with an ID and title if the
280 marker elements contain those attributes.
282 The original tree will be destroyed or reused.
285 root
= tree
.getroot()
286 except AttributeError:
289 # find the node lineages along which to split the document.
290 # anything outside these lines (i.e., side branches) can be copied
291 # wholesale, which speeds things up considerably.
293 for hr
in root
.iter(tag
='hr'):
294 klass
= hr
.get('class')
295 if klass
== MARKER_CLASS_SPLIT
:
297 stack
.extend(x
for x
in hr
.iterancestors())
300 elif klass
== MARKER_CLASS_INFO
:
301 hr
.getparent().remove(hr
)
303 iterstacks
= iter(stacks
)
306 dest
= lxml
.html
.Element(root
.tag
, **dict(root
.items()))
308 stack
= iterstacks
.next()
312 ID
= 'unidentified-front-matter'
318 #cut and paste branch
322 chapters
.append(Section(doc
, ID
, title
))
323 #The ID and title are for the *next* section, so
324 #collect them before deleting the marker.
326 title
= e
.get('title')
329 dest
= lxml
.html
.Element(root
.tag
, **dict(root
.items()))
331 stack
= iterstacks
.next()
336 #It is safe to descend without leaving a trail,
337 #because side branches are not descended.
338 dest
= etree
.SubElement(dest
, e
.tag
, **dict(e
.items()))
343 except StopIteration:
344 #stacks have run out -- the rest of the tree is the last section
345 chapters
.append(Section(src
, ID
, title
))