New books in list mean longer strings are needed for unique identification
[objavi2.git] / epub.py
blobf825cce57f7a5338bb03965ac2253606f2547c55
1 """Module for dealing with epub -> booki conversions."""
3 import os, sys
4 from pprint import pprint
5 import zipfile
6 from cStringIO import StringIO
8 try:
9 from json import dumps
10 except ImportError:
11 from simplejson import dumps
13 import lxml, lxml.html, lxml.etree, lxml.cssselect
15 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
16 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
17 OPFNS = '{http://www.idpf.org/2007/opf}'
18 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
20 DC = "http://purl.org/dc/elements/1.1/"
22 NAMESPACES = {
23 'opf': 'http://www.idpf.org/2007/opf',
24 'dc': 'http://purl.org/dc/elements/1.1/', #dublin core
25 'tei': 'http://www.tei-c.org/ns/1.0',
26 'dcterms': 'http://purl.org/dc/terms/',
27 'nzetc': 'http://www.nzetc.org/structure',
28 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
31 def log(*messages, **kwargs):
32 for m in messages:
33 try:
34 print >> sys.stderr, m
35 except Exception:
36 print >> sys.stderr, repr(m)
40 class EpubError(Exception):
41 pass
43 class Epub(object):
44 """
46 Abstract Container:
47 META-INF/
48 container.xml
49 [manifest.xml]
50 [metadata.xml]
51 [signatures.xml]
52 [encryption.xml]
53 [rights.xml]
54 OEBPS/
55 Great Expectations.opf
56 cover.html
57 chapters/
58 chapter01.html
59 chapter02.html
60 <other HTML files for the remaining chapters>
62 """
63 def load(self, src):
64 #XXX if zip variability proves a problem, we should just use
65 #an `unzip` subprocess
66 if isinstance(src, str):
67 # Should end with PK<06><05> + 18 more.
68 # Some zips contain 'comments' after that, which breaks ZipFile
69 zipend = src.rfind('PK\x05\x06') + 22
70 if len(src) != zipend:
71 log('Bad zipfile?')
72 src = src[: zipend]
73 src = StringIO(src)
74 self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
75 self.names = self.zip.namelist()
76 self.info = self.zip.infolist()
78 def gettree(self, name):
79 """get an etree from the given zip filename"""
80 #Note: python 2.6 (not 2.5) has zipfile.open
81 s = self.zip.read(name)
82 f = StringIO(s)
83 tree = lxml.etree.parse(f)
84 f.close()
85 return tree
87 def parse_meta(self):
88 '''META-INF/container.xml contains one or more <rootfile>
89 nodes. We want the "application/oepbs-package+xml" one.
91 <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
92 <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
94 If there is only one (as is common), forget the media-type.
96 Other files are allowed in META-INF, but none of them are much
97 use. They are manifest.xml, metadata.xml, signatures.xml,
98 encryption.xml, and rights.xml.
99 '''
100 tree = self.gettree('META-INF/container.xml')
101 for r in tree.getiterator(CONTAINERNS + 'rootfile'):
102 if r.get('media-type') == "application/oebps-package+xml":
103 rootfile = r.get('full-path')
104 break
105 else:
106 raise EpubError("No OPF rootfile found")
108 self.opf_file = rootfile
110 def parse_opf(self):
112 The opf file is arranged like this:
113 <package>
114 <metadata />
115 <manifest />
116 <spine />
117 <guide />
118 </package>
120 Metadata, manifest and spine are parsed in separate helper
121 functions.
123 pwd = os.path.dirname(self.opf_file) #needed for mainfest parsing
124 tree = self.gettree(self.opf_file)
125 root = tree.getroot()
126 metadata = root.find(OPFNS + 'metadata')
127 manifest = root.find(OPFNS + 'manifest')
128 spine = root.find(OPFNS + 'spine')
129 #there is also an optional guide section, which we ignore
131 self.metadata = parse_metadata(metadata)
132 self.files = parse_manifest(manifest, pwd)
133 ncxid, self.order = parse_spine(spine)
134 self.ncxfile = self.files[ncxid][0]
136 def parse_ncx(self):
137 ncx = self.gettree(self.ncxfile)
138 self.ncxdata = parse_ncx(ncx)
140 def raw_json(self):
141 """get all the known metadata and nav data as json."""
142 data = {
143 'metadata': self.metadata,
144 'manifest': self.files,
145 'spine': self.order,
146 'ncx': self.ncxdata
148 return dumps(data, indent=2)
150 def find_language(self):
151 opflang = [x[0].lower() for x in
152 self.metadata.get(DC, {}).get('language', ())]
154 # XXX Should the ncx language enter into it? Being xml:lang,
155 # it is in theory just the language of the ncx document
156 # itself. But if the metadata lacks language, should it be
157 # used instead? At present, NO.
158 #ncxlang = self.ncxdata['headers'].get('lang', ())
160 # XXX also, for now, ignoring case of badly formed language
161 # codes, conflicting or supplementary languages, etc.
162 opflang = [x for x in opflang if x not in ('und', '')]
163 if not opflang:
164 return None
165 return opflang[0]
170 def parse_metadata(metadata):
171 """metadata is an OPF metadata node, as defined at
172 http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
173 (or a dc-metadata or x-metadata child thereof).
176 # the node probably has at least 'dc', 'opf', and None namespace
177 # prefixes. None and opf probably map to the same thing. 'dc' is
178 # Dublin Core.
179 nsmap = metadata.nsmap
180 nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
181 default_ns = nstags[None]
183 # Collect element data in namespace-bins, and map prefixes to
184 # those bins for convenience
185 nsdict = dict((v, {}) for v in nsmap.values())
187 def add_item(ns, tag, value, extra):
188 #any key can be duplicate, so store in a list
189 if ns not in nsdict:
190 nsdict[ns] = {}
191 values = nsdict[ns].setdefault(tag, [])
192 values.append((value, extra))
194 for t in metadata.iterdescendants():
195 #look for special OPF tags
196 if t.tag == default_ns + 'meta':
197 #meta tags <meta name="" content="" />
198 name = t.get('name')
199 content = t.get('content')
200 others = tuple((k, v) for k, v in t.items() if k not in ('name', 'content'))
201 if ':' in name:
202 # the meta tag is using xml namespaces in attribute values.
203 prefix, name = name.split(':', 1)
204 else:
205 prefix = None
206 add_item(t.nsmap[prefix], name, content, others)
207 continue
209 if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
210 # Subelements of these deprecated elements are in either
211 # DC or non-DC namespace (respectively). Of course, this
212 # is true of any element anyway, so it is sufficent to
213 # ignore this (unless we want to cause pedantic errors).
214 log("found a live %s tag; descending into but otherwise ignoring it"
215 % t.tag[len(default_ns):])
216 continue
218 tag = t.tag[t.tag.rfind('}') + 1:]
219 add_item(t.nsmap[t.prefix], tag, t.text,
220 tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
222 return nsdict
224 def parse_manifest(manifest, pwd):
226 Only contains <item>s; each <item> has id, href, and media-type.
228 It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
229 file (i.e., the files needed to get this far).
231 The manifest can specify fallbacks for unrecognised documents, but
232 Espri does not use that (nor do any of the test epub files).
234 <manifest>
235 <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
236 <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
237 <item id="cover" href="cover.jpg" media-type="image/jpeg" />
238 </manifest>
240 items = {}
241 ns = '{%s}' % manifest.nsmap[None]
243 for t in manifest.iterchildren(ns + 'item'):
244 id = t.get('id')
245 href = os.path.join(pwd, t.get('href'))
246 media_type = t.get('media-type')
247 items[id] = (href, media_type) #XXX does media-type matter?
249 return items
251 def parse_spine(spine):
252 """The spine is an ordered list of xhtml documents (or dtbook, but
253 Booki can't edit that, or manifest items that 'fallback' to xhtml,
254 which Espri doesn't yet handle). Also, anything in the manifest
255 that can be in the spine, must be.
257 Spine itemrefs can have a 'linear' attribute, with a value of
258 'yes' or 'no' (defaulting to 'yes'). If an item is linear, it is
259 in the main stream of the book. Reader software is allowed to
260 ignore this distinction, as Espri does.
262 The toc attribute points to the ncx file (via manifest id).
264 items = []
265 ns = '{%s}' % spine.nsmap[None]
266 for t in spine.iterchildren(ns + 'itemref'):
267 items.append(t.get('idref'))
269 toc = spine.get('toc')
271 return toc, items
274 def get_ncxtext(e):
275 #get text from an <xx><text>...</text></xx> xconstruct
276 t = e.find(DAISYNS + 'text')
277 if t is not None:
278 return t.text
279 return '' # or leave it at None?
281 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
282 """Make a mapping of languages to labels."""
283 # This reads navInfo or navLabel tags. navInfo is unlikely, but
284 # navLabel is ubiquitous. There can be one for each language, so
285 # construct a dict.
286 labels = {}
287 for label in e.findall(DAISYNS + 'navLabel'):
288 lang = label.get(XMLNS + 'lang')
289 labels[lang] = get_ncxtext(e)
290 return labels
292 def parse_ncx(ncx):
294 The NCX file is the closest thing to FLOSS Manuals TOC.txt. It
295 describes the heirarchical structure of the document (wheras the
296 spine describes its 'physical' structure).
298 #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
300 headers = {}
301 #if a header is set multiple times, keep all
302 def setheader(name, content, scheme=None):
303 values = headers.setdefault(name, [])
304 values.append((content, scheme))
306 head = ncx.find(DAISYNS + 'head')
307 #<!ELEMENT head (meta+)>
308 for meta in head.findall(DAISYNS + 'itemref'):
309 #whatever 'scheme' is
310 setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
312 for t in ('docTitle', 'docAuthor'):
313 for e in ncx.findall(DAISYNS + t):
314 if e is not None:
315 setheader(t, get_ncxtext(e))
317 root = ncx.getroot()
318 for attr, header in (('dir', 'dir'),
319 (XMLNS + 'lang', 'lang')):
320 value = root.get(attr)
321 if value is not None:
322 setheader(header, value)
324 ret = {
325 'headers': headers,
326 'navmap': parse_navmap(root.find(DAISYNS + 'navMap')),
329 #Try adding these bits, even though noone has them and they are no use.
330 pagelist = ncx.find(DAISYNS + 'pageList')
331 navlist = ncx.find(DAISYNS + 'navList')
332 if pagelist is not None:
333 ret['pagelist'] = parse_pagelist(pagelist)
334 if navlist is not None:
335 ret['navlist'] = parse_navlist(navlist)
337 return ret
340 def parse_navmap(e):
341 #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
342 return {
343 'info': get_labels(e, DAISYNS + 'navInfo'),
344 'labels': get_labels(e),
345 'points': tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint')),
348 def parse_navpoint(e):
349 #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
350 c = e.find(DAISYNS + 'content')
351 subpoints = tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint'))
352 return {
353 'id': e.get('id'),
354 'play_order': int(e.get('playOrder')),
355 #'content_id': c.get('id'),
356 'content_src': c.get('src'),
357 'labels': get_labels(e),
358 'points': subpoints,
362 def parse_pagelist(e):
363 # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
364 return {
365 'info': get_labels(e, DAISYNS + 'navInfo'),
366 'labels': get_labels(e),
367 'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'pageTarget')),
370 def parse_pagetarget(e):
371 #<!ELEMENT pageTarget (navLabel+, content)>
372 labels = get_labels(e)
373 c = e.find(DAISYNS + 'content')
374 ret = {
375 'id': e.get('id'),
376 'type': e.get('type'),
377 'play_order': int(e.get('playOrder')),
378 'content_src': c.get('src'),
379 'labels': get_labels(e),
381 value = e.get('value')
382 if value is not None:
383 ret['value'] = value
384 return ret
386 def parse_navlist(e):
387 #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
388 return {
389 'info': get_labels(e, DAISYNS + 'navInfo'),
390 'labels': get_labels(e),
391 'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'navTarget')),
394 def parse_navtarget(e):
395 #<!ELEMENT navTarget (navLabel+, content)>
396 labels = get_labels(e)
397 c = e.find(DAISYNS + 'content')
398 ret = {
399 'id': e.get('id'),
400 'play_order': int(e.get('playOrder')),
401 'content_src': c.get('src'),
402 'labels': get_labels(e),
404 value = e.get('value')
405 if value is not None:
406 ret['value'] = value
407 return ret