1 """Module for dealing with epub -> booki conversions."""
4 from pprint
import pprint
6 from cStringIO
import StringIO
11 from simplejson
import dumps
13 import lxml
, lxml
.html
, lxml
.etree
, lxml
.cssselect
15 XMLNS
= '{http://www.w3.org/XML/1998/namespace}'
16 DAISYNS
= '{http://www.daisy.org/z3986/2005/ncx/}'
17 OPFNS
= '{http://www.idpf.org/2007/opf}'
18 CONTAINERNS
= '{urn:oasis:names:tc:opendocument:xmlns:container}'
20 DC
= "http://purl.org/dc/elements/1.1/"
23 'opf': 'http://www.idpf.org/2007/opf',
24 'dc': 'http://purl.org/dc/elements/1.1/', #dublin core
25 'tei': 'http://www.tei-c.org/ns/1.0',
26 'dcterms': 'http://purl.org/dc/terms/',
27 'nzetc': 'http://www.nzetc.org/structure',
28 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
31 def log(*messages
, **kwargs
):
34 print >> sys
.stderr
, m
36 print >> sys
.stderr
, repr(m
)
40 class EpubError(Exception):
55 Great Expectations.opf
60 <other HTML files for the remaining chapters>
64 #XXX if zip variability proves a problem, we should just use
65 #an `unzip` subprocess
66 if isinstance(src
, str):
67 # Should end with PK<06><05> + 18 more.
68 # Some zips contain 'comments' after that, which breaks ZipFile
69 zipend
= src
.rfind('PK\x05\x06') + 22
70 if len(src
) != zipend
:
74 self
.zip = zipfile
.ZipFile(src
, 'r', compression
=zipfile
.ZIP_DEFLATED
, allowZip64
=True)
75 self
.names
= self
.zip.namelist()
76 self
.info
= self
.zip.infolist()
78 def gettree(self
, name
):
79 """get an etree from the given zip filename"""
80 #Note: python 2.6 (not 2.5) has zipfile.open
81 s
= self
.zip.read(name
)
83 tree
= lxml
.etree
.parse(f
)
88 '''META-INF/container.xml contains one or more <rootfile>
89 nodes. We want the "application/oepbs-package+xml" one.
91 <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
92 <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
94 If there is only one (as is common), forget the media-type.
96 Other files are allowed in META-INF, but none of them are much
97 use. They are manifest.xml, metadata.xml, signatures.xml,
98 encryption.xml, and rights.xml.
100 tree
= self
.gettree('META-INF/container.xml')
101 for r
in tree
.getiterator(CONTAINERNS
+ 'rootfile'):
102 if r
.get('media-type') == "application/oebps-package+xml":
103 rootfile
= r
.get('full-path')
106 raise EpubError("No OPF rootfile found")
108 self
.opf_file
= rootfile
112 The opf file is arranged like this:
120 Metadata, manifest and spine are parsed in separate helper
123 pwd
= os
.path
.dirname(self
.opf_file
) #needed for mainfest parsing
124 tree
= self
.gettree(self
.opf_file
)
125 root
= tree
.getroot()
126 metadata
= root
.find(OPFNS
+ 'metadata')
127 manifest
= root
.find(OPFNS
+ 'manifest')
128 spine
= root
.find(OPFNS
+ 'spine')
129 #there is also an optional guide section, which we ignore
131 self
.metadata
= parse_metadata(metadata
)
132 self
.files
= parse_manifest(manifest
, pwd
)
133 ncxid
, self
.order
= parse_spine(spine
)
134 self
.ncxfile
= self
.files
[ncxid
][0]
137 ncx
= self
.gettree(self
.ncxfile
)
138 self
.ncxdata
= parse_ncx(ncx
)
141 """get all the known metadata and nav data as json."""
143 'metadata': self
.metadata
,
144 'manifest': self
.files
,
148 return dumps(data
, indent
=2)
150 def find_language(self
):
151 opflang
= [x
[0].lower() for x
in
152 self
.metadata
.get(DC
, {}).get('language', ())]
154 # XXX Should the ncx language enter into it? Being xml:lang,
155 # it is in theory just the language of the ncx document
156 # itself. But if the metadata lacks language, should it be
157 # used instead? At present, NO.
158 #ncxlang = self.ncxdata['headers'].get('lang', ())
160 # XXX also, for now, ignoring case of badly formed language
161 # codes, conflicting or supplementary languages, etc.
162 opflang
= [x
for x
in opflang
if x
not in ('und', '')]
170 def parse_metadata(metadata
):
171 """metadata is an OPF metadata node, as defined at
172 http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
173 (or a dc-metadata or x-metadata child thereof).
176 # the node probably has at least 'dc', 'opf', and None namespace
177 # prefixes. None and opf probably map to the same thing. 'dc' is
179 nsmap
= metadata
.nsmap
180 nstags
= dict((k
, '{%s}' % v
) for k
, v
in nsmap
.iteritems())
181 default_ns
= nstags
[None]
183 # Collect element data in namespace-bins, and map prefixes to
184 # those bins for convenience
185 nsdict
= dict((v
, {}) for v
in nsmap
.values())
187 def add_item(ns
, tag
, value
, extra
):
188 #any key can be duplicate, so store in a list
191 values
= nsdict
[ns
].setdefault(tag
, [])
192 values
.append((value
, extra
))
194 for t
in metadata
.iterdescendants():
195 #look for special OPF tags
196 if t
.tag
== default_ns
+ 'meta':
197 #meta tags <meta name="" content="" />
199 content
= t
.get('content')
200 others
= tuple((k
, v
) for k
, v
in t
.items() if k
not in ('name', 'content'))
202 # the meta tag is using xml namespaces in attribute values.
203 prefix
, name
= name
.split(':', 1)
206 add_item(t
.nsmap
[prefix
], name
, content
, others
)
209 if t
.tag
in (default_ns
+ 'dc-metadata', default_ns
+ 'x-metadata'):
210 # Subelements of these deprecated elements are in either
211 # DC or non-DC namespace (respectively). Of course, this
212 # is true of any element anyway, so it is sufficent to
213 # ignore this (unless we want to cause pedantic errors).
214 log("found a live %s tag; descending into but otherwise ignoring it"
215 % t
.tag
[len(default_ns
):])
218 tag
= t
.tag
[t
.tag
.rfind('}') + 1:]
219 add_item(t
.nsmap
[t
.prefix
], tag
, t
.text
,
220 tuple((k
.replace(default_ns
, ''), v
) for k
, v
in t
.items()))
224 def parse_manifest(manifest
, pwd
):
226 Only contains <item>s; each <item> has id, href, and media-type.
228 It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
229 file (i.e., the files needed to get this far).
231 The manifest can specify fallbacks for unrecognised documents, but
232 Espri does not use that (nor do any of the test epub files).
235 <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
236 <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
237 <item id="cover" href="cover.jpg" media-type="image/jpeg" />
241 ns
= '{%s}' % manifest
.nsmap
[None]
243 for t
in manifest
.iterchildren(ns
+ 'item'):
245 href
= os
.path
.join(pwd
, t
.get('href'))
246 media_type
= t
.get('media-type')
247 items
[id] = (href
, media_type
) #XXX does media-type matter?
251 def parse_spine(spine
):
252 """The spine is an ordered list of xhtml documents (or dtbook, but
253 Booki can't edit that, or manifest items that 'fallback' to xhtml,
254 which Espri doesn't yet handle). Also, anything in the manifest
255 that can be in the spine, must be.
257 Spine itemrefs can have a 'linear' attribute, with a value of
258 'yes' or 'no' (defaulting to 'yes'). If an item is linear, it is
259 in the main stream of the book. Reader software is allowed to
260 ignore this distinction, as Espri does.
262 The toc attribute points to the ncx file (via manifest id).
265 ns
= '{%s}' % spine
.nsmap
[None]
266 for t
in spine
.iterchildren(ns
+ 'itemref'):
267 items
.append(t
.get('idref'))
269 toc
= spine
.get('toc')
275 #get text from an <xx><text>...</text></xx> xconstruct
276 t
= e
.find(DAISYNS
+ 'text')
279 return '' # or leave it at None?
281 def get_labels(e
, tag
='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
282 """Make a mapping of languages to labels."""
283 # This reads navInfo or navLabel tags. navInfo is unlikely, but
284 # navLabel is ubiquitous. There can be one for each language, so
287 for label
in e
.findall(DAISYNS
+ 'navLabel'):
288 lang
= label
.get(XMLNS
+ 'lang')
289 labels
[lang
] = get_ncxtext(e
)
294 The NCX file is the closest thing to FLOSS Manuals TOC.txt. It
295 describes the heirarchical structure of the document (wheras the
296 spine describes its 'physical' structure).
298 #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
301 #if a header is set multiple times, keep all
302 def setheader(name
, content
, scheme
=None):
303 values
= headers
.setdefault(name
, [])
304 values
.append((content
, scheme
))
306 head
= ncx
.find(DAISYNS
+ 'head')
307 #<!ELEMENT head (meta+)>
308 for meta
in head
.findall(DAISYNS
+ 'itemref'):
309 #whatever 'scheme' is
310 setheader(meta
.get('name'), meta
.get('content'), meta
.get('scheme'))
312 for t
in ('docTitle', 'docAuthor'):
313 for e
in ncx
.findall(DAISYNS
+ t
):
315 setheader(t
, get_ncxtext(e
))
318 for attr
, header
in (('dir', 'dir'),
319 (XMLNS
+ 'lang', 'lang')):
320 value
= root
.get(attr
)
321 if value
is not None:
322 setheader(header
, value
)
326 'navmap': parse_navmap(root
.find(DAISYNS
+ 'navMap')),
329 #Try adding these bits, even though noone has them and they are no use.
330 pagelist
= ncx
.find(DAISYNS
+ 'pageList')
331 navlist
= ncx
.find(DAISYNS
+ 'navList')
332 if pagelist
is not None:
333 ret
['pagelist'] = parse_pagelist(pagelist
)
334 if navlist
is not None:
335 ret
['navlist'] = parse_navlist(navlist
)
341 #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
343 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
344 'labels': get_labels(e
),
345 'points': tuple(parse_navpoint(x
) for x
in e
.findall(DAISYNS
+ 'navPoint')),
348 def parse_navpoint(e
):
349 #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
350 c
= e
.find(DAISYNS
+ 'content')
351 subpoints
= tuple(parse_navpoint(x
) for x
in e
.findall(DAISYNS
+ 'navPoint'))
354 'play_order': int(e
.get('playOrder')),
355 #'content_id': c.get('id'),
356 'content_src': c
.get('src'),
357 'labels': get_labels(e
),
362 def parse_pagelist(e
):
363 # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
365 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
366 'labels': get_labels(e
),
367 'targets': tuple(parse_pagetarget(x
) for x
in e
.findall(DAISYNS
+ 'pageTarget')),
370 def parse_pagetarget(e
):
371 #<!ELEMENT pageTarget (navLabel+, content)>
372 labels
= get_labels(e
)
373 c
= e
.find(DAISYNS
+ 'content')
376 'type': e
.get('type'),
377 'play_order': int(e
.get('playOrder')),
378 'content_src': c
.get('src'),
379 'labels': get_labels(e
),
381 value
= e
.get('value')
382 if value
is not None:
386 def parse_navlist(e
):
387 #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
389 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
390 'labels': get_labels(e
),
391 'targets': tuple(parse_pagetarget(x
) for x
in e
.findall(DAISYNS
+ 'navTarget')),
394 def parse_navtarget(e
):
395 #<!ELEMENT navTarget (navLabel+, content)>
396 labels
= get_labels(e
)
397 c
= e
.find(DAISYNS
+ 'content')
400 'play_order': int(e
.get('playOrder')),
401 'content_src': c
.get('src'),
402 'labels': get_labels(e
),
404 value
= e
.get('value')
405 if value
is not None: