1 """Module for dealing with epub -> booki conversions."""
4 from pprint
import pprint
6 from cStringIO
import StringIO
10 from json
import dumps
12 from simplejson
import dumps
14 import lxml
, lxml
.html
, lxml
.cssselect
15 from lxml
import etree
17 from booki
.xhtml_utils
import BookiZip
19 #XML namespaces. The *NS varients are in {curly brackets} for clark's syntax
20 XMLNS
= '{http://www.w3.org/XML/1998/namespace}'
21 DAISYNS
= '{http://www.daisy.org/z3986/2005/ncx/}'
22 OPFNS
= '{http://www.idpf.org/2007/opf}'
23 CONTAINERNS
= '{urn:oasis:names:tc:opendocument:xmlns:container}'
24 XHTMLNS
= '{http://www.w3.org/1999/xhtml}'
26 XHTML
= 'http://www.w3.org/1999/xhtml'
27 DC
= "http://purl.org/dc/elements/1.1/"
29 MARKUP_TYPES
= ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
30 HTML_TYPES
= ('application/xhtml+xml', 'text/html')
32 def log(*messages
, **kwargs
):
35 print >> sys
.stderr
, m
37 print >> sys
.stderr
, repr(m
)
40 html_parser
= lxml
.html
.HTMLParser(encoding
="utf-8")
41 xhtml_parser
= lxml
.html
.XHTMLParser(encoding
="utf-8")
43 def _xhtml_parse(*args
, **kwargs
):
44 kwargs
['parser'] = xhtml_parser
45 return lxml
.html
.parse(*args
, **kwargs
)
47 def _html_parse(*args
, **kwargs
):
48 kwargs
['parser'] = html_parser
49 return lxml
.html
.parse(*args
, **kwargs
)
51 def new_doc(guts
="", version
="1.1", lang
=None):
52 xmldec
= '<?xml version="1.0" encoding="UTF-8"?>'
54 '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
55 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
56 '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
57 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
60 if lang
in (None, 'und', 'UND'):
63 langdec
= 'xml:lang="%s" lang="%s"' % (lang
, lang
)
65 doc
= ('<html xmlns="%s" version="XHTML %s" %s>'
66 '<head></head><body>%s</body></html>'
67 % (XHTML
, version
, langdec
, guts
))
69 f
= StringIO(xmldec
+ doctypes
.get(version
, '') + doc
)
70 tree
= lxml
.html
.parse(f
)
75 class EpubError(Exception):
90 Great Expectations.opf
95 <other HTML files for the remaining chapters>
99 # Zip is a variable format, and zipfile is limited. If that
100 # becomes a problem we will have to ise an `unzip` subprocess,
101 # but it hasn't been so far.
102 if isinstance(src
, str):
103 # Should end with PK<06><05> + 18 more.
104 # Some zips contain 'comments' after that, which breaks ZipFile
105 zipend
= src
.rfind('PK\x05\x06') + 22
106 if len(src
) != zipend
:
110 self
.zip = zipfile
.ZipFile(src
, 'r', compression
=zipfile
.ZIP_DEFLATED
, allowZip64
=True)
111 self
.names
= self
.zip.namelist()
112 self
.info
= self
.zip.infolist()
115 def gettree(self
, name
=None, id=None, parse
=etree
.parse
):
116 """get an XML tree from the given zip filename or manifest ID"""
118 name
, mimetype
= self
.manifest
[id]
119 #Note: python 2.6 (not 2.5) has zipfile.open
120 s
= self
.zip.read(name
)
126 def parse_meta(self
):
127 '''META-INF/container.xml contains one or more <rootfile>
128 nodes. We want the "application/oepbs-package+xml" one.
130 <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
131 <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
133 Other files are allowed in META-INF, but none of them are much
134 use. They are manifest.xml, metadata.xml, signatures.xml,
135 encryption.xml, and rights.xml.
137 tree
= self
.gettree('META-INF/container.xml')
138 for r
in tree
.getiterator(CONTAINERNS
+ 'rootfile'):
139 if r
.get('media-type') == "application/oebps-package+xml":
140 rootfile
= r
.get('full-path')
143 raise EpubError("No OPF rootfile found")
145 self
.opf_file
= rootfile
149 The opf file is arranged like this:
157 Metadata, manifest and spine are parsed in separate helper
160 self
.opfdir
= os
.path
.dirname(self
.opf_file
) #needed for mainfest parsing
161 tree
= self
.gettree(self
.opf_file
)
162 root
= tree
.getroot()
163 metadata
= root
.find(OPFNS
+ 'metadata')
164 manifest
= root
.find(OPFNS
+ 'manifest')
165 spine
= root
.find(OPFNS
+ 'spine')
166 #there is also an optional guide section, which we ignore
168 self
.metadata
= parse_metadata(metadata
)
169 self
.manifest
= parse_manifest(manifest
, self
.opfdir
)
170 # mapping of filenames to new filenames. This needs to be
171 # done early to detect clashes (e.g. '/images/hello.jpg' and
172 # '/images/big/hello.jpg' would both reduce to
173 # 'static/hello.jpg').
175 for k
, v
in self
.manifest
.items():
177 if mimetype
not in MARKUP_TYPES
:
180 fn
= fn
.rsplit('/', 1)[1]
181 while fn
in self
.media_map
.values():
183 newfn
= 'static/%s' % fn
184 self
.media_map
[oldfn
] = newfn
186 ncxid
, self
.spine
= parse_spine(spine
)
187 self
.ncxfile
= self
.manifest
[ncxid
][0]
190 ncx
= self
.gettree(self
.ncxfile
)
191 self
.ncxdata
= parse_ncx(ncx
)
194 """get all the known metadata and nav data as json."""
196 'metadata': self
.metadata
,
197 'manifest': self
.manifest
,
201 return dumps(data
, indent
=2)
203 def find_language(self
):
204 opflang
= [x
[0].lower() for x
in
205 self
.metadata
.get(DC
, {}).get('language', ())]
207 # XXX Should the ncx language enter into it? Being xml:lang,
208 # it is in theory just the language of the ncx document
209 # itself. But if the metadata lacks language, should it be
210 # used instead? At present, NO.
211 #ncxlang = self.ncxdata['headers'].get('lang', ())
213 # XXX also, for now, ignoring case of badly formed language
214 # codes, conflicting or supplementary languages, etc.
215 opflang
= [x
for x
in opflang
if x
not in ('und', '')]
218 if len(set(opflang
)) > 1:
219 log('%s metadata has more than one language: %s -- using first one'
220 % (self
.origin
, opflang
))
223 def find_probable_chapters(self
):
224 """Try to find the real chapters from the NCX file. The
225 problem is that different epubs all use their own level of
227 # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
228 # and FM books have (section 1 (c1, c2,..),..)
229 # i.e super-chapter blocks
230 # some have (((c1, c2, c3))) -- deeply nested chapters
231 # some have no real chapters, but stupid structure
232 points
= self
.ncxdata
['navmap']['points']
233 chapter_depth
, serial_points
, splits
= get_chapter_breaks(points
)
234 return chapter_depth
, serial_points
, splits
236 def concat_document(self
):
237 """Join all the xhtml files together, putting in markers
238 indicating where the splits should be.
240 lang
= self
.find_language()
241 points
= self
.ncxdata
['navmap']['points']
242 pwd
= os
.path
.dirname(self
.ncxfile
)
243 chapter_depth
, serial_points
, chapter_markers
= get_chapter_breaks(points
, pwd
)
244 doc
= new_doc(lang
=lang
)
245 #log(chapter_markers)
246 for ID
in self
.spine
:
247 fn
, mimetype
= self
.manifest
[ID
]
248 if mimetype
.startswith('image'):
249 root
= lxml
.html
.Element('html')
250 body
= etree
.SubElement(root
, 'body')
251 first_el
= etree
.SubElement(body
, 'img', src
=self
.media_map
.get(fn
, fn
), alt
='')
253 tree
= self
.gettree(fn
, parse
=_html_parse
)
254 root
= tree
.getroot()
255 first_el
= _find_tag(root
, 'body')[0]
256 #point the links to the new names. XXX probably fragile
257 root
.rewrite_links(lambda x
: self
.media_map
.get(os
.path
.join(self
.opfdir
, x
), x
))
259 for depth
, fragment
, point
in chapter_markers
.get(fn
, ()):
261 start
= root
.xpath("//*[@id='%s']" % fragment
)[0]
264 labels
= point
['labels']
265 add_marker(start
, 'espri-chapter-%(id)s' % point
,
266 title
=labels
.get(lang
, '\n'.join(labels
.values())),
267 subsections
=str(bool(point
['points'])))
269 add_marker(first_el
, 'espri-new-file-%s' % ID
, title
=fn
)
274 def make_bookizip(self
, zfn
):
275 """Split up the document and construct a booki-toc for it."""
276 doc
= self
.concat_document()
279 chapters
= split_document(doc
)
280 real_chapters
= drop_empty_chapters(chapters
)
283 for id, title
, tree
in real_chapters
:
286 root
= tree
.getroot()
289 head
= root
.makeelement('head')
290 _title
= etree
.SubElement(head
, 'title')
293 blob
= etree
.tostring(tree
)
294 bz
.add_to_package(id, '%s.html' % id,
295 blob
, mediatype
='text/html')
298 #add the images ad other non-html data unchanged.
299 for id, data
in self
.manifest
.iteritems():
301 if mimetype
not in MARKUP_TYPES
:
302 blob
= self
.zip.read(fn
)
303 bz
.add_to_package(id, self
.media_map
[fn
], blob
, mimetype
)
305 #now to construct a table of contents
307 def write_toc(point
, section
):
310 section
.append((ID
, ID
+ '.html'))
312 section
.append((ID
, None))
314 for child
in point
['points']:
315 write_toc(child
, subsection
)
317 section
.append(subsection
)
320 points
= self
.ncxdata
['navmap']['points']
327 'metadata': self
.metadata
["http://purl.org/dc/elements/1.1/"],
328 'copyright': {'The Contributors': [(x
, 'primary') for x
in spine
]},
334 def drop_empty_chapters(chapters
):
335 """If the chapter has no content, ignore it. Content is defined
336 as images or text."""
340 for e
in c
[2].iter():
341 if ((e
.text
and e
.text
.strip()) or
342 (e
.tail
and e
.tail
.strip()) or
347 good_chapters
.append(c
)
351 def copy_element(src
, create
):
352 """Return a copy of the src element, with all its attributes and
353 tail, using create to make the copy. create is probably an
354 Element._makeelement method, to associate the copy with the right
355 tree, but it could be etree.HTMLElement."""
356 if isinstance(src
.tag
, basestring
):
357 dest
= create(src
.tag
)
359 dest
= copy
.copy(src
)
361 for k
, v
in src
.items():
366 def split_document(doc
):
367 """Split the document along chapter boundaries."""
370 except AttributeError:
373 front_matter
= copy_element(root
, lxml
.html
.Element
)
374 chapters
= [('espri-unindexed-front-matter',
375 'Unindexed Front Matter',
378 _climb_and_split(root
, front_matter
, chapters
)
381 def _climb_and_split(src
, dest
, chapters
):
382 for child
in src
.iterchildren():
383 if child
.tag
== 'hr' and child
.get('class') == MARKER_CLASS
:
385 if ID
.startswith('espri-chapter-'):
386 title
= child
.get('title') or ID
387 new
= copy_element(src
, lxml
.html
.Element
)
390 for a
in src
.iterancestors():
391 a2
= copy_element(a
, root
.makeelement
)
395 chapters
.append((ID
[14:], title
, root
))
398 for a
in dest
.iterancestors():
403 log("skipping %s" % etree
.tostring(child
))
406 new
= copy_element(child
, dest
.makeelement
)
407 new
.text
= child
.text
409 _climb_and_split(child
, new
, chapters
)
412 def save_chapters(chapters
):
413 for id, tree
in chapters
.items():
414 string
= lxml
.html
.tostring(tree
, method
='html')
415 f
= open('/tmp/x%s.html' % id, 'w')
420 def add_guts(src
, dest
):
421 """Append the contents of the <body> of one tree onto that of
422 another. The source tree will be emptied."""
423 #print lxml.etree.tostring(src)
424 sbody
= _find_tag(src
, 'body')
425 dbody
= _find_tag(dest
, 'body')
427 dbody
[-1].tail
= ((dbody
[-1].tail
or '') +
428 (sbody
.text
or '')) or None
430 dbody
.text
= sbody
.text
435 dbody
.tail
= ((dbody
.tail
or '') +
436 (sbody
.tail
or '')) or None
440 def _find_tag(doc
, tag
):
441 #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace(' ', ''))
444 except AttributeError:
448 return doc
.iter(XHTMLNS
+ tag
).next()
449 except StopIteration:
450 log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc
.nsmap
, tag
))
451 return doc
.iter(tag
).next()
453 MARKER_CLASS
="espri-marker"
455 def add_marker(el
, ID
, **kwargs
):
456 """Add a marker before the elememt"""
457 marker
= el
.makeelement('hr')
459 marker
.set('class', MARKER_CLASS
)
460 for k
, v
in kwargs
.items():
462 parent
= el
.getparent()
463 index
= parent
.index(el
)
464 parent
.insert(index
, marker
)
468 def get_chapter_breaks(points
, pwd
):
470 # first go is quite naive: go to deepest level that is in
471 # every branch, not counting top level divisions (which may be
472 # cover, prologue, etc).
475 #lcb == lowest common depth (> 1)
476 def serialise(p
, depth
):
477 serial_points
.append((depth
, p
))
479 # log("found class=='%s' at depth %s" % (p['class'], depth))
480 if not p
.get('points'):
483 for child
in p
['points']:
484 bottom
= serialise(child
, depth
+ 1)
485 lcd
= min(bottom
, lcd
)
491 depth
= serialise(p
, 1)
498 # The book should now be split on all the points at chapter depth
499 # (lcd), and all higher points butnot if the higher point is at
500 # the same location as the chapter. If the chapter start url has
501 # a fragment id (e.g. "something.html#chapter-6"), then the split
502 # is internal to the chapter. What the book serialiser needs is a
503 # mapping from file names to the split-ids in that chapter, so
507 for depth
, p
in serial_points
:
509 continue #ignore the sub-sections
510 url
, ID
= p
['content_src'], None
511 url
= os
.path
.join(pwd
, url
)
513 log("GOT a fragment! %s" % url
)
514 url
, ID
= url
.split('#', 1)
515 s
= splits
.setdefault(url
, [])
516 s
.append((depth
, ID
, p
))
518 return lcd
, serial_points
, splits
521 def parse_metadata(metadata
):
522 """metadata is an OPF metadata node, as defined at
523 http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
524 (or a dc-metadata or x-metadata child thereof).
527 # the node probably has at least 'dc', 'opf', and None namespace
528 # prefixes. None and opf probably map to the same thing. 'dc' is
530 nsmap
= metadata
.nsmap
531 nstags
= dict((k
, '{%s}' % v
) for k
, v
in nsmap
.iteritems())
532 default_ns
= nstags
[None]
534 # Collect element data in namespace-bins, and map prefixes to
535 # those bins for convenience
536 nsdict
= dict((v
, {}) for v
in nsmap
.values())
538 def add_item(ns
, tag
, value
, extra
):
539 #any key can be duplicate, so store in a list
542 values
= nsdict
[ns
].setdefault(tag
, [])
543 values
.append((value
, extra
))
545 for t
in metadata
.iterdescendants():
546 #look for special OPF tags
547 if t
.tag
== default_ns
+ 'meta':
548 #meta tags <meta name="" content="" />
550 content
= t
.get('content')
551 others
= tuple((k
, v
) for k
, v
in t
.items() if k
not in ('name', 'content'))
553 # the meta tag is using xml namespaces in attribute values.
554 prefix
, name
= name
.split(':', 1)
557 add_item(t
.nsmap
[prefix
], name
, content
, others
)
560 if t
.tag
in (default_ns
+ 'dc-metadata', default_ns
+ 'x-metadata'):
561 # Subelements of these deprecated elements are in either
562 # DC or non-DC namespace (respectively). Of course, this
563 # is true of any element anyway, so it is sufficent to
564 # ignore this (unless we want to cause pedantic errors).
565 log("found a live %s tag; descending into but otherwise ignoring it"
566 % t
.tag
[len(default_ns
):])
569 tag
= t
.tag
[t
.tag
.rfind('}') + 1:]
570 add_item(t
.nsmap
[t
.prefix
], tag
, t
.text
,
571 tuple((k
.replace(default_ns
, ''), v
) for k
, v
in t
.items()))
576 def parse_manifest(manifest
, pwd
):
578 Only contains <item>s; each <item> has id, href, and media-type.
580 It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
581 file (i.e., the files needed to get this far).
583 The manifest can specify fallbacks for unrecognised documents, but
584 Espri does not use that (nor do any of the test epub files).
587 <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
588 <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
589 <item id="cover" href="cover.jpg" media-type="image/jpeg" />
593 ns
= '{%s}' % manifest
.nsmap
[None]
595 for t
in manifest
.iterchildren(ns
+ 'item'):
597 href
= os
.path
.join(pwd
, t
.get('href'))
598 media_type
= t
.get('media-type')
599 items
[id] = (href
, media_type
) #XXX does media-type matter?
603 def parse_spine(spine
):
604 """The spine is an ordered list of xhtml documents (or dtbook, but
605 Booki can't edit that, or manifest items that 'fallback' to xhtml,
606 which Espri doesn't yet handle). Also, anything in the manifest
607 that can be in the spine, must be.
609 Spine itemrefs can have a 'linear' attribute, with a value of
610 'yes' or 'no' (defaulting to 'yes'). If an item is linear, it is
611 in the main stream of the book. Reader software is allowed to
612 ignore this distinction, as Espri does.
614 The toc attribute points to the ncx file (via manifest id).
617 ns
= '{%s}' % spine
.nsmap
[None]
618 for t
in spine
.iterchildren(ns
+ 'itemref'):
619 items
.append(t
.get('idref'))
621 toc
= spine
.get('toc')
627 """get text content from an <xx><text>...</text></xx> construct,
628 as is common in NCX files."""
629 # there will only be one <text>, but for...iter is still easiest
630 for t
in e
.iter(DAISYNS
+ 'text'):
632 return '' # or leave it at None?
634 def get_labels(e
, tag
='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
635 """Make a mapping of languages to labels."""
636 # This reads navInfo or navLabel tags. navInfo is unlikely, but
637 # navLabel is ubiquitous. There can be one for each language, so
640 for label
in e
.findall(tag
):
641 lang
= label
.get(XMLNS
+ 'lang')
642 labels
[lang
] = get_ncxtext(e
)
647 The NCX file is the closest thing to FLOSS Manuals TOC.txt. It
648 describes the heirarchical structure of the document (wheras the
649 spine describes its 'physical' structure).
651 #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
654 #if a header is set multiple times, keep all
655 def setheader(name
, content
, scheme
=None):
656 values
= headers
.setdefault(name
, [])
657 values
.append((content
, scheme
))
659 head
= ncx
.find(DAISYNS
+ 'head')
660 #<!ELEMENT head (meta+)>
661 for meta
in head
.findall(DAISYNS
+ 'meta'):
662 #whatever 'scheme' is
663 setheader(meta
.get('name'), meta
.get('content'), meta
.get('scheme'))
665 for t
in ('docTitle', 'docAuthor'):
666 for e
in ncx
.findall(DAISYNS
+ t
):
668 setheader(t
, get_ncxtext(e
))
671 for attr
, header
in (('dir', 'dir'),
672 (XMLNS
+ 'lang', 'lang')):
673 value
= root
.get(attr
)
674 if value
is not None:
675 setheader(header
, value
)
677 navmap
= root
.find(DAISYNS
+ 'navMap')
680 'navmap': parse_navmap(navmap
),
683 #Try adding these bits, even though no-one has them and they are no use.
684 pagelist
= ncx
.find(DAISYNS
+ 'pageList')
685 navlist
= ncx
.find(DAISYNS
+ 'navList')
686 if pagelist
is not None:
687 ret
['pagelist'] = parse_pagelist(pagelist
)
688 if navlist
is not None:
689 ret
['navlist'] = parse_navlist(navlist
)
695 #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
696 #XXX move info and labels out of navmap, and into headers?
698 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
699 'labels': get_labels(e
),
700 'points': tuple(parse_navpoint(x
) for x
in e
.findall(DAISYNS
+ 'navPoint')),
703 def parse_navpoint(e
):
704 #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
705 c
= e
.find(DAISYNS
+ 'content')
706 subpoints
= tuple(parse_navpoint(x
) for x
in e
.findall(DAISYNS
+ 'navPoint'))
709 'class': e
.get('class'),
710 'play_order': int(e
.get('playOrder')),
711 #'content_id': c.get('id'),
712 'content_src': c
.get('src'),
713 'labels': get_labels(e
),
718 def parse_pagelist(e
):
719 # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
721 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
722 'labels': get_labels(e
),
723 'targets': tuple(parse_pagetarget(x
) for x
in e
.findall(DAISYNS
+ 'pageTarget')),
726 def parse_pagetarget(e
):
727 #<!ELEMENT pageTarget (navLabel+, content)>
728 labels
= get_labels(e
)
729 c
= e
.find(DAISYNS
+ 'content')
732 'type': e
.get('type'),
733 'play_order': int(e
.get('playOrder')),
734 'content_src': c
.get('src'),
735 'labels': get_labels(e
),
737 value
= e
.get('value')
738 if value
is not None:
742 def parse_navlist(e
):
743 #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
745 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
746 'labels': get_labels(e
),
747 'targets': tuple(parse_pagetarget(x
) for x
in e
.findall(DAISYNS
+ 'navTarget')),
750 def parse_navtarget(e
):
751 #<!ELEMENT navTarget (navLabel+, content)>
752 labels
= get_labels(e
)
753 c
= e
.find(DAISYNS
+ 'content')
756 'play_order': int(e
.get('playOrder')),
757 'content_src': c
.get('src'),
758 'labels': get_labels(e
),
760 value
= e
.get('value')
761 if value
is not None: