1 """Module for dealing with epub -> booki conversions."""
5 from cStringIO
import StringIO
10 from simplejson
import dumps
12 import lxml
.html
, lxml
.cssselect
13 from lxml
import etree
15 from objavi
.xhtml_utils
import split_tree
16 from objavi
.book_utils
import log
17 from objavi
.config
import DC
, XHTML
, XHTMLNS
, FM
, MARKER_CLASS_INFO
, MARKER_CLASS_SPLIT
18 from booki
.bookizip
import BookiZip
20 #XML namespaces. The *NS varients are in {curly brackets} for clark's syntax
21 XMLNS
= '{http://www.w3.org/XML/1998/namespace}'
22 DAISYNS
= '{http://www.daisy.org/z3986/2005/ncx/}'
23 OPFNS
= '{http://www.idpf.org/2007/opf}'
24 CONTAINERNS
= '{urn:oasis:names:tc:opendocument:xmlns:container}'
26 MARKUP_TYPES
= ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
27 HTML_TYPES
= ('application/xhtml+xml', 'text/html')
29 ADD_INFO_MARKERS
= False
32 html_parser
= lxml
.html
.HTMLParser(encoding
="utf-8")
33 xhtml_parser
= lxml
.html
.XHTMLParser(encoding
="utf-8")
35 def _xhtml_parse(*args
, **kwargs
):
36 kwargs
['parser'] = xhtml_parser
37 return lxml
.html
.parse(*args
, **kwargs
)
39 def _html_parse(*args
, **kwargs
):
40 kwargs
['parser'] = html_parser
41 return lxml
.html
.parse(*args
, **kwargs
)
43 def new_doc(guts
="", version
="1.1", lang
=None):
44 xmldec
= '<?xml version="1.0" encoding="UTF-8"?>'
46 '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
47 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
48 '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
49 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
52 if lang
in (None, 'und', 'UND'):
55 langdec
= 'xml:lang="%s" lang="%s"' % (lang
, lang
)
57 doc
= ('<html xmlns="%s" version="XHTML %s" %s>'
58 '<head></head><body>%s</body></html>'
59 % (XHTML
, version
, langdec
, guts
))
61 f
= StringIO(xmldec
+ doctypes
.get(version
, '') + doc
)
62 tree
= lxml
.html
.parse(f
)
67 class EpubError(Exception):
82 Great Expectations.opf
87 <other HTML files for the remaining chapters>
91 # Zip is a variable format, and zipfile is limited. If that
92 # becomes a problem we will have to ise an `unzip` subprocess,
93 # but it hasn't been so far.
94 if isinstance(src
, str):
95 # Should end with PK<06><05> + 18 more.
96 # Some zips contain 'comments' after that, which breaks ZipFile
97 zipend
= src
.rfind('PK\x05\x06') + 22
98 if len(src
) != zipend
:
102 self
.zip = zipfile
.ZipFile(src
, 'r', compression
=zipfile
.ZIP_DEFLATED
, allowZip64
=True)
103 self
.names
= self
.zip.namelist()
104 self
.info
= self
.zip.infolist()
107 def gettree(self
, name
=None, id=None, parse
=etree
.parse
):
108 """get an XML tree from the given zip filename or manifest ID"""
110 name
, mimetype
= self
.manifest
[id]
111 #Note: python 2.6 (not 2.5) has zipfile.open
112 s
= self
.zip.read(name
)
118 def parse_meta(self
):
119 '''META-INF/container.xml contains one or more <rootfile>
120 nodes. We want the "application/oepbs-package+xml" one.
122 <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
123 <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
125 Other files are allowed in META-INF, but none of them are much
126 use. They are manifest.xml, metadata.xml, signatures.xml,
127 encryption.xml, and rights.xml.
129 tree
= self
.gettree('META-INF/container.xml')
130 for r
in tree
.getiterator(CONTAINERNS
+ 'rootfile'):
131 if r
.get('media-type') == "application/oebps-package+xml":
132 rootfile
= r
.get('full-path')
135 raise EpubError("No OPF rootfile found")
137 self
.opf_file
= rootfile
141 The opf file is arranged like this:
149 Metadata, manifest and spine are parsed in separate helper
152 self
.opfdir
= os
.path
.dirname(self
.opf_file
) #needed for manifest parsing
153 tree
= self
.gettree(self
.opf_file
)
154 root
= tree
.getroot()
155 metadata
= root
.find(OPFNS
+ 'metadata')
156 manifest
= root
.find(OPFNS
+ 'manifest')
157 spine
= root
.find(OPFNS
+ 'spine')
159 self
.metadata
= parse_metadata(metadata
)
160 self
.manifest
= parse_manifest(manifest
, self
.opfdir
)
161 # mapping of filenames to new filenames. This needs to be
162 # done early to detect clashes (e.g. '/images/hello.jpg' and
163 # '/images/big/hello.jpg' would both reduce to
164 # 'static/hello.jpg').
166 for k
, v
in self
.manifest
.items():
168 if isinstance(fn
, unicode):
169 log('Stupid unicode: %r' % fn
)
171 if mimetype
not in MARKUP_TYPES
:
174 fn
= fn
.rsplit('/', 1)[1]
175 while fn
in self
.media_map
.values():
177 newfn
= 'static/%s' % fn
178 self
.media_map
[oldfn
] = newfn
180 ncxid
, self
.spine
= parse_spine(spine
)
181 self
.ncxfile
= self
.manifest
[ncxid
][0]
183 #there is also an optional guide section, which we ignore
184 guide
= root
.find(OPFNS
+ 'guide')
185 if guide
is not None:
186 self
.guide
= parse_guide(guide
)
192 ncx
= self
.gettree(self
.ncxfile
)
193 self
.ncxdata
= parse_ncx(ncx
)
196 """get all the known metadata and nav data as json."""
198 'metadata': self
.metadata
,
199 'manifest': self
.manifest
,
203 if self
.guide
is not None:
204 data
['guide'] = self
.guide
205 return dumps(data
, indent
=2)
207 def find_language(self
):
208 opflang
= [x
[0].lower() for x
in
209 self
.metadata
.get(DC
, {}).get('language', ())]
211 # XXX Should the ncx language enter into it? Being xml:lang,
212 # it is in theory just the language of the ncx document
213 # itself. But if the metadata lacks language, should it be
214 # used instead? At present, NO.
215 #ncxlang = self.ncxdata['headers'].get('lang', ())
217 # XXX also, for now, ignoring case of badly formed language
218 # codes, conflicting or supplementary languages, etc.
219 opflang
= [x
for x
in opflang
if x
not in ('und', '')]
222 if len(set(opflang
)) > 1:
223 log('%s metadata has more than one language: %s -- using first one'
224 % (self
.origin
, opflang
))
227 def find_probable_chapters(self
):
228 """Try to find the real chapters from the NCX file. The
229 problem is that different epubs all use their own level of
231 # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
232 # and FM books have (section 1 (c1, c2,..),..)
233 # i.e super-chapter blocks
234 # some have (((c1, c2, c3))) -- deeply nested chapters
235 # some have no real chapters, but stupid structure
236 points
= self
.ncxdata
['navmap']['points']
237 pwd
= os
.path
.dirname(self
.ncxfile
)
238 serial_points
, splits
= get_chapter_breaks(points
, pwd
)
239 return serial_points
, splits
241 def concat_document(self
):
242 """Join all the xhtml files together, putting in markers
243 indicating where the splits should be.
245 lang
= self
.find_language()
246 points
= self
.ncxdata
['navmap']['points']
247 pwd
= os
.path
.dirname(self
.ncxfile
)
248 serial_points
, chapter_markers
= get_chapter_breaks(points
, pwd
)
249 doc
= new_doc(lang
=lang
)
250 #log(chapter_markers)
251 for ID
in self
.spine
:
252 fn
, mimetype
= self
.manifest
[ID
]
253 if mimetype
.startswith('image'):
254 root
= lxml
.html
.Element('html')
255 body
= etree
.SubElement(root
, 'body')
256 first_el
= etree
.SubElement(body
, 'img', src
=self
.media_map
.get(fn
, fn
), alt
='')
258 tree
= self
.gettree(fn
, parse
=_html_parse
)
259 root
= tree
.getroot()
260 body
= _find_tag(root
, 'body')
261 if not len(body
) and ADD_INFO_MARKERS
:
262 add_marker(body
, 'espri-empty-file-%s' % ID
, title
=fn
, child
=True)
264 #point the links to the new names. XXX probably fragile
265 root
.rewrite_links(lambda x
: self
.media_map
.get(os
.path
.join(self
.opfdir
, x
), x
))
267 for depth
, fragment
, point
in chapter_markers
.get(fn
, ()):
269 start
= root
.xpath("//*[@id='%s']" % fragment
)[0]
272 labels
= point
['labels']
273 add_marker(start
, '%(id)s' % point
,
274 klass
=MARKER_CLASS_SPLIT
,
275 title
=find_good_label(labels
, lang
),
276 subsections
=str(bool(point
['points'])))
279 add_marker(first_el
, 'espri-new-file-%s' % ID
, title
=fn
)
284 def make_bookizip(self
, zfn
):
285 """Split up the document and construct a booki-toc for it."""
286 doc
= self
.concat_document()
289 chapters
= split_tree(doc
) #destroys doc.
290 real_chapters
= drop_empty_chapters(chapters
)
291 rightsholders
= [c
for c
, extra
in self
.metadata
[DC
].get('creator', ())]
292 contributors
= rightsholders
+ [c
for c
, extra
in self
.metadata
[DC
].get('contributor', ())]
295 for c
in real_chapters
:
297 root
= c
.tree
.getroot()
301 del root
.attrib
['xmlns']
302 del root
.attrib
['version']
303 del root
.attrib
['xml:lang']
307 head
= root
.makeelement('head')
308 _title
= etree
.SubElement(head
, 'title')
309 _title
.text
= c
.title
311 blob
= lxml
.html
.tostring(c
.tree
)
312 bz
.add_to_package(c
.ID
, '%s.html' % c
.ID
, blob
, mediatype
='text/html',
313 contributors
=contributors
,
314 rightsholders
=rightsholders
)
317 #add the images and other non-html data unchanged.
318 for id, data
in self
.manifest
.iteritems():
320 if isinstance(fn
, unicode):
321 log("Hateful unicode: %r" % fn
)
322 if mimetype
not in MARKUP_TYPES
:
323 blob
= self
.zip.read(fn
)
324 bz
.add_to_package(id, self
.media_map
[fn
], blob
, mimetype
,
325 contributors
=contributors
,
326 rightsholders
=rightsholders
329 #now to construct a table of contents
330 lang
= self
.find_language()
333 def write_toc(point
, section
):
335 title
= find_good_label(point
['labels'], lang
),
336 if title
and title
[0]:
337 tocpoint
['title'] = title
[0]
340 tocpoint
['url'] = self
.manifest
.get(ID
, ID
+ '.html')
342 tp
= deferred_urls
.pop()
343 tp
['url'] = tocpoint
['url']
344 log('%r has deferred url: %r' % (tp
['title'], tp
['url']))
346 deferred_urls
.append(tocpoint
)
348 tocpoint
['children'] = []
349 for child
in point
['points']:
350 write_toc(child
, tocpoint
['children'])
352 section
.append(tocpoint
)
355 points
= self
.ncxdata
['navmap']['points']
359 metadata
= {FM
: {'book':{},
364 for namespace
, keys
in self
.metadata
.items():
365 if 'namespace' not in metadata
:
366 metadata
[namespace
] = {}
368 for key
, values
in keys
.items():
369 dest
= metadata
[namespace
].setdefault(key
, {})
370 for value
, extra
in values
:
373 for x
in ('scheme', 'role'):
377 dest
.setdefault(scheme
, []).append(value
)
379 if not metadata
[FM
]['book']:
380 metadata
[FM
]['book'][''] = [''.join(x
for x
in str(metadata
[DC
]['identifier'][''][0]) if x
.isalnum())]
381 if not metadata
[FM
]['server']:
382 metadata
[FM
]['server'][''] = ['booki.flossmanuals.net']
389 'metadata': metadata
,
396 def find_good_label(labels
, lang
=None):
397 """Try to find a suitable label from a dictionary mapping
398 languages to labels, resorting to a random label if need be."""
399 #XXX not taking into account language sub-tags ("en_GB")
400 for x
in [lang
, None]:
404 #return random.choice(labels.values())
405 return ' | '.join(labels
.values())
409 #labels.get(lang, '\n'.join(labels.values())),
411 def drop_empty_chapters(chapters
):
412 """If the chapter has no content, ignore it. Content is defined
413 as images or text."""
417 for e
in c
.tree
.iter():
418 if ((e
.text
and e
.text
.strip()) or
419 (e
.tail
and e
.tail
.strip()) or
424 good_chapters
.append(c
)
428 def add_guts(src
, dest
):
429 """Append the contents of the <body> of one tree onto that of
430 another. The source tree will be emptied."""
431 #print lxml.etree.tostring(src)
432 sbody
= _find_tag(src
, 'body')
433 dbody
= _find_tag(dest
, 'body')
435 dbody
[-1].tail
= ((dbody
[-1].tail
or '') +
436 (sbody
.text
or '')) or None
438 dbody
.text
= sbody
.text
443 dbody
.tail
= ((dbody
.tail
or '') +
444 (sbody
.tail
or '')) or None
448 def _find_tag(doc
, tag
):
449 #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace(' ', ''))
452 except AttributeError:
456 return doc
.iter(XHTMLNS
+ tag
).next()
457 except StopIteration:
458 log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc
.nsmap
, tag
))
459 return doc
.iter(tag
).next()
462 def add_marker(el
, ID
, child
=False, klass
=MARKER_CLASS_INFO
, **kwargs
):
463 """Add a marker before the element, or inside it if child is true"""
464 marker
= el
.makeelement('hr')
466 marker
.set('class', klass
)
467 for k
, v
in kwargs
.items():
473 parent
= el
.getparent()
474 index
= parent
.index(el
)
475 parent
.insert(index
, marker
)
479 def get_chapter_breaks(points
, pwd
):
480 # First go was overly complex, trying to guess which sections were
481 # really chapters. Now, every ncx navpoint is a chapter break.
483 def serialise(p
, depth
):
484 serial_points
.append((depth
, p
))
486 # log("found class=='%s' at depth %s" % (p['class'], depth))
487 if not p
.get('points'):
489 for child
in p
['points']:
490 serialise(child
, depth
+ 1)
496 for depth
, p
in serial_points
:
497 url
, ID
= p
['content_src'], None
498 url
= os
.path
.join(pwd
, url
)
500 log("GOT a fragment! %s" % url
)
501 url
, ID
= url
.split('#', 1)
502 s
= splits
.setdefault(url
, [])
503 s
.append((depth
, ID
, p
))
505 return serial_points
, splits
508 def parse_metadata(metadata
):
509 """metadata is an OPF metadata node, as defined at
510 http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
511 (or a dc-metadata or x-metadata child thereof).
514 # the node probably has at least 'dc', 'opf', and None namespace
515 # prefixes. None and opf probably map to the same thing. 'dc' is
517 nsmap
= metadata
.nsmap
518 nstags
= dict((k
, '{%s}' % v
) for k
, v
in nsmap
.iteritems())
519 default_ns
= nstags
[None]
521 # Collect element data in namespace-bins, and map prefixes to
522 # those bins for convenience
523 nsdict
= dict((v
, {}) for v
in nsmap
.values())
525 def add_item(ns
, tag
, value
, extra
):
526 #any key can be duplicate, so store in a list
529 values
= nsdict
[ns
].setdefault(tag
, [])
530 values
.append((value
, extra
))
532 for t
in metadata
.iterdescendants():
533 #look for special OPF tags
534 if t
.tag
== default_ns
+ 'meta':
535 #meta tags <meta name="" content="" />
537 content
= t
.get('content')
538 others
= dict((k
, v
) for k
, v
in t
.items() if k
not in ('name', 'content'))
540 # the meta tag is using xml namespaces in attribute values.
541 prefix
, name
= name
.split(':', 1)
544 add_item(t
.nsmap
[prefix
], name
, content
, others
)
547 if t
.tag
in (default_ns
+ 'dc-metadata', default_ns
+ 'x-metadata'):
548 # Subelements of these deprecated elements are in either
549 # DC or non-DC namespace (respectively). Of course, this
550 # is true of any element anyway, so it is sufficent to
551 # ignore this (unless we want to cause pedantic errors).
552 log("found a live %s tag; descending into but otherwise ignoring it"
553 % t
.tag
[len(default_ns
):])
556 tag
= t
.tag
[t
.tag
.rfind('}') + 1:]
557 add_item(t
.nsmap
[t
.prefix
], tag
, t
.text
,
558 tuple((k
.replace(default_ns
, ''), v
) for k
, v
in t
.items()))
563 def parse_manifest(manifest
, pwd
):
565 Only contains <item>s; each <item> has id, href, and media-type.
567 It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
568 file (i.e., the files needed to get this far).
570 The manifest can specify fallbacks for unrecognised documents, but
571 Espri does not use that (nor do any of the test epub files).
574 <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
575 <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
576 <item id="cover" href="cover.jpg" media-type="image/jpeg" />
580 ns
= '{%s}' % manifest
.nsmap
[None]
582 for t
in manifest
.iterchildren(ns
+ 'item'):
584 href
= os
.path
.join(pwd
, t
.get('href'))
585 if isinstance(href
, unicode):
586 log('damn unicode: %r' % href
)
587 log(etree
.tostring(t
))
588 media_type
= t
.get('media-type')
589 items
[id] = (href
, media_type
) #XXX does media-type matter?
593 def parse_spine(spine
):
594 """The spine is an ordered list of xhtml documents (or dtbook, but
595 Booki can't edit that, or manifest items that 'fallback' to xhtml,
596 which Espri doesn't yet handle). Also, anything in the manifest
597 that can be in the spine, must be.
599 Spine itemrefs can have a 'linear' attribute, with a value of
600 'yes' or 'no' (defaulting to 'yes'). If an item is linear, it is
601 in the main stream of the book. Reader software is allowed to
602 ignore this distinction, as Espri does.
604 The toc attribute points to the ncx file (via manifest id).
607 ns
= '{%s}' % spine
.nsmap
[None]
608 for t
in spine
.iterchildren(ns
+ 'itemref'):
609 items
.append(t
.get('idref'))
611 toc
= spine
.get('toc')
615 def parse_guide(guide
):
616 """Parse the guide from the opf file."""
618 ns
= '{%s}' % guide
.nsmap
[None]
619 for r
in guide
.iterchildren(ns
+ 'reference'):
620 items
.append((r
.get('href'), r
.get('type'), r
.get('title'),))
626 """get text content from an <xx><text>...</text></xx> construct,
627 as is common in NCX files."""
628 # there will only be one <text>, but for...iter is still easiest
629 for t
in e
.iter(DAISYNS
+ 'text'):
631 return '' # or leave it at None?
633 def get_labels(e
, tag
='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
634 """Make a mapping of languages to labels."""
635 # This reads navInfo or navLabel tags. navInfo is unlikely, but
636 # navLabel is ubiquitous. There can be one for each language, so
639 for label
in e
.iterchildren(tag
):
640 lang
= label
.get(XMLNS
+ 'lang')
641 labels
[lang
] = get_ncxtext(e
)
646 The NCX file is the closest thing to FLOSS Manuals TOC.txt. It
647 describes the heirarchical structure of the document (wheras the
648 spine describes its 'physical' structure).
650 #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
653 #if a header is set multiple times, keep all
654 def setheader(name
, content
, scheme
=None):
655 values
= headers
.setdefault(name
, [])
656 values
.append((content
, scheme
))
658 head
= ncx
.find(DAISYNS
+ 'head')
659 #<!ELEMENT head (meta+)>
660 for meta
in head
.iterchildren(DAISYNS
+ 'meta'):
661 #whatever 'scheme' is
662 setheader(meta
.get('name'), meta
.get('content'), meta
.get('scheme'))
665 for t
in ('docTitle', 'docAuthor'):
666 for e
in root
.iterchildren(DAISYNS
+ t
):
668 setheader(t
, get_ncxtext(e
))
670 for attr
, header
in (('dir', 'dir'),
671 (XMLNS
+ 'lang', 'lang')):
672 value
= root
.get(attr
)
673 if value
is not None:
674 setheader(header
, value
)
676 navmap
= root
.find(DAISYNS
+ 'navMap')
679 'navmap': parse_navmap(navmap
),
682 #Try adding these bits, even though no-one has them and they are no use.
683 pagelist
= ncx
.find(DAISYNS
+ 'pageList')
684 navlist
= ncx
.find(DAISYNS
+ 'navList')
685 if pagelist
is not None:
686 ret
['pagelist'] = parse_pagelist(pagelist
)
687 if navlist
is not None:
688 ret
['navlist'] = parse_navlist(navlist
)
694 #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
695 #XXX move info and labels out of navmap, and into headers?
697 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
698 'labels': get_labels(e
),
699 'points': tuple(parse_navpoint(x
) for x
in e
.iterchildren(DAISYNS
+ 'navPoint')),
702 def parse_navpoint(e
):
703 #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
704 c
= e
.find(DAISYNS
+ 'content')
705 subpoints
= tuple(parse_navpoint(x
) for x
in e
.iterchildren(DAISYNS
+ 'navPoint'))
708 'class': e
.get('class'),
709 'play_order': int(e
.get('playOrder')),
710 #'content_id': c.get('id'),
711 'content_src': c
.get('src'),
712 'labels': get_labels(e
),
717 def parse_pagelist(e
):
718 # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
720 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
721 'labels': get_labels(e
),
722 'targets': tuple(parse_pagetarget(x
) for x
in e
.iterchildren(DAISYNS
+ 'pageTarget')),
725 def parse_pagetarget(e
):
726 #<!ELEMENT pageTarget (navLabel+, content)>
727 c
= e
.find(DAISYNS
+ 'content')
730 'type': e
.get('type'),
731 'play_order': int(e
.get('playOrder')),
732 'content_src': c
.get('src'),
733 'labels': get_labels(e
),
735 value
= e
.get('value')
736 if value
is not None:
740 def parse_navlist(e
):
741 #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
743 'info': get_labels(e
, DAISYNS
+ 'navInfo'),
744 'labels': get_labels(e
),
745 'targets': tuple(parse_navtarget(x
) for x
in e
.iterchildren(DAISYNS
+ 'navTarget')),
748 def parse_navtarget(e
):
749 #<!ELEMENT navTarget (navLabel+, content)>
750 c
= e
.find(DAISYNS
+ 'content')
753 'play_order': int(e
.get('playOrder')),
754 'content_src': c
.get('src'),
755 'labels': get_labels(e
),
757 value
= e
.get('value')
758 if value
is not None: