if no id, make up a good one, not an empty string
[objavi2.git] / objavi / epub.py
blob3cce6baf72edff064521d5140294dca78c80cbc3
1 """Module for dealing with epub -> booki conversions."""
3 import os, sys
4 import zipfile
5 from cStringIO import StringIO
6 import copy
8 try:
9 from json import dumps
10 except ImportError:
11 from simplejson import dumps
13 import lxml, lxml.html, lxml.cssselect
14 from lxml import etree
16 from objavi.config import DC, XHTML, XHTMLNS, FM
17 from booki.bookizip import BookiZip
19 #XML namespaces. The *NS varients are in {curly brackets} for clark's syntax
20 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
21 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
22 OPFNS = '{http://www.idpf.org/2007/opf}'
23 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
25 MARKUP_TYPES = ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
26 HTML_TYPES = ('application/xhtml+xml', 'text/html')
28 def log(*messages, **kwargs):
29 for m in messages:
30 try:
31 print >> sys.stderr, m
32 except Exception:
33 print >> sys.stderr, repr(m)
36 html_parser = lxml.html.HTMLParser(encoding="utf-8")
37 xhtml_parser = lxml.html.XHTMLParser(encoding="utf-8")
39 def _xhtml_parse(*args, **kwargs):
40 kwargs['parser'] = xhtml_parser
41 return lxml.html.parse(*args, **kwargs)
43 def _html_parse(*args, **kwargs):
44 kwargs['parser'] = html_parser
45 return lxml.html.parse(*args, **kwargs)
47 def new_doc(guts="", version="1.1", lang=None):
48 xmldec = '<?xml version="1.0" encoding="UTF-8"?>'
49 doctypes = {
50 '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
51 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
52 '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
53 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
56 if lang in (None, 'und', 'UND'):
57 langdec = ''
58 else:
59 langdec = 'xml:lang="%s" lang="%s"' % (lang, lang)
61 doc = ('<html xmlns="%s" version="XHTML %s" %s>'
62 '<head></head><body>%s</body></html>'
63 % (XHTML, version, langdec, guts))
65 f = StringIO(xmldec + doctypes.get(version, '') + doc)
66 tree = lxml.html.parse(f)
67 f.close()
68 return tree
71 class EpubError(Exception):
72 pass
74 class Epub(object):
75 """
77 Abstract Container:
78 META-INF/
79 container.xml
80 [manifest.xml]
81 [metadata.xml]
82 [signatures.xml]
83 [encryption.xml]
84 [rights.xml]
85 OEBPS/
86 Great Expectations.opf
87 cover.html
88 chapters/
89 chapter01.html
90 chapter02.html
91 <other HTML files for the remaining chapters>
93 """
94 def load(self, src):
95 # Zip is a variable format, and zipfile is limited. If that
96 # becomes a problem we will have to ise an `unzip` subprocess,
97 # but it hasn't been so far.
98 if isinstance(src, str):
99 # Should end with PK<06><05> + 18 more.
100 # Some zips contain 'comments' after that, which breaks ZipFile
101 zipend = src.rfind('PK\x05\x06') + 22
102 if len(src) != zipend:
103 log('Bad zipfile?')
104 src = src[: zipend]
105 src = StringIO(src)
106 self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
107 self.names = self.zip.namelist()
108 self.info = self.zip.infolist()
109 self.origin = src
110 log(self.names)
112 def gettree(self, name=None, id=None, parse=etree.parse):
113 """get an XML tree from the given zip filename or manifest ID"""
114 if name is None:
115 name, mimetype = self.manifest[id]
116 #Note: python 2.6 (not 2.5) has zipfile.open
117 s = self.zip.read(name)
118 f = StringIO(s)
119 tree = parse(f)
120 f.close()
121 return tree
123 def parse_meta(self):
124 '''META-INF/container.xml contains one or more <rootfile>
125 nodes. We want the "application/oepbs-package+xml" one.
127 <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
128 <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
130 Other files are allowed in META-INF, but none of them are much
131 use. They are manifest.xml, metadata.xml, signatures.xml,
132 encryption.xml, and rights.xml.
134 tree = self.gettree('META-INF/container.xml')
135 for r in tree.getiterator(CONTAINERNS + 'rootfile'):
136 if r.get('media-type') == "application/oebps-package+xml":
137 rootfile = r.get('full-path')
138 break
139 else:
140 raise EpubError("No OPF rootfile found")
142 self.opf_file = rootfile
144 def parse_opf(self):
146 The opf file is arranged like this:
147 <package>
148 <metadata />
149 <manifest />
150 <spine />
151 <guide />
152 </package>
154 Metadata, manifest and spine are parsed in separate helper
155 functions.
157 self.opfdir = os.path.dirname(self.opf_file) #needed for manifest parsing
158 tree = self.gettree(self.opf_file)
159 root = tree.getroot()
160 metadata = root.find(OPFNS + 'metadata')
161 manifest = root.find(OPFNS + 'manifest')
162 spine = root.find(OPFNS + 'spine')
164 self.metadata = parse_metadata(metadata)
165 self.manifest = parse_manifest(manifest, self.opfdir)
166 # mapping of filenames to new filenames. This needs to be
167 # done early to detect clashes (e.g. '/images/hello.jpg' and
168 # '/images/big/hello.jpg' would both reduce to
169 # 'static/hello.jpg').
170 self.media_map = {}
171 for k, v in self.manifest.items():
172 fn, mimetype = v
173 if mimetype not in MARKUP_TYPES:
174 oldfn = fn
175 if '/' in fn:
176 fn = fn.rsplit('/', 1)[1]
177 while fn in self.media_map.values():
178 fn = '_' + fn
179 newfn = 'static/%s' % fn
180 self.media_map[oldfn] = newfn
182 ncxid, self.spine = parse_spine(spine)
183 self.ncxfile = self.manifest[ncxid][0]
185 #there is also an optional guide section, which we ignore
186 guide = root.find(OPFNS + 'guide')
187 if guide is not None:
188 self.guide = parse_guide(guide)
189 else:
190 self.guide = None
193 def parse_ncx(self):
194 ncx = self.gettree(self.ncxfile)
195 self.ncxdata = parse_ncx(ncx)
197 def raw_json(self):
198 """get all the known metadata and nav data as json."""
199 data = {
200 'metadata': self.metadata,
201 'manifest': self.manifest,
202 'spine': self.spine,
203 'ncx': self.ncxdata
205 if self.guide is not None:
206 data['guide'] = self.guide
207 return dumps(data, indent=2)
209 def find_language(self):
210 opflang = [x[0].lower() for x in
211 self.metadata.get(DC, {}).get('language', ())]
213 # XXX Should the ncx language enter into it? Being xml:lang,
214 # it is in theory just the language of the ncx document
215 # itself. But if the metadata lacks language, should it be
216 # used instead? At present, NO.
217 #ncxlang = self.ncxdata['headers'].get('lang', ())
219 # XXX also, for now, ignoring case of badly formed language
220 # codes, conflicting or supplementary languages, etc.
221 opflang = [x for x in opflang if x not in ('und', '')]
222 if not opflang:
223 return None
224 if len(set(opflang)) > 1:
225 log('%s metadata has more than one language: %s -- using first one'
226 % (self.origin, opflang))
227 return opflang[0]
229 def find_probable_chapters(self):
230 """Try to find the real chapters from the NCX file. The
231 problem is that different epubs all use their own level of
232 nesting."""
233 # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
234 # and FM books have (section 1 (c1, c2,..),..)
235 # i.e super-chapter blocks
236 # some have (((c1, c2, c3))) -- deeply nested chapters
237 # some have no real chapters, but stupid structure
238 points = self.ncxdata['navmap']['points']
239 pwd = os.path.dirname(self.ncxfile)
240 serial_points, splits = get_chapter_breaks(points, pwd)
241 return serial_points, splits
243 def concat_document(self):
244 """Join all the xhtml files together, putting in markers
245 indicating where the splits should be.
247 lang = self.find_language()
248 points = self.ncxdata['navmap']['points']
249 pwd = os.path.dirname(self.ncxfile)
250 serial_points, chapter_markers = get_chapter_breaks(points, pwd)
251 doc = new_doc(lang=lang)
252 #log(chapter_markers)
253 for ID in self.spine:
254 fn, mimetype = self.manifest[ID]
255 if mimetype.startswith('image'):
256 root = lxml.html.Element('html')
257 body = etree.SubElement(root, 'body')
258 first_el = etree.SubElement(body, 'img', src=self.media_map.get(fn, fn), alt='')
259 else:
260 tree = self.gettree(fn, parse=_html_parse)
261 root = tree.getroot()
262 first_el = _find_tag(root, 'body')[0]
263 #point the links to the new names. XXX probably fragile
264 root.rewrite_links(lambda x: self.media_map.get(os.path.join(self.opfdir, x), x))
266 for depth, fragment, point in chapter_markers.get(fn, ()):
267 if fragment:
268 start = root.xpath("//*[@id='%s']" % fragment)[0]
269 else:
270 start = first_el
271 labels = point['labels']
272 add_marker(start, 'espri-chapter-%(id)s' % point,
273 title=find_good_label(labels, lang),
274 subsections=str(bool(point['points'])))
276 add_marker(first_el, 'espri-new-file-%s' % ID, title=fn)
277 add_guts(root, doc)
278 return doc
281 def make_bookizip(self, zfn):
282 """Split up the document and construct a booki-toc for it."""
283 doc = self.concat_document()
284 bz = BookiZip(zfn)
286 chapters = split_document(doc)
287 real_chapters = drop_empty_chapters(chapters)
288 rightsholders = [c for c, extra in self.metadata[DC].get('creator', ())]
289 contributors = rightsholders + [c for c, extra in self.metadata[DC].get('contributor', ())]
291 spine = []
292 for id, title, tree in real_chapters:
293 try:
294 root = tree.getroot()
295 except:
296 root = tree
297 try:
298 del root.attrib['xmlns']
299 del root.attrib['version']
300 del root.attrib['xml:lang']
301 except KeyError,e:
302 log(e)
303 if title:
304 head = root.makeelement('head')
305 _title = etree.SubElement(head, 'title')
306 _title.text = title
307 root.insert(0, head)
308 #blob = etree.tostring(tree)
309 blob = lxml.html.tostring(tree)
310 bz.add_to_package(id, '%s.html' % id, blob, mediatype='text/html',
311 contributors=contributors,
312 rightsholders=rightsholders)
313 spine.append(id)
315 #add the images and other non-html data unchanged.
316 for id, data in self.manifest.iteritems():
317 fn, mimetype = data
318 if mimetype not in MARKUP_TYPES:
319 blob = self.zip.read(fn)
320 bz.add_to_package(id, self.media_map[fn], blob, mimetype,
321 contributors=contributors,
322 rightsholders=rightsholders
325 #now to construct a table of contents
326 lang = self.find_language()
328 deferred_urls = []
329 def write_toc(point, section):
330 tocpoint = {}
331 title = find_good_label(point['labels'], lang),
332 if title and title[0]:
333 tocpoint['title'] = title[0]
334 ID = point['id']
335 if ID in spine:
336 tocpoint['url'] = self.manifest.get(ID, ID + '.html')
337 while deferred_urls:
338 tp = deferred_urls.pop()
339 tp['url'] = tocpoint['url']
340 log('%r has deferred url: %r' % (tp['title'], tp['url']))
341 else:
342 deferred_urls.append(tocpoint)
343 if point['points']:
344 tocpoint['children'] = []
345 for child in point['points']:
346 write_toc(child, tocpoint['children'])
348 section.append(tocpoint)
350 toc = []
351 points = self.ncxdata['navmap']['points']
352 for p in points:
353 write_toc(p, toc)
355 metadata = {FM: {'book':{},
356 'server': {},
358 DC: {}}
360 for namespace, keys in self.metadata.items():
361 if 'namespace' not in metadata:
362 metadata[namespace] = {}
363 log(keys)
364 for key, values in keys.items():
365 dest = metadata[namespace].setdefault(key, {})
366 for value, extra in values:
367 scheme = ''
368 if extra:
369 for x in ('scheme', 'role'):
370 if x in extra:
371 scheme = extra[x]
372 break
373 dest.setdefault(scheme, []).append(value)
375 if not metadata[FM]['book']:
376 metadata[FM]['book'][''] = [''.join(x for x in str(metadata[DC]['identifier'][''][0]) if x.isalnum())]
377 if not metadata[FM]['server']:
378 metadata[FM]['server'][''] = ['booki.flossmanuals.net']
380 log(metadata)
382 bz.info = {
383 'spine': spine,
384 'TOC': toc,
385 'metadata': metadata,
386 'version': '1',
389 bz.finish()
392 def find_good_label(labels, lang=None):
393 """Try to find a suitable label from a dictionary mapping
394 languages to labels, resorting to a random label if need be."""
395 #XXX not taking into account language sub-tags ("en_GB")
396 for x in [lang, None]:
397 if x in labels:
398 return labels[x]
399 if labels:
400 #return random.choice(labels.values())
401 return ' | '.join(labels.values())
402 return None
405 #labels.get(lang, '\n'.join(labels.values())),
407 def drop_empty_chapters(chapters):
408 """If the chapter has no content, ignore it. Content is defined
409 as images or text."""
410 good_chapters = []
411 for c in chapters:
412 good = False
413 for e in c[2].iter():
414 if ((e.text and e.text.strip()) or
415 (e.tail and e.tail.strip()) or
416 e.tag in ('img',)):
417 good = True
418 break
419 if good:
420 good_chapters.append(c)
421 return good_chapters
424 def copy_element(src, create):
425 """Return a copy of the src element, with all its attributes and
426 tail, using create to make the copy. create is probably an
427 Element._makeelement method, to associate the copy with the right
428 tree, but it could be etree.HTMLElement."""
429 if isinstance(src.tag, basestring):
430 dest = create(src.tag)
431 else:
432 dest = copy.copy(src)
434 for k, v in src.items():
435 dest.set(k, v)
436 dest.tail = src.tail
437 return dest
439 def split_document(doc):
440 """Split the document along chapter boundaries."""
441 try:
442 root = doc.getroot()
443 except AttributeError:
444 root = doc
446 front_matter = copy_element(root, lxml.html.Element)
447 chapters = [('espri-unindexed-front-matter',
448 'Unindexed Front Matter',
449 front_matter)]
451 _climb_and_split(root, front_matter, chapters)
452 return chapters
454 def _climb_and_split(src, dest, chapters):
455 for child in src.iterchildren():
456 if child.tag == 'hr' and child.get('class') == MARKER_CLASS:
457 ID = child.get('id')
458 if ID.startswith('espri-chapter-'):
459 title = child.get('title') or ID
460 new = copy_element(src, lxml.html.Element)
461 root = new
463 for a in src.iterancestors():
464 a2 = copy_element(a, root.makeelement)
465 a2.append(root)
466 root = a2
468 chapters.append((ID[14:], title, root))
470 dest.tail = None
471 for a in dest.iterancestors():
472 a.tail = None
474 dest = new
475 else:
476 log("skipping %s" % etree.tostring(child))
478 else:
479 new = copy_element(child, dest.makeelement)
480 new.text = child.text
481 dest.append(new)
482 _climb_and_split(child, new, chapters)
485 def save_chapters(chapters):
486 for id, tree in chapters.items():
487 string = lxml.html.tostring(tree, method='html')
488 f = open('/tmp/x%s.html' % id, 'w')
489 f.write(string)
490 f.close()
493 def add_guts(src, dest):
494 """Append the contents of the <body> of one tree onto that of
495 another. The source tree will be emptied."""
496 #print lxml.etree.tostring(src)
497 sbody = _find_tag(src, 'body')
498 dbody = _find_tag(dest, 'body')
499 if len(dbody):
500 dbody[-1].tail = ((dbody[-1].tail or '') +
501 (sbody.text or '')) or None
502 else:
503 dbody.text = sbody.text
505 for x in sbody:
506 dbody.append(x)
508 dbody.tail = ((dbody.tail or '') +
509 (sbody.tail or '')) or None
513 def _find_tag(doc, tag):
514 #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
515 try:
516 doc = doc.getroot()
517 except AttributeError:
518 pass
519 if doc.nsmap:
520 try:
521 return doc.iter(XHTMLNS + tag).next()
522 except StopIteration:
523 log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc.nsmap, tag))
524 return doc.iter(tag).next()
526 MARKER_CLASS="espri-marker"
528 def add_marker(el, ID, **kwargs):
529 """Add a marker before the elememt"""
530 marker = el.makeelement('hr')
531 marker.set('id', ID)
532 marker.set('class', MARKER_CLASS)
533 for k, v in kwargs.items():
534 marker.set(k, v)
535 parent = el.getparent()
536 index = parent.index(el)
537 parent.insert(index, marker)
541 def get_chapter_breaks(points, pwd):
542 # First go was overly complex, trying to guess which sections were
543 # really chapters. Now, every ncx navpoint is a chapter break.
544 serial_points = []
545 def serialise(p, depth):
546 serial_points.append((depth, p))
547 #if p['class']:
548 # log("found class=='%s' at depth %s" % (p['class'], depth))
549 if not p.get('points'):
550 return depth
551 for child in p['points']:
552 bottom = serialise(child, depth + 1)
554 for p in points:
555 serialise(p, 1)
557 splits = {}
558 for depth, p in serial_points:
559 url, ID = p['content_src'], None
560 url = os.path.join(pwd, url)
561 if '#' in url:
562 log("GOT a fragment! %s" % url)
563 url, ID = url.split('#', 1)
564 s = splits.setdefault(url, [])
565 s.append((depth, ID, p))
567 return serial_points, splits
570 def parse_metadata(metadata):
571 """metadata is an OPF metadata node, as defined at
572 http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
573 (or a dc-metadata or x-metadata child thereof).
576 # the node probably has at least 'dc', 'opf', and None namespace
577 # prefixes. None and opf probably map to the same thing. 'dc' is
578 # Dublin Core.
579 nsmap = metadata.nsmap
580 nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
581 default_ns = nstags[None]
583 # Collect element data in namespace-bins, and map prefixes to
584 # those bins for convenience
585 nsdict = dict((v, {}) for v in nsmap.values())
587 def add_item(ns, tag, value, extra):
588 #any key can be duplicate, so store in a list
589 if ns not in nsdict:
590 nsdict[ns] = {}
591 values = nsdict[ns].setdefault(tag, [])
592 values.append((value, extra))
594 for t in metadata.iterdescendants():
595 #look for special OPF tags
596 if t.tag == default_ns + 'meta':
597 #meta tags <meta name="" content="" />
598 name = t.get('name')
599 content = t.get('content')
600 others = dict((k, v) for k, v in t.items() if k not in ('name', 'content'))
601 if ':' in name:
602 # the meta tag is using xml namespaces in attribute values.
603 prefix, name = name.split(':', 1)
604 else:
605 prefix = None
606 add_item(t.nsmap[prefix], name, content, others)
607 continue
609 if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
610 # Subelements of these deprecated elements are in either
611 # DC or non-DC namespace (respectively). Of course, this
612 # is true of any element anyway, so it is sufficent to
613 # ignore this (unless we want to cause pedantic errors).
614 log("found a live %s tag; descending into but otherwise ignoring it"
615 % t.tag[len(default_ns):])
616 continue
618 tag = t.tag[t.tag.rfind('}') + 1:]
619 add_item(t.nsmap[t.prefix], tag, t.text,
620 tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
622 return nsdict
625 def parse_manifest(manifest, pwd):
627 Only contains <item>s; each <item> has id, href, and media-type.
629 It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
630 file (i.e., the files needed to get this far).
632 The manifest can specify fallbacks for unrecognised documents, but
633 Espri does not use that (nor do any of the test epub files).
635 <manifest>
636 <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
637 <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
638 <item id="cover" href="cover.jpg" media-type="image/jpeg" />
639 </manifest>
641 items = {}
642 ns = '{%s}' % manifest.nsmap[None]
644 for t in manifest.iterchildren(ns + 'item'):
645 id = t.get('id')
646 href = os.path.join(pwd, t.get('href'))
647 media_type = t.get('media-type')
648 items[id] = (href, media_type) #XXX does media-type matter?
650 return items
652 def parse_spine(spine):
653 """The spine is an ordered list of xhtml documents (or dtbook, but
654 Booki can't edit that, or manifest items that 'fallback' to xhtml,
655 which Espri doesn't yet handle). Also, anything in the manifest
656 that can be in the spine, must be.
658 Spine itemrefs can have a 'linear' attribute, with a value of
659 'yes' or 'no' (defaulting to 'yes'). If an item is linear, it is
660 in the main stream of the book. Reader software is allowed to
661 ignore this distinction, as Espri does.
663 The toc attribute points to the ncx file (via manifest id).
665 items = []
666 ns = '{%s}' % spine.nsmap[None]
667 for t in spine.iterchildren(ns + 'itemref'):
668 items.append(t.get('idref'))
670 toc = spine.get('toc')
672 return toc, items
674 def parse_guide(guide):
675 """Parse the guide from the opf file."""
676 items = []
677 ns = '{%s}' % guide.nsmap[None]
678 for r in guide.iterchildren(ns + 'reference'):
679 items.append((r.get('href'), r.get('type'), r.get('title'),))
681 return items
684 def get_ncxtext(e):
685 """get text content from an <xx><text>...</text></xx> construct,
686 as is common in NCX files."""
687 # there will only be one <text>, but for...iter is still easiest
688 for t in e.iter(DAISYNS + 'text'):
689 return t.text
690 return '' # or leave it at None?
692 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
693 """Make a mapping of languages to labels."""
694 # This reads navInfo or navLabel tags. navInfo is unlikely, but
695 # navLabel is ubiquitous. There can be one for each language, so
696 # construct a dict.
697 labels = {}
698 for label in e.findall(tag):
699 lang = label.get(XMLNS + 'lang')
700 labels[lang] = get_ncxtext(e)
701 return labels
703 def parse_ncx(ncx):
705 The NCX file is the closest thing to FLOSS Manuals TOC.txt. It
706 describes the heirarchical structure of the document (wheras the
707 spine describes its 'physical' structure).
709 #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
711 headers = {}
712 #if a header is set multiple times, keep all
713 def setheader(name, content, scheme=None):
714 values = headers.setdefault(name, [])
715 values.append((content, scheme))
717 head = ncx.find(DAISYNS + 'head')
718 #<!ELEMENT head (meta+)>
719 for meta in head.findall(DAISYNS + 'meta'):
720 #whatever 'scheme' is
721 setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
723 for t in ('docTitle', 'docAuthor'):
724 for e in ncx.findall(DAISYNS + t):
725 if e is not None:
726 setheader(t, get_ncxtext(e))
728 root = ncx.getroot()
729 for attr, header in (('dir', 'dir'),
730 (XMLNS + 'lang', 'lang')):
731 value = root.get(attr)
732 if value is not None:
733 setheader(header, value)
735 navmap = root.find(DAISYNS + 'navMap')
736 ret = {
737 'headers': headers,
738 'navmap': parse_navmap(navmap),
741 #Try adding these bits, even though no-one has them and they are no use.
742 pagelist = ncx.find(DAISYNS + 'pageList')
743 navlist = ncx.find(DAISYNS + 'navList')
744 if pagelist is not None:
745 ret['pagelist'] = parse_pagelist(pagelist)
746 if navlist is not None:
747 ret['navlist'] = parse_navlist(navlist)
749 return ret
752 def parse_navmap(e):
753 #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
754 #XXX move info and labels out of navmap, and into headers?
755 return {
756 'info': get_labels(e, DAISYNS + 'navInfo'),
757 'labels': get_labels(e),
758 'points': tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint')),
761 def parse_navpoint(e):
762 #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
763 c = e.find(DAISYNS + 'content')
764 subpoints = tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint'))
765 return {
766 'id': e.get('id'),
767 'class': e.get('class'),
768 'play_order': int(e.get('playOrder')),
769 #'content_id': c.get('id'),
770 'content_src': c.get('src'),
771 'labels': get_labels(e),
772 'points': subpoints,
776 def parse_pagelist(e):
777 # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
778 return {
779 'info': get_labels(e, DAISYNS + 'navInfo'),
780 'labels': get_labels(e),
781 'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'pageTarget')),
784 def parse_pagetarget(e):
785 #<!ELEMENT pageTarget (navLabel+, content)>
786 labels = get_labels(e)
787 c = e.find(DAISYNS + 'content')
788 ret = {
789 'id': e.get('id'),
790 'type': e.get('type'),
791 'play_order': int(e.get('playOrder')),
792 'content_src': c.get('src'),
793 'labels': get_labels(e),
795 value = e.get('value')
796 if value is not None:
797 ret['value'] = value
798 return ret
800 def parse_navlist(e):
801 #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
802 return {
803 'info': get_labels(e, DAISYNS + 'navInfo'),
804 'labels': get_labels(e),
805 'targets': tuple(parse_navtarget(x) for x in e.findall(DAISYNS + 'navTarget')),
808 def parse_navtarget(e):
809 #<!ELEMENT navTarget (navLabel+, content)>
810 labels = get_labels(e)
811 c = e.find(DAISYNS + 'content')
812 ret = {
813 'id': e.get('id'),
814 'play_order': int(e.get('playOrder')),
815 'content_src': c.get('src'),
816 'labels': get_labels(e),
818 value = e.get('value')
819 if value is not None:
820 ret['value'] = value
821 return ret