objavi/epub.py

   1 """Module for dealing with epub -> booki conversions."""
   2
   3 import os, sys
   4 import zipfile
   5 from cStringIO import StringIO
   6
   7 try:
   8     from json import dumps
   9 except ImportError:
  10     from simplejson import dumps
  11
  12 import lxml.html, lxml.cssselect
  13 from lxml import etree
  14
  15 from objavi.xhtml_utils import split_tree
  16 from objavi.book_utils import log
  17 from objavi.config import DC, XHTML, XHTMLNS, FM, MARKER_CLASS_INFO, MARKER_CLASS_SPLIT
  18 from booki.bookizip import BookiZip
  19
  20 #XML namespaces.  The *NS varients are in {curly brackets} for clark's syntax
  21 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
  22 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
  23 OPFNS = '{http://www.idpf.org/2007/opf}'
  24 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
  25
  26 MARKUP_TYPES = ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
  27 HTML_TYPES = ('application/xhtml+xml', 'text/html')
  28
  29 ADD_INFO_MARKERS = False
  30
  31
  32 html_parser = lxml.html.HTMLParser(encoding="utf-8")
  33 xhtml_parser = lxml.html.XHTMLParser(encoding="utf-8")
  34
  35 def _xhtml_parse(*args, **kwargs):
  36     kwargs['parser'] = xhtml_parser
  37     return lxml.html.parse(*args, **kwargs)
  38
  39 def _html_parse(*args, **kwargs):
  40     kwargs['parser'] = html_parser
  41     return lxml.html.parse(*args, **kwargs)
  42
  43 def new_doc(guts="", version="1.1", lang=None):
  44     xmldec = '<?xml version="1.0" encoding="UTF-8"?>'
  45     doctypes = {
  46         '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
  47                 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
  48         '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
  49                 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
  50     }
  51
  52     if lang in (None, 'und', 'UND'):
  53         langdec = ''
  54     else:
  55         langdec = 'xml:lang="%s" lang="%s"' % (lang, lang)
  56
  57     doc = ('<html xmlns="%s" version="XHTML %s" %s>'
  58            '<head></head><body>%s</body></html>'
  59            % (XHTML, version, langdec, guts))
  60
  61     f = StringIO(xmldec + doctypes.get(version, '') + doc)
  62     tree = lxml.html.parse(f)
  63     f.close()
  64     return tree
  65
  66
  67 class EpubError(Exception):
  68     pass
  69
  70 class Epub(object):
  71     """
  72
  73     Abstract Container:
  74     META-INF/
  75        container.xml
  76        [manifest.xml]
  77        [metadata.xml]
  78        [signatures.xml]
  79        [encryption.xml]
  80        [rights.xml]
  81     OEBPS/
  82        Great Expectations.opf
  83        cover.html
  84        chapters/
  85           chapter01.html
  86           chapter02.html
  87           <other HTML files for the remaining chapters>
  88
  89     """
  90     def load(self, src):
  91         # Zip is a variable format, and zipfile is limited.  If that
  92         # becomes a problem we will have to ise an `unzip` subprocess,
  93         # but it hasn't been so far.
  94         if isinstance(src, str):
  95             # Should end with PK<06><05> + 18 more.
  96             # Some zips contain 'comments' after that, which breaks ZipFile
  97             zipend = src.rfind('PK\x05\x06') + 22
  98             if len(src) != zipend:
  99                 log('Bad zipfile?')
 100                 src = src[: zipend]
 101             src = StringIO(src)
 102         self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
 103         self.names = self.zip.namelist()
 104         self.info = self.zip.infolist()
 105         self.origin = src
 106
 107     def gettree(self, name=None, id=None, parse=etree.parse):
 108         """get an XML tree from the given zip filename or manifest ID"""
 109         if name is None:
 110             name, mimetype = self.manifest[id]
 111         #Note: python 2.6 (not 2.5) has zipfile.open
 112         s = self.zip.read(name)
 113         f = StringIO(s)
 114         tree = parse(f)
 115         f.close()
 116         return tree
 117
 118     def parse_meta(self):
 119         '''META-INF/container.xml contains one or more <rootfile>
 120         nodes.  We want the "application/oepbs-package+xml" one.
 121
 122         <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
 123         <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
 124
 125         Other files are allowed in META-INF, but none of them are much
 126         use.  They are manifest.xml, metadata.xml, signatures.xml,
 127         encryption.xml, and rights.xml.
 128         '''
 129         tree = self.gettree('META-INF/container.xml')
 130         for r in tree.getiterator(CONTAINERNS + 'rootfile'):
 131             if r.get('media-type') == "application/oebps-package+xml":
 132                 rootfile = r.get('full-path')
 133                 break
 134         else:
 135             raise EpubError("No OPF rootfile found")
 136
 137         self.opf_file = rootfile
 138
 139     def parse_opf(self):
 140         """
 141         The opf file is arranged like this:
 142         <package>
 143         <metadata />
 144         <manifest />
 145         <spine />
 146         <guide />
 147         </package>
 148
 149         Metadata, manifest and spine are parsed in separate helper
 150         functions.
 151         """
 152         self.opfdir = os.path.dirname(self.opf_file) #needed for manifest parsing
 153         tree = self.gettree(self.opf_file)
 154         root = tree.getroot()
 155         metadata = root.find(OPFNS + 'metadata')
 156         manifest = root.find(OPFNS + 'manifest')
 157         spine = root.find(OPFNS + 'spine')
 158
 159         self.metadata = parse_metadata(metadata)
 160         self.manifest = parse_manifest(manifest, self.opfdir)
 161         # mapping of filenames to new filenames.  This needs to be
 162         # done early to detect clashes (e.g. '/images/hello.jpg' and
 163         # '/images/big/hello.jpg' would both reduce to
 164         # 'static/hello.jpg').
 165         self.media_map = {}
 166         for k, v in self.manifest.items():
 167             fn, mimetype = v
 168             if isinstance(fn, unicode):
 169                 log('Stupid unicode: %r' % fn)
 170
 171             if mimetype not in MARKUP_TYPES:
 172                 oldfn = fn
 173                 if '/' in fn:
 174                     fn = fn.rsplit('/', 1)[1]
 175                 while fn in self.media_map.values():
 176                     fn = '_' + fn
 177                 newfn = 'static/%s' % fn
 178                 self.media_map[oldfn] = newfn
 179
 180         ncxid, self.spine = parse_spine(spine)
 181         self.ncxfile = self.manifest[ncxid][0]
 182
 183         #there is also an optional guide section, which we ignore
 184         guide = root.find(OPFNS + 'guide')
 185         if guide is not None:
 186             self.guide = parse_guide(guide)
 187         else:
 188             self.guide = None
 189
 190
 191     def parse_ncx(self):
 192         ncx = self.gettree(self.ncxfile)
 193         self.ncxdata = parse_ncx(ncx)
 194
 195     def raw_json(self):
 196         """get all the known metadata and nav data as json."""
 197         data = {
 198             'metadata': self.metadata,
 199             'manifest': self.manifest,
 200             'spine': self.spine,
 201             'ncx': self.ncxdata
 202             }
 203         if self.guide is not None:
 204             data['guide'] = self.guide
 205         return dumps(data, indent=2)
 206
 207     def find_language(self):
 208         opflang = [x[0].lower() for x in
 209                    self.metadata.get(DC, {}).get('language', ())]
 210
 211         # XXX Should the ncx language enter into it? Being xml:lang,
 212         # it is in theory just the language of the ncx document
 213         # itself.  But if the metadata lacks language, should it be
 214         # used instead? At present, NO.
 215         #ncxlang = self.ncxdata['headers'].get('lang', ())
 216
 217         # XXX also, for now, ignoring case of badly formed language
 218         # codes, conflicting or supplementary languages, etc.
 219         opflang = [x for x in opflang if x not in ('und', '')]
 220         if not opflang:
 221             return None
 222         if len(set(opflang)) > 1:
 223             log('%s metadata has more than one language: %s -- using first one'
 224                 % (self.origin, opflang))
 225         return opflang[0]
 226
 227     def find_probable_chapters(self):
 228         """Try to find the real chapters from the NCX file.  The
 229         problem is that different epubs all use their own level of
 230         nesting."""
 231         # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
 232         # and FM books have (section 1 (c1, c2,..),..)
 233         # i.e super-chapter blocks
 234         # some have (((c1, c2, c3))) -- deeply nested chapters
 235         # some have no real chapters, but stupid structure
 236         points = self.ncxdata['navmap']['points']
 237         pwd = os.path.dirname(self.ncxfile)
 238         serial_points, splits = get_chapter_breaks(points, pwd)
 239         return serial_points, splits
 240
 241     def concat_document(self):
 242         """Join all the xhtml files together, putting in markers
 243         indicating where the splits should be.
 244         """
 245         lang = self.find_language()
 246         points = self.ncxdata['navmap']['points']
 247         pwd = os.path.dirname(self.ncxfile)
 248         serial_points, chapter_markers = get_chapter_breaks(points, pwd)
 249         doc = new_doc(lang=lang)
 250         #log(chapter_markers)
 251         for ID in self.spine:
 252             fn, mimetype = self.manifest[ID]
 253             if mimetype.startswith('image'):
 254                 root = lxml.html.Element('html')
 255                 body = etree.SubElement(root, 'body')
 256                 first_el = etree.SubElement(body, 'img', src=self.media_map.get(fn, fn), alt='')
 257             else:
 258                 tree = self.gettree(fn, parse=_html_parse)
 259                 root = tree.getroot()
 260                 body = _find_tag(root, 'body')
 261                 if not len(body) and ADD_INFO_MARKERS:
 262                     add_marker(body, 'espri-empty-file-%s' % ID, title=fn, child=True)
 263                 first_el = body[0]
 264             #point the links to the new names. XXX probably fragile
 265             root.rewrite_links(lambda x: self.media_map.get(os.path.join(self.opfdir, x), x))
 266
 267             for depth, fragment, point in chapter_markers.get(fn, ()):
 268                 if fragment:
 269                     start = root.xpath("//*[@id='%s']" % fragment)[0]
 270                 else:
 271                     start = first_el
 272                 labels = point['labels']
 273                 add_marker(start, '%(id)s' % point,
 274                            klass=MARKER_CLASS_SPLIT,
 275                            title=find_good_label(labels, lang),
 276                            subsections=str(bool(point['points'])))
 277
 278             if ADD_INFO_MARKERS:
 279                 add_marker(first_el, 'espri-new-file-%s' % ID, title=fn)
 280             add_guts(root, doc)
 281         return doc
 282
 283
 284     def make_bookizip(self, zfn):
 285         """Split up the document and construct a booki-toc for it."""
 286         doc = self.concat_document()
 287         bz = BookiZip(zfn)
 288
 289         chapters = split_tree(doc) #destroys doc.
 290         real_chapters = drop_empty_chapters(chapters)
 291         rightsholders = [c for c, extra in self.metadata[DC].get('creator', ())]
 292         contributors = rightsholders + [c for c, extra in self.metadata[DC].get('contributor', ())]
 293
 294         spine = []
 295         for c in real_chapters:
 296             try:
 297                 root = c.tree.getroot()
 298             except Exception:
 299                 root = c.tree
 300             try:
 301                 del root.attrib['xmlns']
 302                 del root.attrib['version']
 303                 del root.attrib['xml:lang']
 304             except KeyError, e:
 305                 log(e)
 306             if c.title:
 307                 head = root.makeelement('head')
 308                 _title = etree.SubElement(head, 'title')
 309                 _title.text = c.title
 310                 root.insert(0, head)
 311             blob = lxml.html.tostring(c.tree)
 312             bz.add_to_package(c.ID, '%s.html' % c.ID, blob, mediatype='text/html',
 313                               contributors=contributors,
 314                               rightsholders=rightsholders)
 315             spine.append(c.ID)
 316
 317         #add the images and other non-html data unchanged.
 318         for id, data in self.manifest.iteritems():
 319             fn, mimetype = data
 320             if isinstance(fn, unicode):
 321                 log("Hateful unicode: %r" % fn)
 322             if mimetype not in MARKUP_TYPES:
 323                 blob = self.zip.read(fn)
 324                 bz.add_to_package(id, self.media_map[fn], blob, mimetype,
 325                                   contributors=contributors,
 326                                   rightsholders=rightsholders
 327                                   )
 328
 329         #now to construct a table of contents
 330         lang = self.find_language()
 331
 332         deferred_urls = []
 333         def write_toc(point, section):
 334             tocpoint = {}
 335             title = find_good_label(point['labels'], lang),
 336             if title and title[0]:
 337                 tocpoint['title'] = title[0]
 338             ID = point['id']
 339             if ID in spine:
 340                 tocpoint['url'] = self.manifest.get(ID, ID + '.html')
 341                 while deferred_urls:
 342                     tp = deferred_urls.pop()
 343                     tp['url'] = tocpoint['url']
 344                     log('%r has deferred url: %r' % (tp['title'], tp['url']))
 345             else:
 346                 deferred_urls.append(tocpoint)
 347             if point['points']:
 348                 tocpoint['children'] = []
 349                 for child in point['points']:
 350                     write_toc(child, tocpoint['children'])
 351
 352             section.append(tocpoint)
 353
 354         toc = []
 355         points = self.ncxdata['navmap']['points']
 356         for p in points:
 357             write_toc(p, toc)
 358
 359         metadata = {FM: {'book':{},
 360                          'server': {},
 361                          },
 362                     DC: {}}
 363
 364         for namespace, keys in self.metadata.items():
 365             if 'namespace' not in metadata:
 366                 metadata[namespace] = {}
 367             log(keys)
 368             for key, values in keys.items():
 369                 dest = metadata[namespace].setdefault(key, {})
 370                 for value, extra in values:
 371                     scheme = ''
 372                     if extra:
 373                         for x in ('scheme', 'role'):
 374                             if x in extra:
 375                                 scheme = extra[x]
 376                                 break
 377                     dest.setdefault(scheme, []).append(value)
 378
 379         if not metadata[FM]['book']:
 380             metadata[FM]['book'][''] = [''.join(x for x in str(metadata[DC]['identifier'][''][0]) if x.isalnum())]
 381         if not metadata[FM]['server']:
 382             metadata[FM]['server'][''] = ['booki.flossmanuals.net']
 383
 384         log(metadata)
 385
 386         bz.info = {
 387             'spine': spine,
 388             'TOC': toc,
 389             'metadata': metadata,
 390             'version': '1',
 391             }
 392
 393         bz.finish()
 394
 395
 396 def find_good_label(labels, lang=None):
 397     """Try to find a suitable label from a dictionary mapping
 398     languages to labels, resorting to a random label if need be."""
 399     #XXX not taking into account language sub-tags ("en_GB")
 400     for x in [lang, None]:
 401         if x in labels:
 402             return labels[x]
 403     if labels:
 404         #return random.choice(labels.values())
 405         return ' | '.join(labels.values())
 406     return None
 407
 408
 409 #labels.get(lang, '\n'.join(labels.values())),
 410
 411 def drop_empty_chapters(chapters):
 412     """If the chapter has no content, ignore it.  Content is defined
 413     as images or text."""
 414     good_chapters = []
 415     for c in chapters:
 416         good = False
 417         for e in c.tree.iter():
 418             if ((e.text and e.text.strip()) or
 419                 (e.tail and e.tail.strip()) or
 420                 e.tag in ('img',)):
 421                 good = True
 422                 break
 423         if good:
 424             good_chapters.append(c)
 425     return good_chapters
 426
 427
 428 def add_guts(src, dest):
 429     """Append the contents of the <body> of one tree onto that of
 430     another.  The source tree will be emptied."""
 431     #print  lxml.etree.tostring(src)
 432     sbody = _find_tag(src, 'body')
 433     dbody = _find_tag(dest, 'body')
 434     if len(dbody):
 435         dbody[-1].tail = ((dbody[-1].tail or '') +
 436                           (sbody.text or '')) or None
 437     else:
 438         dbody.text = sbody.text
 439
 440     for x in sbody:
 441         dbody.append(x)
 442
 443     dbody.tail = ((dbody.tail or '') +
 444                   (sbody.tail or '')) or None
 445
 446
 447
 448 def _find_tag(doc, tag):
 449     #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
 450     try:
 451         doc = doc.getroot()
 452     except AttributeError:
 453         pass
 454     if doc.nsmap:
 455         try:
 456             return doc.iter(XHTMLNS + tag).next()
 457         except StopIteration:
 458             log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc.nsmap, tag))
 459     return doc.iter(tag).next()
 460
 461
 462 def add_marker(el, ID, child=False, klass=MARKER_CLASS_INFO, **kwargs):
 463     """Add a marker before the element, or inside it if child is true"""
 464     marker = el.makeelement('hr')
 465     marker.set('id', ID)
 466     marker.set('class', klass)
 467     for k, v in kwargs.items():
 468         marker.set(k, v)
 469     if child:
 470         parent = el
 471         index = 0
 472     else:
 473         parent = el.getparent()
 474         index = parent.index(el)
 475     parent.insert(index, marker)
 476
 477
 478
 479 def get_chapter_breaks(points, pwd):
 480     # First go was overly complex, trying to guess which sections were
 481     # really chapters.  Now, every ncx navpoint is a chapter break.
 482     serial_points = []
 483     def serialise(p, depth):
 484         serial_points.append((depth, p))
 485         #if p['class']:
 486         #    log("found class=='%s' at depth %s" % (p['class'], depth))
 487         if not p.get('points'):
 488             return
 489         for child in p['points']:
 490             serialise(child, depth + 1)
 491
 492     for p in points:
 493         serialise(p, 1)
 494
 495     splits = {}
 496     for depth, p in serial_points:
 497         url, ID = p['content_src'], None
 498         url = os.path.join(pwd, url)
 499         if '#' in url:
 500             log("GOT a fragment! %s" % url)
 501             url, ID = url.split('#', 1)
 502         s = splits.setdefault(url, [])
 503         s.append((depth, ID, p))
 504
 505     return serial_points, splits
 506
 507
 508 def parse_metadata(metadata):
 509     """metadata is an OPF metadata node, as defined at
 510     http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
 511     (or a dc-metadata or x-metadata child thereof).
 512
 513     """
 514     # the node probably has at least 'dc', 'opf', and None namespace
 515     # prefixes.  None and opf probably map to the same thing. 'dc' is
 516     # Dublin Core.
 517     nsmap = metadata.nsmap
 518     nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
 519     default_ns = nstags[None]
 520
 521     # Collect element data in namespace-bins, and map prefixes to
 522     # those bins for convenience
 523     nsdict = dict((v, {}) for v in nsmap.values())
 524
 525     def add_item(ns, tag, value, extra):
 526         #any key can be duplicate, so store in a list
 527         if ns not in nsdict:
 528             nsdict[ns] = {}
 529         values = nsdict[ns].setdefault(tag, [])
 530         values.append((value, extra))
 531
 532     for t in metadata.iterdescendants():
 533         #look for special OPF tags
 534         if t.tag == default_ns + 'meta':
 535             #meta tags <meta name="" content="" />
 536             name = t.get('name')
 537             content = t.get('content')
 538             others = dict((k, v) for k, v in t.items() if k not in ('name', 'content'))
 539             if ':' in name:
 540                 # the meta tag is using xml namespaces in attribute values.
 541                 prefix, name = name.split(':', 1)
 542             else:
 543                 prefix = None
 544             add_item(t.nsmap[prefix], name, content, others)
 545             continue
 546
 547         if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
 548             # Subelements of these deprecated elements are in either
 549             # DC or non-DC namespace (respectively).  Of course, this
 550             # is true of any element anyway, so it is sufficent to
 551             # ignore this (unless we want to cause pedantic errors).
 552             log("found a live %s tag; descending into but otherwise ignoring it"
 553                 % t.tag[len(default_ns):])
 554             continue
 555
 556         tag = t.tag[t.tag.rfind('}') + 1:]
 557         add_item(t.nsmap[t.prefix], tag, t.text,
 558                  tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
 559
 560     return nsdict
 561
 562
 563 def parse_manifest(manifest, pwd):
 564     """
 565     Only contains <item>s; each <item> has id, href, and media-type.
 566
 567     It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
 568     file (i.e., the files needed to get this far).
 569
 570     The manifest can specify fallbacks for unrecognised documents, but
 571     Espri does not use that (nor do any of the test epub files).
 572
 573     <manifest>
 574     <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
 575     <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
 576     <item id="cover" href="cover.jpg" media-type="image/jpeg" />
 577     </manifest>
 578     """
 579     items = {}
 580     ns = '{%s}' % manifest.nsmap[None]
 581
 582     for t in manifest.iterchildren(ns + 'item'):
 583         id = t.get('id')
 584         href = os.path.join(pwd, t.get('href'))
 585         if isinstance(href, unicode):
 586             log('damn unicode: %r' % href)
 587             log(etree.tostring(t))
 588         media_type = t.get('media-type')
 589         items[id] = (href, media_type) #XXX does media-type matter?
 590
 591     return items
 592
 593 def parse_spine(spine):
 594     """The spine is an ordered list of xhtml documents (or dtbook, but
 595     Booki can't edit that, or manifest items that 'fallback' to xhtml,
 596     which Espri doesn't yet handle).  Also, anything in the manifest
 597     that can be in the spine, must be.
 598
 599     Spine itemrefs can have a 'linear' attribute, with a value of
 600     'yes' or 'no' (defaulting to 'yes').  If an item is linear, it is
 601     in the main stream of the book.  Reader software is allowed to
 602     ignore this distinction, as Espri does.
 603
 604     The toc attribute points to the ncx file (via manifest id).
 605     """
 606     items = []
 607     ns = '{%s}' % spine.nsmap[None]
 608     for t in spine.iterchildren(ns + 'itemref'):
 609         items.append(t.get('idref'))
 610
 611     toc = spine.get('toc')
 612
 613     return toc, items
 614
 615 def parse_guide(guide):
 616     """Parse the guide from the opf file."""
 617     items = []
 618     ns = '{%s}' % guide.nsmap[None]
 619     for r in guide.iterchildren(ns + 'reference'):
 620         items.append((r.get('href'), r.get('type'), r.get('title'),))
 621
 622     return items
 623
 624
 625 def get_ncxtext(e):
 626     """get text content from an <xx><text>...</text></xx> construct,
 627     as is common in NCX files."""
 628     # there will only be one <text>, but for...iter is still easiest
 629     for t in e.iter(DAISYNS + 'text'):
 630         return t.text
 631     return '' # or leave it at None?
 632
 633 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
 634     """Make a mapping of languages to labels."""
 635     # This reads navInfo or navLabel tags. navInfo is unlikely, but
 636     # navLabel is ubiquitous.  There can be one for each language, so
 637     # construct a dict.
 638     labels = {}
 639     for label in e.iterchildren(tag):
 640         lang = label.get(XMLNS + 'lang')
 641         labels[lang] = get_ncxtext(e)
 642     return labels
 643
 644 def parse_ncx(ncx):
 645     """
 646     The NCX file is the closest thing to FLOSS Manuals TOC.txt.  It
 647     describes the heirarchical structure of the document (wheras the
 648     spine describes its 'physical' structure).
 649     """
 650     #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
 651
 652     headers = {}
 653     #if a header is set multiple times, keep all
 654     def setheader(name, content, scheme=None):
 655         values = headers.setdefault(name, [])
 656         values.append((content, scheme))
 657
 658     head = ncx.find(DAISYNS + 'head')
 659     #<!ELEMENT head (meta+)>
 660     for meta in head.iterchildren(DAISYNS + 'meta'):
 661         #whatever 'scheme' is
 662         setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
 663
 664     root = ncx.getroot()
 665     for t in ('docTitle', 'docAuthor'):
 666         for e in root.iterchildren(DAISYNS + t):
 667             if e is not None:
 668                 setheader(t, get_ncxtext(e))
 669
 670     for attr, header in (('dir', 'dir'),
 671                          (XMLNS + 'lang', 'lang')):
 672         value = root.get(attr)
 673         if value is not None:
 674             setheader(header, value)
 675
 676     navmap = root.find(DAISYNS + 'navMap')
 677     ret = {
 678         'headers':  headers,
 679         'navmap':   parse_navmap(navmap),
 680     }
 681
 682     #Try adding these bits, even though no-one has them and they are no use.
 683     pagelist = ncx.find(DAISYNS + 'pageList')
 684     navlist = ncx.find(DAISYNS + 'navList')
 685     if pagelist is not None:
 686         ret['pagelist'] = parse_pagelist(pagelist)
 687     if navlist is not None:
 688         ret['navlist'] = parse_navlist(navlist)
 689
 690     return ret
 691
 692
 693 def parse_navmap(e):
 694     #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
 695     #XXX move info and labels out of navmap, and into headers?
 696     return {
 697         'info': get_labels(e, DAISYNS + 'navInfo'),
 698         'labels': get_labels(e),
 699         'points': tuple(parse_navpoint(x) for x in e.iterchildren(DAISYNS + 'navPoint')),
 700         }
 701
 702 def parse_navpoint(e):
 703     #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
 704     c = e.find(DAISYNS + 'content')
 705     subpoints = tuple(parse_navpoint(x) for x in e.iterchildren(DAISYNS + 'navPoint'))
 706     return {
 707         'id': e.get('id'),
 708         'class': e.get('class'),
 709         'play_order': int(e.get('playOrder')),
 710         #'content_id': c.get('id'),
 711         'content_src': c.get('src'),
 712         'labels': get_labels(e),
 713         'points': subpoints,
 714         }
 715
 716
 717 def parse_pagelist(e):
 718     # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
 719     return {
 720         'info': get_labels(e, DAISYNS + 'navInfo'),
 721         'labels': get_labels(e),
 722         'targets': tuple(parse_pagetarget(x) for x in e.iterchildren(DAISYNS + 'pageTarget')),
 723         }
 724
 725 def parse_pagetarget(e):
 726     #<!ELEMENT pageTarget (navLabel+, content)>
 727     c = e.find(DAISYNS + 'content')
 728     ret = {
 729         'id': e.get('id'),
 730         'type': e.get('type'),
 731         'play_order': int(e.get('playOrder')),
 732         'content_src': c.get('src'),
 733         'labels': get_labels(e),
 734     }
 735     value = e.get('value')
 736     if value is not None:
 737         ret['value'] = value
 738     return ret
 739
 740 def parse_navlist(e):
 741     #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
 742     return {
 743         'info': get_labels(e, DAISYNS + 'navInfo'),
 744         'labels': get_labels(e),
 745         'targets': tuple(parse_navtarget(x) for x in e.iterchildren(DAISYNS + 'navTarget')),
 746         }
 747
 748 def parse_navtarget(e):
 749     #<!ELEMENT navTarget (navLabel+, content)>
 750     c = e.find(DAISYNS + 'content')
 751     ret = {
 752         'id': e.get('id'),
 753         'play_order': int(e.get('playOrder')),
 754         'content_src': c.get('src'),
 755         'labels': get_labels(e),
 756     }
 757     value = e.get('value')
 758     if value is not None:
 759         ret['value'] = value
 760     return ret