epub.py

   1 """Module for dealing with epub -> booki conversions."""
   2
   3 import os, sys
   4 from pprint import pprint
   5 import zipfile
   6 from cStringIO import StringIO
   7 import copy
   8
   9 try:
  10     from json import dumps
  11 except ImportError:
  12     from simplejson import dumps
  13
  14 import lxml, lxml.html, lxml.cssselect
  15 from lxml import etree
  16
  17 from booki.xhtml_utils import BookiZip
  18
  19 #XML namespaces.  The *NS varients are in {curly brackets} for clark's syntax
  20 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
  21 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
  22 OPFNS = '{http://www.idpf.org/2007/opf}'
  23 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
  24 XHTMLNS = '{http://www.w3.org/1999/xhtml}'
  25
  26 XHTML = 'http://www.w3.org/1999/xhtml'
  27 DC = "http://purl.org/dc/elements/1.1/"
  28
  29 MARKUP_TYPES = ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
  30 HTML_TYPES = ('application/xhtml+xml', 'text/html')
  31
  32 def log(*messages, **kwargs):
  33     for m in messages:
  34         try:
  35             print >> sys.stderr, m
  36         except Exception:
  37             print >> sys.stderr, repr(m)
  38
  39
  40 html_parser = lxml.html.HTMLParser(encoding="utf-8")
  41 xhtml_parser = lxml.html.XHTMLParser(encoding="utf-8")
  42
  43 def _xhtml_parse(*args, **kwargs):
  44     kwargs['parser'] = xhtml_parser
  45     return lxml.html.parse(*args, **kwargs)
  46
  47 def _html_parse(*args, **kwargs):
  48     kwargs['parser'] = html_parser
  49     return lxml.html.parse(*args, **kwargs)
  50
  51 def new_doc(guts="", version="1.1", lang=None):
  52     xmldec = '<?xml version="1.0" encoding="UTF-8"?>'
  53     doctypes = {
  54         '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
  55                 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
  56         '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
  57                 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
  58     }
  59
  60     if lang in (None, 'und', 'UND'):
  61         langdec = ''
  62     else:
  63         langdec = 'xml:lang="%s" lang="%s"' % (lang, lang)
  64
  65     doc = ('<html xmlns="%s" version="XHTML %s" %s>'
  66            '<head></head><body>%s</body></html>'
  67            % (XHTML, version, langdec, guts))
  68
  69     f = StringIO(xmldec + doctypes.get(version, '') + doc)
  70     tree = lxml.html.parse(f)
  71     f.close()
  72     return tree
  73
  74
  75 class EpubError(Exception):
  76     pass
  77
  78 class Epub(object):
  79     """
  80
  81     Abstract Container:
  82     META-INF/
  83        container.xml
  84        [manifest.xml]
  85        [metadata.xml]
  86        [signatures.xml]
  87        [encryption.xml]
  88        [rights.xml]
  89     OEBPS/
  90        Great Expectations.opf
  91        cover.html
  92        chapters/
  93           chapter01.html
  94           chapter02.html
  95           <other HTML files for the remaining chapters>
  96
  97     """
  98     def load(self, src):
  99         # Zip is a variable format, and zipfile is limited.  If that
 100         # becomes a problem we will have to ise an `unzip` subprocess,
 101         # but it hasn't been so far.
 102         if isinstance(src, str):
 103             # Should end with PK<06><05> + 18 more.
 104             # Some zips contain 'comments' after that, which breaks ZipFile
 105             zipend = src.rfind('PK\x05\x06') + 22
 106             if len(src) != zipend:
 107                 log('Bad zipfile?')
 108                 src = src[: zipend]
 109             src = StringIO(src)
 110         self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
 111         self.names = self.zip.namelist()
 112         self.info = self.zip.infolist()
 113         self.origin = src
 114
 115     def gettree(self, name=None, id=None, parse=etree.parse):
 116         """get an XML tree from the given zip filename or manifest ID"""
 117         if name is None:
 118             name, mimetype = self.manifest[id]
 119         #Note: python 2.6 (not 2.5) has zipfile.open
 120         s = self.zip.read(name)
 121         f = StringIO(s)
 122         tree = parse(f)
 123         f.close()
 124         return tree
 125
 126     def parse_meta(self):
 127         '''META-INF/container.xml contains one or more <rootfile>
 128         nodes.  We want the "application/oepbs-package+xml" one.
 129
 130         <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
 131         <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
 132
 133         Other files are allowed in META-INF, but none of them are much
 134         use.  They are manifest.xml, metadata.xml, signatures.xml,
 135         encryption.xml, and rights.xml.
 136         '''
 137         tree = self.gettree('META-INF/container.xml')
 138         for r in tree.getiterator(CONTAINERNS + 'rootfile'):
 139             if r.get('media-type') == "application/oebps-package+xml":
 140                 rootfile = r.get('full-path')
 141                 break
 142         else:
 143             raise EpubError("No OPF rootfile found")
 144
 145         self.opf_file = rootfile
 146
 147     def parse_opf(self):
 148         """
 149         The opf file is arranged like this:
 150         <package>
 151         <metadata />
 152         <manifest />
 153         <spine />
 154         <guide />
 155         </package>
 156
 157         Metadata, manifest and spine are parsed in separate helper
 158         functions.
 159         """
 160         self.opfdir = os.path.dirname(self.opf_file) #needed for mainfest parsing
 161         tree = self.gettree(self.opf_file)
 162         root = tree.getroot()
 163         metadata = root.find(OPFNS + 'metadata')
 164         manifest = root.find(OPFNS + 'manifest')
 165         spine = root.find(OPFNS + 'spine')
 166         #there is also an optional guide section, which we ignore
 167
 168         self.metadata = parse_metadata(metadata)
 169         self.manifest = parse_manifest(manifest, self.opfdir)
 170         # mapping of filenames to new filenames.  This needs to be
 171         # done early to detect clashes (e.g. '/images/hello.jpg' and
 172         # '/images/big/hello.jpg' would both reduce to
 173         # 'static/hello.jpg').
 174         self.media_map = {}
 175         for k, v in self.manifest.items():
 176             fn, mimetype = v
 177             if mimetype not in MARKUP_TYPES:
 178                 oldfn = fn
 179                 if '/' in fn:
 180                     fn = fn.rsplit('/', 1)[1]
 181                 while fn in self.media_map.values():
 182                     fn = '_' + fn
 183                 newfn = 'static/%s' % fn
 184                 self.media_map[oldfn] = newfn
 185
 186         ncxid, self.spine = parse_spine(spine)
 187         self.ncxfile = self.manifest[ncxid][0]
 188
 189     def parse_ncx(self):
 190         ncx = self.gettree(self.ncxfile)
 191         self.ncxdata = parse_ncx(ncx)
 192
 193     def raw_json(self):
 194         """get all the known metadata and nav data as json."""
 195         data = {
 196             'metadata': self.metadata,
 197             'manifest': self.manifest,
 198             'spine': self.spine,
 199             'ncx': self.ncxdata
 200             }
 201         return dumps(data, indent=2)
 202
 203     def find_language(self):
 204         opflang = [x[0].lower() for x in
 205                    self.metadata.get(DC, {}).get('language', ())]
 206
 207         # XXX Should the ncx language enter into it? Being xml:lang,
 208         # it is in theory just the language of the ncx document
 209         # itself.  But if the metadata lacks language, should it be
 210         # used instead? At present, NO.
 211         #ncxlang = self.ncxdata['headers'].get('lang', ())
 212
 213         # XXX also, for now, ignoring case of badly formed language
 214         # codes, conflicting or supplementary languages, etc.
 215         opflang = [x for x in opflang if x not in ('und', '')]
 216         if not opflang:
 217             return None
 218         if len(set(opflang)) > 1:
 219             log('%s metadata has more than one language: %s -- using first one'
 220                 % (self.origin, opflang))
 221         return opflang[0]
 222
 223     def find_probable_chapters(self):
 224         """Try to find the real chapters from the NCX file.  The
 225         problem is that different epubs all use their own level of
 226         nesting."""
 227         # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
 228         # and FM books have (section 1 (c1, c2,..),..)
 229         # i.e super-chapter blocks
 230         # some have (((c1, c2, c3))) -- deeply nested chapters
 231         # some have no real chapters, but stupid structure
 232         points = self.ncxdata['navmap']['points']
 233         chapter_depth, serial_points, splits = get_chapter_breaks(points)
 234         return chapter_depth, serial_points, splits
 235
 236     def concat_document(self):
 237         """Join all the xhtml files together, putting in markers
 238         indicating where the splits should be.
 239         """
 240         lang = self.find_language()
 241         points = self.ncxdata['navmap']['points']
 242         pwd = os.path.dirname(self.ncxfile)
 243         chapter_depth, serial_points, chapter_markers = get_chapter_breaks(points, pwd)
 244         doc = new_doc(lang=lang)
 245         #log(chapter_markers)
 246         for ID in self.spine:
 247             fn, mimetype = self.manifest[ID]
 248             if mimetype.startswith('image'):
 249                 root = lxml.html.Element('html')
 250                 body = etree.SubElement(root, 'body')
 251                 first_el = etree.SubElement(body, 'img', src=self.media_map.get(fn, fn), alt='')
 252             else:
 253                 tree = self.gettree(fn, parse=_html_parse)
 254                 root = tree.getroot()
 255                 first_el = _find_tag(root, 'body')[0]
 256             #point the links to the new names. XXX probably fragile
 257             root.rewrite_links(lambda x: self.media_map.get(os.path.join(self.opfdir, x), x))
 258
 259             for depth, fragment, point in chapter_markers.get(fn, ()):
 260                 if fragment:
 261                     start = root.xpath("//*[@id='%s']" % fragment)[0]
 262                 else:
 263                     start = first_el
 264                 labels = point['labels']
 265                 add_marker(start, 'espri-chapter-%(id)s' % point,
 266                            title=labels.get(lang, '\n'.join(labels.values())),
 267                            subsections=str(bool(point['points'])))
 268
 269             add_marker(first_el, 'espri-new-file-%s' % ID, title=fn)
 270             add_guts(root, doc)
 271         return doc
 272
 273
 274     def make_bookizip(self, zfn):
 275         """Split up the document and construct a booki-toc for it."""
 276         doc = self.concat_document()
 277         bz = BookiZip(zfn)
 278
 279         chapters = split_document(doc)
 280         real_chapters = drop_empty_chapters(chapters)
 281
 282         spine = []
 283         for id, title, tree in real_chapters:
 284             if title:
 285                 try:
 286                     root = tree.getroot()
 287                 except:
 288                     root = tree
 289                 head = root.makeelement('head')
 290                 _title = etree.SubElement(head, 'title')
 291                 _title.text = title
 292                 root.insert(0, head)
 293             blob = etree.tostring(tree)
 294             bz.add_to_package(id, '%s.html' % id,
 295                               blob, mediatype='text/html')
 296             spine.append(id)
 297
 298         #add the images ad other non-html data unchanged.
 299         for id, data in self.manifest.iteritems():
 300             fn, mimetype = data
 301             if mimetype not in MARKUP_TYPES:
 302                 blob = self.zip.read(fn)
 303                 bz.add_to_package(id, self.media_map[fn], blob, mimetype)
 304
 305         #now to construct a table of contents
 306
 307         def write_toc(point, section):
 308             ID = point['id']
 309             if ID in spine:
 310                 section.append((ID, ID + '.html'))
 311             else:
 312                 section.append((ID, None))
 313             subsection = []
 314             for child in point['points']:
 315                 write_toc(child, subsection)
 316             if subsection:
 317                 section.append(subsection)
 318
 319         toc = []
 320         points = self.ncxdata['navmap']['points']
 321         for p in points:
 322             write_toc(p, toc)
 323
 324         bz.info = {
 325             'spine': spine,
 326             'TOC': toc,
 327             'metadata': self.metadata["http://purl.org/dc/elements/1.1/"],
 328             'copyright': {'The Contributors': [(x, 'primary') for x in spine]},
 329             }
 330
 331         bz.finish()
 332
 333
 334 def drop_empty_chapters(chapters):
 335     """If the chapter has no content, ignore it.  Content is defined
 336     as images or text."""
 337     good_chapters = []
 338     for c in chapters:
 339         good = False
 340         for e in c[2].iter():
 341             if ((e.text and e.text.strip()) or
 342                 (e.tail and e.tail.strip()) or
 343                 e.tag in ('img',)):
 344                 good = True
 345                 break
 346         if good:
 347             good_chapters.append(c)
 348     return good_chapters
 349
 350
 351 def copy_element(src, create):
 352     """Return a copy of the src element, with all its attributes and
 353     tail, using create to make the copy. create is probably an
 354     Element._makeelement method, to associate the copy with the right
 355     tree, but it could be etree.HTMLElement."""
 356     if isinstance(src.tag, basestring):
 357         dest = create(src.tag)
 358     else:
 359         dest = copy.copy(src)
 360
 361     for k, v in src.items():
 362         dest.set(k, v)
 363     dest.tail = src.tail
 364     return dest
 365
 366 def split_document(doc):
 367     """Split the document along chapter boundaries."""
 368     try:
 369         root = doc.getroot()
 370     except AttributeError:
 371         root = doc
 372
 373     front_matter = copy_element(root, lxml.html.Element)
 374     chapters = [('espri-unindexed-front-matter',
 375                  'Unindexed Front Matter',
 376                  front_matter)]
 377
 378     _climb_and_split(root, front_matter, chapters)
 379     return chapters
 380
 381 def _climb_and_split(src, dest, chapters):
 382     for child in src.iterchildren():
 383         if child.tag == 'hr' and child.get('class') == MARKER_CLASS:
 384             ID = child.get('id')
 385             if ID.startswith('espri-chapter-'):
 386                 title = child.get('title') or ID
 387                 new = copy_element(src, lxml.html.Element)
 388                 root = new
 389
 390                 for a in src.iterancestors():
 391                     a2 = copy_element(a, root.makeelement)
 392                     a2.append(root)
 393                     root = a2
 394
 395                 chapters.append((ID[14:], title, root))
 396
 397                 dest.tail = None
 398                 for a in dest.iterancestors():
 399                     a.tail = None
 400
 401                 dest = new
 402             else:
 403                 log("skipping %s" % etree.tostring(child))
 404
 405         else:
 406             new = copy_element(child, dest.makeelement)
 407             new.text = child.text
 408             dest.append(new)
 409             _climb_and_split(child, new, chapters)
 410
 411
 412 def save_chapters(chapters):
 413     for id, tree in chapters.items():
 414         string = lxml.html.tostring(tree, method='html')
 415         f = open('/tmp/x%s.html' % id, 'w')
 416         f.write(string)
 417         f.close()
 418
 419
 420 def add_guts(src, dest):
 421     """Append the contents of the <body> of one tree onto that of
 422     another.  The source tree will be emptied."""
 423     #print  lxml.etree.tostring(src)
 424     sbody = _find_tag(src, 'body')
 425     dbody = _find_tag(dest, 'body')
 426     if len(dbody):
 427         dbody[-1].tail = ((dbody[-1].tail or '') +
 428                           (sbody.text or '')) or None
 429     else:
 430         dbody.text = sbody.text
 431
 432     for x in sbody:
 433         dbody.append(x)
 434
 435     dbody.tail = ((dbody.tail or '') +
 436                   (sbody.tail or '')) or None
 437
 438
 439
 440 def _find_tag(doc, tag):
 441     #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
 442     try:
 443         doc = doc.getroot()
 444     except AttributeError:
 445         pass
 446     if doc.nsmap:
 447         try:
 448             return doc.iter(XHTMLNS + tag).next()
 449         except StopIteration:
 450             log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc.nsmap, tag))
 451     return doc.iter(tag).next()
 452
 453 MARKER_CLASS="espri-marker"
 454
 455 def add_marker(el, ID, **kwargs):
 456     """Add a marker before the elememt"""
 457     marker = el.makeelement('hr')
 458     marker.set('id', ID)
 459     marker.set('class', MARKER_CLASS)
 460     for k, v in kwargs.items():
 461         marker.set(k, v)
 462     parent = el.getparent()
 463     index = parent.index(el)
 464     parent.insert(index, marker)
 465
 466
 467
 468 def get_chapter_breaks(points, pwd):
 469     #
 470     # first go is quite naive: go to deepest level that is in
 471     # every branch, not counting top level divisions (which may be
 472     # cover, prologue, etc).
 473     serial_points = []
 474     #pprint(points)
 475     #lcb == lowest common depth (> 1)
 476     def serialise(p, depth):
 477         serial_points.append((depth, p))
 478         #if p['class']:
 479         #    log("found class=='%s' at depth %s" % (p['class'], depth))
 480         if not p.get('points'):
 481             return depth
 482         lcd = 1e999
 483         for child in p['points']:
 484             bottom = serialise(child, depth + 1)
 485             lcd = min(bottom, lcd)
 486         return lcd
 487
 488     lcd = 999
 489     depths = []
 490     for p in points:
 491         depth = serialise(p, 1)
 492         depths.append(depth)
 493         if 1 < depth < lcd:
 494             lcd = depth
 495     if lcd == 999:
 496         lcd = 1
 497
 498     # The book should now be split on all the points at chapter depth
 499     # (lcd), and all higher points butnot if the higher point is at
 500     # the same location as the chapter.  If the chapter start url has
 501     # a fragment id (e.g. "something.html#chapter-6"), then the split
 502     # is internal to the chapter.  What the book serialiser needs is a
 503     # mapping from file names to the split-ids in that chapter, so
 504     # construct that.
 505
 506     splits = {}
 507     for depth, p in serial_points:
 508         if depth > lcd:
 509             continue #ignore the sub-sections
 510         url, ID = p['content_src'], None
 511         url = os.path.join(pwd, url)
 512         if '#' in url:
 513             log("GOT a fragment! %s" % url)
 514             url, ID = url.split('#', 1)
 515         s = splits.setdefault(url, [])
 516         s.append((depth, ID, p))
 517
 518     return lcd, serial_points, splits
 519
 520
 521 def parse_metadata(metadata):
 522     """metadata is an OPF metadata node, as defined at
 523     http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
 524     (or a dc-metadata or x-metadata child thereof).
 525
 526     """
 527     # the node probably has at least 'dc', 'opf', and None namespace
 528     # prefixes.  None and opf probably map to the same thing. 'dc' is
 529     # Dublin Core.
 530     nsmap = metadata.nsmap
 531     nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
 532     default_ns = nstags[None]
 533
 534     # Collect element data in namespace-bins, and map prefixes to
 535     # those bins for convenience
 536     nsdict = dict((v, {}) for v in nsmap.values())
 537
 538     def add_item(ns, tag, value, extra):
 539         #any key can be duplicate, so store in a list
 540         if ns not in nsdict:
 541             nsdict[ns] = {}
 542         values = nsdict[ns].setdefault(tag, [])
 543         values.append((value, extra))
 544
 545     for t in metadata.iterdescendants():
 546         #look for special OPF tags
 547         if t.tag == default_ns + 'meta':
 548             #meta tags <meta name="" content="" />
 549             name = t.get('name')
 550             content = t.get('content')
 551             others = tuple((k, v) for k, v in t.items() if k not in ('name', 'content'))
 552             if ':' in name:
 553                 # the meta tag is using xml namespaces in attribute values.
 554                 prefix, name = name.split(':', 1)
 555             else:
 556                 prefix = None
 557             add_item(t.nsmap[prefix], name, content, others)
 558             continue
 559
 560         if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
 561             # Subelements of these deprecated elements are in either
 562             # DC or non-DC namespace (respectively).  Of course, this
 563             # is true of any element anyway, so it is sufficent to
 564             # ignore this (unless we want to cause pedantic errors).
 565             log("found a live %s tag; descending into but otherwise ignoring it"
 566                 % t.tag[len(default_ns):])
 567             continue
 568
 569         tag = t.tag[t.tag.rfind('}') + 1:]
 570         add_item(t.nsmap[t.prefix], tag, t.text,
 571                  tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
 572
 573     return nsdict
 574
 575
 576 def parse_manifest(manifest, pwd):
 577     """
 578     Only contains <item>s; each <item> has id, href, and media-type.
 579
 580     It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
 581     file (i.e., the files needed to get this far).
 582
 583     The manifest can specify fallbacks for unrecognised documents, but
 584     Espri does not use that (nor do any of the test epub files).
 585
 586     <manifest>
 587     <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
 588     <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
 589     <item id="cover" href="cover.jpg" media-type="image/jpeg" />
 590     </manifest>
 591     """
 592     items = {}
 593     ns = '{%s}' % manifest.nsmap[None]
 594
 595     for t in manifest.iterchildren(ns + 'item'):
 596         id = t.get('id')
 597         href = os.path.join(pwd, t.get('href'))
 598         media_type = t.get('media-type')
 599         items[id] = (href, media_type) #XXX does media-type matter?
 600
 601     return items
 602
 603 def parse_spine(spine):
 604     """The spine is an ordered list of xhtml documents (or dtbook, but
 605     Booki can't edit that, or manifest items that 'fallback' to xhtml,
 606     which Espri doesn't yet handle).  Also, anything in the manifest
 607     that can be in the spine, must be.
 608
 609     Spine itemrefs can have a 'linear' attribute, with a value of
 610     'yes' or 'no' (defaulting to 'yes').  If an item is linear, it is
 611     in the main stream of the book.  Reader software is allowed to
 612     ignore this distinction, as Espri does.
 613
 614     The toc attribute points to the ncx file (via manifest id).
 615     """
 616     items = []
 617     ns = '{%s}' % spine.nsmap[None]
 618     for t in spine.iterchildren(ns + 'itemref'):
 619         items.append(t.get('idref'))
 620
 621     toc = spine.get('toc')
 622
 623     return toc, items
 624
 625
 626 def get_ncxtext(e):
 627     """get text content from an <xx><text>...</text></xx> construct,
 628     as is common in NCX files."""
 629     # there will only be one <text>, but for...iter is still easiest
 630     for t in e.iter(DAISYNS + 'text'):
 631         return t.text
 632     return '' # or leave it at None?
 633
 634 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
 635     """Make a mapping of languages to labels."""
 636     # This reads navInfo or navLabel tags. navInfo is unlikely, but
 637     # navLabel is ubiquitous.  There can be one for each language, so
 638     # construct a dict.
 639     labels = {}
 640     for label in e.findall(tag):
 641         lang = label.get(XMLNS + 'lang')
 642         labels[lang] = get_ncxtext(e)
 643     return labels
 644
 645 def parse_ncx(ncx):
 646     """
 647     The NCX file is the closest thing to FLOSS Manuals TOC.txt.  It
 648     describes the heirarchical structure of the document (wheras the
 649     spine describes its 'physical' structure).
 650     """
 651     #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
 652
 653     headers = {}
 654     #if a header is set multiple times, keep all
 655     def setheader(name, content, scheme=None):
 656         values = headers.setdefault(name, [])
 657         values.append((content, scheme))
 658
 659     head = ncx.find(DAISYNS + 'head')
 660     #<!ELEMENT head (meta+)>
 661     for meta in head.findall(DAISYNS + 'meta'):
 662         #whatever 'scheme' is
 663         setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
 664
 665     for t in ('docTitle', 'docAuthor'):
 666         for e in ncx.findall(DAISYNS + t):
 667             if e is not None:
 668                 setheader(t, get_ncxtext(e))
 669
 670     root = ncx.getroot()
 671     for attr, header in (('dir', 'dir'),
 672                          (XMLNS + 'lang', 'lang')):
 673         value = root.get(attr)
 674         if value is not None:
 675             setheader(header, value)
 676
 677     navmap = root.find(DAISYNS + 'navMap')
 678     ret = {
 679         'headers':  headers,
 680         'navmap':   parse_navmap(navmap),
 681     }
 682
 683     #Try adding these bits, even though no-one has them and they are no use.
 684     pagelist = ncx.find(DAISYNS + 'pageList')
 685     navlist = ncx.find(DAISYNS + 'navList')
 686     if pagelist is not None:
 687         ret['pagelist'] = parse_pagelist(pagelist)
 688     if navlist is not None:
 689         ret['navlist'] = parse_navlist(navlist)
 690
 691     return ret
 692
 693
 694 def parse_navmap(e):
 695     #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
 696     #XXX move info and labels out of navmap, and into headers?
 697     return {
 698         'info': get_labels(e, DAISYNS + 'navInfo'),
 699         'labels': get_labels(e),
 700         'points': tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint')),
 701         }
 702
 703 def parse_navpoint(e):
 704     #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
 705     c = e.find(DAISYNS + 'content')
 706     subpoints = tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint'))
 707     return {
 708         'id': e.get('id'),
 709         'class': e.get('class'),
 710         'play_order': int(e.get('playOrder')),
 711         #'content_id': c.get('id'),
 712         'content_src': c.get('src'),
 713         'labels': get_labels(e),
 714         'points': subpoints,
 715         }
 716
 717
 718 def parse_pagelist(e):
 719     # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
 720     return {
 721         'info': get_labels(e, DAISYNS + 'navInfo'),
 722         'labels': get_labels(e),
 723         'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'pageTarget')),
 724         }
 725
 726 def parse_pagetarget(e):
 727     #<!ELEMENT pageTarget (navLabel+, content)>
 728     labels = get_labels(e)
 729     c = e.find(DAISYNS + 'content')
 730     ret = {
 731         'id': e.get('id'),
 732         'type': e.get('type'),
 733         'play_order': int(e.get('playOrder')),
 734         'content_src': c.get('src'),
 735         'labels': get_labels(e),
 736     }
 737     value = e.get('value')
 738     if value is not None:
 739         ret['value'] = value
 740     return ret
 741
 742 def parse_navlist(e):
 743     #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
 744     return {
 745         'info': get_labels(e, DAISYNS + 'navInfo'),
 746         'labels': get_labels(e),
 747         'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'navTarget')),
 748         }
 749
 750 def parse_navtarget(e):
 751     #<!ELEMENT navTarget (navLabel+, content)>
 752     labels = get_labels(e)
 753     c = e.find(DAISYNS + 'content')
 754     ret = {
 755         'id': e.get('id'),
 756         'play_order': int(e.get('playOrder')),
 757         'content_src': c.get('src'),
 758         'labels': get_labels(e),
 759     }
 760     value = e.get('value')
 761     if value is not None:
 762         ret['value'] = value
 763     return ret