objavi/epub.py

   1 """Module for dealing with epub -> booki conversions."""
   2
   3 import os, sys
   4 import zipfile
   5 from cStringIO import StringIO
   6 import copy
   7
   8 try:
   9     from json import dumps
  10 except ImportError:
  11     from simplejson import dumps
  12
  13 import lxml, lxml.html, lxml.cssselect
  14 from lxml import etree
  15
  16 from objavi.config import DC, XHTML, XHTMLNS, FM
  17 from booki.bookizip import BookiZip
  18
  19 #XML namespaces.  The *NS varients are in {curly brackets} for clark's syntax
  20 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
  21 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
  22 OPFNS = '{http://www.idpf.org/2007/opf}'
  23 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
  24
  25 MARKUP_TYPES = ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
  26 HTML_TYPES = ('application/xhtml+xml', 'text/html')
  27
  28 def log(*messages, **kwargs):
  29     for m in messages:
  30         try:
  31             print >> sys.stderr, m
  32         except Exception:
  33             print >> sys.stderr, repr(m)
  34
  35
  36 html_parser = lxml.html.HTMLParser(encoding="utf-8")
  37 xhtml_parser = lxml.html.XHTMLParser(encoding="utf-8")
  38
  39 def _xhtml_parse(*args, **kwargs):
  40     kwargs['parser'] = xhtml_parser
  41     return lxml.html.parse(*args, **kwargs)
  42
  43 def _html_parse(*args, **kwargs):
  44     kwargs['parser'] = html_parser
  45     return lxml.html.parse(*args, **kwargs)
  46
  47 def new_doc(guts="", version="1.1", lang=None):
  48     xmldec = '<?xml version="1.0" encoding="UTF-8"?>'
  49     doctypes = {
  50         '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
  51                 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
  52         '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
  53                 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
  54     }
  55
  56     if lang in (None, 'und', 'UND'):
  57         langdec = ''
  58     else:
  59         langdec = 'xml:lang="%s" lang="%s"' % (lang, lang)
  60
  61     doc = ('<html xmlns="%s" version="XHTML %s" %s>'
  62            '<head></head><body>%s</body></html>'
  63            % (XHTML, version, langdec, guts))
  64
  65     f = StringIO(xmldec + doctypes.get(version, '') + doc)
  66     tree = lxml.html.parse(f)
  67     f.close()
  68     return tree
  69
  70
  71 class EpubError(Exception):
  72     pass
  73
  74 class Epub(object):
  75     """
  76
  77     Abstract Container:
  78     META-INF/
  79        container.xml
  80        [manifest.xml]
  81        [metadata.xml]
  82        [signatures.xml]
  83        [encryption.xml]
  84        [rights.xml]
  85     OEBPS/
  86        Great Expectations.opf
  87        cover.html
  88        chapters/
  89           chapter01.html
  90           chapter02.html
  91           <other HTML files for the remaining chapters>
  92
  93     """
  94     def load(self, src):
  95         # Zip is a variable format, and zipfile is limited.  If that
  96         # becomes a problem we will have to ise an `unzip` subprocess,
  97         # but it hasn't been so far.
  98         if isinstance(src, str):
  99             # Should end with PK<06><05> + 18 more.
 100             # Some zips contain 'comments' after that, which breaks ZipFile
 101             zipend = src.rfind('PK\x05\x06') + 22
 102             if len(src) != zipend:
 103                 log('Bad zipfile?')
 104                 src = src[: zipend]
 105             src = StringIO(src)
 106         self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
 107         self.names = self.zip.namelist()
 108         self.info = self.zip.infolist()
 109         self.origin = src
 110         log(self.names)
 111
 112     def gettree(self, name=None, id=None, parse=etree.parse):
 113         """get an XML tree from the given zip filename or manifest ID"""
 114         if name is None:
 115             name, mimetype = self.manifest[id]
 116         #Note: python 2.6 (not 2.5) has zipfile.open
 117         s = self.zip.read(name)
 118         f = StringIO(s)
 119         tree = parse(f)
 120         f.close()
 121         return tree
 122
 123     def parse_meta(self):
 124         '''META-INF/container.xml contains one or more <rootfile>
 125         nodes.  We want the "application/oepbs-package+xml" one.
 126
 127         <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
 128         <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
 129
 130         Other files are allowed in META-INF, but none of them are much
 131         use.  They are manifest.xml, metadata.xml, signatures.xml,
 132         encryption.xml, and rights.xml.
 133         '''
 134         tree = self.gettree('META-INF/container.xml')
 135         for r in tree.getiterator(CONTAINERNS + 'rootfile'):
 136             if r.get('media-type') == "application/oebps-package+xml":
 137                 rootfile = r.get('full-path')
 138                 break
 139         else:
 140             raise EpubError("No OPF rootfile found")
 141
 142         self.opf_file = rootfile
 143
 144     def parse_opf(self):
 145         """
 146         The opf file is arranged like this:
 147         <package>
 148         <metadata />
 149         <manifest />
 150         <spine />
 151         <guide />
 152         </package>
 153
 154         Metadata, manifest and spine are parsed in separate helper
 155         functions.
 156         """
 157         self.opfdir = os.path.dirname(self.opf_file) #needed for manifest parsing
 158         tree = self.gettree(self.opf_file)
 159         root = tree.getroot()
 160         metadata = root.find(OPFNS + 'metadata')
 161         manifest = root.find(OPFNS + 'manifest')
 162         spine = root.find(OPFNS + 'spine')
 163
 164         self.metadata = parse_metadata(metadata)
 165         self.manifest = parse_manifest(manifest, self.opfdir)
 166         # mapping of filenames to new filenames.  This needs to be
 167         # done early to detect clashes (e.g. '/images/hello.jpg' and
 168         # '/images/big/hello.jpg' would both reduce to
 169         # 'static/hello.jpg').
 170         self.media_map = {}
 171         for k, v in self.manifest.items():
 172             fn, mimetype = v
 173             if mimetype not in MARKUP_TYPES:
 174                 oldfn = fn
 175                 if '/' in fn:
 176                     fn = fn.rsplit('/', 1)[1]
 177                 while fn in self.media_map.values():
 178                     fn = '_' + fn
 179                 newfn = 'static/%s' % fn
 180                 self.media_map[oldfn] = newfn
 181
 182         ncxid, self.spine = parse_spine(spine)
 183         self.ncxfile = self.manifest[ncxid][0]
 184
 185         #there is also an optional guide section, which we ignore
 186         guide = root.find(OPFNS + 'guide')
 187         if guide is not None:
 188             self.guide = parse_guide(guide)
 189         else:
 190             self.guide = None
 191
 192
 193     def parse_ncx(self):
 194         ncx = self.gettree(self.ncxfile)
 195         self.ncxdata = parse_ncx(ncx)
 196
 197     def raw_json(self):
 198         """get all the known metadata and nav data as json."""
 199         data = {
 200             'metadata': self.metadata,
 201             'manifest': self.manifest,
 202             'spine': self.spine,
 203             'ncx': self.ncxdata
 204             }
 205         if self.guide is not None:
 206             data['guide'] = self.guide
 207         return dumps(data, indent=2)
 208
 209     def find_language(self):
 210         opflang = [x[0].lower() for x in
 211                    self.metadata.get(DC, {}).get('language', ())]
 212
 213         # XXX Should the ncx language enter into it? Being xml:lang,
 214         # it is in theory just the language of the ncx document
 215         # itself.  But if the metadata lacks language, should it be
 216         # used instead? At present, NO.
 217         #ncxlang = self.ncxdata['headers'].get('lang', ())
 218
 219         # XXX also, for now, ignoring case of badly formed language
 220         # codes, conflicting or supplementary languages, etc.
 221         opflang = [x for x in opflang if x not in ('und', '')]
 222         if not opflang:
 223             return None
 224         if len(set(opflang)) > 1:
 225             log('%s metadata has more than one language: %s -- using first one'
 226                 % (self.origin, opflang))
 227         return opflang[0]
 228
 229     def find_probable_chapters(self):
 230         """Try to find the real chapters from the NCX file.  The
 231         problem is that different epubs all use their own level of
 232         nesting."""
 233         # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
 234         # and FM books have (section 1 (c1, c2,..),..)
 235         # i.e super-chapter blocks
 236         # some have (((c1, c2, c3))) -- deeply nested chapters
 237         # some have no real chapters, but stupid structure
 238         points = self.ncxdata['navmap']['points']
 239         pwd = os.path.dirname(self.ncxfile)
 240         serial_points, splits = get_chapter_breaks(points, pwd)
 241         return serial_points, splits
 242
 243     def concat_document(self):
 244         """Join all the xhtml files together, putting in markers
 245         indicating where the splits should be.
 246         """
 247         lang = self.find_language()
 248         points = self.ncxdata['navmap']['points']
 249         pwd = os.path.dirname(self.ncxfile)
 250         serial_points, chapter_markers = get_chapter_breaks(points, pwd)
 251         doc = new_doc(lang=lang)
 252         #log(chapter_markers)
 253         for ID in self.spine:
 254             fn, mimetype = self.manifest[ID]
 255             if mimetype.startswith('image'):
 256                 root = lxml.html.Element('html')
 257                 body = etree.SubElement(root, 'body')
 258                 first_el = etree.SubElement(body, 'img', src=self.media_map.get(fn, fn), alt='')
 259             else:
 260                 tree = self.gettree(fn, parse=_html_parse)
 261                 root = tree.getroot()
 262                 first_el = _find_tag(root, 'body')[0]
 263             #point the links to the new names. XXX probably fragile
 264             root.rewrite_links(lambda x: self.media_map.get(os.path.join(self.opfdir, x), x))
 265
 266             for depth, fragment, point in chapter_markers.get(fn, ()):
 267                 if fragment:
 268                     start = root.xpath("//*[@id='%s']" % fragment)[0]
 269                 else:
 270                     start = first_el
 271                 labels = point['labels']
 272                 add_marker(start, 'espri-chapter-%(id)s' % point,
 273                            title=find_good_label(labels, lang),
 274                            subsections=str(bool(point['points'])))
 275
 276             add_marker(first_el, 'espri-new-file-%s' % ID, title=fn)
 277             add_guts(root, doc)
 278         return doc
 279
 280
 281     def make_bookizip(self, zfn):
 282         """Split up the document and construct a booki-toc for it."""
 283         doc = self.concat_document()
 284         bz = BookiZip(zfn)
 285
 286         chapters = split_document(doc)
 287         real_chapters = drop_empty_chapters(chapters)
 288         rightsholders = [c for c, extra in self.metadata[DC].get('creator', ())]
 289         contributors = rightsholders + [c for c, extra in self.metadata[DC].get('contributor', ())]
 290
 291         spine = []
 292         for id, title, tree in real_chapters:
 293             try:
 294                 root = tree.getroot()
 295             except:
 296                 root = tree
 297             try:
 298                 del root.attrib['xmlns']
 299                 del root.attrib['version']
 300                 del root.attrib['xml:lang']
 301             except KeyError,e:
 302                 log(e)
 303             if title:
 304                 head = root.makeelement('head')
 305                 _title = etree.SubElement(head, 'title')
 306                 _title.text = title
 307                 root.insert(0, head)
 308             #blob = etree.tostring(tree)
 309             blob = lxml.html.tostring(tree)
 310             bz.add_to_package(id, '%s.html' % id, blob, mediatype='text/html',
 311                               contributors=contributors,
 312                               rightsholders=rightsholders)
 313             spine.append(id)
 314
 315         #add the images and other non-html data unchanged.
 316         for id, data in self.manifest.iteritems():
 317             fn, mimetype = data
 318             if mimetype not in MARKUP_TYPES:
 319                 blob = self.zip.read(fn)
 320                 bz.add_to_package(id, self.media_map[fn], blob, mimetype,
 321                                   contributors=contributors,
 322                                   rightsholders=rightsholders
 323                                   )
 324
 325         #now to construct a table of contents
 326         lang = self.find_language()
 327
 328         deferred_urls = []
 329         def write_toc(point, section):
 330             tocpoint = {}
 331             title = find_good_label(point['labels'], lang),
 332             if title and title[0]:
 333                 tocpoint['title'] = title[0]
 334             ID = point['id']
 335             if ID in spine:
 336                 tocpoint['url'] = self.manifest.get(ID, ID + '.html')
 337                 while deferred_urls:
 338                     tp = deferred_urls.pop()
 339                     tp['url'] = tocpoint['url']
 340                     log('%r has deferred url: %r' % (tp['title'], tp['url']))
 341             else:
 342                 deferred_urls.append(tocpoint)
 343             if point['points']:
 344                 tocpoint['children'] = []
 345                 for child in point['points']:
 346                     write_toc(child, tocpoint['children'])
 347
 348             section.append(tocpoint)
 349
 350         toc = []
 351         points = self.ncxdata['navmap']['points']
 352         for p in points:
 353             write_toc(p, toc)
 354
 355         metadata = {FM: {'book':{},
 356                          'server': {},
 357                          },
 358                     DC: {}}
 359
 360         for namespace, keys in self.metadata.items():
 361             if 'namespace' not in metadata:
 362                 metadata[namespace] = {}
 363             log(keys)
 364             for key, values in keys.items():
 365                 dest = metadata[namespace].setdefault(key, {})
 366                 for value, extra in values:
 367                     scheme = ''
 368                     if extra:
 369                         for x in ('scheme', 'role'):
 370                             if x in extra:
 371                                 scheme = extra[x]
 372                                 break
 373                     dest.setdefault(scheme, []).append(value)
 374
 375         if not metadata[FM]['book']:
 376             metadata[FM]['book'][''] = [''.join(x for x in str(metadata[DC]['identifier'][''][0]) if x.isalnum())]
 377         if not metadata[FM]['server']:
 378             metadata[FM]['server'][''] = ['booki.flossmanuals.net']
 379
 380         log(metadata)
 381
 382         bz.info = {
 383             'spine': spine,
 384             'TOC': toc,
 385             'metadata': metadata,
 386             'version': '1',
 387             }
 388
 389         bz.finish()
 390
 391
 392 def find_good_label(labels, lang=None):
 393     """Try to find a suitable label from a dictionary mapping
 394     languages to labels, resorting to a random label if need be."""
 395     #XXX not taking into account language sub-tags ("en_GB")
 396     for x in [lang, None]:
 397         if x in labels:
 398             return labels[x]
 399     if labels:
 400         #return random.choice(labels.values())
 401         return ' | '.join(labels.values())
 402     return None
 403
 404
 405 #labels.get(lang, '\n'.join(labels.values())),
 406
 407 def drop_empty_chapters(chapters):
 408     """If the chapter has no content, ignore it.  Content is defined
 409     as images or text."""
 410     good_chapters = []
 411     for c in chapters:
 412         good = False
 413         for e in c[2].iter():
 414             if ((e.text and e.text.strip()) or
 415                 (e.tail and e.tail.strip()) or
 416                 e.tag in ('img',)):
 417                 good = True
 418                 break
 419         if good:
 420             good_chapters.append(c)
 421     return good_chapters
 422
 423
 424 def copy_element(src, create):
 425     """Return a copy of the src element, with all its attributes and
 426     tail, using create to make the copy. create is probably an
 427     Element._makeelement method, to associate the copy with the right
 428     tree, but it could be etree.HTMLElement."""
 429     if isinstance(src.tag, basestring):
 430         dest = create(src.tag)
 431     else:
 432         dest = copy.copy(src)
 433
 434     for k, v in src.items():
 435         dest.set(k, v)
 436     dest.tail = src.tail
 437     return dest
 438
 439 def split_document(doc):
 440     """Split the document along chapter boundaries."""
 441     try:
 442         root = doc.getroot()
 443     except AttributeError:
 444         root = doc
 445
 446     front_matter = copy_element(root, lxml.html.Element)
 447     chapters = [('espri-unindexed-front-matter',
 448                  'Unindexed Front Matter',
 449                  front_matter)]
 450
 451     _climb_and_split(root, front_matter, chapters)
 452     return chapters
 453
 454 def _climb_and_split(src, dest, chapters):
 455     for child in src.iterchildren():
 456         if child.tag == 'hr' and child.get('class') == MARKER_CLASS:
 457             ID = child.get('id')
 458             if ID.startswith('espri-chapter-'):
 459                 title = child.get('title') or ID
 460                 new = copy_element(src, lxml.html.Element)
 461                 root = new
 462
 463                 for a in src.iterancestors():
 464                     a2 = copy_element(a, root.makeelement)
 465                     a2.append(root)
 466                     root = a2
 467
 468                 chapters.append((ID[14:], title, root))
 469
 470                 dest.tail = None
 471                 for a in dest.iterancestors():
 472                     a.tail = None
 473
 474                 dest = new
 475             else:
 476                 log("skipping %s" % etree.tostring(child))
 477
 478         else:
 479             new = copy_element(child, dest.makeelement)
 480             new.text = child.text
 481             dest.append(new)
 482             _climb_and_split(child, new, chapters)
 483
 484
 485 def save_chapters(chapters):
 486     for id, tree in chapters.items():
 487         string = lxml.html.tostring(tree, method='html')
 488         f = open('/tmp/x%s.html' % id, 'w')
 489         f.write(string)
 490         f.close()
 491
 492
 493 def add_guts(src, dest):
 494     """Append the contents of the <body> of one tree onto that of
 495     another.  The source tree will be emptied."""
 496     #print  lxml.etree.tostring(src)
 497     sbody = _find_tag(src, 'body')
 498     dbody = _find_tag(dest, 'body')
 499     if len(dbody):
 500         dbody[-1].tail = ((dbody[-1].tail or '') +
 501                           (sbody.text or '')) or None
 502     else:
 503         dbody.text = sbody.text
 504
 505     for x in sbody:
 506         dbody.append(x)
 507
 508     dbody.tail = ((dbody.tail or '') +
 509                   (sbody.tail or '')) or None
 510
 511
 512
 513 def _find_tag(doc, tag):
 514     #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
 515     try:
 516         doc = doc.getroot()
 517     except AttributeError:
 518         pass
 519     if doc.nsmap:
 520         try:
 521             return doc.iter(XHTMLNS + tag).next()
 522         except StopIteration:
 523             log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc.nsmap, tag))
 524     return doc.iter(tag).next()
 525
 526 MARKER_CLASS="espri-marker"
 527
 528 def add_marker(el, ID, **kwargs):
 529     """Add a marker before the elememt"""
 530     marker = el.makeelement('hr')
 531     marker.set('id', ID)
 532     marker.set('class', MARKER_CLASS)
 533     for k, v in kwargs.items():
 534         marker.set(k, v)
 535     parent = el.getparent()
 536     index = parent.index(el)
 537     parent.insert(index, marker)
 538
 539
 540
 541 def get_chapter_breaks(points, pwd):
 542     # First go was overly complex, trying to guess which sections were
 543     # really chapters.  Now, every ncx navpoint is a chapter break.
 544     serial_points = []
 545     def serialise(p, depth):
 546         serial_points.append((depth, p))
 547         #if p['class']:
 548         #    log("found class=='%s' at depth %s" % (p['class'], depth))
 549         if not p.get('points'):
 550             return depth
 551         for child in p['points']:
 552             bottom = serialise(child, depth + 1)
 553
 554     for p in points:
 555         serialise(p, 1)
 556
 557     splits = {}
 558     for depth, p in serial_points:
 559         url, ID = p['content_src'], None
 560         url = os.path.join(pwd, url)
 561         if '#' in url:
 562             log("GOT a fragment! %s" % url)
 563             url, ID = url.split('#', 1)
 564         s = splits.setdefault(url, [])
 565         s.append((depth, ID, p))
 566
 567     return serial_points, splits
 568
 569
 570 def parse_metadata(metadata):
 571     """metadata is an OPF metadata node, as defined at
 572     http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
 573     (or a dc-metadata or x-metadata child thereof).
 574
 575     """
 576     # the node probably has at least 'dc', 'opf', and None namespace
 577     # prefixes.  None and opf probably map to the same thing. 'dc' is
 578     # Dublin Core.
 579     nsmap = metadata.nsmap
 580     nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
 581     default_ns = nstags[None]
 582
 583     # Collect element data in namespace-bins, and map prefixes to
 584     # those bins for convenience
 585     nsdict = dict((v, {}) for v in nsmap.values())
 586
 587     def add_item(ns, tag, value, extra):
 588         #any key can be duplicate, so store in a list
 589         if ns not in nsdict:
 590             nsdict[ns] = {}
 591         values = nsdict[ns].setdefault(tag, [])
 592         values.append((value, extra))
 593
 594     for t in metadata.iterdescendants():
 595         #look for special OPF tags
 596         if t.tag == default_ns + 'meta':
 597             #meta tags <meta name="" content="" />
 598             name = t.get('name')
 599             content = t.get('content')
 600             others = dict((k, v) for k, v in t.items() if k not in ('name', 'content'))
 601             if ':' in name:
 602                 # the meta tag is using xml namespaces in attribute values.
 603                 prefix, name = name.split(':', 1)
 604             else:
 605                 prefix = None
 606             add_item(t.nsmap[prefix], name, content, others)
 607             continue
 608
 609         if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
 610             # Subelements of these deprecated elements are in either
 611             # DC or non-DC namespace (respectively).  Of course, this
 612             # is true of any element anyway, so it is sufficent to
 613             # ignore this (unless we want to cause pedantic errors).
 614             log("found a live %s tag; descending into but otherwise ignoring it"
 615                 % t.tag[len(default_ns):])
 616             continue
 617
 618         tag = t.tag[t.tag.rfind('}') + 1:]
 619         add_item(t.nsmap[t.prefix], tag, t.text,
 620                  tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
 621
 622     return nsdict
 623
 624
 625 def parse_manifest(manifest, pwd):
 626     """
 627     Only contains <item>s; each <item> has id, href, and media-type.
 628
 629     It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
 630     file (i.e., the files needed to get this far).
 631
 632     The manifest can specify fallbacks for unrecognised documents, but
 633     Espri does not use that (nor do any of the test epub files).
 634
 635     <manifest>
 636     <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
 637     <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
 638     <item id="cover" href="cover.jpg" media-type="image/jpeg" />
 639     </manifest>
 640     """
 641     items = {}
 642     ns = '{%s}' % manifest.nsmap[None]
 643
 644     for t in manifest.iterchildren(ns + 'item'):
 645         id = t.get('id')
 646         href = os.path.join(pwd, t.get('href'))
 647         media_type = t.get('media-type')
 648         items[id] = (href, media_type) #XXX does media-type matter?
 649
 650     return items
 651
 652 def parse_spine(spine):
 653     """The spine is an ordered list of xhtml documents (or dtbook, but
 654     Booki can't edit that, or manifest items that 'fallback' to xhtml,
 655     which Espri doesn't yet handle).  Also, anything in the manifest
 656     that can be in the spine, must be.
 657
 658     Spine itemrefs can have a 'linear' attribute, with a value of
 659     'yes' or 'no' (defaulting to 'yes').  If an item is linear, it is
 660     in the main stream of the book.  Reader software is allowed to
 661     ignore this distinction, as Espri does.
 662
 663     The toc attribute points to the ncx file (via manifest id).
 664     """
 665     items = []
 666     ns = '{%s}' % spine.nsmap[None]
 667     for t in spine.iterchildren(ns + 'itemref'):
 668         items.append(t.get('idref'))
 669
 670     toc = spine.get('toc')
 671
 672     return toc, items
 673
 674 def parse_guide(guide):
 675     """Parse the guide from the opf file."""
 676     items = []
 677     ns = '{%s}' % guide.nsmap[None]
 678     for r in guide.iterchildren(ns + 'reference'):
 679         items.append((r.get('href'), r.get('type'), r.get('title'),))
 680
 681     return items
 682
 683
 684 def get_ncxtext(e):
 685     """get text content from an <xx><text>...</text></xx> construct,
 686     as is common in NCX files."""
 687     # there will only be one <text>, but for...iter is still easiest
 688     for t in e.iter(DAISYNS + 'text'):
 689         return t.text
 690     return '' # or leave it at None?
 691
 692 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
 693     """Make a mapping of languages to labels."""
 694     # This reads navInfo or navLabel tags. navInfo is unlikely, but
 695     # navLabel is ubiquitous.  There can be one for each language, so
 696     # construct a dict.
 697     labels = {}
 698     for label in e.findall(tag):
 699         lang = label.get(XMLNS + 'lang')
 700         labels[lang] = get_ncxtext(e)
 701     return labels
 702
 703 def parse_ncx(ncx):
 704     """
 705     The NCX file is the closest thing to FLOSS Manuals TOC.txt.  It
 706     describes the heirarchical structure of the document (wheras the
 707     spine describes its 'physical' structure).
 708     """
 709     #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
 710
 711     headers = {}
 712     #if a header is set multiple times, keep all
 713     def setheader(name, content, scheme=None):
 714         values = headers.setdefault(name, [])
 715         values.append((content, scheme))
 716
 717     head = ncx.find(DAISYNS + 'head')
 718     #<!ELEMENT head (meta+)>
 719     for meta in head.findall(DAISYNS + 'meta'):
 720         #whatever 'scheme' is
 721         setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
 722
 723     for t in ('docTitle', 'docAuthor'):
 724         for e in ncx.findall(DAISYNS + t):
 725             if e is not None:
 726                 setheader(t, get_ncxtext(e))
 727
 728     root = ncx.getroot()
 729     for attr, header in (('dir', 'dir'),
 730                          (XMLNS + 'lang', 'lang')):
 731         value = root.get(attr)
 732         if value is not None:
 733             setheader(header, value)
 734
 735     navmap = root.find(DAISYNS + 'navMap')
 736     ret = {
 737         'headers':  headers,
 738         'navmap':   parse_navmap(navmap),
 739     }
 740
 741     #Try adding these bits, even though no-one has them and they are no use.
 742     pagelist = ncx.find(DAISYNS + 'pageList')
 743     navlist = ncx.find(DAISYNS + 'navList')
 744     if pagelist is not None:
 745         ret['pagelist'] = parse_pagelist(pagelist)
 746     if navlist is not None:
 747         ret['navlist'] = parse_navlist(navlist)
 748
 749     return ret
 750
 751
 752 def parse_navmap(e):
 753     #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
 754     #XXX move info and labels out of navmap, and into headers?
 755     return {
 756         'info': get_labels(e, DAISYNS + 'navInfo'),
 757         'labels': get_labels(e),
 758         'points': tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint')),
 759         }
 760
 761 def parse_navpoint(e):
 762     #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
 763     c = e.find(DAISYNS + 'content')
 764     subpoints = tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint'))
 765     return {
 766         'id': e.get('id'),
 767         'class': e.get('class'),
 768         'play_order': int(e.get('playOrder')),
 769         #'content_id': c.get('id'),
 770         'content_src': c.get('src'),
 771         'labels': get_labels(e),
 772         'points': subpoints,
 773         }
 774
 775
 776 def parse_pagelist(e):
 777     # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
 778     return {
 779         'info': get_labels(e, DAISYNS + 'navInfo'),
 780         'labels': get_labels(e),
 781         'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'pageTarget')),
 782         }
 783
 784 def parse_pagetarget(e):
 785     #<!ELEMENT pageTarget (navLabel+, content)>
 786     labels = get_labels(e)
 787     c = e.find(DAISYNS + 'content')
 788     ret = {
 789         'id': e.get('id'),
 790         'type': e.get('type'),
 791         'play_order': int(e.get('playOrder')),
 792         'content_src': c.get('src'),
 793         'labels': get_labels(e),
 794     }
 795     value = e.get('value')
 796     if value is not None:
 797         ret['value'] = value
 798     return ret
 799
 800 def parse_navlist(e):
 801     #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
 802     return {
 803         'info': get_labels(e, DAISYNS + 'navInfo'),
 804         'labels': get_labels(e),
 805         'targets': tuple(parse_navtarget(x) for x in e.findall(DAISYNS + 'navTarget')),
 806         }
 807
 808 def parse_navtarget(e):
 809     #<!ELEMENT navTarget (navLabel+, content)>
 810     labels = get_labels(e)
 811     c = e.find(DAISYNS + 'content')
 812     ret = {
 813         'id': e.get('id'),
 814         'play_order': int(e.get('playOrder')),
 815         'content_src': c.get('src'),
 816         'labels': get_labels(e),
 817     }
 818     value = e.get('value')
 819     if value is not None:
 820         ret['value'] = value
 821     return ret