epub.py

   1 """Module for dealing with epub -> booki conversions."""
   2
   3 import os, sys
   4 from pprint import pprint
   5 import zipfile
   6 from cStringIO import StringIO
   7
   8 try:
   9     from json import dumps
  10 except ImportError:
  11     from simplejson import dumps
  12
  13 import lxml, lxml.html, lxml.etree, lxml.cssselect
  14
  15 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
  16 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
  17 OPFNS = '{http://www.idpf.org/2007/opf}'
  18 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
  19
  20 DC = "http://purl.org/dc/elements/1.1/"
  21
  22 NAMESPACES = {
  23     'opf': 'http://www.idpf.org/2007/opf',
  24     'dc': 'http://purl.org/dc/elements/1.1/', #dublin core
  25     'tei': 'http://www.tei-c.org/ns/1.0',
  26     'dcterms': 'http://purl.org/dc/terms/',
  27     'nzetc': 'http://www.nzetc.org/structure',
  28     'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
  29 }
  30
  31 def log(*messages, **kwargs):
  32     for m in messages:
  33         try:
  34             print >> sys.stderr, m
  35         except Exception:
  36             print >> sys.stderr, repr(m)
  37
  38
  39
  40 class EpubError(Exception):
  41     pass
  42
  43 class Epub(object):
  44     """
  45
  46     Abstract Container:
  47     META-INF/
  48        container.xml
  49        [manifest.xml]
  50        [metadata.xml]
  51        [signatures.xml]
  52        [encryption.xml]
  53        [rights.xml]
  54     OEBPS/
  55        Great Expectations.opf
  56        cover.html
  57        chapters/
  58           chapter01.html
  59           chapter02.html
  60           <other HTML files for the remaining chapters>
  61
  62     """
  63     def load(self, src):
  64         #XXX if zip variability proves a problem, we should just use
  65         #an `unzip` subprocess
  66         if isinstance(src, str):
  67             # Should end with PK<06><05> + 18 more.
  68             # Some zips contain 'comments' after that, which breaks ZipFile
  69             zipend = src.rfind('PK\x05\x06') + 22
  70             if len(src) != zipend:
  71                 log('Bad zipfile?')
  72                 src = src[: zipend]
  73             src = StringIO(src)
  74         self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
  75         self.names = self.zip.namelist()
  76         self.info = self.zip.infolist()
  77
  78     def gettree(self, name):
  79         """get an etree from the given zip filename"""
  80         #Note: python 2.6 (not 2.5) has zipfile.open
  81         s = self.zip.read(name)
  82         f = StringIO(s)
  83         tree = lxml.etree.parse(f)
  84         f.close()
  85         return tree
  86
  87     def parse_meta(self):
  88         '''META-INF/container.xml contains one or more <rootfile>
  89         nodes.  We want the "application/oepbs-package+xml" one.
  90
  91         <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
  92         <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
  93
  94         If there is only one (as is common), forget the media-type.
  95
  96         Other files are allowed in META-INF, but none of them are much
  97         use.  They are manifest.xml, metadata.xml, signatures.xml,
  98         encryption.xml, and rights.xml.
  99         '''
 100         tree = self.gettree('META-INF/container.xml')
 101         for r in tree.getiterator(CONTAINERNS + 'rootfile'):
 102             if r.get('media-type') == "application/oebps-package+xml":
 103                 rootfile = r.get('full-path')
 104                 break
 105         else:
 106             raise EpubError("No OPF rootfile found")
 107
 108         self.opf_file = rootfile
 109
 110     def parse_opf(self):
 111         """
 112         The opf file is arranged like this:
 113         <package>
 114         <metadata />
 115         <manifest />
 116         <spine />
 117         <guide />
 118         </package>
 119
 120         Metadata, manifest and spine are parsed in separate helper
 121         functions.
 122         """
 123         pwd = os.path.dirname(self.opf_file) #needed for mainfest parsing
 124         tree = self.gettree(self.opf_file)
 125         root = tree.getroot()
 126         metadata = root.find(OPFNS + 'metadata')
 127         manifest = root.find(OPFNS + 'manifest')
 128         spine = root.find(OPFNS + 'spine')
 129         #there is also an optional guide section, which we ignore
 130
 131         self.metadata = parse_metadata(metadata)
 132         self.files = parse_manifest(manifest, pwd)
 133         ncxid, self.order = parse_spine(spine)
 134         self.ncxfile = self.files[ncxid][0]
 135
 136     def parse_ncx(self):
 137         ncx = self.gettree(self.ncxfile)
 138         self.ncxdata = parse_ncx(ncx)
 139
 140     def raw_json(self):
 141         """get all the known metadata and nav data as json."""
 142         data = {
 143             'metadata': self.metadata,
 144             'manifest': self.files,
 145             'spine': self.order,
 146             'ncx': self.ncxdata
 147             }
 148         return dumps(data, indent=2)
 149
 150     def find_language(self):
 151         opflang = [x[0].lower() for x in
 152                    self.metadata.get(DC, {}).get('language', ())]
 153
 154         # XXX Should the ncx language enter into it? Being xml:lang,
 155         # it is in theory just the language of the ncx document
 156         # itself.  But if the metadata lacks language, should it be
 157         # used instead? At present, NO.
 158         #ncxlang = self.ncxdata['headers'].get('lang', ())
 159
 160         # XXX also, for now, ignoring case of badly formed language
 161         # codes, conflicting or supplementary languages, etc.
 162         opflang = [x for x in opflang if x not in ('und', '')]
 163         if not opflang:
 164             return None
 165         return opflang[0]
 166
 167
 168
 169
 170 def parse_metadata(metadata):
 171     """metadata is an OPF metadata node, as defined at
 172     http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
 173     (or a dc-metadata or x-metadata child thereof).
 174
 175     """
 176     # the node probably has at least 'dc', 'opf', and None namespace
 177     # prefixes.  None and opf probably map to the same thing. 'dc' is
 178     # Dublin Core.
 179     nsmap = metadata.nsmap
 180     nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
 181     default_ns = nstags[None]
 182
 183     # Collect element data in namespace-bins, and map prefixes to
 184     # those bins for convenience
 185     nsdict = dict((v, {}) for v in nsmap.values())
 186
 187     def add_item(ns, tag, value, extra):
 188         #any key can be duplicate, so store in a list
 189         if ns not in nsdict:
 190             nsdict[ns] = {}
 191         values = nsdict[ns].setdefault(tag, [])
 192         values.append((value, extra))
 193
 194     for t in metadata.iterdescendants():
 195         #look for special OPF tags
 196         if t.tag == default_ns + 'meta':
 197             #meta tags <meta name="" content="" />
 198             name = t.get('name')
 199             content = t.get('content')
 200             others = tuple((k, v) for k, v in t.items() if k not in ('name', 'content'))
 201             if ':' in name:
 202                 # the meta tag is using xml namespaces in attribute values.
 203                 prefix, name = name.split(':', 1)
 204             else:
 205                 prefix = None
 206             add_item(t.nsmap[prefix], name, content, others)
 207             continue
 208
 209         if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
 210             # Subelements of these deprecated elements are in either
 211             # DC or non-DC namespace (respectively).  Of course, this
 212             # is true of any element anyway, so it is sufficent to
 213             # ignore this (unless we want to cause pedantic errors).
 214             log("found a live %s tag; descending into but otherwise ignoring it"
 215                 % t.tag[len(default_ns):])
 216             continue
 217
 218         tag = t.tag[t.tag.rfind('}') + 1:]
 219         add_item(t.nsmap[t.prefix], tag, t.text,
 220                  tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
 221
 222     return nsdict
 223
 224 def parse_manifest(manifest, pwd):
 225     """
 226     Only contains <item>s; each <item> has id, href, and media-type.
 227
 228     It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
 229     file (i.e., the files needed to get this far).
 230
 231     The manifest can specify fallbacks for unrecognised documents, but
 232     Espri does not use that (nor do any of the test epub files).
 233
 234     <manifest>
 235     <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
 236     <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
 237     <item id="cover" href="cover.jpg" media-type="image/jpeg" />
 238     </manifest>
 239     """
 240     items = {}
 241     ns = '{%s}' % manifest.nsmap[None]
 242
 243     for t in manifest.iterchildren(ns + 'item'):
 244         id = t.get('id')
 245         href = os.path.join(pwd, t.get('href'))
 246         media_type = t.get('media-type')
 247         items[id] = (href, media_type) #XXX does media-type matter?
 248
 249     return items
 250
 251 def parse_spine(spine):
 252     """The spine is an ordered list of xhtml documents (or dtbook, but
 253     Booki can't edit that, or manifest items that 'fallback' to xhtml,
 254     which Espri doesn't yet handle).  Also, anything in the manifest
 255     that can be in the spine, must be.
 256
 257     Spine itemrefs can have a 'linear' attribute, with a value of
 258     'yes' or 'no' (defaulting to 'yes').  If an item is linear, it is
 259     in the main stream of the book.  Reader software is allowed to
 260     ignore this distinction, as Espri does.
 261
 262     The toc attribute points to the ncx file (via manifest id).
 263     """
 264     items = []
 265     ns = '{%s}' % spine.nsmap[None]
 266     for t in spine.iterchildren(ns + 'itemref'):
 267         items.append(t.get('idref'))
 268
 269     toc = spine.get('toc')
 270
 271     return toc, items
 272
 273
 274 def get_ncxtext(e):
 275     #get text from an <xx><text>...</text></xx> xconstruct
 276     t = e.find(DAISYNS + 'text')
 277     if t is not None:
 278         return t.text
 279     return '' # or leave it at None?
 280
 281 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
 282     """Make a mapping of languages to labels."""
 283     # This reads navInfo or navLabel tags. navInfo is unlikely, but
 284     # navLabel is ubiquitous.  There can be one for each language, so
 285     # construct a dict.
 286     labels = {}
 287     for label in e.findall(DAISYNS + 'navLabel'):
 288         lang = label.get(XMLNS + 'lang')
 289         labels[lang] = get_ncxtext(e)
 290     return labels
 291
 292 def parse_ncx(ncx):
 293     """
 294     The NCX file is the closest thing to FLOSS Manuals TOC.txt.  It
 295     describes the heirarchical structure of the document (wheras the
 296     spine describes its 'physical' structure).
 297     """
 298     #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
 299
 300     headers = {}
 301     #if a header is set multiple times, keep all
 302     def setheader(name, content, scheme=None):
 303         values = headers.setdefault(name, [])
 304         values.append((content, scheme))
 305
 306     head = ncx.find(DAISYNS + 'head')
 307     #<!ELEMENT head (meta+)>
 308     for meta in head.findall(DAISYNS + 'itemref'):
 309         #whatever 'scheme' is
 310         setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
 311
 312     for t in ('docTitle', 'docAuthor'):
 313         for e in ncx.findall(DAISYNS + t):
 314             if e is not None:
 315                 setheader(t, get_ncxtext(e))
 316
 317     root = ncx.getroot()
 318     for attr, header in (('dir', 'dir'),
 319                          (XMLNS + 'lang', 'lang')):
 320         value = root.get(attr)
 321         if value is not None:
 322             setheader(header, value)
 323
 324     ret = {
 325         'headers':  headers,
 326         'navmap':   parse_navmap(root.find(DAISYNS + 'navMap')),
 327     }
 328
 329     #Try adding these bits, even though noone has them and they are no use.
 330     pagelist = ncx.find(DAISYNS + 'pageList')
 331     navlist = ncx.find(DAISYNS + 'navList')
 332     if pagelist is not None:
 333         ret['pagelist'] = parse_pagelist(pagelist)
 334     if navlist is not None:
 335         ret['navlist'] = parse_navlist(navlist)
 336
 337     return ret
 338
 339
 340 def parse_navmap(e):
 341     #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
 342     return {
 343         'info': get_labels(e, DAISYNS + 'navInfo'),
 344         'labels': get_labels(e),
 345         'points': tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint')),
 346         }
 347
 348 def parse_navpoint(e):
 349     #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
 350     c = e.find(DAISYNS + 'content')
 351     subpoints = tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint'))
 352     return {
 353         'id': e.get('id'),
 354         'play_order': int(e.get('playOrder')),
 355         #'content_id': c.get('id'),
 356         'content_src': c.get('src'),
 357         'labels': get_labels(e),
 358         'points': subpoints,
 359         }
 360
 361
 362 def parse_pagelist(e):
 363     # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
 364     return {
 365         'info': get_labels(e, DAISYNS + 'navInfo'),
 366         'labels': get_labels(e),
 367         'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'pageTarget')),
 368         }
 369
 370 def parse_pagetarget(e):
 371     #<!ELEMENT pageTarget (navLabel+, content)>
 372     labels = get_labels(e)
 373     c = e.find(DAISYNS + 'content')
 374     ret = {
 375         'id': e.get('id'),
 376         'type': e.get('type'),
 377         'play_order': int(e.get('playOrder')),
 378         'content_src': c.get('src'),
 379         'labels': get_labels(e),
 380     }
 381     value = e.get('value')
 382     if value is not None:
 383         ret['value'] = value
 384     return ret
 385
 386 def parse_navlist(e):
 387     #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
 388     return {
 389         'info': get_labels(e, DAISYNS + 'navInfo'),
 390         'labels': get_labels(e),
 391         'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'navTarget')),
 392         }
 393
 394 def parse_navtarget(e):
 395     #<!ELEMENT navTarget (navLabel+, content)>
 396     labels = get_labels(e)
 397     c = e.find(DAISYNS + 'content')
 398     ret = {
 399         'id': e.get('id'),
 400         'play_order': int(e.get('playOrder')),
 401         'content_src': c.get('src'),
 402         'labels': get_labels(e),
 403     }
 404     value = e.get('value')
 405     if value is not None:
 406         ret['value'] = value
 407     return ret