tests/_epub.py

   1 #!/usr/bin/python
   2
   3 """tests for epub.py"""
   4
   5 import os, sys
   6 import tempfile
   7 from pprint import pprint, pformat
   8 import epub
   9
  10 import lxml
  11
  12 DC = "http://purl.org/dc/elements/1.1/"
  13 TEST_FILE_DIR = 'tests/epub-examples/'
  14 TEST_FILES =  sorted( TEST_FILE_DIR + x for x in os.listdir(TEST_FILE_DIR) if x.endswith('.epub'))
  15 #print '\n'.join(os.path.basename(x) for x in TEST_FILES)
  16
  17 ## Best_of_TOC.epub
  18 ## Bonaparte.epub
  19 ## Conrad - Heart of Darkness.epub
  20 ## Cowan-Kimberly.epub
  21 ## Doctorow - I, Robot.epub
  22 ## Doyle - The Adventures of Sherlock Holmes.epub
  23 ## GalCuri.epub
  24 ## Grimm - Grimm's Fairy Tales.epub
  25 ## Hume_Nature.epub
  26 ## Lang - The Arabian Nights.epub
  27 ## LittleBrother.epub
  28 ## McGSome.epub
  29 ## Melville - Moby-Dick.epub
  30 ## Stevenson - Treasure Island.epub
  31 ## Tolstoy - Ivan the Fool.epub
  32 ## Treasure_Island.epub
  33 ## Twain - The Adventures of Huckleberry Finn.epub
  34 ## Wells - The War of the Worlds.epub
  35 ## Wilde - The Importance of Being Earnest.epub
  36 ## beaglehole-letter-61.epub
  37 ## cowan-to-fildes.epub
  38 ## ctaquarterly13197477chic.epub
  39 ## cyclopedia-wellington.epub
  40 ## darwin-autobiography-of-charles-darwin.epub
  41 ## early-life-notes.epub
  42 ## halfhoursinfarno00newy.epub
  43 ## ia-abroad.epub
  44 ## ia-huckfin.epub
  45 ## ia-letters-from-cat.epub
  46 ## ia-old-french.epub
  47 ## ia-tomsawyer.epub
  48 ## littleroadstoryo00hick.epub
  49 ## pg2292.epub
  50 ## pg29904-images.epub
  51 ## pg829-images.epub
  52 ## pg829.epub
  53 ## sample.epub
  54 ## songssourdough00servuoft.epub
  55 ## stevenson-black-arrow.epub
  56 ## swift-a-modest-proposal.epub
  57 ## takitimu.epub
  58 ## twain-adventures-of-huckleberry-finn.epub
  59 ## war-economy-recipes.epub
  60 ## wells-calibre-pathological.epub
  61
  62 def _test_file(x):
  63     if isinstance(x, int):
  64         return TEST_FILES[x]
  65     elif x in TEST_FILES:
  66         return x
  67     elif isinstance(x, basestring):
  68         for fn in TEST_FILES:
  69             if x in fn.rsplit('/', 1)[1]:
  70                 return fn
  71
  72
  73 #TEMPDIR = tempfile.mkdtemp(prefix='epub-')
  74
  75 def _load_epub(filename, verbose=False):
  76     fn = _test_file(filename)
  77     if fn is None:
  78         raise ValueError("'%s' doesn't refer to a known file" % filename)
  79     if verbose:
  80         print fn
  81     e = epub.Epub()
  82     e.load(open(fn).read())
  83     return e
  84
  85 def _get_elements(book, elements):
  86     e = _load_epub(book)
  87     e.parse_meta()
  88     tree = e.gettree(e.opf_file)
  89     ns = '{http://www.idpf.org/2007/opf}'
  90     return [tree.find(ns + x) for x in elements] + [e]
  91
  92
  93
  94 def test_load():
  95     fn = _test_file('Treasure_Island')
  96     #print fn
  97     e = epub.Epub()
  98     e.load(open(fn).read())
  99     assert e.names
 100     assert e.info
 101
 102
 103 def test_meta():
 104     for book, root in [('Hume', 'OPS/content.opf'),
 105                        ('Treasure_Island', 'OEBPS/volume.opf'),
 106                        ('black-arrow', "OPS/epb.opf"),
 107                        ('LittleBrother', "metadata.opf"),
 108                        ]:
 109         #print book
 110         e = _load_epub(book)
 111         e.parse_meta()
 112         assert e.opf_file == root
 113
 114
 115 def test_opf():
 116     for book in ['ctaquarterly', 'letter-61',
 117                  'early-life', 'LittleBrother'
 118                  ]:
 119         e = _load_epub(book)
 120         e.parse_meta()
 121         e.parse_opf()
 122
 123         for a, t in [('metadata', dict),
 124                      ('manifest', dict),
 125                      ('spine', list),
 126                      ('ncxfile', basestring),
 127             ]:
 128             assert hasattr(e, a)
 129             assert isinstance(getattr(e, a), t)
 130
 131
 132 def test_metadata_count():
 133     counts = {}
 134     for book in TEST_FILES:
 135         #print book
 136         e = _load_epub(book)
 137         e.parse_meta()
 138         e.parse_opf()
 139         md = e.metadata
 140         for ns, values in md.items():
 141             nsdict = counts.setdefault(ns, {})
 142             for k, v in values.items():
 143                 name = v[0]
 144                 if name:
 145                     nsdict[k] = nsdict.get(k, 0) + 1
 146
 147     pprint(counts)
 148     #sys.exit()
 149
 150 def test_metadata_conformance():
 151     #at least one:
 152     #identifier  title  language
 153     for book in TEST_FILES:
 154         #print book
 155         e = _load_epub(book)
 156         e.parse_meta()
 157         e.parse_opf()
 158         md = e.metadata
 159         dc = md[DC]
 160         for x in ('identifier', 'title', 'language'):
 161             assert dc.get(x)
 162
 163
 164     #the unique-identifier attribute of the package element is a
 165     #correct XML IDREF to a Dublin Core identifier element; and
 166
 167     #any extended values specified for the Dublin Core creator and
 168     #contributor elements' OPF role attribute must be taken from the
 169     #registered MARC Relator Code list or must begin with oth.; and
 170
 171
 172 def test_example_ncx():
 173     import lxml
 174     f = open('tests/example.ncx')
 175     tree = lxml.etree.parse(f)
 176     f.close()
 177     data = epub.parse_ncx(tree)
 178     #pprint(data)
 179     f = open('tests/example.ncx.result')
 180     answer = eval(f.read())
 181     f.close()
 182     assert data == answer
 183
 184 def test_new_doc():
 185     #XXX not very comprehensive.
 186     for guts in ('', "hello", "<h1>HELLO!</h1>"):
 187         doc = epub.new_doc(guts=guts)
 188         try:
 189             body = doc.iter(epub.XHTMLNS + 'body').next()
 190             print "got %s" % body
 191         except StopIteration:
 192             body = doc.iter('body').next()
 193         guts2 = body.text or ''
 194         for x in body:
 195             guts2 += lxml.etree.tostring(x)
 196         if body.tail is not None:
 197             guts2 += body.tail
 198         assert guts == guts2
 199
 200
 201 ## def test_parse_ncx():
 202 ##     for book in TEST_FILES:
 203 ##         print book
 204 ##         e = _load_epub(book, verbose=True)
 205 ##         e.parse_meta()
 206 ##         e.parse_opf()
 207 ##         e.parse_ncx()
 208 ##        #pprint(e.ncxdata)
 209
 210 def test_raw_json():
 211     for book in TEST_FILES:
 212         e = _load_epub(book)
 213         e.parse_meta()
 214         e.parse_opf()
 215         e.parse_ncx()
 216         js = e.raw_json()
 217         f = open('tests/json/' + os.path.basename(book) + '.js', 'w')
 218         print >> f, '/* %s */' % book
 219         print >> f, js
 220         f.close()
 221
 222         #print js
 223
 224 def test_find_language():
 225     for book in TEST_FILES:
 226         e = _load_epub(book)
 227         e.parse_meta()
 228         e.parse_opf()
 229         e.parse_ncx()
 230         print e.find_language(), book
 231
 232
 233
 234 def test_parse_metadata():
 235     #XXX check unicode!
 236
 237     print "TESTING metadata"
 238     import lxml
 239     f = open('tests/metadata.xml')
 240     tree = lxml.etree.parse(f)
 241     f.close()
 242     #nsmap = {'opf': 'http://www.idpf.org/2007/opf'}
 243     #metadata = root.xpath('.//opf:metadata', namespaces=nsmap)[0]
 244     results = []
 245     for metadata in tree.iter('{http://www.idpf.org/2007/opf}metadata'):
 246         results.append(epub.parse_metadata(metadata))
 247
 248     f = open('tests/metadata.result')
 249     correct = eval(f.read())
 250     f.close()
 251
 252     if results != correct:
 253         # how to do semantic diff of dicts?
 254         from difflib import unified_diff
 255         print '\n'.join(unified_diff(pformat(results).split('\n'), pformat(correct).split('\n')))
 256         raise AssertionError('bad metadata parsing')
 257
 258
 259
 260
 261 def test_parse_manifest():
 262     # manifest should be dict of ids pointing to name, mime-type pairs
 263     # names should be found in zipfile
 264     all_mimetypes = {}
 265     for book in TEST_FILES:
 266         manifest, e = _get_elements(book, ['manifest'])
 267         pwd = os.path.dirname(e.opf_file)
 268         files = epub.parse_manifest(manifest, pwd)
 269         #print book
 270         mimetypes = set()
 271         filenames = e.names
 272
 273         for name, mimetype in files.values():
 274             assert isinstance(name, basestring)
 275             assert isinstance(mimetype, basestring)
 276             mimetypes.add(mimetype)
 277             all_mimetypes[mimetype] = all_mimetypes.get(mimetype, 0) + 1
 278             if  name not in filenames:
 279                 print book, name, filenames
 280             assert name in filenames
 281
 282         print "%s: %s files, %s different types" % (book, len(files), len(mimetypes))
 283
 284     for x in all_mimetypes.items():
 285         print "%30s: %s" % x
 286
 287
 288 def test_parse_spine():
 289     #every item in the spine should be a string
 290     # the toc should be a string
 291     #no duplicates
 292     for book in TEST_FILES:
 293         spine, e = _get_elements(book, ('spine',))
 294         toc, order = epub.parse_spine(spine)
 295         assert isinstance(order, (list, tuple))
 296         if not isinstance(toc, basestring):
 297             print book, toc, basestring
 298
 299         assert isinstance(toc, basestring)
 300         assert all(isinstance(x, basestring) for x in order)
 301         assert len(order) == len(set(order))
 302
 303
 304 #XXX turned off because the archive.org ones fail, but I can't just dismiss them.
 305 def _test_spine_manifest_match():
 306     #every item in the spine should be in the manifest (thence in the zip, tested above)
 307     #every xhtml in the manifest should be in the spine. (XXX unless there are fallbacks)
 308     bad_spine_files = []
 309     for book in TEST_FILES:
 310         #print book
 311         spine, manifest, e = _get_elements(book, ('spine', 'manifest'))
 312         toc, order = epub.parse_spine(spine)
 313         pwd = os.path.dirname(e.opf_file)
 314         files = epub.parse_manifest(manifest, pwd)
 315
 316         assert toc not in order
 317         xhtmls = set(order)
 318         for x in order:
 319             name, mimetype = files.pop(x)
 320             if mimetype != 'application/xhtml+xml':
 321                 bad_spine_files.append((book, name, mimetype))
 322
 323         name, mimetype = files.pop(toc)
 324         assert mimetype == 'application/x-dtbncx+xml'
 325         remaining = (x[1] for x in files.values())
 326         if any(x in ('application/x-dtbncx+xml', 'application/xhtml+xml') for x in remaining):
 327             print book, set(remaining)
 328
 329         assert not any(x in ('application/x-dtbncx+xml', 'application/xhtml+xml') for x in remaining)
 330
 331     if bad_spine_files:
 332         bsf = {}
 333         for book, fn, mt in bad_spine_files:
 334             mimecount = bsf.setdefault(book, {})
 335             mimecount[mt] = mimecount.get(mt, 0) + 1
 336
 337         pprint(bsf)
 338
 339         raise AssertionError('bad spine files in %s' % bsf.keys())
 340
 341
 342