call a spine a spine, and quieter testing
[objavi2.git] / tests / _epub.py
blobd36b81f51a3f7bec7fd3960a101ade71873d6e32
1 #!/usr/bin/python
3 """tests for epub.py"""
5 import os, sys
6 import tempfile
7 from pprint import pprint, pformat
8 import epub
10 import lxml
12 DC = "http://purl.org/dc/elements/1.1/"
13 TEST_FILE_DIR = 'tests/epub-examples/'
14 TEST_FILES = sorted( TEST_FILE_DIR + x for x in os.listdir(TEST_FILE_DIR) if x.endswith('.epub'))
15 #print '\n'.join(os.path.basename(x) for x in TEST_FILES)
17 ## Best_of_TOC.epub
18 ## Bonaparte.epub
19 ## Conrad - Heart of Darkness.epub
20 ## Cowan-Kimberly.epub
21 ## Doctorow - I, Robot.epub
22 ## Doyle - The Adventures of Sherlock Holmes.epub
23 ## GalCuri.epub
24 ## Grimm - Grimm's Fairy Tales.epub
25 ## Hume_Nature.epub
26 ## Lang - The Arabian Nights.epub
27 ## LittleBrother.epub
28 ## McGSome.epub
29 ## Melville - Moby-Dick.epub
30 ## Stevenson - Treasure Island.epub
31 ## Tolstoy - Ivan the Fool.epub
32 ## Treasure_Island.epub
33 ## Twain - The Adventures of Huckleberry Finn.epub
34 ## Wells - The War of the Worlds.epub
35 ## Wilde - The Importance of Being Earnest.epub
36 ## beaglehole-letter-61.epub
37 ## cowan-to-fildes.epub
38 ## ctaquarterly13197477chic.epub
39 ## cyclopedia-wellington.epub
40 ## darwin-autobiography-of-charles-darwin.epub
41 ## early-life-notes.epub
42 ## halfhoursinfarno00newy.epub
43 ## ia-abroad.epub
44 ## ia-huckfin.epub
45 ## ia-letters-from-cat.epub
46 ## ia-old-french.epub
47 ## ia-tomsawyer.epub
48 ## littleroadstoryo00hick.epub
49 ## pg2292.epub
50 ## pg29904-images.epub
51 ## pg829-images.epub
52 ## pg829.epub
53 ## sample.epub
54 ## songssourdough00servuoft.epub
55 ## stevenson-black-arrow.epub
56 ## swift-a-modest-proposal.epub
57 ## takitimu.epub
58 ## twain-adventures-of-huckleberry-finn.epub
59 ## war-economy-recipes.epub
60 ## wells-calibre-pathological.epub
62 def _test_file(x):
63 if isinstance(x, int):
64 return TEST_FILES[x]
65 elif x in TEST_FILES:
66 return x
67 elif isinstance(x, basestring):
68 for fn in TEST_FILES:
69 if x in fn.rsplit('/', 1)[1]:
70 return fn
73 #TEMPDIR = tempfile.mkdtemp(prefix='epub-')
75 def _load_epub(filename, verbose=False):
76 fn = _test_file(filename)
77 if fn is None:
78 raise ValueError("'%s' doesn't refer to a known file" % filename)
79 if verbose:
80 print fn
81 e = epub.Epub()
82 e.load(open(fn).read())
83 return e
85 def _get_elements(book, elements):
86 e = _load_epub(book)
87 e.parse_meta()
88 tree = e.gettree(e.opf_file)
89 ns = '{http://www.idpf.org/2007/opf}'
90 return [tree.find(ns + x) for x in elements] + [e]
94 def test_load():
95 fn = _test_file('Treasure_Island')
96 #print fn
97 e = epub.Epub()
98 e.load(open(fn).read())
99 assert e.names
100 assert e.info
103 def test_meta():
104 for book, root in [('Hume', 'OPS/content.opf'),
105 ('Treasure_Island', 'OEBPS/volume.opf'),
106 ('black-arrow', "OPS/epb.opf"),
107 ('LittleBrother', "metadata.opf"),
109 #print book
110 e = _load_epub(book)
111 e.parse_meta()
112 assert e.opf_file == root
115 def test_opf():
116 for book in ['ctaquarterly', 'letter-61',
117 'early-life', 'LittleBrother'
119 e = _load_epub(book)
120 e.parse_meta()
121 e.parse_opf()
123 for a, t in [('metadata', dict),
124 ('manifest', dict),
125 ('spine', list),
126 ('ncxfile', basestring),
128 assert hasattr(e, a)
129 assert isinstance(getattr(e, a), t)
132 def test_metadata_count():
133 counts = {}
134 for book in TEST_FILES:
135 #print book
136 e = _load_epub(book)
137 e.parse_meta()
138 e.parse_opf()
139 md = e.metadata
140 for ns, values in md.items():
141 nsdict = counts.setdefault(ns, {})
142 for k, v in values.items():
143 name = v[0]
144 if name:
145 nsdict[k] = nsdict.get(k, 0) + 1
147 pprint(counts)
148 #sys.exit()
150 def test_metadata_conformance():
151 #at least one:
152 #identifier title language
153 for book in TEST_FILES:
154 #print book
155 e = _load_epub(book)
156 e.parse_meta()
157 e.parse_opf()
158 md = e.metadata
159 dc = md[DC]
160 for x in ('identifier', 'title', 'language'):
161 assert dc.get(x)
164 #the unique-identifier attribute of the package element is a
165 #correct XML IDREF to a Dublin Core identifier element; and
167 #any extended values specified for the Dublin Core creator and
168 #contributor elements' OPF role attribute must be taken from the
169 #registered MARC Relator Code list or must begin with oth.; and
172 def test_example_ncx():
173 import lxml
174 f = open('tests/example.ncx')
175 tree = lxml.etree.parse(f)
176 f.close()
177 data = epub.parse_ncx(tree)
178 #pprint(data)
179 f = open('tests/example.ncx.result')
180 answer = eval(f.read())
181 f.close()
182 assert data == answer
184 def test_new_doc():
185 #XXX not very comprehensive.
186 for guts in ('', "hello", "<h1>HELLO!</h1>"):
187 doc = epub.new_doc(guts=guts)
188 try:
189 body = doc.iter(epub.XHTMLNS + 'body').next()
190 print "got %s" % body
191 except StopIteration:
192 body = doc.iter('body').next()
193 guts2 = body.text or ''
194 for x in body:
195 guts2 += lxml.etree.tostring(x)
196 if body.tail is not None:
197 guts2 += body.tail
198 assert guts == guts2
201 ## def test_parse_ncx():
202 ## for book in TEST_FILES:
203 ## print book
204 ## e = _load_epub(book, verbose=True)
205 ## e.parse_meta()
206 ## e.parse_opf()
207 ## e.parse_ncx()
208 ## #pprint(e.ncxdata)
210 def test_raw_json():
211 for book in TEST_FILES:
212 e = _load_epub(book)
213 e.parse_meta()
214 e.parse_opf()
215 e.parse_ncx()
216 js = e.raw_json()
217 f = open('tests/json/' + os.path.basename(book) + '.js', 'w')
218 print >> f, '/* %s */' % book
219 print >> f, js
220 f.close()
222 #print js
224 def test_find_language():
225 for book in TEST_FILES:
226 e = _load_epub(book)
227 e.parse_meta()
228 e.parse_opf()
229 e.parse_ncx()
230 print e.find_language(), book
234 def test_parse_metadata():
235 #XXX check unicode!
237 print "TESTING metadata"
238 import lxml
239 f = open('tests/metadata.xml')
240 tree = lxml.etree.parse(f)
241 f.close()
242 #nsmap = {'opf': 'http://www.idpf.org/2007/opf'}
243 #metadata = root.xpath('.//opf:metadata', namespaces=nsmap)[0]
244 results = []
245 for metadata in tree.iter('{http://www.idpf.org/2007/opf}metadata'):
246 results.append(epub.parse_metadata(metadata))
248 f = open('tests/metadata.result')
249 correct = eval(f.read())
250 f.close()
252 if results != correct:
253 # how to do semantic diff of dicts?
254 from difflib import unified_diff
255 print '\n'.join(unified_diff(pformat(results).split('\n'), pformat(correct).split('\n')))
256 raise AssertionError('bad metadata parsing')
261 def test_parse_manifest():
262 # manifest should be dict of ids pointing to name, mime-type pairs
263 # names should be found in zipfile
264 all_mimetypes = {}
265 for book in TEST_FILES:
266 manifest, e = _get_elements(book, ['manifest'])
267 pwd = os.path.dirname(e.opf_file)
268 files = epub.parse_manifest(manifest, pwd)
269 #print book
270 mimetypes = set()
271 filenames = e.names
273 for name, mimetype in files.values():
274 assert isinstance(name, basestring)
275 assert isinstance(mimetype, basestring)
276 mimetypes.add(mimetype)
277 all_mimetypes[mimetype] = all_mimetypes.get(mimetype, 0) + 1
278 if name not in filenames:
279 print book, name, filenames
280 assert name in filenames
282 print "%s: %s files, %s different types" % (book, len(files), len(mimetypes))
284 for x in all_mimetypes.items():
285 print "%30s: %s" % x
288 def test_parse_spine():
289 #every item in the spine should be a string
290 # the toc should be a string
291 #no duplicates
292 for book in TEST_FILES:
293 spine, e = _get_elements(book, ('spine',))
294 toc, order = epub.parse_spine(spine)
295 assert isinstance(order, (list, tuple))
296 if not isinstance(toc, basestring):
297 print book, toc, basestring
299 assert isinstance(toc, basestring)
300 assert all(isinstance(x, basestring) for x in order)
301 assert len(order) == len(set(order))
304 #XXX turned off because the archive.org ones fail, but I can't just dismiss them.
305 def _test_spine_manifest_match():
306 #every item in the spine should be in the manifest (thence in the zip, tested above)
307 #every xhtml in the manifest should be in the spine. (XXX unless there are fallbacks)
308 bad_spine_files = []
309 for book in TEST_FILES:
310 #print book
311 spine, manifest, e = _get_elements(book, ('spine', 'manifest'))
312 toc, order = epub.parse_spine(spine)
313 pwd = os.path.dirname(e.opf_file)
314 files = epub.parse_manifest(manifest, pwd)
316 assert toc not in order
317 xhtmls = set(order)
318 for x in order:
319 name, mimetype = files.pop(x)
320 if mimetype != 'application/xhtml+xml':
321 bad_spine_files.append((book, name, mimetype))
323 name, mimetype = files.pop(toc)
324 assert mimetype == 'application/x-dtbncx+xml'
325 remaining = (x[1] for x in files.values())
326 if any(x in ('application/x-dtbncx+xml', 'application/xhtml+xml') for x in remaining):
327 print book, set(remaining)
329 assert not any(x in ('application/x-dtbncx+xml', 'application/xhtml+xml') for x in remaining)
331 if bad_spine_files:
332 bsf = {}
333 for book, fn, mt in bad_spine_files:
334 mimecount = bsf.setdefault(book, {})
335 mimecount[mt] = mimecount.get(mt, 0) + 1
337 pprint(bsf)
339 raise AssertionError('bad spine files in %s' % bsf.keys())