3 """tests for epub.py"""
7 from pprint
import pprint
, pformat
12 DC
= "http://purl.org/dc/elements/1.1/"
13 TEST_FILE_DIR
= 'tests/epub-examples/'
14 TEST_FILES
= sorted( TEST_FILE_DIR
+ x
for x
in os
.listdir(TEST_FILE_DIR
) if x
.endswith('.epub'))
15 #print '\n'.join(os.path.basename(x) for x in TEST_FILES)
19 ## Conrad - Heart of Darkness.epub
20 ## Cowan-Kimberly.epub
21 ## Doctorow - I, Robot.epub
22 ## Doyle - The Adventures of Sherlock Holmes.epub
24 ## Grimm - Grimm's Fairy Tales.epub
26 ## Lang - The Arabian Nights.epub
29 ## Melville - Moby-Dick.epub
30 ## Stevenson - Treasure Island.epub
31 ## Tolstoy - Ivan the Fool.epub
32 ## Treasure_Island.epub
33 ## Twain - The Adventures of Huckleberry Finn.epub
34 ## Wells - The War of the Worlds.epub
35 ## Wilde - The Importance of Being Earnest.epub
36 ## beaglehole-letter-61.epub
37 ## cowan-to-fildes.epub
38 ## ctaquarterly13197477chic.epub
39 ## cyclopedia-wellington.epub
40 ## darwin-autobiography-of-charles-darwin.epub
41 ## early-life-notes.epub
42 ## halfhoursinfarno00newy.epub
45 ## ia-letters-from-cat.epub
48 ## littleroadstoryo00hick.epub
50 ## pg29904-images.epub
54 ## songssourdough00servuoft.epub
55 ## stevenson-black-arrow.epub
56 ## swift-a-modest-proposal.epub
58 ## twain-adventures-of-huckleberry-finn.epub
59 ## war-economy-recipes.epub
60 ## wells-calibre-pathological.epub
63 if isinstance(x
, int):
67 elif isinstance(x
, basestring
):
69 if x
in fn
.rsplit('/', 1)[1]:
73 #TEMPDIR = tempfile.mkdtemp(prefix='epub-')
75 def _load_epub(filename
, verbose
=False):
76 fn
= _test_file(filename
)
78 raise ValueError("'%s' doesn't refer to a known file" % filename
)
82 e
.load(open(fn
).read())
85 def _get_elements(book
, elements
):
88 tree
= e
.gettree(e
.opf_file
)
89 ns
= '{http://www.idpf.org/2007/opf}'
90 return [tree
.find(ns
+ x
) for x
in elements
] + [e
]
95 fn
= _test_file('Treasure_Island')
98 e
.load(open(fn
).read())
104 for book
, root
in [('Hume', 'OPS/content.opf'),
105 ('Treasure_Island', 'OEBPS/volume.opf'),
106 ('black-arrow', "OPS/epb.opf"),
107 ('LittleBrother', "metadata.opf"),
112 assert e
.opf_file
== root
116 for book
in ['ctaquarterly', 'letter-61',
117 'early-life', 'LittleBrother'
123 for a
, t
in [('metadata', dict),
126 ('ncxfile', basestring
),
129 assert isinstance(getattr(e
, a
), t
)
132 def test_metadata_count():
134 for book
in TEST_FILES
:
140 for ns
, values
in md
.items():
141 nsdict
= counts
.setdefault(ns
, {})
142 for k
, v
in values
.items():
145 nsdict
[k
] = nsdict
.get(k
, 0) + 1
150 def test_metadata_conformance():
152 #identifier title language
153 for book
in TEST_FILES
:
160 for x
in ('identifier', 'title', 'language'):
164 #the unique-identifier attribute of the package element is a
165 #correct XML IDREF to a Dublin Core identifier element; and
167 #any extended values specified for the Dublin Core creator and
168 #contributor elements' OPF role attribute must be taken from the
169 #registered MARC Relator Code list or must begin with oth.; and
172 def test_example_ncx():
174 f
= open('tests/example.ncx')
175 tree
= lxml
.etree
.parse(f
)
177 data
= epub
.parse_ncx(tree
)
179 f
= open('tests/example.ncx.result')
180 answer
= eval(f
.read())
182 assert data
== answer
185 #XXX not very comprehensive.
186 for guts
in ('', "hello", "<h1>HELLO!</h1>"):
187 doc
= epub
.new_doc(guts
=guts
)
189 body
= doc
.iter(epub
.XHTMLNS
+ 'body').next()
190 print "got %s" % body
191 except StopIteration:
192 body
= doc
.iter('body').next()
193 guts2
= body
.text
or ''
195 guts2
+= lxml
.etree
.tostring(x
)
196 if body
.tail
is not None:
201 ## def test_parse_ncx():
202 ## for book in TEST_FILES:
204 ## e = _load_epub(book, verbose=True)
208 ## #pprint(e.ncxdata)
211 for book
in TEST_FILES
:
217 f
= open('tests/json/' + os
.path
.basename(book
) + '.js', 'w')
218 print >> f
, '/* %s */' % book
224 def test_find_language():
225 for book
in TEST_FILES
:
230 print e
.find_language(), book
234 def test_parse_metadata():
237 print "TESTING metadata"
239 f
= open('tests/metadata.xml')
240 tree
= lxml
.etree
.parse(f
)
242 #nsmap = {'opf': 'http://www.idpf.org/2007/opf'}
243 #metadata = root.xpath('.//opf:metadata', namespaces=nsmap)[0]
245 for metadata
in tree
.iter('{http://www.idpf.org/2007/opf}metadata'):
246 results
.append(epub
.parse_metadata(metadata
))
248 f
= open('tests/metadata.result')
249 correct
= eval(f
.read())
252 if results
!= correct
:
253 # how to do semantic diff of dicts?
254 from difflib
import unified_diff
255 print '\n'.join(unified_diff(pformat(results
).split('\n'), pformat(correct
).split('\n')))
256 raise AssertionError('bad metadata parsing')
261 def test_parse_manifest():
262 # manifest should be dict of ids pointing to name, mime-type pairs
263 # names should be found in zipfile
265 for book
in TEST_FILES
:
266 manifest
, e
= _get_elements(book
, ['manifest'])
267 pwd
= os
.path
.dirname(e
.opf_file
)
268 files
= epub
.parse_manifest(manifest
, pwd
)
273 for name
, mimetype
in files
.values():
274 assert isinstance(name
, basestring
)
275 assert isinstance(mimetype
, basestring
)
276 mimetypes
.add(mimetype
)
277 all_mimetypes
[mimetype
] = all_mimetypes
.get(mimetype
, 0) + 1
278 if name
not in filenames
:
279 print book
, name
, filenames
280 assert name
in filenames
282 print "%s: %s files, %s different types" % (book
, len(files
), len(mimetypes
))
284 for x
in all_mimetypes
.items():
288 def test_parse_spine():
289 #every item in the spine should be a string
290 # the toc should be a string
292 for book
in TEST_FILES
:
293 spine
, e
= _get_elements(book
, ('spine',))
294 toc
, order
= epub
.parse_spine(spine
)
295 assert isinstance(order
, (list, tuple))
296 if not isinstance(toc
, basestring
):
297 print book
, toc
, basestring
299 assert isinstance(toc
, basestring
)
300 assert all(isinstance(x
, basestring
) for x
in order
)
301 assert len(order
) == len(set(order
))
304 #XXX turned off because the archive.org ones fail, but I can't just dismiss them.
305 def _test_spine_manifest_match():
306 #every item in the spine should be in the manifest (thence in the zip, tested above)
307 #every xhtml in the manifest should be in the spine. (XXX unless there are fallbacks)
309 for book
in TEST_FILES
:
311 spine
, manifest
, e
= _get_elements(book
, ('spine', 'manifest'))
312 toc
, order
= epub
.parse_spine(spine
)
313 pwd
= os
.path
.dirname(e
.opf_file
)
314 files
= epub
.parse_manifest(manifest
, pwd
)
316 assert toc
not in order
319 name
, mimetype
= files
.pop(x
)
320 if mimetype
!= 'application/xhtml+xml':
321 bad_spine_files
.append((book
, name
, mimetype
))
323 name
, mimetype
= files
.pop(toc
)
324 assert mimetype
== 'application/x-dtbncx+xml'
325 remaining
= (x
[1] for x
in files
.values())
326 if any(x
in ('application/x-dtbncx+xml', 'application/xhtml+xml') for x
in remaining
):
327 print book
, set(remaining
)
329 assert not any(x
in ('application/x-dtbncx+xml', 'application/xhtml+xml') for x
in remaining
)
333 for book
, fn
, mt
in bad_spine_files
:
334 mimecount
= bsf
.setdefault(book
, {})
335 mimecount
[mt
] = mimecount
.get(mt
, 0) + 1
339 raise AssertionError('bad spine files in %s' % bsf
.keys())