filter out chapters that have no text
[objavi2.git] / epub.py
blob12e562169311fa8b89ccd34bfa4d246cc0d15f77
1 """Module for dealing with epub -> booki conversions."""
3 import os, sys
4 from pprint import pprint
5 import zipfile
6 from cStringIO import StringIO
7 import copy
9 try:
10 from json import dumps
11 except ImportError:
12 from simplejson import dumps
14 import lxml, lxml.html, lxml.cssselect
15 from lxml import etree
17 from booki.xhtml_utils import BookiZip
19 #XML namespaces. The *NS varients are in {curly brackets} for clark's syntax
20 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
21 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
22 OPFNS = '{http://www.idpf.org/2007/opf}'
23 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
24 XHTMLNS = '{http://www.w3.org/1999/xhtml}'
26 XHTML = 'http://www.w3.org/1999/xhtml'
27 DC = "http://purl.org/dc/elements/1.1/"
29 MARKUP_TYPES = ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
30 HTML_TYPES = ('application/xhtml+xml', 'text/html')
32 def log(*messages, **kwargs):
33 for m in messages:
34 try:
35 print >> sys.stderr, m
36 except Exception:
37 print >> sys.stderr, repr(m)
40 html_parser = lxml.html.HTMLParser(encoding="utf-8")
41 xhtml_parser = lxml.html.XHTMLParser(encoding="utf-8")
43 def _xhtml_parse(*args, **kwargs):
44 kwargs['parser'] = xhtml_parser
45 return lxml.html.parse(*args, **kwargs)
47 def _html_parse(*args, **kwargs):
48 kwargs['parser'] = html_parser
49 return lxml.html.parse(*args, **kwargs)
51 def new_doc(guts="", version="1.1", lang=None):
52 xmldec = '<?xml version="1.0" encoding="UTF-8"?>'
53 doctypes = {
54 '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
55 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
56 '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
57 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
60 if lang in (None, 'und', 'UND'):
61 langdec = ''
62 else:
63 langdec = 'xml:lang="%s" lang="%s"' % (lang, lang)
65 doc = ('<html xmlns="%s" version="XHTML %s" %s>'
66 '<head></head><body>%s</body></html>'
67 % (XHTML, version, langdec, guts))
69 f = StringIO(xmldec + doctypes.get(version, '') + doc)
70 tree = lxml.html.parse(f)
71 f.close()
72 return tree
75 class EpubError(Exception):
76 pass
78 class Epub(object):
79 """
81 Abstract Container:
82 META-INF/
83 container.xml
84 [manifest.xml]
85 [metadata.xml]
86 [signatures.xml]
87 [encryption.xml]
88 [rights.xml]
89 OEBPS/
90 Great Expectations.opf
91 cover.html
92 chapters/
93 chapter01.html
94 chapter02.html
95 <other HTML files for the remaining chapters>
97 """
98 def load(self, src):
99 # Zip is a variable format, and zipfile is limited. If that
100 # becomes a problem we will have to ise an `unzip` subprocess,
101 # but it hasn't been so far.
102 if isinstance(src, str):
103 # Should end with PK<06><05> + 18 more.
104 # Some zips contain 'comments' after that, which breaks ZipFile
105 zipend = src.rfind('PK\x05\x06') + 22
106 if len(src) != zipend:
107 log('Bad zipfile?')
108 src = src[: zipend]
109 src = StringIO(src)
110 self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
111 self.names = self.zip.namelist()
112 self.info = self.zip.infolist()
113 self.origin = src
115 def gettree(self, name=None, id=None, parse=etree.parse):
116 """get an XML tree from the given zip filename or manifest ID"""
117 if name is None:
118 name, mimetype = self.manifest[id]
119 #Note: python 2.6 (not 2.5) has zipfile.open
120 s = self.zip.read(name)
121 f = StringIO(s)
122 tree = parse(f)
123 f.close()
124 return tree
126 def parse_meta(self):
127 '''META-INF/container.xml contains one or more <rootfile>
128 nodes. We want the "application/oepbs-package+xml" one.
130 <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
131 <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
133 Other files are allowed in META-INF, but none of them are much
134 use. They are manifest.xml, metadata.xml, signatures.xml,
135 encryption.xml, and rights.xml.
137 tree = self.gettree('META-INF/container.xml')
138 for r in tree.getiterator(CONTAINERNS + 'rootfile'):
139 if r.get('media-type') == "application/oebps-package+xml":
140 rootfile = r.get('full-path')
141 break
142 else:
143 raise EpubError("No OPF rootfile found")
145 self.opf_file = rootfile
147 def parse_opf(self):
149 The opf file is arranged like this:
150 <package>
151 <metadata />
152 <manifest />
153 <spine />
154 <guide />
155 </package>
157 Metadata, manifest and spine are parsed in separate helper
158 functions.
160 self.opfdir = os.path.dirname(self.opf_file) #needed for mainfest parsing
161 tree = self.gettree(self.opf_file)
162 root = tree.getroot()
163 metadata = root.find(OPFNS + 'metadata')
164 manifest = root.find(OPFNS + 'manifest')
165 spine = root.find(OPFNS + 'spine')
166 #there is also an optional guide section, which we ignore
168 self.metadata = parse_metadata(metadata)
169 self.manifest = parse_manifest(manifest, self.opfdir)
170 # mapping of filenames to new filenames. This needs to be
171 # done early to detect clashes (e.g. '/images/hello.jpg' and
172 # '/images/big/hello.jpg' would both reduce to
173 # 'static/hello.jpg').
174 self.media_map = {}
175 for k, v in self.manifest.items():
176 fn, mimetype = v
177 if mimetype not in MARKUP_TYPES:
178 oldfn = fn
179 if '/' in fn:
180 fn = fn.rsplit('/', 1)[1]
181 while fn in self.media_map.values():
182 fn = '_' + fn
183 newfn = 'static/%s' % fn
184 self.media_map[oldfn] = newfn
186 ncxid, self.spine = parse_spine(spine)
187 self.ncxfile = self.manifest[ncxid][0]
189 def parse_ncx(self):
190 ncx = self.gettree(self.ncxfile)
191 self.ncxdata = parse_ncx(ncx)
193 def raw_json(self):
194 """get all the known metadata and nav data as json."""
195 data = {
196 'metadata': self.metadata,
197 'manifest': self.manifest,
198 'spine': self.spine,
199 'ncx': self.ncxdata
201 return dumps(data, indent=2)
203 def find_language(self):
204 opflang = [x[0].lower() for x in
205 self.metadata.get(DC, {}).get('language', ())]
207 # XXX Should the ncx language enter into it? Being xml:lang,
208 # it is in theory just the language of the ncx document
209 # itself. But if the metadata lacks language, should it be
210 # used instead? At present, NO.
211 #ncxlang = self.ncxdata['headers'].get('lang', ())
213 # XXX also, for now, ignoring case of badly formed language
214 # codes, conflicting or supplementary languages, etc.
215 opflang = [x for x in opflang if x not in ('und', '')]
216 if not opflang:
217 return None
218 if len(set(opflang)) > 1:
219 log('%s metadata has more than one language: %s -- using first one'
220 % (self.origin, opflang))
221 return opflang[0]
223 def find_probable_chapters(self):
224 """Try to find the real chapters from the NCX file. The
225 problem is that different epubs all use their own level of
226 nesting."""
227 # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
228 # and FM books have (section 1 (c1, c2,..),..)
229 # i.e super-chapter blocks
230 # some have (((c1, c2, c3))) -- deeply nested chapters
231 # some have no real chapters, but stupid structure
232 points = self.ncxdata['navmap']['points']
233 chapter_depth, serial_points, splits = get_chapter_breaks(points)
234 return chapter_depth, serial_points, splits
236 def concat_document(self):
237 """Join all the xhtml files together, putting in markers
238 indicating where the splits should be.
240 lang = self.find_language()
241 points = self.ncxdata['navmap']['points']
242 pwd = os.path.dirname(self.ncxfile)
243 chapter_depth, serial_points, chapter_markers = get_chapter_breaks(points, pwd)
244 doc = new_doc(lang=lang)
245 #log(chapter_markers)
246 for ID in self.spine:
247 fn, mimetype = self.manifest[ID]
248 if mimetype.startswith('image'):
249 root = lxml.html.Element('html')
250 body = etree.SubElement(root, 'body')
251 first_el = etree.SubElement(body, 'img', src=self.media_map.get(fn, fn), alt='')
252 else:
253 tree = self.gettree(fn, parse=_html_parse)
254 root = tree.getroot()
255 first_el = _find_tag(root, 'body')[0]
256 #point the links to the new names. XXX probably fragile
257 root.rewrite_links(lambda x: self.media_map.get(os.path.join(self.opfdir, x), x))
259 for depth, fragment, point in chapter_markers.get(fn, ()):
260 if fragment:
261 start = root.xpath("//*[@id='%s']" % fragment)[0]
262 else:
263 start = first_el
264 labels = point['labels']
265 add_marker(start, 'espri-chapter-%(id)s' % point,
266 title=labels.get(lang, '\n'.join(labels.values())),
267 subsections=str(bool(point['points'])))
269 add_marker(first_el, 'espri-new-file-%s' % ID, title=fn)
270 add_guts(root, doc)
271 return doc
274 def make_bookizip(self, zfn):
275 """Split up the document and construct a booki-toc for it."""
276 doc = self.concat_document()
277 bz = BookiZip(zfn)
279 chapters = split_document(doc)
280 real_chapters = drop_empty_chapters(chapters)
282 spine = []
283 for id, title, tree in real_chapters:
284 if title:
285 try:
286 root = tree.getroot()
287 except:
288 root = tree
289 head = root.makeelement('head')
290 _title = etree.SubElement(head, 'title')
291 _title.text = title
292 root.insert(0, head)
293 blob = etree.tostring(tree)
294 bz.add_to_package(id, '%s.html' % id,
295 blob, mediatype='text/html')
296 spine.append(id)
298 #add the images ad other non-html data unchanged.
299 for id, data in self.manifest.iteritems():
300 fn, mimetype = data
301 if mimetype not in MARKUP_TYPES:
302 blob = self.zip.read(fn)
303 bz.add_to_package(id, self.media_map[fn], blob, mimetype)
305 #now to construct a table of contents
307 def write_toc(point, section):
308 ID = point['id']
309 if ID in spine:
310 section.append((ID, ID + '.html'))
311 else:
312 section.append((ID, None))
313 subsection = []
314 for child in point['points']:
315 write_toc(child, subsection)
316 if subsection:
317 section.append(subsection)
319 toc = []
320 points = self.ncxdata['navmap']['points']
321 for p in points:
322 write_toc(p, toc)
324 bz.info = {
325 'spine': spine,
326 'TOC': toc,
327 'metadata': self.metadata["http://purl.org/dc/elements/1.1/"],
328 'copyright': {'The Contributors': [(x, 'primary') for x in spine]},
331 bz.finish()
334 def drop_empty_chapters(chapters):
335 """If the chapter has no content, ignore it. Content is defined
336 as images or text."""
337 good_chapters = []
338 for c in chapters:
339 good = False
340 for e in c[2].iter():
341 if ((e.text and e.text.strip()) or
342 (e.tail and e.tail.strip()) or
343 e.tag in ('img',)):
344 good = True
345 break
346 if good:
347 good_chapters.append(c)
348 return good_chapters
351 def copy_element(src, create):
352 """Return a copy of the src element, with all its attributes and
353 tail, using create to make the copy. create is probably an
354 Element._makeelement method, to associate the copy with the right
355 tree, but it could be etree.HTMLElement."""
356 if isinstance(src.tag, basestring):
357 dest = create(src.tag)
358 else:
359 dest = copy.copy(src)
361 for k, v in src.items():
362 dest.set(k, v)
363 dest.tail = src.tail
364 return dest
366 def split_document(doc):
367 """Split the document along chapter boundaries."""
368 try:
369 root = doc.getroot()
370 except AttributeError:
371 root = doc
373 front_matter = copy_element(root, lxml.html.Element)
374 chapters = [('espri-unindexed-front-matter',
375 'Unindexed Front Matter',
376 front_matter)]
378 _climb_and_split(root, front_matter, chapters)
379 return chapters
381 def _climb_and_split(src, dest, chapters):
382 for child in src.iterchildren():
383 if child.tag == 'hr' and child.get('class') == MARKER_CLASS:
384 ID = child.get('id')
385 if ID.startswith('espri-chapter-'):
386 title = child.get('title') or ID
387 new = copy_element(src, lxml.html.Element)
388 root = new
390 for a in src.iterancestors():
391 a2 = copy_element(a, root.makeelement)
392 a2.append(root)
393 root = a2
395 chapters.append((ID[14:], title, root))
397 dest.tail = None
398 for a in dest.iterancestors():
399 a.tail = None
401 dest = new
402 else:
403 log("skipping %s" % etree.tostring(child))
405 else:
406 new = copy_element(child, dest.makeelement)
407 new.text = child.text
408 dest.append(new)
409 _climb_and_split(child, new, chapters)
412 def save_chapters(chapters):
413 for id, tree in chapters.items():
414 string = lxml.html.tostring(tree, method='html')
415 f = open('/tmp/x%s.html' % id, 'w')
416 f.write(string)
417 f.close()
420 def add_guts(src, dest):
421 """Append the contents of the <body> of one tree onto that of
422 another. The source tree will be emptied."""
423 #print lxml.etree.tostring(src)
424 sbody = _find_tag(src, 'body')
425 dbody = _find_tag(dest, 'body')
426 if len(dbody):
427 dbody[-1].tail = ((dbody[-1].tail or '') +
428 (sbody.text or '')) or None
429 else:
430 dbody.text = sbody.text
432 for x in sbody:
433 dbody.append(x)
435 dbody.tail = ((dbody.tail or '') +
436 (sbody.tail or '')) or None
440 def _find_tag(doc, tag):
441 #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
442 try:
443 doc = doc.getroot()
444 except AttributeError:
445 pass
446 if doc.nsmap:
447 try:
448 return doc.iter(XHTMLNS + tag).next()
449 except StopIteration:
450 log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc.nsmap, tag))
451 return doc.iter(tag).next()
453 MARKER_CLASS="espri-marker"
455 def add_marker(el, ID, **kwargs):
456 """Add a marker before the elememt"""
457 marker = el.makeelement('hr')
458 marker.set('id', ID)
459 marker.set('class', MARKER_CLASS)
460 for k, v in kwargs.items():
461 marker.set(k, v)
462 parent = el.getparent()
463 index = parent.index(el)
464 parent.insert(index, marker)
468 def get_chapter_breaks(points, pwd):
470 # first go is quite naive: go to deepest level that is in
471 # every branch, not counting top level divisions (which may be
472 # cover, prologue, etc).
473 serial_points = []
474 #pprint(points)
475 #lcb == lowest common depth (> 1)
476 def serialise(p, depth):
477 serial_points.append((depth, p))
478 #if p['class']:
479 # log("found class=='%s' at depth %s" % (p['class'], depth))
480 if not p.get('points'):
481 return depth
482 lcd = 1e999
483 for child in p['points']:
484 bottom = serialise(child, depth + 1)
485 lcd = min(bottom, lcd)
486 return lcd
488 lcd = 999
489 depths = []
490 for p in points:
491 depth = serialise(p, 1)
492 depths.append(depth)
493 if 1 < depth < lcd:
494 lcd = depth
495 if lcd == 999:
496 lcd = 1
498 # The book should now be split on all the points at chapter depth
499 # (lcd), and all higher points butnot if the higher point is at
500 # the same location as the chapter. If the chapter start url has
501 # a fragment id (e.g. "something.html#chapter-6"), then the split
502 # is internal to the chapter. What the book serialiser needs is a
503 # mapping from file names to the split-ids in that chapter, so
504 # construct that.
506 splits = {}
507 for depth, p in serial_points:
508 if depth > lcd:
509 continue #ignore the sub-sections
510 url, ID = p['content_src'], None
511 url = os.path.join(pwd, url)
512 if '#' in url:
513 log("GOT a fragment! %s" % url)
514 url, ID = url.split('#', 1)
515 s = splits.setdefault(url, [])
516 s.append((depth, ID, p))
518 return lcd, serial_points, splits
521 def parse_metadata(metadata):
522 """metadata is an OPF metadata node, as defined at
523 http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
524 (or a dc-metadata or x-metadata child thereof).
527 # the node probably has at least 'dc', 'opf', and None namespace
528 # prefixes. None and opf probably map to the same thing. 'dc' is
529 # Dublin Core.
530 nsmap = metadata.nsmap
531 nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
532 default_ns = nstags[None]
534 # Collect element data in namespace-bins, and map prefixes to
535 # those bins for convenience
536 nsdict = dict((v, {}) for v in nsmap.values())
538 def add_item(ns, tag, value, extra):
539 #any key can be duplicate, so store in a list
540 if ns not in nsdict:
541 nsdict[ns] = {}
542 values = nsdict[ns].setdefault(tag, [])
543 values.append((value, extra))
545 for t in metadata.iterdescendants():
546 #look for special OPF tags
547 if t.tag == default_ns + 'meta':
548 #meta tags <meta name="" content="" />
549 name = t.get('name')
550 content = t.get('content')
551 others = tuple((k, v) for k, v in t.items() if k not in ('name', 'content'))
552 if ':' in name:
553 # the meta tag is using xml namespaces in attribute values.
554 prefix, name = name.split(':', 1)
555 else:
556 prefix = None
557 add_item(t.nsmap[prefix], name, content, others)
558 continue
560 if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
561 # Subelements of these deprecated elements are in either
562 # DC or non-DC namespace (respectively). Of course, this
563 # is true of any element anyway, so it is sufficent to
564 # ignore this (unless we want to cause pedantic errors).
565 log("found a live %s tag; descending into but otherwise ignoring it"
566 % t.tag[len(default_ns):])
567 continue
569 tag = t.tag[t.tag.rfind('}') + 1:]
570 add_item(t.nsmap[t.prefix], tag, t.text,
571 tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
573 return nsdict
576 def parse_manifest(manifest, pwd):
578 Only contains <item>s; each <item> has id, href, and media-type.
580 It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
581 file (i.e., the files needed to get this far).
583 The manifest can specify fallbacks for unrecognised documents, but
584 Espri does not use that (nor do any of the test epub files).
586 <manifest>
587 <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
588 <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
589 <item id="cover" href="cover.jpg" media-type="image/jpeg" />
590 </manifest>
592 items = {}
593 ns = '{%s}' % manifest.nsmap[None]
595 for t in manifest.iterchildren(ns + 'item'):
596 id = t.get('id')
597 href = os.path.join(pwd, t.get('href'))
598 media_type = t.get('media-type')
599 items[id] = (href, media_type) #XXX does media-type matter?
601 return items
603 def parse_spine(spine):
604 """The spine is an ordered list of xhtml documents (or dtbook, but
605 Booki can't edit that, or manifest items that 'fallback' to xhtml,
606 which Espri doesn't yet handle). Also, anything in the manifest
607 that can be in the spine, must be.
609 Spine itemrefs can have a 'linear' attribute, with a value of
610 'yes' or 'no' (defaulting to 'yes'). If an item is linear, it is
611 in the main stream of the book. Reader software is allowed to
612 ignore this distinction, as Espri does.
614 The toc attribute points to the ncx file (via manifest id).
616 items = []
617 ns = '{%s}' % spine.nsmap[None]
618 for t in spine.iterchildren(ns + 'itemref'):
619 items.append(t.get('idref'))
621 toc = spine.get('toc')
623 return toc, items
626 def get_ncxtext(e):
627 """get text content from an <xx><text>...</text></xx> construct,
628 as is common in NCX files."""
629 # there will only be one <text>, but for...iter is still easiest
630 for t in e.iter(DAISYNS + 'text'):
631 return t.text
632 return '' # or leave it at None?
634 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
635 """Make a mapping of languages to labels."""
636 # This reads navInfo or navLabel tags. navInfo is unlikely, but
637 # navLabel is ubiquitous. There can be one for each language, so
638 # construct a dict.
639 labels = {}
640 for label in e.findall(tag):
641 lang = label.get(XMLNS + 'lang')
642 labels[lang] = get_ncxtext(e)
643 return labels
645 def parse_ncx(ncx):
647 The NCX file is the closest thing to FLOSS Manuals TOC.txt. It
648 describes the heirarchical structure of the document (wheras the
649 spine describes its 'physical' structure).
651 #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
653 headers = {}
654 #if a header is set multiple times, keep all
655 def setheader(name, content, scheme=None):
656 values = headers.setdefault(name, [])
657 values.append((content, scheme))
659 head = ncx.find(DAISYNS + 'head')
660 #<!ELEMENT head (meta+)>
661 for meta in head.findall(DAISYNS + 'meta'):
662 #whatever 'scheme' is
663 setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
665 for t in ('docTitle', 'docAuthor'):
666 for e in ncx.findall(DAISYNS + t):
667 if e is not None:
668 setheader(t, get_ncxtext(e))
670 root = ncx.getroot()
671 for attr, header in (('dir', 'dir'),
672 (XMLNS + 'lang', 'lang')):
673 value = root.get(attr)
674 if value is not None:
675 setheader(header, value)
677 navmap = root.find(DAISYNS + 'navMap')
678 ret = {
679 'headers': headers,
680 'navmap': parse_navmap(navmap),
683 #Try adding these bits, even though no-one has them and they are no use.
684 pagelist = ncx.find(DAISYNS + 'pageList')
685 navlist = ncx.find(DAISYNS + 'navList')
686 if pagelist is not None:
687 ret['pagelist'] = parse_pagelist(pagelist)
688 if navlist is not None:
689 ret['navlist'] = parse_navlist(navlist)
691 return ret
694 def parse_navmap(e):
695 #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
696 #XXX move info and labels out of navmap, and into headers?
697 return {
698 'info': get_labels(e, DAISYNS + 'navInfo'),
699 'labels': get_labels(e),
700 'points': tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint')),
703 def parse_navpoint(e):
704 #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
705 c = e.find(DAISYNS + 'content')
706 subpoints = tuple(parse_navpoint(x) for x in e.findall(DAISYNS + 'navPoint'))
707 return {
708 'id': e.get('id'),
709 'class': e.get('class'),
710 'play_order': int(e.get('playOrder')),
711 #'content_id': c.get('id'),
712 'content_src': c.get('src'),
713 'labels': get_labels(e),
714 'points': subpoints,
718 def parse_pagelist(e):
719 # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
720 return {
721 'info': get_labels(e, DAISYNS + 'navInfo'),
722 'labels': get_labels(e),
723 'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'pageTarget')),
726 def parse_pagetarget(e):
727 #<!ELEMENT pageTarget (navLabel+, content)>
728 labels = get_labels(e)
729 c = e.find(DAISYNS + 'content')
730 ret = {
731 'id': e.get('id'),
732 'type': e.get('type'),
733 'play_order': int(e.get('playOrder')),
734 'content_src': c.get('src'),
735 'labels': get_labels(e),
737 value = e.get('value')
738 if value is not None:
739 ret['value'] = value
740 return ret
742 def parse_navlist(e):
743 #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
744 return {
745 'info': get_labels(e, DAISYNS + 'navInfo'),
746 'labels': get_labels(e),
747 'targets': tuple(parse_pagetarget(x) for x in e.findall(DAISYNS + 'navTarget')),
750 def parse_navtarget(e):
751 #<!ELEMENT navTarget (navLabel+, content)>
752 labels = get_labels(e)
753 c = e.find(DAISYNS + 'content')
754 ret = {
755 'id': e.get('id'),
756 'play_order': int(e.get('playOrder')),
757 'content_src': c.get('src'),
758 'labels': get_labels(e),
760 value = e.get('value')
761 if value is not None:
762 ret['value'] = value
763 return ret