catch None urls in another place
[objavi2.git] / objavi / epub.py
blob180ede8c7823460e55431ca29f0f83690fe161f9
1 """Module for dealing with epub -> booki conversions."""
3 import os, sys
4 import zipfile
5 from cStringIO import StringIO
7 try:
8 from json import dumps
9 except ImportError:
10 from simplejson import dumps
12 import lxml.html, lxml.cssselect
13 from lxml import etree
15 from objavi.xhtml_utils import split_tree
16 from objavi.book_utils import log
17 from objavi.config import DC, XHTML, XHTMLNS, FM, MARKER_CLASS_INFO, MARKER_CLASS_SPLIT
18 from booki.bookizip import BookiZip
20 #XML namespaces. The *NS varients are in {curly brackets} for clark's syntax
21 XMLNS = '{http://www.w3.org/XML/1998/namespace}'
22 DAISYNS = '{http://www.daisy.org/z3986/2005/ncx/}'
23 OPFNS = '{http://www.idpf.org/2007/opf}'
24 CONTAINERNS = '{urn:oasis:names:tc:opendocument:xmlns:container}'
26 MARKUP_TYPES = ('application/xhtml+xml', 'text/html', "application/x-dtbncx+xml")
27 HTML_TYPES = ('application/xhtml+xml', 'text/html')
29 ADD_INFO_MARKERS = False
32 html_parser = lxml.html.HTMLParser(encoding="utf-8")
33 xhtml_parser = lxml.html.XHTMLParser(encoding="utf-8")
35 def _xhtml_parse(*args, **kwargs):
36 kwargs['parser'] = xhtml_parser
37 return lxml.html.parse(*args, **kwargs)
39 def _html_parse(*args, **kwargs):
40 kwargs['parser'] = html_parser
41 return lxml.html.parse(*args, **kwargs)
43 def new_doc(guts="", version="1.1", lang=None):
44 xmldec = '<?xml version="1.0" encoding="UTF-8"?>'
45 doctypes = {
46 '1.1': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
47 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'),
48 '1.0': ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
49 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
52 if lang in (None, 'und', 'UND'):
53 langdec = ''
54 else:
55 langdec = 'xml:lang="%s" lang="%s"' % (lang, lang)
57 doc = ('<html xmlns="%s" version="XHTML %s" %s>'
58 '<head></head><body>%s</body></html>'
59 % (XHTML, version, langdec, guts))
61 f = StringIO(xmldec + doctypes.get(version, '') + doc)
62 tree = lxml.html.parse(f)
63 f.close()
64 return tree
67 class EpubError(Exception):
68 pass
70 class Epub(object):
71 """
73 Abstract Container:
74 META-INF/
75 container.xml
76 [manifest.xml]
77 [metadata.xml]
78 [signatures.xml]
79 [encryption.xml]
80 [rights.xml]
81 OEBPS/
82 Great Expectations.opf
83 cover.html
84 chapters/
85 chapter01.html
86 chapter02.html
87 <other HTML files for the remaining chapters>
89 """
90 def load(self, src):
91 # Zip is a variable format, and zipfile is limited. If that
92 # becomes a problem we will have to ise an `unzip` subprocess,
93 # but it hasn't been so far.
94 if isinstance(src, str):
95 # Should end with PK<06><05> + 18 more.
96 # Some zips contain 'comments' after that, which breaks ZipFile
97 zipend = src.rfind('PK\x05\x06') + 22
98 if len(src) != zipend:
99 log('Bad zipfile?')
100 src = src[: zipend]
101 src = StringIO(src)
102 self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
103 self.names = self.zip.namelist()
104 self.info = self.zip.infolist()
105 self.origin = src
107 def gettree(self, name=None, id=None, parse=etree.parse):
108 """get an XML tree from the given zip filename or manifest ID"""
109 if name is None:
110 name, mimetype = self.manifest[id]
111 #Note: python 2.6 (not 2.5) has zipfile.open
112 s = self.zip.read(name)
113 f = StringIO(s)
114 tree = parse(f)
115 f.close()
116 return tree
118 def parse_meta(self):
119 '''META-INF/container.xml contains one or more <rootfile>
120 nodes. We want the "application/oepbs-package+xml" one.
122 <rootfile full-path="OEBPS/Great Expectations.opf" media-type="application/oebps-package+xml" />
123 <rootfile full-path="PDF/Great Expectations.pdf" media-type="application/pdf" />
125 Other files are allowed in META-INF, but none of them are much
126 use. They are manifest.xml, metadata.xml, signatures.xml,
127 encryption.xml, and rights.xml.
129 tree = self.gettree('META-INF/container.xml')
130 for r in tree.getiterator(CONTAINERNS + 'rootfile'):
131 if r.get('media-type') == "application/oebps-package+xml":
132 rootfile = r.get('full-path')
133 break
134 else:
135 raise EpubError("No OPF rootfile found")
137 self.opf_file = rootfile
139 def parse_opf(self):
141 The opf file is arranged like this:
142 <package>
143 <metadata />
144 <manifest />
145 <spine />
146 <guide />
147 </package>
149 Metadata, manifest and spine are parsed in separate helper
150 functions.
152 self.opfdir = os.path.dirname(self.opf_file) #needed for manifest parsing
153 tree = self.gettree(self.opf_file)
154 root = tree.getroot()
155 metadata = root.find(OPFNS + 'metadata')
156 manifest = root.find(OPFNS + 'manifest')
157 spine = root.find(OPFNS + 'spine')
159 self.metadata = parse_metadata(metadata)
160 self.manifest = parse_manifest(manifest, self.opfdir)
161 # mapping of filenames to new filenames. This needs to be
162 # done early to detect clashes (e.g. '/images/hello.jpg' and
163 # '/images/big/hello.jpg' would both reduce to
164 # 'static/hello.jpg').
165 self.media_map = {}
166 for k, v in self.manifest.items():
167 fn, mimetype = v
168 if isinstance(fn, unicode):
169 log('Stupid unicode: %r' % fn)
171 if mimetype not in MARKUP_TYPES:
172 oldfn = fn
173 if '/' in fn:
174 fn = fn.rsplit('/', 1)[1]
175 while fn in self.media_map.values():
176 fn = '_' + fn
177 newfn = 'static/%s' % fn
178 self.media_map[oldfn] = newfn
180 ncxid, self.spine = parse_spine(spine)
181 self.ncxfile = self.manifest[ncxid][0]
183 #there is also an optional guide section, which we ignore
184 guide = root.find(OPFNS + 'guide')
185 if guide is not None:
186 self.guide = parse_guide(guide)
187 else:
188 self.guide = None
191 def parse_ncx(self):
192 ncx = self.gettree(self.ncxfile)
193 self.ncxdata = parse_ncx(ncx)
195 def raw_json(self):
196 """get all the known metadata and nav data as json."""
197 data = {
198 'metadata': self.metadata,
199 'manifest': self.manifest,
200 'spine': self.spine,
201 'ncx': self.ncxdata
203 if self.guide is not None:
204 data['guide'] = self.guide
205 return dumps(data, indent=2)
207 def find_language(self):
208 opflang = [x[0].lower() for x in
209 self.metadata.get(DC, {}).get('language', ())]
211 # XXX Should the ncx language enter into it? Being xml:lang,
212 # it is in theory just the language of the ncx document
213 # itself. But if the metadata lacks language, should it be
214 # used instead? At present, NO.
215 #ncxlang = self.ncxdata['headers'].get('lang', ())
217 # XXX also, for now, ignoring case of badly formed language
218 # codes, conflicting or supplementary languages, etc.
219 opflang = [x for x in opflang if x not in ('und', '')]
220 if not opflang:
221 return None
222 if len(set(opflang)) > 1:
223 log('%s metadata has more than one language: %s -- using first one'
224 % (self.origin, opflang))
225 return opflang[0]
227 def find_probable_chapters(self):
228 """Try to find the real chapters from the NCX file. The
229 problem is that different epubs all use their own level of
230 nesting."""
231 # the Black Arrow has (book 1 (c1, c2, c3), book2 (c4, c5, c6..))
232 # and FM books have (section 1 (c1, c2,..),..)
233 # i.e super-chapter blocks
234 # some have (((c1, c2, c3))) -- deeply nested chapters
235 # some have no real chapters, but stupid structure
236 points = self.ncxdata['navmap']['points']
237 pwd = os.path.dirname(self.ncxfile)
238 serial_points, splits = get_chapter_breaks(points, pwd)
239 return serial_points, splits
241 def concat_document(self):
242 """Join all the xhtml files together, putting in markers
243 indicating where the splits should be.
245 lang = self.find_language()
246 points = self.ncxdata['navmap']['points']
247 pwd = os.path.dirname(self.ncxfile)
248 serial_points, chapter_markers = get_chapter_breaks(points, pwd)
249 doc = new_doc(lang=lang)
250 #log(chapter_markers)
251 for ID in self.spine:
252 fn, mimetype = self.manifest[ID]
253 if mimetype.startswith('image'):
254 root = lxml.html.Element('html')
255 body = etree.SubElement(root, 'body')
256 first_el = etree.SubElement(body, 'img', src=self.media_map.get(fn, fn), alt='')
257 else:
258 tree = self.gettree(fn, parse=_html_parse)
259 root = tree.getroot()
260 body = _find_tag(root, 'body')
261 if not len(body) and ADD_INFO_MARKERS:
262 add_marker(body, 'espri-empty-file-%s' % ID, title=fn, child=True)
263 first_el = body[0]
264 #point the links to the new names. XXX probably fragile
265 root.rewrite_links(lambda x: self.media_map.get(os.path.join(self.opfdir, x), x))
267 for depth, fragment, point in chapter_markers.get(fn, ()):
268 if fragment:
269 start = root.xpath("//*[@id='%s']" % fragment)[0]
270 else:
271 start = first_el
272 labels = point['labels']
273 add_marker(start, '%(id)s' % point,
274 klass=MARKER_CLASS_SPLIT,
275 title=find_good_label(labels, lang),
276 subsections=str(bool(point['points'])))
278 if ADD_INFO_MARKERS:
279 add_marker(first_el, 'espri-new-file-%s' % ID, title=fn)
280 add_guts(root, doc)
281 return doc
284 def make_bookizip(self, zfn):
285 """Split up the document and construct a booki-toc for it."""
286 doc = self.concat_document()
287 bz = BookiZip(zfn)
289 chapters = split_tree(doc) #destroys doc.
290 real_chapters = drop_empty_chapters(chapters)
291 rightsholders = [c for c, extra in self.metadata[DC].get('creator', ())]
292 contributors = rightsholders + [c for c, extra in self.metadata[DC].get('contributor', ())]
294 spine = []
295 for c in real_chapters:
296 try:
297 root = c.tree.getroot()
298 except Exception:
299 root = c.tree
300 try:
301 del root.attrib['xmlns']
302 del root.attrib['version']
303 del root.attrib['xml:lang']
304 except KeyError, e:
305 log(e)
306 if c.title:
307 head = root.makeelement('head')
308 _title = etree.SubElement(head, 'title')
309 _title.text = c.title
310 root.insert(0, head)
311 blob = lxml.html.tostring(c.tree)
312 bz.add_to_package(c.ID, '%s.html' % c.ID, blob, mediatype='text/html',
313 contributors=contributors,
314 rightsholders=rightsholders)
315 spine.append(c.ID)
317 #add the images and other non-html data unchanged.
318 for id, data in self.manifest.iteritems():
319 fn, mimetype = data
320 if isinstance(fn, unicode):
321 log("Hateful unicode: %r" % fn)
322 if mimetype not in MARKUP_TYPES:
323 blob = self.zip.read(fn)
324 bz.add_to_package(id, self.media_map[fn], blob, mimetype,
325 contributors=contributors,
326 rightsholders=rightsholders
329 #now to construct a table of contents
330 lang = self.find_language()
332 deferred_urls = []
333 def write_toc(point, section):
334 tocpoint = {}
335 title = find_good_label(point['labels'], lang),
336 if title and title[0]:
337 tocpoint['title'] = title[0]
338 ID = point['id']
339 if ID in spine:
340 tocpoint['url'] = self.manifest.get(ID, ID + '.html')
341 while deferred_urls:
342 tp = deferred_urls.pop()
343 tp['url'] = tocpoint['url']
344 log('%r has deferred url: %r' % (tp['title'], tp['url']))
345 else:
346 deferred_urls.append(tocpoint)
347 if point['points']:
348 tocpoint['children'] = []
349 for child in point['points']:
350 write_toc(child, tocpoint['children'])
352 section.append(tocpoint)
354 toc = []
355 points = self.ncxdata['navmap']['points']
356 for p in points:
357 write_toc(p, toc)
359 metadata = {FM: {'book':{},
360 'server': {},
362 DC: {}}
364 for namespace, keys in self.metadata.items():
365 if 'namespace' not in metadata:
366 metadata[namespace] = {}
367 log(keys)
368 for key, values in keys.items():
369 dest = metadata[namespace].setdefault(key, {})
370 for value, extra in values:
371 scheme = ''
372 if extra:
373 for x in ('scheme', 'role'):
374 if x in extra:
375 scheme = extra[x]
376 break
377 dest.setdefault(scheme, []).append(value)
379 if not metadata[FM]['book']:
380 metadata[FM]['book'][''] = [''.join(x for x in str(metadata[DC]['identifier'][''][0]) if x.isalnum())]
381 if not metadata[FM]['server']:
382 metadata[FM]['server'][''] = ['booki.flossmanuals.net']
384 log(metadata)
386 bz.info = {
387 'spine': spine,
388 'TOC': toc,
389 'metadata': metadata,
390 'version': '1',
393 bz.finish()
396 def find_good_label(labels, lang=None):
397 """Try to find a suitable label from a dictionary mapping
398 languages to labels, resorting to a random label if need be."""
399 #XXX not taking into account language sub-tags ("en_GB")
400 for x in [lang, None]:
401 if x in labels:
402 return labels[x]
403 if labels:
404 #return random.choice(labels.values())
405 return ' | '.join(labels.values())
406 return None
409 #labels.get(lang, '\n'.join(labels.values())),
411 def drop_empty_chapters(chapters):
412 """If the chapter has no content, ignore it. Content is defined
413 as images or text."""
414 good_chapters = []
415 for c in chapters:
416 good = False
417 for e in c.tree.iter():
418 if ((e.text and e.text.strip()) or
419 (e.tail and e.tail.strip()) or
420 e.tag in ('img',)):
421 good = True
422 break
423 if good:
424 good_chapters.append(c)
425 return good_chapters
428 def add_guts(src, dest):
429 """Append the contents of the <body> of one tree onto that of
430 another. The source tree will be emptied."""
431 #print lxml.etree.tostring(src)
432 sbody = _find_tag(src, 'body')
433 dbody = _find_tag(dest, 'body')
434 if len(dbody):
435 dbody[-1].tail = ((dbody[-1].tail or '') +
436 (sbody.text or '')) or None
437 else:
438 dbody.text = sbody.text
440 for x in sbody:
441 dbody.append(x)
443 dbody.tail = ((dbody.tail or '') +
444 (sbody.tail or '')) or None
448 def _find_tag(doc, tag):
449 #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
450 try:
451 doc = doc.getroot()
452 except AttributeError:
453 pass
454 if doc.nsmap:
455 try:
456 return doc.iter(XHTMLNS + tag).next()
457 except StopIteration:
458 log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc.nsmap, tag))
459 return doc.iter(tag).next()
462 def add_marker(el, ID, child=False, klass=MARKER_CLASS_INFO, **kwargs):
463 """Add a marker before the element, or inside it if child is true"""
464 marker = el.makeelement('hr')
465 marker.set('id', ID)
466 marker.set('class', klass)
467 for k, v in kwargs.items():
468 marker.set(k, v)
469 if child:
470 parent = el
471 index = 0
472 else:
473 parent = el.getparent()
474 index = parent.index(el)
475 parent.insert(index, marker)
479 def get_chapter_breaks(points, pwd):
480 # First go was overly complex, trying to guess which sections were
481 # really chapters. Now, every ncx navpoint is a chapter break.
482 serial_points = []
483 def serialise(p, depth):
484 serial_points.append((depth, p))
485 #if p['class']:
486 # log("found class=='%s' at depth %s" % (p['class'], depth))
487 if not p.get('points'):
488 return
489 for child in p['points']:
490 serialise(child, depth + 1)
492 for p in points:
493 serialise(p, 1)
495 splits = {}
496 for depth, p in serial_points:
497 url, ID = p['content_src'], None
498 url = os.path.join(pwd, url)
499 if '#' in url:
500 log("GOT a fragment! %s" % url)
501 url, ID = url.split('#', 1)
502 s = splits.setdefault(url, [])
503 s.append((depth, ID, p))
505 return serial_points, splits
508 def parse_metadata(metadata):
509 """metadata is an OPF metadata node, as defined at
510 http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
511 (or a dc-metadata or x-metadata child thereof).
514 # the node probably has at least 'dc', 'opf', and None namespace
515 # prefixes. None and opf probably map to the same thing. 'dc' is
516 # Dublin Core.
517 nsmap = metadata.nsmap
518 nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
519 default_ns = nstags[None]
521 # Collect element data in namespace-bins, and map prefixes to
522 # those bins for convenience
523 nsdict = dict((v, {}) for v in nsmap.values())
525 def add_item(ns, tag, value, extra):
526 #any key can be duplicate, so store in a list
527 if ns not in nsdict:
528 nsdict[ns] = {}
529 values = nsdict[ns].setdefault(tag, [])
530 values.append((value, extra))
532 for t in metadata.iterdescendants():
533 #look for special OPF tags
534 if t.tag == default_ns + 'meta':
535 #meta tags <meta name="" content="" />
536 name = t.get('name')
537 content = t.get('content')
538 others = dict((k, v) for k, v in t.items() if k not in ('name', 'content'))
539 if ':' in name:
540 # the meta tag is using xml namespaces in attribute values.
541 prefix, name = name.split(':', 1)
542 else:
543 prefix = None
544 add_item(t.nsmap[prefix], name, content, others)
545 continue
547 if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
548 # Subelements of these deprecated elements are in either
549 # DC or non-DC namespace (respectively). Of course, this
550 # is true of any element anyway, so it is sufficent to
551 # ignore this (unless we want to cause pedantic errors).
552 log("found a live %s tag; descending into but otherwise ignoring it"
553 % t.tag[len(default_ns):])
554 continue
556 tag = t.tag[t.tag.rfind('}') + 1:]
557 add_item(t.nsmap[t.prefix], tag, t.text,
558 tuple((k.replace(default_ns, ''), v) for k, v in t.items()))
560 return nsdict
563 def parse_manifest(manifest, pwd):
565 Only contains <item>s; each <item> has id, href, and media-type.
567 It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
568 file (i.e., the files needed to get this far).
570 The manifest can specify fallbacks for unrecognised documents, but
571 Espri does not use that (nor do any of the test epub files).
573 <manifest>
574 <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
575 <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
576 <item id="cover" href="cover.jpg" media-type="image/jpeg" />
577 </manifest>
579 items = {}
580 ns = '{%s}' % manifest.nsmap[None]
582 for t in manifest.iterchildren(ns + 'item'):
583 id = t.get('id')
584 href = os.path.join(pwd, t.get('href'))
585 if isinstance(href, unicode):
586 log('damn unicode: %r' % href)
587 log(etree.tostring(t))
588 media_type = t.get('media-type')
589 items[id] = (href, media_type) #XXX does media-type matter?
591 return items
593 def parse_spine(spine):
594 """The spine is an ordered list of xhtml documents (or dtbook, but
595 Booki can't edit that, or manifest items that 'fallback' to xhtml,
596 which Espri doesn't yet handle). Also, anything in the manifest
597 that can be in the spine, must be.
599 Spine itemrefs can have a 'linear' attribute, with a value of
600 'yes' or 'no' (defaulting to 'yes'). If an item is linear, it is
601 in the main stream of the book. Reader software is allowed to
602 ignore this distinction, as Espri does.
604 The toc attribute points to the ncx file (via manifest id).
606 items = []
607 ns = '{%s}' % spine.nsmap[None]
608 for t in spine.iterchildren(ns + 'itemref'):
609 items.append(t.get('idref'))
611 toc = spine.get('toc')
613 return toc, items
615 def parse_guide(guide):
616 """Parse the guide from the opf file."""
617 items = []
618 ns = '{%s}' % guide.nsmap[None]
619 for r in guide.iterchildren(ns + 'reference'):
620 items.append((r.get('href'), r.get('type'), r.get('title'),))
622 return items
625 def get_ncxtext(e):
626 """get text content from an <xx><text>...</text></xx> construct,
627 as is common in NCX files."""
628 # there will only be one <text>, but for...iter is still easiest
629 for t in e.iter(DAISYNS + 'text'):
630 return t.text
631 return '' # or leave it at None?
633 def get_labels(e, tag='{http://www.daisy.org/z3986/2005/ncx/}navLabel'):
634 """Make a mapping of languages to labels."""
635 # This reads navInfo or navLabel tags. navInfo is unlikely, but
636 # navLabel is ubiquitous. There can be one for each language, so
637 # construct a dict.
638 labels = {}
639 for label in e.iterchildren(tag):
640 lang = label.get(XMLNS + 'lang')
641 labels[lang] = get_ncxtext(e)
642 return labels
644 def parse_ncx(ncx):
646 The NCX file is the closest thing to FLOSS Manuals TOC.txt. It
647 describes the heirarchical structure of the document (wheras the
648 spine describes its 'physical' structure).
650 #<!ELEMENT ncx (head, docTitle, docAuthor*, navMap, pageList?, navList*)>
652 headers = {}
653 #if a header is set multiple times, keep all
654 def setheader(name, content, scheme=None):
655 values = headers.setdefault(name, [])
656 values.append((content, scheme))
658 head = ncx.find(DAISYNS + 'head')
659 #<!ELEMENT head (meta+)>
660 for meta in head.iterchildren(DAISYNS + 'meta'):
661 #whatever 'scheme' is
662 setheader(meta.get('name'), meta.get('content'), meta.get('scheme'))
664 root = ncx.getroot()
665 for t in ('docTitle', 'docAuthor'):
666 for e in root.iterchildren(DAISYNS + t):
667 if e is not None:
668 setheader(t, get_ncxtext(e))
670 for attr, header in (('dir', 'dir'),
671 (XMLNS + 'lang', 'lang')):
672 value = root.get(attr)
673 if value is not None:
674 setheader(header, value)
676 navmap = root.find(DAISYNS + 'navMap')
677 ret = {
678 'headers': headers,
679 'navmap': parse_navmap(navmap),
682 #Try adding these bits, even though no-one has them and they are no use.
683 pagelist = ncx.find(DAISYNS + 'pageList')
684 navlist = ncx.find(DAISYNS + 'navList')
685 if pagelist is not None:
686 ret['pagelist'] = parse_pagelist(pagelist)
687 if navlist is not None:
688 ret['navlist'] = parse_navlist(navlist)
690 return ret
693 def parse_navmap(e):
694 #<!ELEMENT navMap (navInfo*, navLabel*, navPoint+)>
695 #XXX move info and labels out of navmap, and into headers?
696 return {
697 'info': get_labels(e, DAISYNS + 'navInfo'),
698 'labels': get_labels(e),
699 'points': tuple(parse_navpoint(x) for x in e.iterchildren(DAISYNS + 'navPoint')),
702 def parse_navpoint(e):
703 #<!ELEMENT navPoint (navLabel+, content, navPoint*)>
704 c = e.find(DAISYNS + 'content')
705 subpoints = tuple(parse_navpoint(x) for x in e.iterchildren(DAISYNS + 'navPoint'))
706 return {
707 'id': e.get('id'),
708 'class': e.get('class'),
709 'play_order': int(e.get('playOrder')),
710 #'content_id': c.get('id'),
711 'content_src': c.get('src'),
712 'labels': get_labels(e),
713 'points': subpoints,
717 def parse_pagelist(e):
718 # <!ELEMENT pageList (navInfo*, navLabel*, pageTarget+)>
719 return {
720 'info': get_labels(e, DAISYNS + 'navInfo'),
721 'labels': get_labels(e),
722 'targets': tuple(parse_pagetarget(x) for x in e.iterchildren(DAISYNS + 'pageTarget')),
725 def parse_pagetarget(e):
726 #<!ELEMENT pageTarget (navLabel+, content)>
727 c = e.find(DAISYNS + 'content')
728 ret = {
729 'id': e.get('id'),
730 'type': e.get('type'),
731 'play_order': int(e.get('playOrder')),
732 'content_src': c.get('src'),
733 'labels': get_labels(e),
735 value = e.get('value')
736 if value is not None:
737 ret['value'] = value
738 return ret
740 def parse_navlist(e):
741 #<!ELEMENT navList (navInfo*, navLabel+, navTarget+)>
742 return {
743 'info': get_labels(e, DAISYNS + 'navInfo'),
744 'labels': get_labels(e),
745 'targets': tuple(parse_navtarget(x) for x in e.iterchildren(DAISYNS + 'navTarget')),
748 def parse_navtarget(e):
749 #<!ELEMENT navTarget (navLabel+, content)>
750 c = e.find(DAISYNS + 'content')
751 ret = {
752 'id': e.get('id'),
753 'play_order': int(e.get('playOrder')),
754 'content_src': c.get('src'),
755 'labels': get_labels(e),
757 value = e.get('value')
758 if value is not None:
759 ret['value'] = value
760 return ret