Don't crash if a TOC item has no URL
[objavi2.git] / objavi / fmbook.py
blob5f3ab6c4b4b21e22cf2669241350bf1ad0736d30
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 import copy
28 from subprocess import Popen, check_call, PIPE
29 from cStringIO import StringIO
30 from urllib2 import urlopen, HTTPError
31 import zipfile
32 import traceback
33 from string import ascii_letters
34 from pprint import pformat
36 try:
37 import json
38 except ImportError:
39 import simplejson as json
41 import lxml.html
42 from lxml import etree
44 from objavi import config, epub_utils
45 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
46 from objavi.book_utils import ObjaviError, log_types
47 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
48 from objavi.epub import add_guts, _find_tag
49 from objavi.xhtml_utils import EpubChapter, split_tree
50 from objavi.cgi_utils import url2path, path2url
52 from iarchive import epub as ia_epub
53 from booki.bookizip import get_metadata, add_metadata
55 TMPDIR = os.path.abspath(config.TMPDIR)
56 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
57 HTTP_HOST = os.environ.get('HTTP_HOST', '')
59 def find_archive_urls(bookid, bookname):
60 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
61 detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
62 return (s3url, detailsurl)
64 def _get_best_title(tocpoint):
65 if 'html_title' in tocpoint:
66 return tocpoint['html_title']
67 if 'title' in tocpoint:
68 return tocpoint['title']
69 return 'Untitled'
72 def _add_initial_number(e, n):
73 """Put a styled chapter number n at the beginning of element e."""
74 initial = e.makeelement("strong", Class="initial")
75 e.insert(0, initial)
76 initial.tail = ' '
77 if e.text is not None:
78 initial.tail += e.text
79 e.text = ''
80 initial.text = "%s." % n
82 def expand_toc(toc, depth=1, index=0):
83 """Reformat toc slightly for convenience"""
84 for item in toc:
85 url = item.get('url', '').lstrip('/')
86 bits = url.split('#', 1)
87 filename = bits[0]
88 fragment = (bits[1] if len(bits) == 2 else None)
89 item['depth'] = depth
90 item["filename"] = filename
91 item["fragment"] = fragment
92 item["index"] = index
93 index += 1
94 if 'children' in item:
95 index = expand_toc(item['children'], depth + 1, index)
96 return index
98 def _serialise(rtoc, stoc, depth):
99 for item in rtoc:
100 url = item['url'].lstrip('/')
101 bits = url.split('#', 1)
102 filename = bits[0]
103 fragment = (bits[1] if len(bits) == 2 else None)
104 stoc.append({"depth": depth,
105 "title": item['title'],
106 "url": url,
107 "filename": filename,
108 "fragment": fragment,
109 "type": item['type']
111 if 'children' in item:
112 _serialise(item['children'], stoc, depth + 1)
115 def serialise_toc(rtoc):
116 """Take the recursive TOC structure and turn it into a list of
117 serial points. Reformat some things for convenience."""
118 stoc = []
119 _serialise(rtoc, stoc, 1)
120 for i, x in enumerate(stoc):
121 x['position'] = i
122 return stoc
124 def filename_toc_map(rtoc):
125 tocmap = {}
126 #log(rtoc)
127 def traverse(toc):
128 for point in toc:
129 #log(point.keys())
130 tocmap.setdefault(point['filename'], []).append(point)
131 if 'children' in point:
132 traverse(point['children'])
133 traverse(rtoc)
134 return tocmap
136 def save_data(fn, data):
137 """Save without tripping up on unicode"""
138 if isinstance(data, unicode):
139 data = data.encode('utf8', 'ignore')
140 f = open(fn, 'w')
141 f.write(data)
142 f.close()
145 class Book(object):
146 page_numbers = 'latin'
147 preamble_page_numbers = 'roman'
149 def notify_watcher(self, message=None):
150 if self.watchers:
151 if message is None:
152 #message is the name of the caller
153 message = traceback.extract_stack(None, 2)[0][2]
154 log("notify_watcher called with '%s'" % message)
155 for w in self.watchers:
156 w(message)
158 def __enter__(self):
159 return self
161 def __exit__(self, exc_type, exc_value, tb):
162 self.notify_watcher(config.FINISHED_MESSAGE)
163 self.cleanup()
164 #could deal with exceptions here and return true
167 def __init__(self, book, server, bookname,
168 page_settings=None, watchers=None, isbn=None,
169 license=config.DEFAULT_LICENSE, title=None,
170 max_age=0):
171 log("*** Starting new book %s ***" % bookname)
172 self.watchers = set()
173 if watchers is not None:
174 self.watchers.update(watchers)
175 self.notify_watcher('start')
176 self.bookname = bookname
177 self.book = book
178 self.server = server
179 self.cookie = ''.join(random.sample(ascii_letters, 10))
180 try:
181 blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
182 except HTTPError, e:
183 traceback.print_exc()
184 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
185 #not much to do?
186 #raise 502 Bad Gateway ?
187 sys.exit()
188 f = StringIO(blob)
189 self.notify_watcher('fetch_zip')
190 self.store = zipfile.ZipFile(f, 'r')
191 self.info = json.loads(self.store.read('info.json'))
192 for k in ('manifest', 'metadata', 'spine', 'TOC'):
193 if k not in self.info:
194 raise ObjaviError('info.json of %s lacks vital element "%s"' %
195 (bookname, k))
196 #check types also?
198 self.metadata = self.info['metadata']
199 self.spine = self.info['spine']
200 self.manifest = self.info['manifest']
202 if server == config.LOCALHOST: # [DEPRECATED]
203 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
204 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
206 log(pformat(self.metadata))
207 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
208 if not self.lang:
209 self.lang = guess_lang(server, book)
210 log('guessed lang as %s' % self.lang)
212 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
213 if not self.toc_header:
214 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
216 self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
217 if not self.dir:
218 self.dir = guess_text_dir(server, book)
220 #Patch in the extra metadata. (lang and dir may be set from config)
221 #these should be read from zip -- so should go into zip?
222 for var, key, scheme, ns in (
223 (isbn, 'id', 'ISBN', config.DC),
224 (license, 'rights', 'License', config.DC),
225 (title, 'title', '', config.DC),
226 (self.lang, 'language', '', config.DC),
227 (self.dir, 'dir', '', config.FM),
229 if var is not None:
230 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
232 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
233 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
235 self.toc = self.info['TOC']
236 expand_toc(self.toc)
238 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
239 os.chmod(self.workdir, 0755)
241 self.body_html_file = self.filepath('body.html')
242 self.body_pdf_file = self.filepath('body.pdf')
243 self.preamble_html_file = self.filepath('preamble.html')
244 self.preamble_pdf_file = self.filepath('preamble.pdf')
245 self.tail_html_file = self.filepath('tail.html')
246 self.tail_pdf_file = self.filepath('tail.pdf')
247 self.isbn_pdf_file = None
248 self.pdf_file = self.filepath('final.pdf')
249 self.body_odt_file = self.filepath('body.odt')
250 self.outline_file = self.filepath('outline.txt')
252 self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
254 if page_settings is not None:
255 self.maker = PageSettings(**page_settings)
257 if title is not None:
258 self.title = title
259 else:
260 titles = get_metadata(self.metadata, 'title')
261 if titles:
262 self.title = titles[0]
263 else:
264 self.title = 'A Book About ' + self.book
265 if isinstance(self.title, unicode):
266 self.title = self.title.encode('utf-8')
268 self.notify_watcher()
271 if config.TRY_BOOK_CLEANUP_ON_DEL:
272 #Dont even define __del__ if it is not used.
273 _try_cleanup_on_del = True
274 def __del__(self):
275 if self._try_cleanup_on_del and os.path.exists(self.workdir):
276 self._try_cleanup_on_del = False #or else you can get in bad cycles
277 self.cleanup()
279 def get_tree_by_id(self, id):
280 """get an HTML tree from the given manifest ID"""
281 name = self.manifest[id]['url']
282 mimetype = self.manifest[id]['mimetype']
283 s = self.store.read(name)
284 f = StringIO(s)
285 if mimetype == 'text/html':
286 try:
287 tree = lxml.html.parse(f)
288 except etree.XMLSyntaxError, e:
289 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
290 (id, name, s[:20], e))
291 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
292 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
293 tree = etree.parse(f)
294 else:
295 tree = f.read()
296 f.close()
297 return tree
299 def filepath(self, fn):
300 return os.path.join(self.workdir, fn)
302 def save_tempfile(self, fn, data):
303 """Save the data in a temporary directory that will be cleaned
304 up when all is done. Return the absolute file path."""
305 fn = self.filepath(fn)
306 save_data(fn, data)
307 return fn
309 def make_oo_doc(self):
310 """Make an openoffice document, using the html2odt script."""
311 self.wait_for_xvfb()
312 html_text = etree.tostring(self.tree, method="html")
313 save_data(self.body_html_file, html_text)
314 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
315 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
316 os.rename(self.body_odt_file, self.publish_file)
317 self.notify_watcher()
319 def extract_pdf_outline(self):
320 """Get the outline (table of contents) for the PDF, which
321 wkhtmltopdf should have written to a file. If that file
322 doesn't exist (or config says not to use it), fall back to
323 using self._extract_pdf_outline_the_old_way, below.
325 if config.USE_DUMP_OUTLINE:
326 try:
327 self.outline_contents, number_of_pages = \
328 parse_extracted_outline(self.outline_file)
330 except Exception, e:
331 traceback.print_exc()
332 number_of_pages = self._extract_pdf_outline_the_old_way()
333 else:
334 number_of_pages = self._extract_pdf_outline_the_old_way()
336 self.notify_watcher()
337 return number_of_pages
339 def _extract_pdf_outline_the_old_way(self):
340 """Try to get the PDF outline using pdftk. This doesn't work
341 well with all scripts."""
342 debugf = self.filepath('extracted-outline.txt')
343 self.outline_contents, number_of_pages = \
344 parse_outline(self.body_pdf_file, 1, debugf)
346 if not self.outline_contents:
347 #probably problems with international text. need a horrible hack
348 log('no outline: trying again with ascii headings')
349 import copy
350 tree = copy.deepcopy(self.tree)
351 titlemap = {}
352 for tag in ('h1', 'h2', 'h3', 'h4'):
353 for i, e in enumerate(tree.getiterator(tag)):
354 key = "%s_%s" % (tag, i)
355 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
356 del e[:]
357 if tag == 'h1':
358 e = lxml.etree.SubElement(e, "strong", Class="initial")
359 e.text = key
360 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
362 ascii_html_file = self.filepath('body-ascii-headings.html')
363 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
364 html_text = lxml.etree.tostring(tree, method="html")
365 save_data(ascii_html_file, html_text)
366 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
367 debugf = self.filepath('ascii-extracted-outline.txt')
368 ascii_contents, number_of_ascii_pages = \
369 parse_outline(ascii_pdf_file, 1, debugf)
370 self.outline_contents = []
371 log ("number of pages: %s, post ascii: %s" %
372 (number_of_pages, number_of_ascii_pages))
373 for ascii_title, depth, pageno in ascii_contents:
374 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
375 ascii_title = ascii_title[:-4]
376 if ' ' in ascii_title:
377 ascii_title = ascii_title.rsplit(' ', 1)[1]
378 title = titlemap.get(ascii_title, '')
379 log((ascii_title, title, depth, pageno))
381 self.outline_contents.append((title, depth, pageno))
383 return number_of_pages
385 def make_body_pdf(self):
386 """Make a pdf of the HTML, using webkit"""
387 #1. Save the html
388 html_text = etree.tostring(self.tree, method="html")
389 save_data(self.body_html_file, html_text)
391 #2. Make a pdf of it
392 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
393 self.notify_watcher('generate_pdf')
395 n_pages = self.extract_pdf_outline()
397 log ("found %s pages in pdf" % n_pages)
398 #4. resize pages, shift gutters, even pages
399 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
400 self.notify_watcher('reshape_pdf')
402 #5 add page numbers
403 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
404 numbers=self.page_numbers)
405 self.notify_watcher("number_pdf")
406 self.notify_watcher()
408 def make_preamble_pdf(self):
409 contents = self.make_contents()
410 inside_cover_html = self.compose_inside_cover()
411 log_types(self.dir, self.css_url, self.title, inside_cover_html,
412 self.toc_header, contents, self.title)
414 html = ('<html dir="%s"><head>\n'
415 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
416 '<link rel="stylesheet" href="%s" />\n'
417 '</head>\n<body>\n'
418 '<h1 class="frontpage">%s</h1>'
419 '%s\n'
420 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
421 '<div style="page-break-after: always; color:#fff" class="unseen">.'
422 '<!--%s--></div></body></html>'
423 ) % (self.dir, self.css_url, self.title, inside_cover_html,
424 self.toc_header, contents, self.title)
425 save_data(self.preamble_html_file, html)
427 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
429 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
431 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
432 numbers=self.preamble_page_numbers,
433 number_start=-2)
435 self.notify_watcher()
437 def make_end_matter_pdf(self):
438 """Make an inside back cover and a back cover. If there is an
439 isbn number its barcode will be put on the back cover."""
440 if self.isbn:
441 self.isbn_pdf_file = self.filepath('isbn.pdf')
442 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
443 self.notify_watcher('make_barcode_pdf')
445 end_matter = self.compose_end_matter()
446 #log(end_matter)
447 save_data(self.tail_html_file, end_matter.decode('utf-8'))
448 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
450 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
451 centre_end=True, even_pages=False)
452 self.notify_watcher()
454 def make_book_pdf(self):
455 """A convenient wrapper of a few necessary steps"""
456 # now the Xvfb server is needed. make sure it has had long enough to get going
457 self.wait_for_xvfb()
458 self.make_body_pdf()
459 self.make_preamble_pdf()
460 self.make_end_matter_pdf()
462 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
463 self.body_pdf_file, self.tail_pdf_file,
464 self.isbn_pdf_file)
466 self.notify_watcher('concatenated_pdfs')
468 def make_templated_html(self, template=None, zip=False, index=config.TEMPLATING_INDEX_FIRST):
469 """Make a templated html version of the book."""
470 #set up the directory and static files
471 self.unpack_static()
472 destdir = self.filepath('html')
473 os.mkdir(destdir)
474 os.rename(self.filepath('static'), self.filepath(os.path.join(destdir, 'static')))
476 if not template:
477 template_tree = lxml.html.parse(config.TEMPLATING_DEFAULT_TEMPLATE).getroot()
478 else:
479 template_tree = lxml.html.document_fromstring(template)
481 tocmap = filename_toc_map(self.toc)
482 contents_name, first_name = config.TEMPLATING_INDEX_MODES[index]
484 #build a contents page and a contents menu
485 #We can't make this in the same pass because the menu needs to
486 #go in every page (i.e., into the template)
487 menu = etree.Element('ul', Class=config.TEMPLATING_MENU_ELEMENT)
488 contents = etree.Element('div', Class=config.TEMPLATING_REPLACED_ELEMENT)
490 booktitle = etree.Element('div', Class=config.TEMPLATING_BOOK_TITLE_ELEMENT)
491 log(self.title)
492 booktitle.text = self.title.decode('utf-8')
494 etree.SubElement(contents, 'h1').text = self.title.decode('utf-8')
496 savename = first_name
497 for ID in self.spine:
498 filename = self.manifest[ID]['url']
499 #handle any TOC points in this file.
500 for point in tocmap[filename]:
501 if point['type'] == 'booki-section':
502 etree.SubElement(contents, 'h2').text = point['title']
503 etree.SubElement(menu, 'li', Class='booki-section').text = point['title']
504 else:
505 if savename is None:
506 savename = filename
507 div = etree.SubElement(contents, 'div')
508 etree.SubElement(div, 'a', href=savename).text = point['title']
509 li = etree.SubElement(menu, 'li')
510 li.tail = '\n'
511 etree.SubElement(li, 'a', href=savename).text = point['title']
512 savename = None
513 #put the menu and book title into the template (if it wants it)
514 for e in template_tree.iterdescendants(config.TEMPLATING_MENU_ELEMENT):
515 e.getparent().replace(e, copy.deepcopy(menu))
516 for e in template_tree.iterdescendants(config.TEMPLATING_BOOK_TITLE_ELEMENT):
517 e.getparent().replace(e, copy.deepcopy(booktitle))
519 #function to template content and write to disk
520 def save_content(content, title, filename):
521 if not isinstance(title, unicode):
522 title = title.decode('utf-8')
523 content.set('id', config.TEMPLATING_CONTENTS_ID)
524 content.tag = 'div'
525 dest = copy.deepcopy(template_tree)
526 dest.set('dir', self.dir)
527 for e in dest.iterdescendants(config.TEMPLATING_REPLACED_ELEMENT):
528 #copy only if there are more than 2
529 if content.getparent() is not None:
530 content = copy.deepcopy(content)
531 e.getparent().replace(e, content)
533 chaptertitle = etree.Element('div', Class=config.TEMPLATING_CHAPTER_TITLE_ELEMENT)
534 chaptertitle.text = title
535 for e in template_tree.iterdescendants(config.TEMPLATING_CHAPTER_TITLE_ELEMENT):
536 e.getparent().replace(e, copy.deepcopy(chaptertitle))
537 for e in dest.iterdescendants('title'):
538 #log(type(title), title)
539 e.text = title
540 self.save_tempfile(os.path.join(destdir, filename), lxml.html.tostring(dest))
543 #write the contents to a file. (either index.html or contents.html)
544 save_content(contents, self.title, contents_name)
546 savename = first_name
547 #and now write each chapter to a file
548 for ID in self.spine:
549 filename = self.manifest[ID]['url']
550 try:
551 root = self.get_tree_by_id(ID).getroot()
552 body = root.find('body')
553 except Exception, e:
554 log("hit %s when trying book.get_tree_by_id(%s).getroot().find('body')" % (e, ID))
555 body = etree.Element('body')
557 #handle any TOC points in this file. There should only be one!
558 for point in tocmap[filename]:
559 if point['type'] != 'booki-section':
560 title = point['title']
561 break
562 else:
563 title = self.title
565 if savename is None:
566 savename = filename
567 save_content(body, title, savename)
568 savename = None
569 log(destdir, self.publish_file)
570 os.rename(destdir, self.publish_file)
571 self.notify_watcher()
574 def make_simple_pdf(self, mode):
575 """Make a simple pdf document without contents or separate
576 title page. This is used for multicolumn newspapers and for
577 web-destined pdfs."""
578 self.wait_for_xvfb()
579 #0. Add heading to begining of html
580 body = list(self.tree.cssselect('body'))[0]
581 e = body.makeelement('h1', {'id': 'book-title'})
582 e.text = self.title.decode('utf-8')
583 body.insert(0, e)
584 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
585 e.addnext(intro)
587 #0.5 adjust parameters to suit the particular kind of output
588 if mode == 'web':
589 self.maker.gutter = 0
591 #1. Save the html
592 html_text = etree.tostring(self.tree, method="html")
593 save_data(self.body_html_file, html_text)
595 #2. Make a pdf of it (direct to to final pdf)
596 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
597 self.notify_watcher('generate_pdf')
598 n_pages = count_pdf_pages(self.pdf_file)
600 if mode != 'web':
601 #3. resize pages and shift gutters.
602 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
603 self.notify_watcher('reshape_pdf')
605 #4. add page numbers
606 self.maker.number_pdf(self.pdf_file, n_pages,
607 dir=self.dir, numbers=self.page_numbers)
608 self.notify_watcher("number_pdf")
609 self.notify_watcher()
612 def rotate180(self):
613 """Rotate the pdf 180 degrees so an RTL book can print on LTR
614 presses."""
615 rotated = self.filepath('final-rotate.pdf')
616 unrotated = self.filepath('final-pre-rotate.pdf')
617 #leave the unrotated pdf intact at first, in case of error.
618 rotate_pdf(self.pdf_file, rotated)
619 os.rename(self.pdf_file, unrotated)
620 os.rename(rotated, self.pdf_file)
621 self.notify_watcher()
623 def publish_pdf(self):
624 """Move the finished PDF to its final resting place"""
625 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
626 os.rename(self.pdf_file, self.publish_file)
627 self.notify_watcher()
629 def publish_bookizip(self):
630 """Publish the bookizip. For this, copy rather than move,
631 because the bookizip might be used by further processing. If
632 possible, a hard link is created."""
633 log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
634 try:
635 run(['cp', '-l', self.bookizip_file, self.publish_file])
636 except OSError:
637 run(['cp', self.bookizip_file, self.publish_file])
638 self.notify_watcher()
640 def concat_html(self):
641 """Join all the chapters together into one tree. Keep the TOC
642 up-to-date along the way."""
644 #each manifest item looks like:
645 #{'contributors': []
646 #'license': [],
647 #'mimetype': '',
648 #'rightsholders': []
649 #'url': ''}
650 doc = lxml.html.document_fromstring('<html dir="%s"><body dir="%s"></body></html>'
651 % (self.dir, self.dir))
652 tocmap = filename_toc_map(self.toc)
653 for ID in self.spine:
654 details = self.manifest[ID]
655 #log(ID, pformat(details))
656 # ACO MIJENJAO
657 try:
658 root = self.get_tree_by_id(ID).getroot()
659 except Exception, e:
660 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
661 continue
662 #handle any TOC points in this file
663 for point in tocmap[details['url']]:
664 #if the url has a #identifier, use it. Otherwise, make
665 #one up, using a hidden element at the beginning of
666 #the inserted document.
667 #XXX this will break if different files use the same ids
668 #XXX should either replace all, or replace selectively.
669 if point['fragment']:
670 fragment = point['fragment']
671 else:
672 body = _find_tag(root, 'body')
673 fragment = '%s_%s' % (self.cookie, point['index'])
674 #reuse first tag if it is suitable.
675 if (len(body) and
676 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
677 if body[0].get('id') is None:
678 body[0].set('id', fragment)
679 else:
680 fragment = body[0].get('id')
681 #the chapter starts with a heading. that heading should be the chapter name.
682 if body[0].tag in ('h1', 'h2', 'h3'):
683 #log('chapter has title "%s", found html title "%s"' %
684 # (point['title'], body[0].text_content()))
685 point['html_title'] = body[0].text_content()
686 else:
687 marker = body.makeelement('div', style="display:none",
688 id=fragment)
689 body.insert(0, marker)
690 point['html_id'] = fragment
692 add_guts(root, doc)
693 return doc
695 def unpack_static(self):
696 """Extract static files from the zip for the html to refer to."""
697 static_files = [x['url'] for x in self.manifest.values()
698 if x['url'].startswith('static')]
699 if static_files:
700 os.mkdir(self.filepath('static'))
702 for name in static_files:
703 s = self.store.read(name)
704 f = open(self.filepath(name), 'w')
705 f.write(s)
706 f.close()
707 self.notify_watcher()
709 def load_book(self):
710 """"""
711 #XXX concatenate the HTML to match how TWiki version worked.
712 # This is perhaps foolishly early -- throwing away useful boundaries.
713 self.unpack_static()
714 self.tree = self.concat_html()
715 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
717 self.headings = [x for x in self.tree.cssselect('h1')]
718 if self.headings:
719 self.headings[0].set('class', "first-heading")
720 for h1 in self.headings:
721 h1.title = h1.text_content().strip()
722 self.notify_watcher()
724 def make_contents(self):
725 """Generate HTML containing the table of contents. This can
726 only be done after the main PDF has been made, because the
727 page numbers are contained in the PDF outline."""
728 header = '<table class="toc">\n'
729 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
730 '<td class="pagenumber">%s</td></tr>\n')
731 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
732 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
733 footer = '\n</table>'
735 contents = []
737 chapter = 1
738 page_num = 1
739 #log(self.outline_contents)
740 outline_contents = iter(self.outline_contents)
742 for section in self.toc:
743 if not section.get('children'):
744 contents.append(empty_section_tmpl % section['title'])
745 continue
746 contents.append(section_tmpl % section['title'])
748 for point in section['children']:
749 try:
750 level = 99
751 while level > 1:
752 h1_text, level, page_num = outline_contents.next()
753 except StopIteration:
754 log("contents data not found for %s. Stopping" % (point,))
755 break
756 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
757 chapter += 1
759 doc = header + '\n'.join(contents) + footer
760 if isinstance(doc, unicode):
761 doc = doc.encode('utf-8')
762 self.notify_watcher()
763 return doc
765 def add_section_titles(self):
766 """Add any section heading pages that the TOC.txt file
767 specifies. These are sub-book, super-chapter groupings.
769 Also add initial numbers to chapters.
771 chapter = 1
772 section = None
773 #log(self.toc)
774 for t in self.toc:
775 #only top level sections get a subsection page,
776 #and only if they have children.
777 if t.get('children'):
778 section = self.tree.makeelement('div', Class="objavi-subsection")
779 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
780 heading.text = t['title']
781 for child in t['children']:
782 item = etree.SubElement(section, 'div', Class="objavi-chapter")
783 if 'html_title' in child:
784 item.text = child['html_title']
785 heading = self.tree.cssselect('#'+ child['html_id'])
786 if heading:
787 _add_initial_number(heading[0], chapter)
788 else:
789 item.text = child['title']
790 _add_initial_number(item, chapter)
791 log(item.text, debug='HTMLGEN')
792 chapter += 1
793 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
794 location = self.tree.cssselect('#'+ t['html_id'])[0]
795 location.addprevious(section)
798 self.notify_watcher()
801 def add_css(self, css=None, mode='book'):
802 """If css looks like a url, use it as a stylesheet link.
803 Otherwise it is the CSS itself, which is saved to a temporary file
804 and linked to."""
805 log("css is %r" % css)
806 htmltree = self.tree
807 if css is None or not css.strip():
808 css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
809 if css_default is None:
810 #guess from language -- this should come first
811 css_modes = config.LANGUAGE_CSS.get(self.lang,
812 config.LANGUAGE_CSS['en'])
813 css_default = css_modes.get(mode, css_modes[None])
814 url = css_default
815 elif not re.match(r'^http://\S+$', css):
816 url = path2url(self.save_tempfile('objavi.css', css), full=True)
817 else:
818 url = css
820 #find the head -- it's probably first child but lets not assume.
821 for child in htmltree:
822 if child.tag == 'head':
823 head = child
824 break
825 else:
826 head = htmltree.makeelement('head')
827 htmltree.insert(0, head)
829 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
830 self.css_url = url
831 self.notify_watcher()
832 return url
835 def _read_localised_template(self, template, fallbacks=['en']):
836 """Try to get the template in the approriate language, otherwise in english."""
837 for lang in [self.lang] + fallbacks:
838 try:
839 fn = template % (lang)
840 f = open(fn)
841 break
842 except IOError, e:
843 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
844 log(e)
845 template = f.read()
846 f.close()
847 return template
849 def compose_inside_cover(self):
850 """create the markup for the preamble inside cover."""
851 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
853 if self.isbn:
854 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
855 else:
856 isbn_text = ''
858 return template % {'date': time.strftime('%Y-%m-%d'),
859 'isbn': isbn_text,
860 'license': self.license,
864 def compose_end_matter(self):
865 """create the markup for the end_matter inside cover. If
866 self.isbn is not set, the html will result in a pdf that
867 spills onto two pages.
869 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
871 d = {'css_url': self.css_url,
872 'title': self.title
875 if self.isbn:
876 d['inside_cover_style'] = ''
877 else:
878 d['inside_cover_style'] = 'page-break-after: always'
880 return template % d
883 def make_epub(self, use_cache=False):
884 """Make an epub version of the book, using Mike McCabe's
885 epub module for the Internet Archive."""
886 ebook = ia_epub.Book(self.publish_file, content_dir='')
887 def add_file(ID, filename, mediatype, content):
888 ebook.add_content({'media-type': mediatype.encode('utf-8'),
889 'id': ID.encode('utf-8'),
890 'href': filename.encode('utf-8'),
891 }, content)
893 toc = self.info['TOC']
895 #manifest
896 filemap = {} #map html to corresponding xhtml
897 spinemap = {} #map IDs to multi-file chapters
898 for ID in self.manifest:
899 details = self.manifest[ID]
900 #log(ID, pformat(details))
901 fn, mediatype = details['url'], details['mimetype']
902 content = self.store.read(fn)
903 if mediatype == 'text/html':
904 #convert to application/xhtml+xml, and perhaps split
905 c = EpubChapter(self.server, self.book, ID, content,
906 use_cache=use_cache)
907 c.remove_bad_tags()
908 if fn[-5:] == '.html':
909 fnbase = fn[:-5]
910 else:
911 fnbase = fn
912 fnx = fnbase + '.xhtml'
913 mediatype = 'application/xhtml+xml'
915 fragments = split_html(c.as_xhtml(),
916 compressed_size=self.store.getinfo(fn).compress_size)
918 #add the first one as if it is the whole thing (as it often is)
919 add_file(ID, fnx, mediatype, fragments[0])
920 filemap[fn] = fnx
921 if len(fragments) > 1:
922 spine_ids = [ID]
923 spinemap[ID] = spine_ids
924 #add any extras
925 for i in range(1, len(fragments)):
926 # XXX it is possible for duplicates if another
927 # file happens to have this name. Ignore for now
928 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
929 spine_ids.append(_id)
930 add_file(_id,
931 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
932 mediatype, fragments[i])
934 else:
935 add_file(ID, fn, mediatype, content)
937 #toc
938 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
939 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
941 #spine
942 for ID in self.spine:
943 if ID in spinemap:
944 for x in spinemap[ID]:
945 ebook.add_spine_item({'idref': x})
946 else:
947 ebook.add_spine_item({'idref': ID})
949 #metadata -- no use of attributes (yet)
950 # and fm: metadata disappears for now
951 DCNS = config.DCNS
952 DC = config.DC
953 meta_info_items = []
954 for ns, namespace in self.metadata.items():
955 for keyword, schemes in namespace.items():
956 if ns:
957 keyword = '{%s}%s' % (ns, keyword)
958 for scheme, values in schemes.items():
959 for value in values:
960 item = {
961 'item': keyword,
962 'text': value,
964 if scheme:
965 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
966 item['atts'] = {'role': scheme}
967 else:
968 item['atts'] = {'scheme': scheme}
970 has_authors = 'creator' in self.metadata[DC]
971 if not has_authors and config.CLAIM_UNAUTHORED:
972 authors = []
973 for x in self.metadata[DC]['creator'].values():
974 authors.extend(x)
976 meta_info_items.append({'item': DCNS + 'creator',
977 'text': 'The Contributors'})
979 meta_info_items.append({'item': DCNS + 'rights',
980 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
983 tree_str = ia_epub.make_opf(meta_info_items,
984 ebook.manifest_items,
985 ebook.spine_items,
986 ebook.guide_items,
987 ebook.cover_id)
988 ebook.add(ebook.content_dir + 'content.opf', tree_str)
989 ebook.z.close()
990 self.notify_watcher()
993 def publish_s3(self):
994 """Push the book's epub to archive.org, using S3."""
995 #XXX why only epub?
996 secrets = {}
997 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
998 fn = getattr(config, x)
999 f = open(fn)
1000 secrets[x] = f.read().strip()
1001 f.close()
1003 now = time.strftime('%F')
1004 s3output = self.filepath('s3-output.txt')
1005 s3url, detailsurl = find_archive_urls(self.book, self.bookname)
1006 headers = [
1007 'x-amz-auto-make-bucket:1',
1008 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
1009 'x-archive-meta-mediatype:texts',
1010 'x-archive-meta-collection:opensource',
1011 'x-archive-meta-title:%s' % (self.book,),
1012 'x-archive-meta-date:%s' % (now,),
1013 'x-archive-meta-creator:FLOSS Manuals Contributors',
1016 if self.license in config.LICENSES:
1017 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
1019 argv = ['curl', '--location', '-s', '-o', s3output]
1020 for h in headers:
1021 argv.extend(('--header', h))
1022 argv.extend(('--upload-file', self.publish_file, s3url,))
1024 log(' '.join(repr(x) for x in argv))
1025 check_call(argv, stdout=sys.stderr)
1026 self.notify_watcher()
1027 return detailsurl, s3url
1029 def publish_shared(self, group=None, user=None):
1030 """Make symlinks from the BOOKI_SHARED_DIRECTORY to the
1031 published file, so that a virtual host can be set up to
1032 publish the files from a static location. If group is set, it
1033 is used as a subdirectory, otherwise a virtual group like
1034 'lonely-user-XXX' is used."""
1035 if group is None:
1036 if user is None:
1037 return
1038 group = config.BOOKI_SHARED_LONELY_USER_PREFIX + user
1039 group = group.replace('..', '+').replace('/', '+')
1040 group = re.sub("[^\w%.,-]+", "_", group)[:250]
1041 groupdir = os.path.join(config.BOOKI_SHARED_DIRECTORY, group)
1043 generic_name = re.sub(r'-\d{4}\.\d\d\.\d\d\-\d\d\.\d\d\.\d\d', '', self.bookname)
1044 log(self.bookname, generic_name)
1046 if not os.path.exists(groupdir):
1047 os.mkdir(groupdir)
1049 #change directory, for least symlink confusion
1050 pwd = os.getcwd()
1051 os.chdir(groupdir)
1052 if os.path.exists(generic_name):
1053 os.unlink(generic_name)
1054 os.symlink(os.path.abspath(self.publish_file), generic_name)
1055 os.chdir(pwd)
1058 def spawn_x(self):
1059 """Start an Xvfb instance, using a new server number. A
1060 reference to it is stored in self.xvfb, which is used to kill
1061 it when the pdf is done.
1063 Note that Xvfb doesn't interact well with dbus which is
1064 present on modern desktops.
1066 #Find an unused server number (in case two cgis are running at once)
1067 while True:
1068 servernum = random.randrange(50, 500)
1069 if not os.path.exists('/tmp/.X%s-lock' % servernum):
1070 break
1072 self.xserver_no = ':%s' % servernum
1074 authfile = self.filepath('Xauthority')
1075 os.environ['XAUTHORITY'] = authfile
1077 #mcookie(1) eats into /dev/random, so avoid that
1078 from hashlib import md5
1079 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
1080 mcookie = m.hexdigest()
1082 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
1084 self.xvfb = Popen(['Xvfb', self.xserver_no,
1085 '-screen', '0', '1024x768x24',
1086 '-pixdepths', '32',
1087 #'-blackpixel', '0',
1088 #'-whitepixel', str(2 ** 24 -1),
1089 #'+extension', 'Composite',
1090 '-dpi', '96',
1091 #'-kb',
1092 '-nolisten', 'tcp',
1095 # We need to wait a bit before the Xvfb is ready. but the
1096 # downloads are so slow that that probably doesn't matter
1098 self.xvfb_ready_time = time.time() + 2
1100 os.environ['DISPLAY'] = self.xserver_no
1101 log(self.xserver_no)
1103 def wait_for_xvfb(self):
1104 """wait until a previously set time before continuing. This
1105 is so Xvfb has time to properly start."""
1106 if hasattr(self, 'xvfb'):
1107 d = self.xvfb_ready_time - time.time()
1108 if d > 0:
1109 time.sleep(d)
1110 self.notify_watcher()
1112 def cleanup_x(self):
1113 """Try very hard to kill off Xvfb. In addition to killing
1114 this instance's xvfb, occasionally (randomly) search for
1115 escaped Xvfb instances and kill those too."""
1116 if not hasattr(self, 'xvfb'):
1117 return
1118 check_call(['xauth', 'remove', self.xserver_no])
1119 p = self.xvfb
1120 log("trying to kill Xvfb %s" % p.pid)
1121 os.kill(p.pid, 15)
1122 for i in range(10):
1123 if p.poll() is not None:
1124 log("%s died with %s" % (p.pid, p.poll()))
1125 break
1126 log("%s not dead yet" % p.pid)
1127 time.sleep(0.2)
1128 else:
1129 log("Xvfb would not die! kill -9! kill -9!")
1130 try:
1131 os.kill(p.pid, 9)
1132 except OSError, e:
1133 log(e)
1135 if random.random() < 0.1:
1136 # occasionally kill old xvfbs and soffices, if there are any.
1137 self.kill_old_processes()
1139 def kill_old_processes(self):
1140 """Sometimes, despite everything, Xvfb or soffice instances
1141 hang around well after they are wanted -- for example if the
1142 cgi process dies particularly badly. So kill them if they have
1143 been running for a long time."""
1144 log("running kill_old_processes")
1145 killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1146 os.path.basename(config.HTML2ODT),
1147 os.path.basename(config.WKHTMLTOPDF),
1149 p = Popen(['ps', '-C', killable_names,
1150 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1151 data = p.communicate()[0].strip()
1152 if data:
1153 lines = data.split('\n')
1154 pids = []
1155 for line in lines:
1156 log('dealing with ps output "%s"' % line)
1157 try:
1158 pid, days, hours, minutes, seconds \
1159 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1160 except AttributeError:
1161 log("Couldn't parse that line!")
1162 # 50 minutes should be enough xvfb time for anyone
1163 if days or hours or int(minutes) > 50:
1164 pid = int(pid)
1165 log("going to kill pid %s" % pid)
1166 os.kill(pid, 15)
1167 pids.append(pid)
1169 time.sleep(1.0)
1170 for pid in pids:
1171 #try again in case any are lingerers
1172 try:
1173 os.kill(int(pid), 9)
1174 except OSError, e:
1175 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1176 continue
1177 log('killing %s with -9' % pid)
1178 self.notify_watcher()
1180 def cleanup(self):
1181 self.cleanup_x()
1182 if not config.KEEP_TEMP_FILES:
1183 for fn in os.listdir(self.workdir):
1184 os.remove(os.path.join(self.workdir, fn))
1185 os.rmdir(self.workdir)
1186 else:
1187 log("NOT removing '%s', containing the following files:" % self.workdir)
1188 log(*os.listdir(self.workdir))
1190 self.notify_watcher()
1193 def use_cache():
1194 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1196 def _read_cached_zip(server, book, max_age):
1197 #find a recent zip if possible
1198 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1199 from glob import glob
1200 zips = sorted(glob(prefix + '*.zip'))
1201 if not zips:
1202 log("no cached booki-zips matching %s*.zip" % (prefix,))
1203 return None
1204 zipname = zips[-1]
1205 cutoff = time.time() - max_age * 60
1206 log(repr(zipname))
1207 try:
1208 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1209 if date > cutoff:
1210 f = open(zipname)
1211 blob = f.read()
1212 f.close()
1213 return blob, zipname
1214 log("%s is too old, must reload" % zipname)
1215 return None
1216 except (IOError, IndexError, ValueError), e:
1217 log('could not make sense of %s: got exception %s' % (zipname, e))
1218 return None
1221 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1222 interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1223 try:
1224 url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1225 'server': server, 'book':book}
1226 except KeyError:
1227 raise NotImplementedError("Can't handle '%s' interface" % interface)
1229 if use_cache() and max_age < 0:
1230 #default to 12 hours cache on objavi.halo.gen.nz
1231 max_age = 12 * 60
1233 if max_age:
1234 log('WARNING: trying to use cached booki-zip',
1235 'If you are debugging booki-zip creation, you will go CRAZY'
1236 ' unless you switch this off')
1237 blob_and_name = _read_cached_zip(server, book, max_age)
1238 if blob_and_name is not None:
1239 return blob_and_name
1241 log('fetching zip from %s'% url)
1242 f = urlopen(url)
1243 blob = f.read()
1244 f.close()
1245 if save:
1246 if filename is None:
1247 filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1248 make_book_name(book, server, '.zip'))
1249 f = open(filename, 'w')
1250 f.write(blob)
1251 f.close()
1252 return blob, filename
1255 def split_html(html, compressed_size=None, fix_markup=False):
1256 """Split long html files into pieces that will work nicely on a
1257 Sony Reader."""
1258 if compressed_size is None:
1259 import zlib
1260 compressed_size = len(zlib.compress(html))
1262 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1263 len(html) // config.EPUB_FILE_SIZE_MAX)
1264 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1266 if not splits:
1267 return [html]
1269 if fix_markup:
1270 #remove '<' in attributes etc, which makes the marker
1271 #insertion more reliable
1272 html = etree.tostring(lxml.html.fromstring(html),
1273 encoding='UTF-8',
1274 #method='html'
1277 target = len(html) // (splits + 1)
1278 s = 0
1279 fragments = []
1280 for i in range(splits):
1281 e = html.find('<', target * (i + 1))
1282 fragments.append(html[s:e])
1283 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1284 s = e
1285 fragments.append(html[s:])
1287 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1288 chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1289 return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]