try import from stdlib before simplejson: errors will be more informative
[objavi2.git] / objavi / fmbook.py
blobde933a91ca910ca7646cbc74d3e01ddf23d36eab
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 import copy
28 from subprocess import Popen, check_call, PIPE
29 from cStringIO import StringIO
30 from urllib2 import urlopen, HTTPError
31 import zipfile
32 import traceback
33 from string import ascii_letters
34 from pprint import pformat
36 try:
37 import json
38 except ImportError:
39 import simplejson as json
41 import lxml.html
42 from lxml import etree
44 from objavi import config, epub_utils
45 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
46 from objavi.book_utils import ObjaviError, log_types
47 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
48 from objavi.epub import add_guts, _find_tag
49 from objavi.xhtml_utils import EpubChapter, split_tree
50 from objavi.cgi_utils import url2path, path2url
52 from iarchive import epub as ia_epub
53 from booki.bookizip import get_metadata, add_metadata
55 TMPDIR = os.path.abspath(config.TMPDIR)
56 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
57 HTTP_HOST = os.environ.get('HTTP_HOST', '')
59 def find_archive_urls(bookid, bookname):
60 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
61 detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
62 return (s3url, detailsurl)
64 def _get_best_title(tocpoint):
65 if 'html_title' in tocpoint:
66 return tocpoint['html_title']
67 if 'title' in tocpoint:
68 return tocpoint['title']
69 return 'Untitled'
72 def _add_initial_number(e, n):
73 """Put a styled chapter number n at the beginning of element e."""
74 initial = e.makeelement("strong", Class="initial")
75 e.insert(0, initial)
76 initial.tail = ' '
77 if e.text is not None:
78 initial.tail += e.text
79 e.text = ''
80 initial.text = "%s." % n
82 def expand_toc(toc, depth=1, index=0):
83 """Reformat toc slightly for convenience"""
84 for item in toc:
85 url = item['url'].lstrip('/')
86 bits = url.split('#', 1)
87 filename = bits[0]
88 fragment = (bits[1] if len(bits) == 2 else None)
89 item['depth'] = depth
90 item["filename"] = filename
91 item["fragment"] = fragment
92 item["index"] = index
93 index += 1
94 if 'children' in item:
95 index = expand_toc(item['children'], depth + 1, index)
96 return index
98 def _serialise(rtoc, stoc, depth):
99 for item in rtoc:
100 url = item['url'].lstrip('/')
101 bits = url.split('#', 1)
102 filename = bits[0]
103 fragment = (bits[1] if len(bits) == 2 else None)
104 stoc.append({"depth": depth,
105 "title": item['title'],
106 "url": url,
107 "filename": filename,
108 "fragment": fragment,
109 "type": item['type']
111 if 'children' in item:
112 _serialise(item['children'], stoc, depth + 1)
115 def serialise_toc(rtoc):
116 """Take the recursive TOC structure and turn it into a list of
117 serial points. Reformat some things for convenience."""
118 stoc = []
119 _serialise(rtoc, stoc, 1)
120 for i, x in enumerate(stoc):
121 x['position'] = i
122 return stoc
124 def filename_toc_map(rtoc):
125 tocmap = {}
126 #log(rtoc)
127 def traverse(toc):
128 for point in toc:
129 #log(point.keys())
130 tocmap.setdefault(point['filename'], []).append(point)
131 if 'children' in point:
132 traverse(point['children'])
133 traverse(rtoc)
134 return tocmap
136 def save_data(fn, data):
137 """Save without tripping up on unicode"""
138 if isinstance(data, unicode):
139 data = data.encode('utf8', 'ignore')
140 f = open(fn, 'w')
141 f.write(data)
142 f.close()
145 class Book(object):
146 page_numbers = 'latin'
147 preamble_page_numbers = 'roman'
149 def notify_watcher(self, message=None):
150 if self.watchers:
151 if message is None:
152 #message is the name of the caller
153 message = traceback.extract_stack(None, 2)[0][2]
154 log("notify_watcher called with '%s'" % message)
155 for w in self.watchers:
156 w(message)
158 def __enter__(self):
159 return self
161 def __exit__(self, exc_type, exc_value, tb):
162 self.notify_watcher(config.FINISHED_MESSAGE)
163 self.cleanup()
164 #could deal with exceptions here and return true
167 def __init__(self, book, server, bookname,
168 page_settings=None, watchers=None, isbn=None,
169 license=config.DEFAULT_LICENSE, title=None,
170 max_age=0):
171 log("*** Starting new book %s ***" % bookname)
172 self.watchers = set()
173 if watchers is not None:
174 self.watchers.update(watchers)
175 self.notify_watcher('start')
176 self.bookname = bookname
177 self.book = book
178 self.server = server
179 self.cookie = ''.join(random.sample(ascii_letters, 10))
180 try:
181 blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
182 except HTTPError, e:
183 traceback.print_exc()
184 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
185 #not much to do?
186 #raise 502 Bad Gateway ?
187 sys.exit()
188 f = StringIO(blob)
189 self.notify_watcher('fetch_zip')
190 self.store = zipfile.ZipFile(f, 'r')
191 self.info = json.loads(self.store.read('info.json'))
192 for k in ('manifest', 'metadata', 'spine', 'TOC'):
193 if k not in self.info:
194 raise ObjaviError('info.json of %s lacks vital element "%s"' %
195 (bookname, k))
196 #check types also?
198 self.metadata = self.info['metadata']
199 self.spine = self.info['spine']
200 self.manifest = self.info['manifest']
202 if server == config.LOCALHOST: # [DEPRECATED]
203 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
204 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
206 log(pformat(self.metadata))
207 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
208 if not self.lang:
209 self.lang = guess_lang(server, book)
210 log('guessed lang as %s' % self.lang)
212 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
213 if not self.toc_header:
214 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
216 self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
217 if not self.dir:
218 self.dir = guess_text_dir(server, book)
220 #Patch in the extra metadata. (lang and dir may be set from config)
221 #these should be read from zip -- so should go into zip?
222 for var, key, scheme, ns in (
223 (isbn, 'id', 'ISBN', config.DC),
224 (license, 'rights', 'License', config.DC),
225 (title, 'title', '', config.DC),
226 (self.lang, 'language', '', config.DC),
227 (self.dir, 'dir', '', config.FM),
229 if var is not None:
230 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
232 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
233 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
235 self.toc = self.info['TOC']
236 expand_toc(self.toc)
238 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
239 os.chmod(self.workdir, 0755)
241 self.body_html_file = self.filepath('body.html')
242 self.body_pdf_file = self.filepath('body.pdf')
243 self.preamble_html_file = self.filepath('preamble.html')
244 self.preamble_pdf_file = self.filepath('preamble.pdf')
245 self.tail_html_file = self.filepath('tail.html')
246 self.tail_pdf_file = self.filepath('tail.pdf')
247 self.isbn_pdf_file = None
248 self.pdf_file = self.filepath('final.pdf')
249 self.body_odt_file = self.filepath('body.odt')
250 self.outline_file = self.filepath('outline.txt')
252 self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
254 if page_settings is not None:
255 self.maker = PageSettings(**page_settings)
257 if title is not None:
258 self.title = title
259 else:
260 titles = get_metadata(self.metadata, 'title')
261 if titles:
262 self.title = titles[0]
263 else:
264 self.title = 'A Book About ' + self.book
265 if isinstance(self.title, unicode):
266 self.title = self.title.encode('utf-8')
268 self.notify_watcher()
271 if config.TRY_BOOK_CLEANUP_ON_DEL:
272 #Dont even define __del__ if it is not used.
273 _try_cleanup_on_del = True
274 def __del__(self):
275 if self._try_cleanup_on_del and os.path.exists(self.workdir):
276 self._try_cleanup_on_del = False #or else you can get in bad cycles
277 self.cleanup()
279 def get_tree_by_id(self, id):
280 """get an HTML tree from the given manifest ID"""
281 name = self.manifest[id]['url']
282 mimetype = self.manifest[id]['mimetype']
283 s = self.store.read(name)
284 f = StringIO(s)
285 if mimetype == 'text/html':
286 try:
287 tree = lxml.html.parse(f)
288 except etree.XMLSyntaxError, e:
289 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
290 (id, name, s[:20], e))
291 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
292 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
293 tree = etree.parse(f)
294 else:
295 tree = f.read()
296 f.close()
297 return tree
299 def filepath(self, fn):
300 return os.path.join(self.workdir, fn)
302 def save_tempfile(self, fn, data):
303 """Save the data in a temporary directory that will be cleaned
304 up when all is done. Return the absolute file path."""
305 fn = self.filepath(fn)
306 save_data(fn, data)
307 return fn
309 def make_oo_doc(self):
310 """Make an openoffice document, using the html2odt script."""
311 self.wait_for_xvfb()
312 html_text = etree.tostring(self.tree, method="html")
313 save_data(self.body_html_file, html_text)
314 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
315 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
316 os.rename(self.body_odt_file, self.publish_file)
317 self.notify_watcher()
319 def extract_pdf_outline(self):
320 """Get the outline (table of contents) for the PDF, which
321 wkhtmltopdf should have written to a file. If that file
322 doesn't exist (or config says not to use it), fall back to
323 using self._extract_pdf_outline_the_old_way, below.
325 if config.USE_DUMP_OUTLINE:
326 try:
327 self.outline_contents, number_of_pages = \
328 parse_extracted_outline(self.outline_file)
330 except Exception, e:
331 traceback.print_exc()
332 number_of_pages = self._extract_pdf_outline_the_old_way()
333 else:
334 number_of_pages = self._extract_pdf_outline_the_old_way()
336 self.notify_watcher()
337 return number_of_pages
339 def _extract_pdf_outline_the_old_way(self):
340 """Try to get the PDF outline using pdftk. This doesn't work
341 well with all scripts."""
342 debugf = self.filepath('extracted-outline.txt')
343 self.outline_contents, number_of_pages = \
344 parse_outline(self.body_pdf_file, 1, debugf)
346 if not self.outline_contents:
347 #probably problems with international text. need a horrible hack
348 log('no outline: trying again with ascii headings')
349 import copy
350 tree = copy.deepcopy(self.tree)
351 titlemap = {}
352 for tag in ('h1', 'h2', 'h3', 'h4'):
353 for i, e in enumerate(tree.getiterator(tag)):
354 key = "%s_%s" % (tag, i)
355 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
356 del e[:]
357 if tag == 'h1':
358 e = lxml.etree.SubElement(e, "strong", Class="initial")
359 e.text = key
360 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
362 ascii_html_file = self.filepath('body-ascii-headings.html')
363 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
364 html_text = lxml.etree.tostring(tree, method="html")
365 save_data(ascii_html_file, html_text)
366 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
367 debugf = self.filepath('ascii-extracted-outline.txt')
368 ascii_contents, number_of_ascii_pages = \
369 parse_outline(ascii_pdf_file, 1, debugf)
370 self.outline_contents = []
371 log ("number of pages: %s, post ascii: %s" %
372 (number_of_pages, number_of_ascii_pages))
373 for ascii_title, depth, pageno in ascii_contents:
374 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
375 ascii_title = ascii_title[:-4]
376 if ' ' in ascii_title:
377 ascii_title = ascii_title.rsplit(' ', 1)[1]
378 title = titlemap.get(ascii_title, '')
379 log((ascii_title, title, depth, pageno))
381 self.outline_contents.append((title, depth, pageno))
383 return number_of_pages
385 def make_body_pdf(self):
386 """Make a pdf of the HTML, using webkit"""
387 #1. Save the html
388 html_text = etree.tostring(self.tree, method="html")
389 save_data(self.body_html_file, html_text)
391 #2. Make a pdf of it
392 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
393 self.notify_watcher('generate_pdf')
395 n_pages = self.extract_pdf_outline()
397 log ("found %s pages in pdf" % n_pages)
398 #4. resize pages, shift gutters, even pages
399 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
400 self.notify_watcher('reshape_pdf')
402 #5 add page numbers
403 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
404 numbers=self.page_numbers)
405 self.notify_watcher("number_pdf")
406 self.notify_watcher()
408 def make_preamble_pdf(self):
409 contents = self.make_contents()
410 inside_cover_html = self.compose_inside_cover()
411 log_types(self.dir, self.css_url, self.title, inside_cover_html,
412 self.toc_header, contents, self.title)
414 html = ('<html dir="%s"><head>\n'
415 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
416 '<link rel="stylesheet" href="%s" />\n'
417 '</head>\n<body>\n'
418 '<h1 class="frontpage">%s</h1>'
419 '%s\n'
420 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
421 '<div style="page-break-after: always; color:#fff" class="unseen">.'
422 '<!--%s--></div></body></html>'
423 ) % (self.dir, self.css_url, self.title, inside_cover_html,
424 self.toc_header, contents, self.title)
425 save_data(self.preamble_html_file, html)
427 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
429 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
431 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
432 numbers=self.preamble_page_numbers,
433 number_start=-2)
435 self.notify_watcher()
437 def make_end_matter_pdf(self):
438 """Make an inside back cover and a back cover. If there is an
439 isbn number its barcode will be put on the back cover."""
440 if self.isbn:
441 self.isbn_pdf_file = self.filepath('isbn.pdf')
442 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
443 self.notify_watcher('make_barcode_pdf')
445 end_matter = self.compose_end_matter()
446 #log(end_matter)
447 save_data(self.tail_html_file, end_matter.decode('utf-8'))
448 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
450 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
451 centre_end=True, even_pages=False)
452 self.notify_watcher()
454 def make_book_pdf(self):
455 """A convenient wrapper of a few necessary steps"""
456 # now the Xvfb server is needed. make sure it has had long enough to get going
457 self.wait_for_xvfb()
458 self.make_body_pdf()
459 self.make_preamble_pdf()
460 self.make_end_matter_pdf()
462 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
463 self.body_pdf_file, self.tail_pdf_file,
464 self.isbn_pdf_file)
466 self.notify_watcher('concatenated_pdfs')
468 def make_templated_html(self, template=None, zip=False, index=config.TEMPLATING_INDEX_FIRST):
469 """Make a templated html version of the book"""
470 #set up the directory and static files
471 self.unpack_static()
472 destdir = self.filepath('html')
473 os.mkdir(destdir)
474 os.rename(self.filepath('static'), self.filepath(os.path.join(destdir, 'static')))
476 if not template:
477 template_tree = lxml.html.parse(config.TEMPLATING_DEFAULT_TEMPLATE).getroot()
478 else:
479 template_tree = lxml.html.document_fromstring(template)
481 tocmap = filename_toc_map(self.toc)
482 contents_name, first_name = config.TEMPLATING_INDEX_MODES[index]
484 #build a contents page and a contents menu
485 #We can't make this in the same pass because the menu needs to
486 #go in every page (i.e., into the template)
487 menu = etree.Element('ul', Class=config.TEMPLATING_MENU_ELEMENT)
488 contents = etree.Element('div', Class=config.TEMPLATING_REPLACED_ELEMENT)
489 etree.SubElement(contents, 'h1').text = self.title
491 savename = first_name
492 for ID in self.spine:
493 filename = self.manifest[ID]['url']
494 #handle any TOC points in this file.
495 for point in tocmap[filename]:
496 if point['type'] == 'booki-section':
497 etree.SubElement(contents, 'h2').text = point['title']
498 etree.SubElement(menu, 'li', Class='booki-section').text = point['title']
499 else:
500 if savename is None:
501 savename = filename
502 div = etree.SubElement(contents, 'div')
503 etree.SubElement(div, 'a', href=savename).text = point['title']
504 li = etree.SubElement(menu, 'li')
505 li.tail = '\n'
506 etree.SubElement(li, 'a', href=savename).text = point['title']
507 savename = None
508 #put the menu into the template (if it wants it)
509 for e in template_tree.iterdescendants(config.TEMPLATING_MENU_ELEMENT):
510 e.getparent().replace(e, menu)
512 #function to template content and write to disk
513 def save_content(content, title, filename):
514 content.set('id', config.TEMPLATING_CONTENTS_ID)
515 dest = copy.deepcopy(template_tree)
516 for e in dest.iterdescendants(config.TEMPLATING_REPLACED_ELEMENT):
517 e.getparent().replace(e, content)
518 for e in dest.iterdescendants('title'):
519 e.text = title
520 self.save_tempfile(os.path.join(destdir, filename), lxml.html.tostring(dest))
523 #write the contents to a file. (either index.html or contents.html)
524 save_content(contents, self.title, contents_name)
526 savename = first_name
527 #and now write each chapter to a file
528 for ID in self.spine:
529 filename = self.manifest[ID]['url']
530 try:
531 root = self.get_tree_by_id(ID).getroot()
532 body = root.find('body')
533 except Exception, e:
534 log("hit %s when trying book.get_tree_by_id(%s).getroot().find('body')" % (e, ID))
535 body = etree.Element('body')
537 #handle any TOC points in this file. There should only be one!
538 for point in tocmap[filename]:
539 if point['type'] != 'booki-section':
540 title = point['title']
541 break
542 else:
543 title = self.title
545 if savename is None:
546 savename = filename
547 save_content(body, title, savename)
548 savename = None
549 log(destdir, self.publish_file)
550 os.rename(destdir, self.publish_file)
551 self.notify_watcher()
554 def make_simple_pdf(self, mode):
555 """Make a simple pdf document without contents or separate
556 title page. This is used for multicolumn newspapers and for
557 web-destined pdfs."""
558 self.wait_for_xvfb()
559 #0. Add heading to begining of html
560 body = list(self.tree.cssselect('body'))[0]
561 e = body.makeelement('h1', {'id': 'book-title'})
562 e.text = self.title
563 body.insert(0, e)
564 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
565 e.addnext(intro)
567 #0.5 adjust parameters to suit the particular kind of output
568 if mode == 'web':
569 self.maker.gutter = 0
571 #1. Save the html
572 html_text = etree.tostring(self.tree, method="html")
573 save_data(self.body_html_file, html_text)
575 #2. Make a pdf of it (direct to to final pdf)
576 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
577 self.notify_watcher('generate_pdf')
578 n_pages = count_pdf_pages(self.pdf_file)
580 if mode != 'web':
581 #3. resize pages and shift gutters.
582 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
583 self.notify_watcher('reshape_pdf')
585 #4. add page numbers
586 self.maker.number_pdf(self.pdf_file, n_pages,
587 dir=self.dir, numbers=self.page_numbers)
588 self.notify_watcher("number_pdf")
589 self.notify_watcher()
592 def rotate180(self):
593 """Rotate the pdf 180 degrees so an RTL book can print on LTR
594 presses."""
595 rotated = self.filepath('final-rotate.pdf')
596 unrotated = self.filepath('final-pre-rotate.pdf')
597 #leave the unrotated pdf intact at first, in case of error.
598 rotate_pdf(self.pdf_file, rotated)
599 os.rename(self.pdf_file, unrotated)
600 os.rename(rotated, self.pdf_file)
601 self.notify_watcher()
603 def publish_pdf(self):
604 """Move the finished PDF to its final resting place"""
605 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
606 os.rename(self.pdf_file, self.publish_file)
607 self.notify_watcher()
609 def publish_bookizip(self):
610 """Publish the bookizip. For this, copy rather than move,
611 because the bookizip might be used by further processing. If
612 possible, a hard link is created."""
613 log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
614 try:
615 run(['cp', '-l', self.bookizip_file, self.publish_file])
616 except OSError:
617 run(['cp', self.bookizip_file, self.publish_file])
618 self.notify_watcher()
620 def concat_html(self):
621 """Join all the chapters together into one tree. Keep the TOC
622 up-to-date along the way."""
624 #each manifest item looks like:
625 #{'contributors': []
626 #'license': [],
627 #'mimetype': '',
628 #'rightsholders': []
629 #'url': ''}
630 doc = lxml.html.document_fromstring('<html><body></body></html>')
631 tocmap = filename_toc_map(self.toc)
632 for ID in self.spine:
633 details = self.manifest[ID]
634 #log(ID, pformat(details))
635 # ACO MIJENJAO
636 try:
637 root = self.get_tree_by_id(ID).getroot()
638 except Exception, e:
639 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
640 continue
641 #handle any TOC points in this file
642 for point in tocmap[details['url']]:
643 #if the url has a #identifier, use it. Otherwise, make
644 #one up, using a hidden element at the beginning of
645 #the inserted document.
646 #XXX this will break if different files use the same ids
647 #XXX should either replace all, or replace selectively.
648 if point['fragment']:
649 fragment = point['fragment']
650 else:
651 body = _find_tag(root, 'body')
652 fragment = '%s_%s' % (self.cookie, point['index'])
653 #reuse first tag if it is suitable.
654 if (len(body) and
655 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
656 if body[0].get('id') is None:
657 body[0].set('id', fragment)
658 else:
659 fragment = body[0].get('id')
660 #the chapter starts with a heading. that heading should be the chapter name.
661 if body[0].tag in ('h1', 'h2', 'h3'):
662 #log('chapter has title "%s", found html title "%s"' %
663 # (point['title'], body[0].text_content()))
664 point['html_title'] = body[0].text_content()
665 else:
666 marker = body.makeelement('div', style="display:none",
667 id=fragment)
668 body.insert(0, marker)
669 point['html_id'] = fragment
671 add_guts(root, doc)
672 return doc
674 def unpack_static(self):
675 """Extract static files from the zip for the html to refer to."""
676 static_files = [x['url'] for x in self.manifest.values()
677 if x['url'].startswith('static')]
678 if static_files:
679 os.mkdir(self.filepath('static'))
681 for name in static_files:
682 s = self.store.read(name)
683 f = open(self.filepath(name), 'w')
684 f.write(s)
685 f.close()
686 self.notify_watcher()
688 def load_book(self):
689 """"""
690 #XXX concatenate the HTML to match how TWiki version worked.
691 # This is perhaps foolishly early -- throwing away useful boundaries.
692 self.unpack_static()
693 self.tree = self.concat_html()
694 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
696 self.headings = [x for x in self.tree.cssselect('h1')]
697 if self.headings:
698 self.headings[0].set('class', "first-heading")
699 for h1 in self.headings:
700 h1.title = h1.text_content().strip()
701 self.notify_watcher()
703 def make_contents(self):
704 """Generate HTML containing the table of contents. This can
705 only be done after the main PDF has been made, because the
706 page numbers are contained in the PDF outline."""
707 header = '<table class="toc">\n'
708 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
709 '<td class="pagenumber">%s</td></tr>\n')
710 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
711 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
712 footer = '\n</table>'
714 contents = []
716 chapter = 1
717 page_num = 1
718 #log(self.outline_contents)
719 outline_contents = iter(self.outline_contents)
721 for section in self.toc:
722 if not section.get('children'):
723 contents.append(empty_section_tmpl % section['title'])
724 continue
725 contents.append(section_tmpl % section['title'])
727 for point in section['children']:
728 try:
729 level = 99
730 while level > 1:
731 h1_text, level, page_num = outline_contents.next()
732 except StopIteration:
733 log("contents data not found for %s. Stopping" % (point,))
734 break
735 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
736 chapter += 1
738 doc = header + '\n'.join(contents) + footer
739 if isinstance(doc, unicode):
740 doc = doc.encode('utf-8')
741 self.notify_watcher()
742 return doc
744 def add_section_titles(self):
745 """Add any section heading pages that the TOC.txt file
746 specifies. These are sub-book, super-chapter groupings.
748 Also add initial numbers to chapters.
750 chapter = 1
751 section = None
752 #log(self.toc)
753 for t in self.toc:
754 #only top level sections get a subsection page,
755 #and only if they have children.
756 if t.get('children'):
757 section = self.tree.makeelement('div', Class="objavi-subsection")
758 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
759 heading.text = t['title']
760 for child in t['children']:
761 item = etree.SubElement(section, 'div', Class="objavi-chapter")
762 if 'html_title' in child:
763 item.text = child['html_title']
764 heading = self.tree.cssselect('#'+ child['html_id'])
765 if heading:
766 _add_initial_number(heading[0], chapter)
767 else:
768 item.text = child['title']
769 _add_initial_number(item, chapter)
770 log(item.text, debug='HTMLGEN')
771 chapter += 1
772 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
773 location = self.tree.cssselect('#'+ t['html_id'])[0]
774 location.addprevious(section)
777 self.notify_watcher()
780 def add_css(self, css=None, mode='book'):
781 """If css looks like a url, use it as a stylesheet link.
782 Otherwise it is the CSS itself, which is saved to a temporary file
783 and linked to."""
784 log("css is %r" % css)
785 htmltree = self.tree
786 if css is None or not css.strip():
787 css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
788 if css_default is None:
789 #guess from language -- this should come first
790 css_modes = config.LANGUAGE_CSS.get(self.lang,
791 config.LANGUAGE_CSS['en'])
792 css_default = css_modes.get(mode, css_modes[None])
793 url = css_default
794 elif not re.match(r'^http://\S+$', css):
795 url = path2url(self.save_tempfile('objavi.css', css), full=True)
796 else:
797 url = css
799 #find the head -- it's probably first child but lets not assume.
800 for child in htmltree:
801 if child.tag == 'head':
802 head = child
803 break
804 else:
805 head = htmltree.makeelement('head')
806 htmltree.insert(0, head)
808 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
809 self.css_url = url
810 self.notify_watcher()
811 return url
814 def _read_localised_template(self, template, fallbacks=['en']):
815 """Try to get the template in the approriate language, otherwise in english."""
816 for lang in [self.lang] + fallbacks:
817 try:
818 fn = template % (lang)
819 f = open(fn)
820 break
821 except IOError, e:
822 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
823 log(e)
824 template = f.read()
825 f.close()
826 return template
828 def compose_inside_cover(self):
829 """create the markup for the preamble inside cover."""
830 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
832 if self.isbn:
833 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
834 else:
835 isbn_text = ''
837 return template % {'date': time.strftime('%Y-%m-%d'),
838 'isbn': isbn_text,
839 'license': self.license,
843 def compose_end_matter(self):
844 """create the markup for the end_matter inside cover. If
845 self.isbn is not set, the html will result in a pdf that
846 spills onto two pages.
848 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
850 d = {'css_url': self.css_url,
851 'title': self.title
854 if self.isbn:
855 d['inside_cover_style'] = ''
856 else:
857 d['inside_cover_style'] = 'page-break-after: always'
859 return template % d
862 def make_epub(self, use_cache=False):
863 """Make an epub version of the book, using Mike McCabe's
864 epub module for the Internet Archive."""
865 ebook = ia_epub.Book(self.publish_file, content_dir='')
866 def add_file(ID, filename, mediatype, content):
867 ebook.add_content({'media-type': mediatype.encode('utf-8'),
868 'id': ID.encode('utf-8'),
869 'href': filename.encode('utf-8'),
870 }, content)
872 toc = self.info['TOC']
874 #manifest
875 filemap = {} #map html to corresponding xhtml
876 spinemap = {} #map IDs to multi-file chapters
877 for ID in self.manifest:
878 details = self.manifest[ID]
879 #log(ID, pformat(details))
880 fn, mediatype = details['url'], details['mimetype']
881 content = self.store.read(fn)
882 if mediatype == 'text/html':
883 #convert to application/xhtml+xml, and perhaps split
884 c = EpubChapter(self.server, self.book, ID, content,
885 use_cache=use_cache)
886 c.remove_bad_tags()
887 if fn[-5:] == '.html':
888 fnbase = fn[:-5]
889 else:
890 fnbase = fn
891 fnx = fnbase + '.xhtml'
892 mediatype = 'application/xhtml+xml'
894 fragments = split_html(c.as_xhtml(),
895 compressed_size=self.store.getinfo(fn).compress_size)
897 #add the first one as if it is the whole thing (as it often is)
898 add_file(ID, fnx, mediatype, fragments[0])
899 filemap[fn] = fnx
900 if len(fragments) > 1:
901 spine_ids = [ID]
902 spinemap[ID] = spine_ids
903 #add any extras
904 for i in range(1, len(fragments)):
905 # XXX it is possible for duplicates if another
906 # file happens to have this name. Ignore for now
907 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
908 spine_ids.append(_id)
909 add_file(_id,
910 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
911 mediatype, fragments[i])
913 else:
914 add_file(ID, fn, mediatype, content)
916 #toc
917 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
918 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
920 #spine
921 for ID in self.spine:
922 if ID in spinemap:
923 for x in spinemap[ID]:
924 ebook.add_spine_item({'idref': x})
925 else:
926 ebook.add_spine_item({'idref': ID})
928 #metadata -- no use of attributes (yet)
929 # and fm: metadata disappears for now
930 DCNS = config.DCNS
931 DC = config.DC
932 meta_info_items = []
933 for ns, namespace in self.metadata.items():
934 for keyword, schemes in namespace.items():
935 if ns:
936 keyword = '{%s}%s' % (ns, keyword)
937 for scheme, values in schemes.items():
938 for value in values:
939 item = {
940 'item': keyword,
941 'text': value,
943 if scheme:
944 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
945 item['atts'] = {'role': scheme}
946 else:
947 item['atts'] = {'scheme': scheme}
949 has_authors = 'creator' in self.metadata[DC]
950 if not has_authors and config.CLAIM_UNAUTHORED:
951 authors = []
952 for x in self.metadata[DC]['creator'].values():
953 authors.extend(x)
955 meta_info_items.append({'item': DCNS + 'creator',
956 'text': 'The Contributors'})
958 meta_info_items.append({'item': DCNS + 'rights',
959 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
962 tree_str = ia_epub.make_opf(meta_info_items,
963 ebook.manifest_items,
964 ebook.spine_items,
965 ebook.guide_items,
966 ebook.cover_id)
967 ebook.add(ebook.content_dir + 'content.opf', tree_str)
968 ebook.z.close()
969 self.notify_watcher()
972 def publish_s3(self):
973 """Push the book's epub to archive.org, using S3."""
974 #XXX why only epub?
975 secrets = {}
976 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
977 fn = getattr(config, x)
978 f = open(fn)
979 secrets[x] = f.read().strip()
980 f.close()
982 now = time.strftime('%F')
983 s3output = self.filepath('s3-output.txt')
984 s3url, detailsurl = find_archive_urls(self.book, self.bookname)
985 headers = [
986 'x-amz-auto-make-bucket:1',
987 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
988 'x-archive-meta-mediatype:texts',
989 'x-archive-meta-collection:opensource',
990 'x-archive-meta-title:%s' % (self.book,),
991 'x-archive-meta-date:%s' % (now,),
992 'x-archive-meta-creator:FLOSS Manuals Contributors',
995 if self.license in config.LICENSES:
996 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
998 argv = ['curl', '--location', '-s', '-o', s3output]
999 for h in headers:
1000 argv.extend(('--header', h))
1001 argv.extend(('--upload-file', self.publish_file, s3url,))
1003 log(' '.join(repr(x) for x in argv))
1004 check_call(argv, stdout=sys.stderr)
1005 self.notify_watcher()
1006 return detailsurl, s3url
1009 def spawn_x(self):
1010 """Start an Xvfb instance, using a new server number. A
1011 reference to it is stored in self.xvfb, which is used to kill
1012 it when the pdf is done.
1014 Note that Xvfb doesn't interact well with dbus which is
1015 present on modern desktops.
1017 #Find an unused server number (in case two cgis are running at once)
1018 while True:
1019 servernum = random.randrange(50, 500)
1020 if not os.path.exists('/tmp/.X%s-lock' % servernum):
1021 break
1023 self.xserver_no = ':%s' % servernum
1025 authfile = self.filepath('Xauthority')
1026 os.environ['XAUTHORITY'] = authfile
1028 #mcookie(1) eats into /dev/random, so avoid that
1029 from hashlib import md5
1030 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
1031 mcookie = m.hexdigest()
1033 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
1035 self.xvfb = Popen(['Xvfb', self.xserver_no,
1036 '-screen', '0', '1024x768x24',
1037 '-pixdepths', '32',
1038 #'-blackpixel', '0',
1039 #'-whitepixel', str(2 ** 24 -1),
1040 #'+extension', 'Composite',
1041 '-dpi', '96',
1042 #'-kb',
1043 '-nolisten', 'tcp',
1046 # We need to wait a bit before the Xvfb is ready. but the
1047 # downloads are so slow that that probably doesn't matter
1049 self.xvfb_ready_time = time.time() + 2
1051 os.environ['DISPLAY'] = self.xserver_no
1052 log(self.xserver_no)
1054 def wait_for_xvfb(self):
1055 """wait until a previously set time before continuing. This
1056 is so Xvfb has time to properly start."""
1057 if hasattr(self, 'xvfb'):
1058 d = self.xvfb_ready_time - time.time()
1059 if d > 0:
1060 time.sleep(d)
1061 self.notify_watcher()
1063 def cleanup_x(self):
1064 """Try very hard to kill off Xvfb. In addition to killing
1065 this instance's xvfb, occasionally (randomly) search for
1066 escaped Xvfb instances and kill those too."""
1067 if not hasattr(self, 'xvfb'):
1068 return
1069 check_call(['xauth', 'remove', self.xserver_no])
1070 p = self.xvfb
1071 log("trying to kill Xvfb %s" % p.pid)
1072 os.kill(p.pid, 15)
1073 for i in range(10):
1074 if p.poll() is not None:
1075 log("%s died with %s" % (p.pid, p.poll()))
1076 break
1077 log("%s not dead yet" % p.pid)
1078 time.sleep(0.2)
1079 else:
1080 log("Xvfb would not die! kill -9! kill -9!")
1081 try:
1082 os.kill(p.pid, 9)
1083 except OSError, e:
1084 log(e)
1086 if random.random() < 0.1:
1087 # occasionally kill old xvfbs and soffices, if there are any.
1088 self.kill_old_processes()
1090 def kill_old_processes(self):
1091 """Sometimes, despite everything, Xvfb or soffice instances
1092 hang around well after they are wanted -- for example if the
1093 cgi process dies particularly badly. So kill them if they have
1094 been running for a long time."""
1095 log("running kill_old_processes")
1096 killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1097 os.path.basename(config.HTML2ODT),
1098 os.path.basename(config.WKHTMLTOPDF),
1100 p = Popen(['ps', '-C', killable_names,
1101 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1102 data = p.communicate()[0].strip()
1103 if data:
1104 lines = data.split('\n')
1105 pids = []
1106 for line in lines:
1107 log('dealing with ps output "%s"' % line)
1108 try:
1109 pid, days, hours, minutes, seconds \
1110 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1111 except AttributeError:
1112 log("Couldn't parse that line!")
1113 # 50 minutes should be enough xvfb time for anyone
1114 if days or hours or int(minutes) > 50:
1115 pid = int(pid)
1116 log("going to kill pid %s" % pid)
1117 os.kill(pid, 15)
1118 pids.append(pid)
1120 time.sleep(1.0)
1121 for pid in pids:
1122 #try again in case any are lingerers
1123 try:
1124 os.kill(int(pid), 9)
1125 except OSError, e:
1126 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1127 continue
1128 log('killing %s with -9' % pid)
1129 self.notify_watcher()
1131 def cleanup(self):
1132 self.cleanup_x()
1133 if not config.KEEP_TEMP_FILES:
1134 for fn in os.listdir(self.workdir):
1135 os.remove(os.path.join(self.workdir, fn))
1136 os.rmdir(self.workdir)
1137 else:
1138 log("NOT removing '%s', containing the following files:" % self.workdir)
1139 log(*os.listdir(self.workdir))
1141 self.notify_watcher()
1144 def use_cache():
1145 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1147 def _read_cached_zip(server, book, max_age):
1148 #find a recent zip if possible
1149 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1150 from glob import glob
1151 zips = sorted(glob(prefix + '*.zip'))
1152 if not zips:
1153 log("no cached booki-zips matching %s*.zip" % (prefix,))
1154 return None
1155 zipname = zips[-1]
1156 cutoff = time.time() - max_age * 60
1157 log(repr(zipname))
1158 try:
1159 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1160 if date > cutoff:
1161 f = open(zipname)
1162 blob = f.read()
1163 f.close()
1164 return blob, zipname
1165 log("%s is too old, must reload" % zipname)
1166 return None
1167 except (IOError, IndexError, ValueError), e:
1168 log('could not make sense of %s: got exception %s' % (zipname, e))
1169 return None
1172 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1173 interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1174 try:
1175 url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1176 'server': server, 'book':book}
1177 except KeyError:
1178 raise NotImplementedError("Can't handle '%s' interface" % interface)
1180 if use_cache() and max_age < 0:
1181 #default to 12 hours cache on objavi.halo.gen.nz
1182 max_age = 12 * 60
1184 if max_age:
1185 log('WARNING: trying to use cached booki-zip',
1186 'If you are debugging booki-zip creation, you will go CRAZY'
1187 ' unless you switch this off')
1188 blob_and_name = _read_cached_zip(server, book, max_age)
1189 if blob_and_name is not None:
1190 return blob_and_name
1192 log('fetching zip from %s'% url)
1193 f = urlopen(url)
1194 blob = f.read()
1195 f.close()
1196 if save:
1197 if filename is None:
1198 filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1199 make_book_name(book, server, '.zip'))
1200 f = open(filename, 'w')
1201 f.write(blob)
1202 f.close()
1203 return blob, filename
1206 def split_html(html, compressed_size=None, fix_markup=False):
1207 """Split long html files into pieces that will work nicely on a
1208 Sony Reader."""
1209 if compressed_size is None:
1210 import zlib
1211 compressed_size = len(zlib.compress(html))
1213 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1214 len(html) // config.EPUB_FILE_SIZE_MAX)
1215 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1217 if not splits:
1218 return [html]
1220 if fix_markup:
1221 #remove '<' in attributes etc, which makes the marker
1222 #insertion more reliable
1223 html = etree.tostring(lxml.html.fromstring(html),
1224 encoding='UTF-8',
1225 #method='html'
1228 target = len(html) // (splits + 1)
1229 s = 0
1230 fragments = []
1231 for i in range(splits):
1232 e = html.find('<', target * (i + 1))
1233 fragments.append(html[s:e])
1234 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1235 s = e
1236 fragments.append(html[s:])
1238 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1239 chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1240 return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]