Add max-age variable, allowing for caching of booki-zips
[objavi2.git] / objavi / fmbook.py
blob3d43c18ec6e067857d9f188534516adecac920db
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 from subprocess import Popen, check_call, PIPE
28 from cStringIO import StringIO
29 from urllib2 import urlopen
30 import zipfile
31 import traceback
32 from string import ascii_letters
33 from pprint import pformat
35 try:
36 import simplejson as json
37 except ImportError:
38 import json
40 import lxml, lxml.html
41 from lxml import etree
43 from objavi import config, epub_utils
44 from objavi.cgi_utils import log, run, shift_file, make_book_name, guess_lang, guess_text_dir
45 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
46 from objavi.epub import add_guts, _find_tag
48 from iarchive import epub as ia_epub
49 from booki.xhtml_utils import EpubChapter
50 from booki.bookizip import get_metadata, add_metadata, clear_metadata, get_metadata_schemes
52 TMPDIR = os.path.abspath(config.TMPDIR)
53 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
54 HTTP_HOST = os.environ.get('HTTP_HOST', '')
55 PUBLISH_PATH = "%s/books/" % DOC_ROOT
58 def _get_best_title(tocpoint):
59 if 'html_title' in tocpoint:
60 return tocpoint['html_title']
61 if 'title' in tocpoint:
62 return tocpoint['title']
63 return 'Untitled'
66 def _add_initial_number(e, n):
67 """Put a styled chapter number n at the beginning of element e."""
68 initial = e.makeelement("strong", Class="initial")
69 e.insert(0, initial)
70 initial.tail = ' '
71 if e.text is not None:
72 initial.tail += e.text
73 e.text = ''
74 initial.text = "%s." % n
76 def expand_toc(toc, depth=1, index=0):
77 """Reformat toc slightly for convenience"""
78 for item in toc:
79 url = item['url'].lstrip('/')
80 bits = url.split('#', 1)
81 filename = bits[0]
82 fragment = (bits[1] if len(bits) == 2 else None)
83 item['depth'] = depth
84 item["filename"] = filename
85 item["fragment"] = fragment
86 item["index"] = index
87 index += 1
88 if 'children' in item:
89 index = expand_toc(item['children'], depth + 1, index)
90 return index
92 def _serialise(rtoc, stoc, depth):
93 for item in rtoc:
94 url = item['url'].lstrip('/')
95 bits = url.split('#', 1)
96 filename = bits[0]
97 fragment = (bits[1] if len(bits) == 2 else None)
98 stoc.append({"depth": depth,
99 "title": item['title'],
100 "url": url,
101 "filename": filename,
102 "fragment": fragment,
103 "type": item['type']
105 if 'children' in item:
106 _serialise(item['children'], stoc, depth + 1)
109 def serialise_toc(rtoc):
110 """Take the recursive TOC structure and turn it into a list of
111 serial points. Reformat some things for convenience."""
112 stoc = []
113 _serialise(rtoc, stoc, 1)
114 for i, x in enumerate(stoc):
115 x['position'] = i
116 return stoc
118 def filename_toc_map(rtoc):
119 tocmap = {}
120 log(rtoc)
121 def traverse(toc):
122 for point in toc:
123 log(point.keys())
124 tocmap.setdefault(point['filename'], []).append(point)
125 if 'children' in point:
126 traverse(point['children'])
127 traverse(rtoc)
128 return tocmap
131 class Book(object):
132 page_numbers = 'latin'
133 preamble_page_numbers = 'roman'
135 def notify_watcher(self, message=None):
136 if self.watcher:
137 if message is None:
138 #message is the name of the caller
139 message = traceback.extract_stack(None, 2)[0][2]
140 log("notify_watcher called with '%s'" % message)
141 self.watcher(message)
143 def __enter__(self):
144 return self
146 def __exit__(self, exc_type, exc_value, traceback):
147 self.cleanup()
148 #could deal with exceptions here and return true
151 def __init__(self, book, server, bookname, project=None,
152 page_settings=None, watcher=None, isbn=None,
153 license=config.DEFAULT_LICENSE, title=None,
154 max_age=0):
155 log("*** Starting new book %s ***" % bookname,
156 "starting zipbook with", server, book, project)
157 self.watcher = watcher
158 self.notify_watcher('start')
159 self.bookname = bookname
160 self.book = book
161 self.server = server
162 self.project = project
163 self.cookie = ''.join(random.sample(ascii_letters, 10))
164 try:
165 blob = fetch_zip(server, book, project, save=True, max_age=max_age)
166 except HTTPError, e:
167 #log(e.url)
168 traceback.print_exc()
169 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
170 #not much to do?
171 sys.exit()
172 f = StringIO(blob)
173 self.notify_watcher('fetch_zip')
174 self.store = zipfile.ZipFile(f, 'r')
175 self.info = json.loads(self.store.read('info.json'))
176 for k in ('manifest', 'metadata', 'spine', 'TOC'):
177 if k not in self.info:
178 raise ObjaviError('info.json of %s lacks vital element "%s"' %
179 (bookname, k))
180 #check types also?
182 self.metadata = self.info['metadata']
183 self.spine = self.info['spine']
184 self.manifest = self.info['manifest']
186 if server == config.LOCALHOST: # [DEPRECATED]
187 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
188 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
190 log(pformat(self.metadata))
191 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
192 if not self.lang:
193 self.lang = guess_lang(server, book)
194 log('guessed lang as %s' % self.lang)
196 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
197 if not self.toc_header:
198 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
200 self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
201 if not self.dir:
202 self.dir = guess_text_dir(server, book)
205 #Patch in the extra metadata. (lang and dir may be set from config)
206 #these should be read from zip -- so should go into zip?
207 for var, key, scheme, ns in (
208 (isbn, 'id', 'ISBN', config.DC),
209 (license, 'rights', 'License', config.DC),
210 (title, 'title', '', config.DC),
211 (self.lang, 'language', '', config.DC),
212 (self.dir, 'dir', '', config.FM),
214 if var is not None:
215 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
217 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
218 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
220 self.toc = self.info['TOC']
221 expand_toc(self.toc)
223 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
224 os.chmod(self.workdir, 0755)
226 self.body_html_file = self.filepath('body.html')
227 self.body_pdf_file = self.filepath('body.pdf')
228 self.preamble_html_file = self.filepath('preamble.html')
229 self.preamble_pdf_file = self.filepath('preamble.pdf')
230 self.tail_html_file = self.filepath('tail.html')
231 self.tail_pdf_file = self.filepath('tail.pdf')
232 self.isbn_pdf_file = None
233 self.pdf_file = self.filepath('final.pdf')
234 self.body_odt_file = self.filepath('body.odt')
236 self.publish_file = os.path.join(PUBLISH_PATH, bookname)
237 self.publish_url = os.path.join(config.PUBLISH_URL, bookname)
239 if page_settings is not None:
240 self.maker = PageSettings(**page_settings)
242 titles = get_metadata(self.metadata, 'title')
243 if titles:
244 self.title = titles[0]
245 else:
246 self.title = 'A Manual About ' + self.book
248 self.notify_watcher()
251 if config.TRY_BOOK_CLEANUP_ON_DEL:
252 #Dont even define __del__ if it is not used.
253 _try_cleanup_on_del = True
254 def __del__(self):
255 if self._try_cleanup_on_del and os.path.exists(self.workdir):
256 self._try_cleanup_on_del = False #or else you can get in bad cycles
257 self.cleanup()
259 def get_tree_by_id(self, id):
260 """get an HTML tree from the given manifest ID"""
261 name = self.manifest[id]['url']
262 mimetype = self.manifest[id]['mimetype']
263 s = self.store.read(name)
264 f = StringIO(s)
265 if mimetype == 'text/html':
266 try:
267 tree = lxml.html.parse(f)
268 except etree.XMLSyntaxError, e:
269 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
270 (id, name, s[:20], e))
271 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
272 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
273 tree = etree.parse(f)
274 else:
275 tree = f.read()
276 f.close()
277 return tree
279 def filepath(self, fn):
280 return os.path.join(self.workdir, fn)
282 def save_data(self, fn, data):
283 """Save without tripping up on unicode"""
284 if isinstance(data, unicode):
285 data = data.encode('utf8', 'ignore')
286 f = open(fn, 'w')
287 f.write(data)
288 f.close()
290 def save_tempfile(self, fn, data):
291 """Save the data in a temporary directory that will be cleaned
292 up when all is done. Return the absolute file path."""
293 fn = self.filepath(fn)
294 self.save_data(fn, data)
295 return fn
297 def make_oo_doc(self):
298 """Make an openoffice document, using the html2odt script."""
299 self.wait_for_xvfb()
300 html_text = etree.tostring(self.tree, method="html")
301 self.save_data(self.body_html_file, html_text)
302 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
303 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
304 os.rename(self.body_odt_file, self.publish_file)
305 self.notify_watcher()
307 def extract_pdf_outline(self):
308 #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
309 debugf = self.filepath('outline.txt')
310 self.outline_contents, self.outline_text, number_of_pages = \
311 parse_outline(self.body_pdf_file, 1, debugf)
313 if not self.outline_contents:
314 #probably problems with international text. need a horrible hack
315 log('no outline: trying again with ascii headings')
316 import copy
317 tree = copy.deepcopy(self.tree)
318 titlemap = {}
319 for tag in ('h1', 'h2', 'h3', 'h4'):
320 for i, e in enumerate(tree.getiterator(tag)):
321 key = "%s_%s" % (tag, i)
322 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
323 del e[:]
324 if tag == 'h1':
325 e = lxml.etree.SubElement(e, "strong", Class="initial")
326 e.text = key
327 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
329 ascii_html_file = self.filepath('body-ascii-headings.html')
330 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
331 html_text = lxml.etree.tostring(tree, method="html")
332 self.save_data(ascii_html_file, html_text)
333 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
334 debugf = self.filepath('ascii_outline.txt')
335 ascii_contents, ascii_text, number_of_ascii_pages = \
336 parse_outline(ascii_pdf_file, 1, debugf)
337 self.outline_contents = []
338 log ("number of pages: %s, post ascii: %s" %
339 (number_of_pages, number_of_ascii_pages))
340 for ascii_title, depth, pageno in ascii_contents:
341 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
342 ascii_title = ascii_title[:-4]
343 if ' ' in ascii_title:
344 ascii_title = ascii_title.rsplit(' ', 1)[1]
345 title = titlemap.get(ascii_title, '')
346 log((ascii_title, title, depth, pageno))
348 self.outline_contents.append((title, depth, pageno))
349 else:
350 for x in self.outline_contents:
351 log(x)
353 self.notify_watcher()
354 return number_of_pages
356 def make_body_pdf(self):
357 """Make a pdf of the HTML, using webkit"""
358 #1. Save the html
359 html_text = etree.tostring(self.tree, method="html")
360 self.save_data(self.body_html_file, html_text)
362 #2. Make a pdf of it
363 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
364 self.notify_watcher('generate_pdf')
366 n_pages = self.extract_pdf_outline()
368 log ("found %s pages in pdf" % n_pages)
369 #4. resize pages, shift gutters, even pages
370 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
371 self.notify_watcher('reshape_pdf')
373 #5 add page numbers
374 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
375 numbers=self.page_numbers)
376 self.notify_watcher("number_pdf")
377 self.notify_watcher()
379 def make_preamble_pdf(self):
380 contents = self.make_contents()
381 inside_cover_html = self.compose_inside_cover()
382 log(self.dir, self.css_url, self.title, inside_cover_html,
383 self.toc_header, contents, self.title)
385 html = ('<html dir="%s"><head>\n'
386 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
387 '<link rel="stylesheet" href="%s" />\n'
388 '</head>\n<body>\n'
389 '<h1 class="frontpage">%s</h1>'
390 '%s\n'
391 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
392 '<div style="page-break-after: always; color:#fff" class="unseen">.'
393 '<!--%s--></div></body></html>'
394 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
395 self.toc_header, contents, self.title)
396 self.save_data(self.preamble_html_file, html)
398 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
400 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
402 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
403 numbers=self.preamble_page_numbers,
404 number_start=-2)
406 self.notify_watcher()
408 def make_end_matter_pdf(self):
409 """Make an inside back cover and a back cover. If there is an
410 isbn number its barcode will be put on the back cover."""
411 if self.isbn:
412 self.isbn_pdf_file = self.filepath('isbn.pdf')
413 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
414 self.notify_watcher('make_barcode_pdf')
416 end_matter = self.compose_end_matter()
417 log(end_matter)
418 self.save_data(self.tail_html_file, end_matter.decode('utf-8'))
419 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
421 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
422 centre_end=True, even_pages=False)
423 self.notify_watcher()
425 def make_book_pdf(self):
426 """A convenient wrapper of a few necessary steps"""
427 # now the Xvfb server is needed. make sure it has had long enough to get going
428 self.wait_for_xvfb()
429 self.make_body_pdf()
430 self.make_preamble_pdf()
431 self.make_end_matter_pdf()
433 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
434 self.body_pdf_file, self.tail_pdf_file,
435 self.isbn_pdf_file)
437 self.notify_watcher('concatenated_pdfs')
440 def make_simple_pdf(self, mode):
441 """Make a simple pdf document without contents or separate
442 title page. This is used for multicolumn newspapers and for
443 web-destined pdfs."""
444 self.wait_for_xvfb()
445 #0. Add heading to begining of html
446 body = list(self.tree.cssselect('body'))[0]
447 e = body.makeelement('h1', {'id': 'book-title'})
448 e.text = self.title
449 body.insert(0, e)
450 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
451 e.addnext(intro)
453 #0.5 adjust parameters to suit the particular kind of output
454 if mode == 'web':
455 self.maker.gutter = 0
457 #1. Save the html
458 html_text = etree.tostring(self.tree, method="html")
459 self.save_data(self.body_html_file, html_text)
461 #2. Make a pdf of it (direct to to final pdf)
462 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
463 self.notify_watcher('generate_pdf')
464 n_pages = count_pdf_pages(self.pdf_file)
466 if mode != 'web':
467 #3. resize pages and shift gutters.
468 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
469 self.notify_watcher('reshape_pdf')
471 #4. add page numbers
472 self.maker.number_pdf(self.pdf_file, n_pages,
473 dir=self.dir, numbers=self.page_numbers)
474 self.notify_watcher("number_pdf")
475 self.notify_watcher()
478 def rotate180(self):
479 """Rotate the pdf 180 degrees so an RTL book can print on LTR
480 presses."""
481 rotated = self.filepath('final-rotate.pdf')
482 unrotated = self.filepath('final-pre-rotate.pdf')
483 #leave the unrotated pdf intact at first, in case of error.
484 rotate_pdf(self.pdf_file, rotated)
485 os.rename(self.pdf_file, unrotated)
486 os.rename(rotated, self.pdf_file)
487 self.notify_watcher()
489 def publish_pdf(self):
490 """Move the finished PDF to its final resting place"""
491 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
492 os.rename(self.pdf_file, self.publish_file)
493 self.notify_watcher()
496 def concat_html(self):
497 """Join all the chapters together into one tree. Keep the TOC
498 up-to-date along the way."""
500 #each manifest item looks like:
501 #{'contributors': []
502 #'license': [],
503 #'mimetype': '',
504 #'rightsholders': []
505 #'url': ''}
506 doc = lxml.html.document_fromstring('<html><body></body></html>')
507 tocmap = filename_toc_map(self.toc)
508 for ID in self.spine:
509 details = self.manifest[ID]
510 log(ID, pformat(details))
511 root = self.get_tree_by_id(ID).getroot()
512 #handle any TOC points in this file
513 for point in tocmap[details['url']]:
514 #if the url has a #identifier, use it. Otherwise, make
515 #one up, using a hidden element at the beginning of
516 #the inserted document.
517 #XXX this will break if different files use the same ids
518 #XXX should either replace all, or replace selectively.
519 if point['fragment']:
520 fragment = point['fragment']
521 else:
522 body = _find_tag(root, 'body')
523 fragment = '%s_%s' % (self.cookie, point['index'])
524 #reuse first tag if it is suitable.
525 if (len(body) and
526 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
527 if body[0].get('id') is None:
528 body[0].set('id', fragment)
529 else:
530 fragment = body[0].get('id')
531 #the chapter starts with a heading. that heading should be the chapter name.
532 if body[0].tag in ('h1', 'h2', 'h3'):
533 log('chapter has title "%s", found html title "%s"' %
534 (point['title'], body[0].text_content()))
535 point['html_title'] = body[0].text_content()
536 else:
537 marker = body.makeelement('div', style="display:none",
538 id=fragment)
539 body.insert(0, marker)
540 point['html_id'] = fragment
542 add_guts(root, doc)
543 return doc
545 def unpack_static(self):
546 """Extract static files from the zip for the html to refer to."""
547 static_files = [x['url'] for x in self.manifest.values()
548 if x['url'].startswith('static')]
549 if static_files:
550 os.mkdir(self.filepath('static'))
552 for name in static_files:
553 s = self.store.read(name)
554 f = open(self.filepath(name), 'w')
555 f.write(s)
556 f.close()
557 self.notify_watcher()
559 def load_book(self):
560 """"""
561 #XXX concatenate the HTML to match how TWiki version worked.
562 # This is perhaps foolishly early -- throwing away useful boundaries.
563 self.unpack_static()
564 self.tree = self.concat_html()
565 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
567 self.headings = [x for x in self.tree.cssselect('h1')]
568 if self.headings:
569 self.headings[0].set('class', "first-heading")
570 for h1 in self.headings:
571 h1.title = h1.text_content().strip()
572 self.notify_watcher()
574 def make_contents(self):
575 """Generate HTML containing the table of contents. This can
576 only be done after the main PDF has been made, because the
577 page numbers are contained in the PDF outline."""
578 header = '<h1>Table of Contents</h1><table class="toc">\n'
579 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
580 '<td class="pagenumber">%s</td></tr>\n')
581 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
582 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
583 footer = '\n</table>'
585 contents = []
587 chapter = 1
588 page_num = 1
589 subsections = [] # for the subsection heading pages.
591 outline_contents = iter(self.outline_contents)
592 headings = iter(self.headings)
594 for section in self.toc:
595 if not section.get('children'):
596 contents.append(empty_section_tmpl % section['title'])
597 continue
598 contents.append(section_tmpl % section['title'])
600 for point in section['children']:
601 try:
602 h1_text, level, page_num = outline_contents.next()
603 except StopIteration:
604 log("contents data not found for %s. Stopping" % (point,))
605 break
606 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
607 chapter += 1
609 doc = header + '\n'.join(contents) + footer
610 self.notify_watcher()
611 return doc
613 def add_section_titles(self):
614 """Add any section heading pages that the TOC.txt file
615 specifies. These are sub-book, super-chapter groupings.
617 Also add initial numbers to chapters.
619 headings = iter(self.headings)
620 chapter = 1
621 section = None
622 log(self.toc)
623 for t in self.toc:
624 #only top level sections get a subsection page,
625 #and only if they have children.
626 if t.get('children'):
627 section = self.tree.makeelement('div', Class="objavi-subsection")
628 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
629 heading.text = t['title']
630 for child in t['children']:
631 item = etree.SubElement(section, 'div', Class="objavi-chapter")
632 if 'html_title' in child:
633 item.text = child['html_title']
634 heading = self.tree.cssselect('#'+ child['html_id'])
635 if heading:
636 _add_initial_number(heading[0], chapter)
637 else:
638 item.text = child['title']
639 _add_initial_number(item, chapter)
640 log(item.text, debug='HTMLGEN')
641 chapter += 1
642 location = self.tree.cssselect('#'+ t['html_id'])[0]
643 location.addprevious(section)
646 self.notify_watcher()
649 def add_css(self, css=None, mode='book'):
650 """If css looks like a url, use it as a stylesheet link.
651 Otherwise it is the CSS itself, which is saved to a temporary file
652 and linked to."""
653 log("css is %r" % css)
654 htmltree = self.tree
655 if css is None or not css.strip():
656 defaults = config.SERVER_DEFAULTS[self.server]
657 url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
658 elif not re.match(r'^http://\S+$', css):
659 fn = self.save_tempfile('objavi.css', css)
660 url = 'file://' + fn
661 else:
662 url = css
663 #XXX for debugging and perhaps sensible anyway
664 #url = url.replace('file:///home/douglas/objavi2', '')
667 #find the head -- it's probably first child but lets not assume.
668 for child in htmltree:
669 if child.tag == 'head':
670 head = child
671 break
672 else:
673 head = htmltree.makeelement('head')
674 htmltree.insert(0, head)
676 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
677 self.css_url = url
678 self.notify_watcher()
679 return url
682 def _read_localised_template(self, template, fallbacks=['en']):
683 """Try to get the template in the approriate language, otherwise in english."""
684 for lang in [self.lang] + fallbacks:
685 try:
686 fn = template % (lang)
687 f = open(fn)
688 break
689 except IOError, e:
690 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
691 log(e)
692 template = f.read()
693 f.close()
694 return template
696 def compose_inside_cover(self):
697 """create the markup for the preamble inside cover."""
698 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
700 if self.isbn:
701 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
702 else:
703 isbn_text = ''
705 return template % {'date': time.strftime('%Y-%m-%d'),
706 'isbn': isbn_text,
707 'license': self.license,
711 def compose_end_matter(self):
712 """create the markup for the end_matter inside cover. If
713 self.isbn is not set, the html will result in a pdf that
714 spills onto two pages.
716 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
718 d = {'css_url': self.css_url,
719 'title': self.title
722 if self.isbn:
723 d['inside_cover_style'] = ''
724 else:
725 d['inside_cover_style'] = 'page-break-after: always'
727 return template % d
730 def make_epub(self, use_cache=False):
731 """Make an epub version of the book, using Mike McCabe's
732 epub module for the Internet Archive."""
733 ebook = ia_epub.Book(self.publish_file, content_dir='')
734 toc = self.info['TOC']
736 #manifest
737 filemap = {} #map html to corresponding xhtml
738 for ID in self.manifest:
739 details = self.manifest[ID]
740 log(ID, pformat(details))
741 fn, mediatype = details['url'], details['mimetype']
742 oldfn = fn
743 content = self.store.read(fn)
744 if mediatype == 'text/html':
745 log('CONVERTING')
746 #convert to application/xhtml+xml
747 c = EpubChapter(self.server, self.book, ID, content,
748 use_cache=use_cache)
749 c.remove_bad_tags()
750 c.prepare_for_epub()
751 content = c.as_xhtml()
752 if fn[-5:] == '.html':
753 fn = fn[:-5]
754 fn += '.xhtml'
755 mediatype = 'application/xhtml+xml'
756 filemap[oldfn] = fn
758 info = {'id': ID.encode('utf-8'),
759 'href': fn.encode('utf-8'),
760 'media-type': mediatype.encode('utf-8')}
761 ebook.add_content(info, content)
763 #toc
764 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
765 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
767 #spine
768 for ID in self.spine:
769 ebook.add_spine_item({'idref': ID})
771 #metadata -- no use of attributes (yet)
772 # and fm: metadata disappears for now
773 DCNS = config.DCNS
774 DC = config.DC
775 meta_info_items = []
776 for ns, namespace in self.metadata.items():
777 for keyword, schemes in namespace.items():
778 if ns:
779 keyword = '{%s}%s' % (ns, keyword)
780 for scheme, values in schemes.items():
781 for value in values:
782 item = {
783 'item': keyword,
784 'text': value,
786 if scheme:
787 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
788 item['atts'] = {'role': scheme}
789 else:
790 item['atts'] = {'scheme': scheme}
792 has_authors = 'creator' in self.metadata[DC]
793 if not has_authors and config.CLAIM_UNAUTHORED:
794 meta_info_items.append({'item': DCNS + 'creator',
795 'text': 'The Contributors'})
797 meta_info_items.append({'item': DCNS + 'rights',
798 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
801 tree_str = ia_epub.make_opf(meta_info_items,
802 ebook.manifest_items,
803 ebook.spine_items,
804 ebook.guide_items,
805 ebook.cover_id)
806 ebook.add(ebook.content_dir + 'content.opf', tree_str)
807 ebook.z.close()
810 def publish_s3(self):
811 """Push the book's epub to archive.org, using S3."""
812 #XXX why only epub?
813 secrets = {}
814 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
815 fn = getattr(config, x)
816 f = open(fn)
817 secrets[x] = f.read().strip()
818 f.close()
820 log(secrets)
821 now = time.strftime('%F')
822 s3output = self.filepath('s3-output.txt')
823 s3url = 'http://s3.us.archive.org/booki-%s-%s/%s' % (self.project, self.book, self.bookname)
824 detailsurl = 'http://archive.org/details/booki-%s-%s' % (self.project, self.book)
825 headers = [
826 'x-amz-auto-make-bucket:1',
827 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
828 'x-archive-meta-mediatype:texts',
829 'x-archive-meta-collection:opensource',
830 'x-archive-meta-title:%s' %(self.book,),
831 'x-archive-meta-date:%s' % (now,),
832 'x-archive-meta-creator:FLOSS Manuals Contributors',
835 if self.license in config.LICENSES:
836 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
838 argv = ['curl', '--location', '-s', '-o', s3output]
839 for h in headers:
840 argv.extend(('--header', h))
841 argv.extend(('--upload-file', self.publish_file, s3url,))
843 log(' '.join(repr(x) for x in argv))
844 check_call(argv, stdout=sys.stderr)
845 return detailsurl, s3url
848 def spawn_x(self):
849 """Start an Xvfb instance, using a new server number. A
850 reference to it is stored in self.xvfb, which is used to kill
851 it when the pdf is done.
853 Note that Xvfb doesn't interact well with dbus which is
854 present on modern desktops.
856 #Find an unused server number (in case two cgis are running at once)
857 while True:
858 servernum = random.randrange(50, 500)
859 if not os.path.exists('/tmp/.X%s-lock' % servernum):
860 break
862 self.xserver_no = ':%s' % servernum
864 authfile = self.filepath('Xauthority')
865 os.environ['XAUTHORITY'] = authfile
867 #mcookie(1) eats into /dev/random, so avoid that
868 from hashlib import md5
869 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
870 mcookie = m.hexdigest()
872 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
874 self.xvfb = Popen(['Xvfb', self.xserver_no,
875 '-screen', '0', '1024x768x24',
876 '-pixdepths', '32',
877 #'-blackpixel', '0',
878 #'-whitepixel', str(2 ** 24 -1),
879 #'+extension', 'Composite',
880 '-dpi', '96',
881 '-kb',
882 '-nolisten', 'tcp',
885 # We need to wait a bit before the Xvfb is ready. but the
886 # downloads are so slow that that probably doesn't matter
888 self.xvfb_ready_time = time.time() + 2
890 os.environ['DISPLAY'] = self.xserver_no
891 log(self.xserver_no)
893 def wait_for_xvfb(self):
894 """wait until a previously set time before continuing. This
895 is so Xvfb has time to properly start."""
896 if hasattr(self, 'xvfb'):
897 d = self.xvfb_ready_time - time.time()
898 if d > 0:
899 time.sleep(d)
900 self.notify_watcher()
902 def cleanup_x(self):
903 """Try very hard to kill off Xvfb. In addition to killing
904 this instance's xvfb, occasionally (randomly) search for
905 escaped Xvfb instances and kill those too."""
906 if not hasattr(self, 'xvfb'):
907 return
908 check_call(['xauth', 'remove', self.xserver_no])
909 p = self.xvfb
910 log("trying to kill Xvfb %s" % p.pid)
911 os.kill(p.pid, 15)
912 for i in range(10):
913 if p.poll() is not None:
914 log("%s died with %s" % (p.pid, p.poll()))
915 break
916 log("%s not dead yet" % p.pid)
917 time.sleep(0.2)
918 else:
919 log("Xvfb would not die! kill -9! kill -9!")
920 os.kill(p.pid, 9)
922 if random.random() < 0.1:
923 # occasionally kill old xvfbs and soffices, if there are any.
924 self.kill_old_processes()
926 def kill_old_processes(self):
927 """Sometimes, despite everything, Xvfb or soffice instances
928 hang around well after they are wanted -- for example if the
929 cgi process dies particularly badly. So kill them if they have
930 been running for a long time."""
931 log("running kill_old_processes")
932 p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
933 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
934 data = p.communicate()[0].strip()
935 if data:
936 lines = data.split('\n')
937 pids = []
938 for line in lines:
939 log('dealing with ps output "%s"' % line)
940 try:
941 pid, days, hours, minutes, seconds \
942 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
943 except AttributeError:
944 log("Couldn't parse that line!")
945 # 50 minutes should be enough xvfb time for anyone
946 if days or hours or int(minutes) > 50:
947 pid = int(pid)
948 log("going to kill pid %s" % pid)
949 os.kill(pid, 15)
950 pids.append(pid)
952 time.sleep(1.0)
953 for pid in pids:
954 #try again in case any are lingerers
955 try:
956 os.kill(int(pid), 9)
957 except OSError, e:
958 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
959 continue
960 log('killing %s with -9' % pid)
961 self.notify_watcher()
963 def cleanup(self):
964 self.cleanup_x()
965 if not config.KEEP_TEMP_FILES:
966 for fn in os.listdir(self.workdir):
967 os.remove(os.path.join(self.workdir, fn))
968 os.rmdir(self.workdir)
969 else:
970 log("NOT removing '%s', containing the following files:" % self.workdir)
971 log(*os.listdir(self.workdir))
973 self.notify_watcher()
976 def use_cache():
977 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
979 def _read_cached_zip(server, book, max_age):
980 #find a recent zip if possible
981 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
982 from glob import glob
983 zips = sorted(glob(prefix + '*.zip'))
984 if not zips:
985 log("no cached booki-zips matching %s*.zip" % (prefix,))
986 return None
987 zipname = zips[-1]
988 cutoff = time.time() - max_age * 60
989 log(repr(zipname))
990 try:
991 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
992 if date > cutoff:
993 f = open(zipname)
994 blob = f.read()
995 f.close()
996 return blob
997 log("%s is too old, must reload" % zipname)
998 return None
999 except (IOError, IndexError, ValueError), e:
1000 log('could not make sense of %s: got exception %s' % (zipname, e))
1001 return None
1005 def fetch_zip(server, book, project, save=False, max_age=-1):
1006 interface = config.SERVER_DEFAULTS[server]['interface']
1007 if interface not in ('Booki', 'TWiki'):
1008 raise NotImplementedError("Can't handle '%s' interface" % interface)
1009 if interface == 'Booki':
1010 url = config.BOOKI_ZIP_URL % {'server': server, 'project': project, 'book':book}
1011 else:
1012 url = config.TWIKI_GATEWAY_URL % (HTTP_HOST, server, book)
1014 if use_cache() and max_age < 0:
1015 #default to 12 hours cache on objavi.halo.gen.nz
1016 max_age = 12 * 60
1018 if max_age:
1019 log('WARNING: trying to use cached booki-zip',
1020 'If you are debugging booki-zip creation, you will go CRAZY'
1021 ' unless you switch this off')
1022 blob = _read_cached_zip(server, book, max_age)
1023 if blob is not None:
1024 return blob
1026 log('fetching zip from %s'% url)
1027 f = urlopen(url)
1028 blob = f.read()
1029 f.close()
1030 if save:
1031 zipname = make_book_name(book, server, '.zip')
1032 f = open('%s/%s' % (config.BOOKI_BOOK_DIR, zipname), 'w')
1033 f.write(blob)
1034 f.close()
1035 return blob